Merge branch 'master' of ssh://gsa2.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
0d93effa4d
|
|
@ -23,7 +23,7 @@ public class BaseAndQualsCounts extends BaseCounts {
|
|||
}
|
||||
}
|
||||
|
||||
public void incr(byte base, byte baseQual, byte insQual, byte delQual) {
|
||||
public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
|
||||
super.incr(base, baseQual);
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // do not allow Ns
|
||||
|
|
@ -32,7 +32,7 @@ public class BaseAndQualsCounts extends BaseCounts {
|
|||
}
|
||||
}
|
||||
|
||||
public void decr(byte base, byte baseQual, byte insQual, byte delQual) {
|
||||
public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
|
||||
super.decr(base, baseQual);
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // do not allow Ns
|
||||
|
|
@ -41,16 +41,15 @@ public class BaseAndQualsCounts extends BaseCounts {
|
|||
}
|
||||
}
|
||||
|
||||
public byte averageInsertionQualsOfMostCommonBase() {
|
||||
return getGenericAverageQualOfMostCommonBase(sumInsertionQuals);
|
||||
public byte averageInsertionQualsOfBase(final BaseIndex base) {
|
||||
return getGenericAverageQualOfBase(base, sumInsertionQuals);
|
||||
}
|
||||
|
||||
public byte averageDeletionQualsOfMostCommonBase() {
|
||||
return getGenericAverageQualOfMostCommonBase(sumDeletionQuals);
|
||||
public byte averageDeletionQualsOfBase(final BaseIndex base) {
|
||||
return getGenericAverageQualOfBase(base, sumDeletionQuals);
|
||||
}
|
||||
|
||||
private byte getGenericAverageQualOfMostCommonBase(Map<BaseIndex, Long> sumQuals) {
|
||||
BaseIndex base = BaseIndex.byteToBase(baseWithMostCounts());
|
||||
private byte getGenericAverageQualOfBase(final BaseIndex base, final Map<BaseIndex, Long> sumQuals) {
|
||||
return (byte) (sumQuals.get(base) / getCount(base));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -41,26 +41,26 @@ import java.util.Map;
|
|||
|
||||
@Requires("other != null")
|
||||
public void add(BaseCounts other) {
|
||||
for (BaseIndex i : BaseIndex.values())
|
||||
for (final BaseIndex i : BaseIndex.values())
|
||||
counts.put(i, counts.get(i) + other.counts.get(i));
|
||||
}
|
||||
|
||||
@Requires("other != null")
|
||||
public void sub(BaseCounts other) {
|
||||
for (BaseIndex i : BaseIndex.values())
|
||||
for (final BaseIndex i : BaseIndex.values())
|
||||
counts.put(i, counts.get(i) - other.counts.get(i));
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
|
||||
public void incr(byte base) {
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) // no Ns
|
||||
counts.put(i, counts.get(i) + 1);
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
|
||||
public void incr(byte base, byte qual) {
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // no Ns
|
||||
counts.put(i, counts.get(i) + 1);
|
||||
sumQuals.put(i, sumQuals.get(i) + qual);
|
||||
|
|
@ -69,14 +69,14 @@ import java.util.Map;
|
|||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
|
||||
public void decr(byte base) {
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) // no Ns
|
||||
counts.put(i, counts.get(i) - 1);
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
|
||||
public void decr(byte base, byte qual) {
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // no Ns
|
||||
counts.put(i, counts.get(i) - 1);
|
||||
sumQuals.put(i, sumQuals.get(i) - qual);
|
||||
|
|
@ -84,52 +84,48 @@ import java.util.Map;
|
|||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int getCount(byte base) {
|
||||
public int getCount(final byte base) {
|
||||
return getCount(BaseIndex.byteToBase(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int getCount(BaseIndex base) {
|
||||
public int getCount(final BaseIndex base) {
|
||||
return counts.get(base);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public long getSumQuals(byte base) {
|
||||
public long getSumQuals(final byte base) {
|
||||
return getSumQuals(BaseIndex.byteToBase(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public long getSumQuals(BaseIndex base) {
|
||||
public long getSumQuals(final BaseIndex base) {
|
||||
return sumQuals.get(base);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public byte averageQuals(byte base) {
|
||||
public byte averageQuals(final byte base) {
|
||||
return (byte) (getSumQuals(base) / getCount(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public byte averageQuals(BaseIndex base) {
|
||||
public byte averageQuals(final BaseIndex base) {
|
||||
return (byte) (getSumQuals(base) / getCount(base));
|
||||
}
|
||||
|
||||
public byte baseWithMostCounts() {
|
||||
return baseIndexWithMostCounts().getByte();
|
||||
@Ensures("result >= 0")
|
||||
public int countOfBase(final BaseIndex base) {
|
||||
return counts.get(base);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int countOfMostCommonBase() {
|
||||
return counts.get(baseIndexWithMostCounts());
|
||||
public long sumQualsOfBase(final BaseIndex base) {
|
||||
return sumQuals.get(base);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public long sumQualsOfMostCommonBase() {
|
||||
return sumQuals.get(baseIndexWithMostCounts());
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public byte averageQualsOfMostCommonBase() {
|
||||
return (byte) (sumQualsOfMostCommonBase() / countOfMostCommonBase());
|
||||
public byte averageQualsOfBase(final BaseIndex base) {
|
||||
return (byte) (sumQualsOfBase(base) / countOfBase(base));
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -149,7 +145,7 @@ import java.util.Map;
|
|||
* @return the proportion of this base over all other bases
|
||||
*/
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportion(byte base) {
|
||||
public double baseCountProportion(final byte base) {
|
||||
return (double) counts.get(BaseIndex.byteToBase(base)) / totalCount();
|
||||
}
|
||||
|
||||
|
|
@ -160,7 +156,7 @@ import java.util.Map;
|
|||
* @return the proportion of this base over all other bases
|
||||
*/
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportion(BaseIndex baseIndex) {
|
||||
public double baseCountProportion(final BaseIndex baseIndex) {
|
||||
int total = totalCount();
|
||||
if (total == 0)
|
||||
return 0.0;
|
||||
|
|
@ -177,22 +173,28 @@ import java.util.Map;
|
|||
return b.toString();
|
||||
}
|
||||
|
||||
public byte baseWithMostCounts() {
|
||||
return baseIndexWithMostCounts().getByte();
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostCounts() {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (BaseIndex i : counts.keySet())
|
||||
if (hasHigherCount(i, maxI))
|
||||
maxI = i;
|
||||
for (Map.Entry<BaseIndex, Integer> entry : counts.entrySet()) {
|
||||
if (entry.getValue() > counts.get(maxI))
|
||||
maxI = entry.getKey();
|
||||
}
|
||||
return maxI;
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostCountsWithoutIndels() {
|
||||
BaseIndex mostCounts = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (BaseIndex index : counts.keySet())
|
||||
if (index.isNucleotide() && hasHigherCount(index, mostCounts))
|
||||
mostCounts = index;
|
||||
return mostCounts;
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (Map.Entry<BaseIndex, Integer> entry : counts.entrySet()) {
|
||||
if (entry.getKey().isNucleotide() && entry.getValue() > counts.get(maxI))
|
||||
maxI = entry.getKey();
|
||||
}
|
||||
return maxI;
|
||||
}
|
||||
|
||||
private boolean hasHigherCount(final BaseIndex targetIndex, final BaseIndex testIndex) {
|
||||
|
|
@ -201,6 +203,30 @@ import java.util.Map;
|
|||
return ( targetCount > testCount || (targetCount == testCount && sumQuals.get(targetIndex) > sumQuals.get(testIndex)) );
|
||||
}
|
||||
|
||||
public byte baseWithMostProbability() {
|
||||
return baseIndexWithMostProbability().getByte();
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostProbability() {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (Map.Entry<BaseIndex, Long> entry : sumQuals.entrySet()) {
|
||||
if (entry.getValue() > sumQuals.get(maxI))
|
||||
maxI = entry.getKey();
|
||||
}
|
||||
return (sumQuals.get(maxI) > 0L ? maxI : baseIndexWithMostCounts());
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostProbabilityWithoutIndels() {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (Map.Entry<BaseIndex, Long> entry : sumQuals.entrySet()) {
|
||||
if (entry.getKey().isNucleotide() && entry.getValue() > sumQuals.get(maxI))
|
||||
maxI = entry.getKey();
|
||||
}
|
||||
return (sumQuals.get(maxI) > 0L ? maxI : baseIndexWithMostCountsWithoutIndels());
|
||||
}
|
||||
|
||||
@Ensures("result >=0")
|
||||
public int totalCountWithoutIndels() {
|
||||
int sum = 0;
|
||||
|
|
@ -218,8 +244,8 @@ import java.util.Map;
|
|||
*/
|
||||
@Requires("index.isNucleotide()")
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportionWithoutIndels(BaseIndex index) {
|
||||
int total = totalCountWithoutIndels();
|
||||
public double baseCountProportionWithoutIndels(final BaseIndex index) {
|
||||
final int total = totalCountWithoutIndels();
|
||||
if (total == 0)
|
||||
return 0.0;
|
||||
return (double) counts.get(index) / totalCountWithoutIndels();
|
||||
|
|
|
|||
|
|
@ -182,7 +182,7 @@ public class HeaderElement {
|
|||
* @return whether or not the HeaderElement is variant due to excess insertions
|
||||
*/
|
||||
private boolean isVariantFromMismatches(double minVariantProportion) {
|
||||
BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostCountsWithoutIndels();
|
||||
BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels();
|
||||
double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon);
|
||||
return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ public class SlidingWindow {
|
|||
|
||||
private final int nContigs;
|
||||
|
||||
private boolean allowPolyploidReduction;
|
||||
private boolean allowPolyploidReductionInGeneral;
|
||||
|
||||
/**
|
||||
* The types of synthetic reads to use in the finalizeAndAdd method
|
||||
|
|
@ -117,7 +117,7 @@ public class SlidingWindow {
|
|||
this.hasIndelQualities = hasIndelQualities;
|
||||
this.nContigs = nContigs;
|
||||
|
||||
this.allowPolyploidReduction = allowPolyploidReduction;
|
||||
this.allowPolyploidReductionInGeneral = allowPolyploidReduction;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -207,8 +207,9 @@ public class SlidingWindow {
|
|||
finalizedReads = closeVariantRegions(regions, false);
|
||||
|
||||
List<GATKSAMRecord> readsToRemove = new LinkedList<GATKSAMRecord>();
|
||||
for (GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!)
|
||||
if (read.getSoftEnd() < getStartLocation(windowHeader)) {
|
||||
final int windowHeaderStartLoc = getStartLocation(windowHeader);
|
||||
for (final GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!)
|
||||
if (read.getSoftEnd() < windowHeaderStartLoc) {
|
||||
readsToRemove.add(read);
|
||||
}
|
||||
}
|
||||
|
|
@ -291,7 +292,7 @@ public class SlidingWindow {
|
|||
reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS));
|
||||
|
||||
int endOfFilteredData = findNextNonFilteredDataElement(header, start, end);
|
||||
addToFilteredData(header, start, endOfFilteredData, isNegativeStrand);
|
||||
reads.addAll(addToFilteredData(header, start, endOfFilteredData, isNegativeStrand));
|
||||
|
||||
if (endOfFilteredData <= start)
|
||||
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start));
|
||||
|
|
@ -418,7 +419,9 @@ public class SlidingWindow {
|
|||
* @param start the first header index to add to consensus
|
||||
* @param end the first header index NOT TO add to consensus
|
||||
*/
|
||||
private void addToFilteredData(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
|
||||
private List<GATKSAMRecord> addToFilteredData(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
|
||||
List<GATKSAMRecord> result = new ArrayList<GATKSAMRecord>(0);
|
||||
|
||||
if (filteredDataConsensus == null)
|
||||
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
|
||||
|
||||
|
|
@ -434,8 +437,15 @@ public class SlidingWindow {
|
|||
if (!headerElement.hasFilteredData())
|
||||
throw new ReviewedStingException("No filtered data in " + index);
|
||||
|
||||
if ( filteredDataConsensus.getRefStart() + filteredDataConsensus.size() != headerElement.getLocation() ) {
|
||||
result.add(finalizeFilteredDataConsensus());
|
||||
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
|
||||
}
|
||||
|
||||
genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS());
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -472,15 +482,15 @@ public class SlidingWindow {
|
|||
* @param rms the rms mapping quality in the header element
|
||||
*/
|
||||
private void genericAddBaseToConsensus(SyntheticRead syntheticRead, BaseAndQualsCounts baseCounts, double rms) {
|
||||
BaseIndex base = baseCounts.baseIndexWithMostCounts();
|
||||
byte count = (byte) Math.min(baseCounts.countOfMostCommonBase(), Byte.MAX_VALUE);
|
||||
byte qual = baseCounts.averageQualsOfMostCommonBase();
|
||||
byte insQual = baseCounts.averageInsertionQualsOfMostCommonBase();
|
||||
byte delQual = baseCounts.averageDeletionQualsOfMostCommonBase();
|
||||
final BaseIndex base = baseCounts.baseIndexWithMostProbability();
|
||||
byte count = (byte) Math.min(baseCounts.countOfBase(base), Byte.MAX_VALUE);
|
||||
byte qual = baseCounts.averageQualsOfBase(base);
|
||||
byte insQual = baseCounts.averageInsertionQualsOfBase(base);
|
||||
byte delQual = baseCounts.averageDeletionQualsOfBase(base);
|
||||
syntheticRead.add(base, count, qual, insQual, delQual, rms);
|
||||
}
|
||||
|
||||
private List<GATKSAMRecord> compressVariantRegion(int start, int stop) {
|
||||
private List<GATKSAMRecord> compressVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
|
||||
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
// Try to compress into a polyploid consensus
|
||||
|
|
@ -490,7 +500,8 @@ public class SlidingWindow {
|
|||
boolean foundEvent = false;
|
||||
Object[] header = windowHeader.toArray();
|
||||
|
||||
if ( allowPolyploidReduction ) { // foundEvent will remain false if we don't allow polyploid reduction
|
||||
// foundEvent will remain false if we don't allow polyploid reduction
|
||||
if ( allowPolyploidReductionInGeneral && !disallowPolyploidReductionAtThisPosition ) {
|
||||
for (int i = start; i<=stop; i++) {
|
||||
nHaplotypes = ((HeaderElement) header[i]).getNumberOfHaplotypes(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT);
|
||||
if (nHaplotypes > nContigs) {
|
||||
|
|
@ -512,9 +523,6 @@ public class SlidingWindow {
|
|||
}
|
||||
}
|
||||
|
||||
int refStart = windowHeader.get(start).getLocation();
|
||||
int refStop = windowHeader.get(stop).getLocation();
|
||||
|
||||
// Try to compress the variant region
|
||||
// the "foundEvent" protects us from trying to compress variant regions that are created by insertions
|
||||
if (canCompress && foundEvent) {
|
||||
|
|
@ -524,6 +532,9 @@ public class SlidingWindow {
|
|||
// Return all reads that overlap the variant region and remove them from the window header entirely
|
||||
// also remove all reads preceding the variant region (since they will be output as consensus right after compression
|
||||
else {
|
||||
final int refStart = windowHeader.get(start).getLocation();
|
||||
final int refStop = windowHeader.get(stop).getLocation();
|
||||
|
||||
LinkedList<GATKSAMRecord> toRemove = new LinkedList<GATKSAMRecord>();
|
||||
for (GATKSAMRecord read : readsInWindow) {
|
||||
if (read.getSoftStart() <= refStop) {
|
||||
|
|
@ -549,8 +560,8 @@ public class SlidingWindow {
|
|||
* @return all reads contained in the variant region plus any adjacent synthetic reads
|
||||
*/
|
||||
@Requires("start <= stop")
|
||||
protected List<GATKSAMRecord> closeVariantRegion(int start, int stop) {
|
||||
List<GATKSAMRecord> allReads = compressVariantRegion(start, stop);
|
||||
protected List<GATKSAMRecord> closeVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
|
||||
List<GATKSAMRecord> allReads = compressVariantRegion(start, stop, disallowPolyploidReductionAtThisPosition);
|
||||
|
||||
List<GATKSAMRecord> result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads;
|
||||
result.addAll(addToSyntheticReads(windowHeader, 0, stop, false));
|
||||
|
|
@ -570,7 +581,7 @@ public class SlidingWindow {
|
|||
if (stop < 0 && forceClose)
|
||||
stop = windowHeader.size() - 1;
|
||||
if (stop >= 0) {
|
||||
allReads.addAll(closeVariantRegion(start, stop));
|
||||
allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1));
|
||||
lastStop = stop;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ public class SyntheticRead {
|
|||
private String contig;
|
||||
private int contigIndex;
|
||||
private String readName;
|
||||
private Integer refStart;
|
||||
private int refStart;
|
||||
private boolean hasIndelQualities = false;
|
||||
private boolean isNegativeStrand = false;
|
||||
|
||||
|
|
@ -60,7 +60,7 @@ public class SyntheticRead {
|
|||
* @param refStart the alignment start (reference based)
|
||||
* @param readTag the reduce reads tag for the synthetic read
|
||||
*/
|
||||
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) {
|
||||
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) {
|
||||
final int initialCapacity = 10000;
|
||||
bases = new ArrayList<BaseIndex>(initialCapacity);
|
||||
counts = new ArrayList<Byte>(initialCapacity);
|
||||
|
|
@ -80,7 +80,7 @@ public class SyntheticRead {
|
|||
this.isNegativeStrand = isNegativeRead;
|
||||
}
|
||||
|
||||
public SyntheticRead(List<BaseIndex> bases, List<Byte> counts, List<Byte> quals, List<Byte> insertionQuals, List<Byte> deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, boolean hasIndelQualities, boolean isNegativeRead) {
|
||||
public SyntheticRead(List<BaseIndex> bases, List<Byte> counts, List<Byte> quals, List<Byte> insertionQuals, List<Byte> deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) {
|
||||
this.bases = bases;
|
||||
this.counts = counts;
|
||||
this.quals = quals;
|
||||
|
|
@ -115,11 +115,15 @@ public class SyntheticRead {
|
|||
this.mappingQuality += mappingQuality;
|
||||
}
|
||||
|
||||
public BaseIndex getBase(int readCoordinate) {
|
||||
public BaseIndex getBase(final int readCoordinate) {
|
||||
return bases.get(readCoordinate);
|
||||
}
|
||||
|
||||
/**
|
||||
public int getRefStart() {
|
||||
return refStart;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a GATKSAMRecord of the synthetic read. Will return null if the read is invalid.
|
||||
*
|
||||
* Invalid reads are :
|
||||
|
|
|
|||
|
|
@ -26,6 +26,8 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACcounts;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
|
@ -123,7 +125,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
*
|
||||
*
|
||||
*/
|
||||
protected static class SumIterator {
|
||||
public static class SumIterator {
|
||||
private int[] currentState;
|
||||
private final int[] finalState;
|
||||
private final int restrictSumTo;
|
||||
|
|
@ -491,32 +493,32 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
// If neighbors fall below maximum - threshold, we don't queue up THEIR own neighbors
|
||||
// and we repeat until queue is empty
|
||||
// queue of AC conformations to process
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue = new LinkedList<AlleleFrequencyCalculationModel.ExactACset>();
|
||||
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
|
||||
// mapping of ExactACset indexes to the objects
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset = new HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset>(likelihoodDim);
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>(likelihoodDim);
|
||||
// add AC=0 to the queue
|
||||
final int[] zeroCounts = new int[nAlleles];
|
||||
zeroCounts[0] = numChromosomes;
|
||||
|
||||
AlleleFrequencyCalculationModel.ExactACset zeroSet =
|
||||
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(zeroCounts));
|
||||
ExactACset zeroSet =
|
||||
new ExactACset(1, new ExactACcounts(zeroCounts));
|
||||
|
||||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||
indexesToACset.put(zeroSet.getACcounts(), zeroSet);
|
||||
|
||||
// keep processing while we have AC conformations that need to be calculated
|
||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
// compute log10Likelihoods
|
||||
final AlleleFrequencyCalculationModel.ExactACset ACset = ACqueue.remove();
|
||||
final ExactACset ACset = ACqueue.remove();
|
||||
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup);
|
||||
|
||||
// adjust max likelihood seen if needed
|
||||
maxLog10L = Math.max(maxLog10L, log10LofKs);
|
||||
// clean up memory
|
||||
indexesToACset.remove(ACset.ACcounts);
|
||||
indexesToACset.remove(ACset.getACcounts());
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** removing used set=%s%n", ACset.ACcounts);
|
||||
System.out.printf(" *** removing used set=%s%n", ACset.getACcounts());
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -525,13 +527,13 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
int plIdx = 0;
|
||||
SumIterator iterator = new SumIterator(nAlleles, numChromosomes);
|
||||
while (iterator.hasNext()) {
|
||||
AlleleFrequencyCalculationModel.ExactACset ACset =
|
||||
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(iterator.getCurrentVector()));
|
||||
ExactACset ACset =
|
||||
new ExactACset(1, new ExactACcounts(iterator.getCurrentVector()));
|
||||
// for observed base X, add Q(jX,k) to likelihood vector for all k in error model
|
||||
//likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
|
||||
getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup);
|
||||
|
||||
setLogPLs(plIdx++, ACset.log10Likelihoods[0]);
|
||||
setLogPLs(plIdx++, ACset.getLog10Likelihoods()[0]);
|
||||
iterator.next();
|
||||
}
|
||||
}
|
||||
|
|
@ -540,40 +542,40 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
|
||||
}
|
||||
|
||||
private double calculateACConformationAndUpdateQueue(final ExactAFCalculationModel.ExactACset set,
|
||||
private double calculateACConformationAndUpdateQueue(final ExactACset set,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
final double maxLog10L,
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts,
|
||||
AlleleFrequencyCalculationModel.ExactACset> indexesToACset,
|
||||
final LinkedList<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts,
|
||||
ExactACset> indexesToACset,
|
||||
final ReadBackedPileup pileup) {
|
||||
// compute likelihood of set
|
||||
getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup);
|
||||
final double log10LofK = set.log10Likelihoods[0];
|
||||
final double log10LofK = set.getLog10Likelihoods()[0];
|
||||
|
||||
// log result in PL vector
|
||||
int idx = getLinearIndex(set.ACcounts.getCounts(), nAlleles, numChromosomes);
|
||||
int idx = getLinearIndex(set.getACcounts().getCounts(), nAlleles, numChromosomes);
|
||||
setLogPLs(idx, log10LofK);
|
||||
|
||||
// can we abort early because the log10Likelihoods are so small?
|
||||
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.getACcounts(), log10LofK, maxLog10L);
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// iterate over higher frequencies if possible
|
||||
// by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
|
||||
final int ACwiggle = numChromosomes - set.getACsum() + set.ACcounts.counts[0];
|
||||
final int ACwiggle = numChromosomes - set.getACsum() + set.getACcounts().getCounts()[0];
|
||||
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
|
||||
return log10LofK;
|
||||
|
||||
|
||||
// add conformations for other cases
|
||||
for ( int allele = 1; allele < nAlleles; allele++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
final int[] ACcountsClone = set.getACcounts().getCounts().clone();
|
||||
ACcountsClone[allele]++;
|
||||
// is this a valid conformation?
|
||||
int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
|
||||
|
|
@ -597,7 +599,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
* @param numObservations Number of observations for each allele
|
||||
* @param pileup Read backed pileup in case it's necessary
|
||||
*/
|
||||
public abstract void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
public abstract void getLikelihoodOfConformation(final ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
|
|
@ -608,12 +610,12 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
|
||||
// Static methods
|
||||
public static void updateACset(final int[] newSetCounts,
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset) {
|
||||
final LinkedList<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
|
||||
|
||||
final AlleleFrequencyCalculationModel.ExactACcounts index = new AlleleFrequencyCalculationModel.ExactACcounts(newSetCounts);
|
||||
final ExactACcounts index = new ExactACcounts(newSetCounts);
|
||||
if ( !indexesToACset.containsKey(index) ) {
|
||||
AlleleFrequencyCalculationModel.ExactACset newSet = new AlleleFrequencyCalculationModel.ExactACset(1, index);
|
||||
ExactACset newSet = new ExactACset(1, index);
|
||||
indexesToACset.put(index, newSet);
|
||||
ACqueue.add(newSet);
|
||||
if (VERBOSE)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
|
@ -188,12 +189,12 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
|
|||
* @param alleleList List of alleles
|
||||
* @param numObservations Number of observations for each allele in alleleList
|
||||
*/
|
||||
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
public void getLikelihoodOfConformation(final ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
final ReadBackedPileup pileup) {
|
||||
final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, alleleList.size());
|
||||
final int[] currentCnt = Arrays.copyOf(ACset.getACcounts().getCounts(), alleleList.size());
|
||||
double p1 = 0.0;
|
||||
|
||||
if (!hasReferenceSampleData) {
|
||||
|
|
@ -218,6 +219,6 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
|
|||
}
|
||||
p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ, maxQ), acVec);
|
||||
}
|
||||
ACset.log10Likelihoods[0] = p1;
|
||||
ACset.getLog10Likelihoods()[0] = p1;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
|
|
@ -12,7 +13,10 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
|||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import static java.lang.Math.pow;
|
||||
|
|
@ -218,12 +222,12 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
|
|||
* @param alleleList List of alleles
|
||||
* @param numObservations Number of observations for each allele in alleleList
|
||||
*/
|
||||
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
public void getLikelihoodOfConformation(final ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
final ReadBackedPileup pileup) {
|
||||
final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, BaseUtils.BASES.length);
|
||||
final int[] currentCnt = Arrays.copyOf(ACset.getACcounts().getCounts(), BaseUtils.BASES.length);
|
||||
final int[] ac = new int[BaseUtils.BASES.length];
|
||||
|
||||
for (int k=0; k < BaseUtils.BASES.length; k++ )
|
||||
|
|
@ -238,9 +242,9 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
|
|||
final byte qual = qualToUse(elt, true, true, mbq);
|
||||
if ( qual == 0 )
|
||||
continue;
|
||||
final double acc[] = new double[ACset.ACcounts.counts.length];
|
||||
final double acc[] = new double[ACset.getACcounts().getCounts().length];
|
||||
for (int k=0; k < acc.length; k++ )
|
||||
acc[k] = qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(alleleList.get(k).getBases()[0])][BaseUtils.simpleBaseToBaseIndex(obsBase)][qual] +MathUtils.log10Cache[ACset.ACcounts.counts[k]]
|
||||
acc[k] = qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(alleleList.get(k).getBases()[0])][BaseUtils.simpleBaseToBaseIndex(obsBase)][qual] +MathUtils.log10Cache[ACset.getACcounts().getCounts()[k]]
|
||||
- LOG10_PLOIDY;
|
||||
p1 += MathUtils.log10sumLog10(acc);
|
||||
}
|
||||
|
|
@ -264,7 +268,7 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
|
|||
|
||||
p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ), acVec);
|
||||
}
|
||||
ACset.log10Likelihoods[0] = p1;
|
||||
ACset.getLog10Likelihoods()[0] = p1;
|
||||
/* System.out.println(Arrays.toString(ACset.ACcounts.getCounts())+" "+String.valueOf(p1));
|
||||
System.out.println(Arrays.toString(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ)));
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -0,0 +1,277 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.apache.log4j.ConsoleAppender;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.log4j.TTCCLayout;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: 10/2/12
|
||||
* Time: 10:25 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class AFCalcPerformanceTest {
|
||||
final static Logger logger = Logger.getLogger(AFCalcPerformanceTest.class);
|
||||
|
||||
private static abstract class Analysis {
|
||||
final GATKReport report;
|
||||
|
||||
public Analysis(final String name, final List<String> columns) {
|
||||
report = GATKReport.newSimpleReport(name, columns);
|
||||
}
|
||||
|
||||
public abstract void run(final AFCalcTestBuilder testBuilder,
|
||||
final List<Object> coreColumns);
|
||||
|
||||
public String getName() {
|
||||
return getTable().getTableName();
|
||||
}
|
||||
|
||||
public GATKReportTable getTable() {
|
||||
return report.getTables().iterator().next();
|
||||
}
|
||||
}
|
||||
|
||||
private static class AnalyzeByACAndPL extends Analysis {
|
||||
public AnalyzeByACAndPL(final List<String> columns) {
|
||||
super("AnalyzeByACAndPL", Utils.append(columns, "non.type.pls", "ac", "n.alt.seg", "other.ac"));
|
||||
}
|
||||
|
||||
public void run(final AFCalcTestBuilder testBuilder, final List<Object> coreValues) {
|
||||
final SimpleTimer timer = new SimpleTimer();
|
||||
|
||||
for ( final int nonTypePL : Arrays.asList(100) ) {
|
||||
final AFCalc calc = testBuilder.makeModel();
|
||||
final double[] priors = testBuilder.makePriors();
|
||||
|
||||
for ( int[] ACs : makeACs(testBuilder.numAltAlleles, testBuilder.nSamples*2) ) {
|
||||
final VariantContext vc = testBuilder.makeACTest(ACs, 0, nonTypePL);
|
||||
|
||||
timer.start();
|
||||
final AFCalcResult resultTracker = calc.getLog10PNonRef(vc, priors);
|
||||
final long runtime = timer.getElapsedTimeNano();
|
||||
|
||||
int otherAC = 0;
|
||||
int nAltSeg = 0;
|
||||
for ( int i = 0; i < ACs.length; i++ ) {
|
||||
nAltSeg += ACs[i] > 0 ? 1 : 0;
|
||||
if ( i > 0 ) otherAC += ACs[i];
|
||||
}
|
||||
|
||||
final List<Object> columns = new LinkedList<Object>(coreValues);
|
||||
columns.addAll(Arrays.asList(runtime, resultTracker.getnEvaluations(), nonTypePL, ACs[0], nAltSeg, otherAC));
|
||||
report.addRowList(columns);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private List<int[]> makeACs(final int nAltAlleles, final int nChrom) {
|
||||
if ( nAltAlleles > 2 ) throw new IllegalArgumentException("nAltAlleles must be < 3");
|
||||
|
||||
final List<int[]> ACs = new LinkedList<int[]>();
|
||||
|
||||
final List<Integer> ACsToTry = MathUtils.log10LinearRange(0, nChrom, 0.1); //Arrays.asList(0, 1, 2, 3, 6, 10, 20, 40, 60, 100, 200, 400, 600, 1000, 2000, 4000, 6000, 10000, 100000);
|
||||
|
||||
for ( int i : ACsToTry ) {
|
||||
if ( i < nChrom ) {
|
||||
if ( nAltAlleles == 1 ) {
|
||||
ACs.add(new int[]{i});
|
||||
} else if ( nAltAlleles == 2 ) {
|
||||
for ( int j : ACsToTry ) {
|
||||
if ( j < nChrom - i )
|
||||
ACs.add(new int[]{i, j});
|
||||
}
|
||||
} else {
|
||||
throw new IllegalStateException("cannot get here");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ACs;
|
||||
}
|
||||
}
|
||||
|
||||
private static class AnalyzeBySingletonPosition extends Analysis {
|
||||
public AnalyzeBySingletonPosition(final List<String> columns) {
|
||||
super("AnalyzeBySingletonPosition", Utils.append(columns, "non.type.pls", "position.of.singleton"));
|
||||
}
|
||||
|
||||
public void run(final AFCalcTestBuilder testBuilder, final List<Object> coreValues) {
|
||||
final SimpleTimer timer = new SimpleTimer();
|
||||
|
||||
for ( final int nonTypePL : Arrays.asList(100) ) {
|
||||
final AFCalc calc = testBuilder.makeModel();
|
||||
final double[] priors = testBuilder.makePriors();
|
||||
|
||||
final int[] ac = new int[testBuilder.numAltAlleles];
|
||||
ac[0] = 1;
|
||||
final VariantContext vc = testBuilder.makeACTest(ac, 0, nonTypePL);
|
||||
|
||||
for ( final int position : MathUtils.log10LinearRange(0, vc.getNSamples(), 0.1) ) {
|
||||
final VariantContextBuilder vcb = new VariantContextBuilder(vc);
|
||||
final List<Genotype> genotypes = new ArrayList<Genotype>(vc.getGenotypes());
|
||||
Collections.rotate(genotypes, position);
|
||||
vcb.genotypes(genotypes);
|
||||
|
||||
timer.start();
|
||||
final AFCalcResult resultTracker = calc.getLog10PNonRef(vcb.make(), priors);
|
||||
final long runtime = timer.getElapsedTimeNano();
|
||||
|
||||
final List<Object> columns = new LinkedList<Object>(coreValues);
|
||||
columns.addAll(Arrays.asList(runtime, resultTracker.getnEvaluations(), nonTypePL, position));
|
||||
report.addRowList(columns);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static class AnalyzeByNonInformative extends Analysis {
|
||||
public AnalyzeByNonInformative(final List<String> columns) {
|
||||
super("AnalyzeByNonInformative", Utils.append(columns, "non.type.pls", "n.non.informative"));
|
||||
}
|
||||
|
||||
public void run(final AFCalcTestBuilder testBuilder, final List<Object> coreValues) {
|
||||
final SimpleTimer timer = new SimpleTimer();
|
||||
|
||||
for ( final int nonTypePL : Arrays.asList(100) ) {
|
||||
final AFCalc calc = testBuilder.makeModel();
|
||||
final double[] priors = testBuilder.makePriors();
|
||||
|
||||
final int[] ac = new int[testBuilder.numAltAlleles];
|
||||
ac[0] = 1;
|
||||
|
||||
for ( int nNonInformative = 0; nNonInformative < testBuilder.nSamples; nNonInformative++ ) {
|
||||
final VariantContext vc = testBuilder.makeACTest(ac, nNonInformative, nonTypePL);
|
||||
|
||||
timer.start();
|
||||
final AFCalcResult resultTracker = calc.getLog10PNonRef(vc, priors);
|
||||
final long runtime = timer.getElapsedTimeNano();
|
||||
|
||||
final List<Object> columns = new LinkedList<Object>(coreValues);
|
||||
columns.addAll(Arrays.asList(runtime, resultTracker.getnEvaluations(), nonTypePL, nNonInformative));
|
||||
report.addRowList(columns);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static class ModelParams {
|
||||
final AFCalcFactory.Calculation modelType;
|
||||
final int maxBiNSamples, maxTriNSamples;
|
||||
|
||||
private ModelParams(AFCalcFactory.Calculation modelType, int maxBiNSamples, int maxTriNSamples) {
|
||||
this.modelType = modelType;
|
||||
this.maxBiNSamples = maxBiNSamples;
|
||||
this.maxTriNSamples = maxTriNSamples;
|
||||
}
|
||||
|
||||
public boolean meetsConstraints(final int nAltAlleles, final int nSamples) {
|
||||
if ( nAltAlleles == 1 )
|
||||
return nSamples <= maxBiNSamples;
|
||||
else if ( nAltAlleles == 2 )
|
||||
return nSamples <= maxTriNSamples;
|
||||
else
|
||||
throw new IllegalStateException("Unexpected number of alt alleles " + nAltAlleles);
|
||||
}
|
||||
}
|
||||
|
||||
public enum Operation {
|
||||
ANALYZE,
|
||||
SINGLE
|
||||
}
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final TTCCLayout layout = new TTCCLayout();
|
||||
layout.setThreadPrinting(false);
|
||||
layout.setCategoryPrefixing(false);
|
||||
layout.setContextPrinting(false);
|
||||
logger.addAppender(new ConsoleAppender(layout));
|
||||
|
||||
final Operation op = Operation.valueOf(args[0]);
|
||||
|
||||
switch ( op ) {
|
||||
case ANALYZE: analyze(args); break;
|
||||
case SINGLE: profileBig(args); break;
|
||||
default: throw new IllegalAccessException("unknown operation " + op);
|
||||
}
|
||||
}
|
||||
|
||||
private static void profileBig(final String[] args) throws Exception {
|
||||
final int nSamples = Integer.valueOf(args[1]);
|
||||
final int ac = Integer.valueOf(args[2]);
|
||||
|
||||
final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(nSamples, 1,
|
||||
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
|
||||
AFCalcTestBuilder.PriorType.human);
|
||||
|
||||
final VariantContext vc = testBuilder.makeACTest(new int[]{ac}, 0, 100);
|
||||
|
||||
final SimpleTimer timer = new SimpleTimer().start();
|
||||
final AFCalcResult resultTracker = testBuilder.makeModel().getLog10PNonRef(vc, testBuilder.makePriors());
|
||||
final long runtime = timer.getElapsedTimeNano();
|
||||
logger.info("result " + resultTracker.getLog10PosteriorOfAFGT0());
|
||||
logger.info("runtime " + runtime);
|
||||
}
|
||||
|
||||
private static void analyze(final String[] args) throws Exception {
|
||||
final List<String> coreColumns = Arrays.asList("iteration", "n.alt.alleles", "n.samples",
|
||||
"exact.model", "prior.type", "runtime", "n.evaluations");
|
||||
|
||||
final PrintStream out = new PrintStream(new FileOutputStream(args[1]));
|
||||
|
||||
final List<ModelParams> modelParams = Arrays.asList(
|
||||
new ModelParams(AFCalcFactory.Calculation.EXACT_REFERENCE, 10000, 10),
|
||||
// new ModelParams(AFCalcTestBuilder.ModelType.GeneralExact, 100, 10),
|
||||
new ModelParams(AFCalcFactory.Calculation.EXACT_CONSTRAINED, 10000, 100),
|
||||
new ModelParams(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 10000, 1000));
|
||||
|
||||
final boolean ONLY_HUMAN_PRIORS = false;
|
||||
final List<AFCalcTestBuilder.PriorType> priorTypes = ONLY_HUMAN_PRIORS
|
||||
? Arrays.asList(AFCalcTestBuilder.PriorType.values())
|
||||
: Arrays.asList(AFCalcTestBuilder.PriorType.human);
|
||||
|
||||
final List<Analysis> analyzes = new ArrayList<Analysis>();
|
||||
analyzes.add(new AnalyzeByACAndPL(coreColumns));
|
||||
analyzes.add(new AnalyzeBySingletonPosition(coreColumns));
|
||||
//analyzes.add(new AnalyzeByNonInformative(coreColumns));
|
||||
|
||||
for ( int iteration = 0; iteration < 1; iteration++ ) {
|
||||
for ( final int nAltAlleles : Arrays.asList(1, 2) ) {
|
||||
for ( final int nSamples : Arrays.asList(1, 10, 100, 1000, 10000) ) {
|
||||
for ( final ModelParams modelToRun : modelParams) {
|
||||
if ( modelToRun.meetsConstraints(nAltAlleles, nSamples) ) {
|
||||
for ( final AFCalcTestBuilder.PriorType priorType : priorTypes ) {
|
||||
final AFCalcTestBuilder testBuilder
|
||||
= new AFCalcTestBuilder(nSamples, nAltAlleles, modelToRun.modelType, priorType);
|
||||
|
||||
for ( final Analysis analysis : analyzes ) {
|
||||
logger.info(Utils.join("\t", Arrays.asList(iteration, nAltAlleles, nSamples, modelToRun.modelType, priorType, analysis.getName())));
|
||||
final List<?> values = Arrays.asList(iteration, nAltAlleles, nSamples, modelToRun.modelType, priorType);
|
||||
analysis.run(testBuilder, (List<Object>)values);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final GATKReport report = new GATKReport();
|
||||
for ( final Analysis analysis : analyzes )
|
||||
report.addTable(analysis.getTable());
|
||||
report.print(out);
|
||||
out.close();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,170 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
public class AFCalcTestBuilder {
|
||||
final static Allele A = Allele.create("A", true);
|
||||
final static Allele C = Allele.create("C");
|
||||
final static Allele G = Allele.create("G");
|
||||
final static Allele T = Allele.create("T");
|
||||
final static Allele AA = Allele.create("AA");
|
||||
final static Allele AT = Allele.create("AT");
|
||||
final static Allele AG = Allele.create("AG");
|
||||
|
||||
static int sampleNameCounter = 0;
|
||||
|
||||
final int nSamples;
|
||||
final int numAltAlleles;
|
||||
final AFCalcFactory.Calculation modelType;
|
||||
final PriorType priorType;
|
||||
|
||||
public AFCalcTestBuilder(final int nSamples, final int numAltAlleles,
|
||||
final AFCalcFactory.Calculation modelType, final PriorType priorType) {
|
||||
this.nSamples = nSamples;
|
||||
this.numAltAlleles = numAltAlleles;
|
||||
this.modelType = modelType;
|
||||
this.priorType = priorType;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("AFCalcTestBuilder nSamples=%d nAlts=%d model=%s prior=%s", nSamples, numAltAlleles, modelType, priorType);
|
||||
}
|
||||
|
||||
public enum PriorType {
|
||||
flat,
|
||||
human
|
||||
}
|
||||
|
||||
public int getnSamples() {
|
||||
return nSamples;
|
||||
}
|
||||
|
||||
public AFCalc makeModel() {
|
||||
return AFCalcFactory.createAFCalc(modelType, nSamples, 4, 4, 2);
|
||||
}
|
||||
|
||||
public double[] makePriors() {
|
||||
final int nPriorValues = 2*nSamples+1;
|
||||
|
||||
switch ( priorType ) {
|
||||
case flat:
|
||||
return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
||||
case human:
|
||||
final double[] humanPriors = new double[nPriorValues];
|
||||
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001);
|
||||
return humanPriors;
|
||||
default:
|
||||
throw new RuntimeException("Unexpected type " + priorType);
|
||||
}
|
||||
}
|
||||
|
||||
public VariantContext makeACTest(final List<Integer> ACs, final int nNonInformative, final int nonTypePL) {
|
||||
return makeACTest(ArrayUtils.toPrimitive(ACs.toArray(new Integer[]{})), nNonInformative, nonTypePL);
|
||||
}
|
||||
|
||||
public VariantContext makeACTest(final int[] ACs, final int nNonInformative, final int nonTypePL) {
|
||||
final int nChrom = nSamples * 2;
|
||||
|
||||
final int[] nhet = new int[numAltAlleles];
|
||||
final int[] nhomvar = new int[numAltAlleles];
|
||||
|
||||
for ( int i = 0; i < ACs.length; i++ ) {
|
||||
final double p = ACs[i] / (1.0 * nChrom);
|
||||
nhomvar[i] = (int)Math.floor((nSamples - nNonInformative) * p * p);
|
||||
nhet[i] = ACs[i] - 2 * nhomvar[i];
|
||||
|
||||
if ( nhet[i] < 0 )
|
||||
throw new IllegalStateException("Bug! nhet[i] < 0");
|
||||
}
|
||||
|
||||
final long calcAC = MathUtils.sum(nhet) + 2 * MathUtils.sum(nhomvar);
|
||||
if ( calcAC != MathUtils.sum(ACs) )
|
||||
throw new IllegalStateException("calculated AC " + calcAC + " not equal to desired AC " + Utils.join(",", ACs));
|
||||
|
||||
return makeACTest(nhet, nhomvar, nNonInformative, nonTypePL);
|
||||
}
|
||||
|
||||
public VariantContext makeACTest(final int[] nhet, final int[] nhomvar, final int nNonInformative, final int nonTypePL) {
|
||||
List<Genotype> samples = new ArrayList<Genotype>(nSamples);
|
||||
|
||||
for ( int altI = 0; altI < nhet.length; altI++ ) {
|
||||
for ( int i = 0; i < nhet[altI]; i++ )
|
||||
samples.add(makePL(GenotypeType.HET, nonTypePL, altI+1));
|
||||
for ( int i = 0; i < nhomvar[altI]; i++ )
|
||||
samples.add(makePL(GenotypeType.HOM_VAR, nonTypePL, altI+1));
|
||||
}
|
||||
|
||||
final Genotype nonInformative = makeNonInformative();
|
||||
samples.addAll(Collections.nCopies(nNonInformative, nonInformative));
|
||||
|
||||
final int nRef = Math.max((int) (nSamples - nNonInformative - MathUtils.sum(nhet) - MathUtils.sum(nhomvar)), 0);
|
||||
samples.addAll(Collections.nCopies(nRef, makePL(GenotypeType.HOM_REF, nonTypePL, 0)));
|
||||
|
||||
samples = samples.subList(0, nSamples);
|
||||
|
||||
if ( samples.size() > nSamples )
|
||||
throw new IllegalStateException("too many samples");
|
||||
|
||||
VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, getAlleles());
|
||||
vcb.genotypes(samples);
|
||||
return vcb.make();
|
||||
}
|
||||
|
||||
public List<Allele> getAlleles() {
|
||||
return Arrays.asList(A, C, G, T, AA, AT, AG).subList(0, numAltAlleles+1);
|
||||
}
|
||||
|
||||
public List<Allele> getAlleles(final GenotypeType type, final int altI) {
|
||||
switch (type) {
|
||||
case HOM_REF: return Arrays.asList(getAlleles().get(0), getAlleles().get(0));
|
||||
case HET: return Arrays.asList(getAlleles().get(0), getAlleles().get(altI));
|
||||
case HOM_VAR: return Arrays.asList(getAlleles().get(altI), getAlleles().get(altI));
|
||||
default: throw new IllegalArgumentException("Unexpected type " + type);
|
||||
}
|
||||
}
|
||||
|
||||
public Genotype makePL(final List<Allele> expectedGT, int ... pls) {
|
||||
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
|
||||
gb.alleles(expectedGT);
|
||||
gb.PL(pls);
|
||||
return gb.make();
|
||||
}
|
||||
|
||||
private int numPLs() {
|
||||
return GenotypeLikelihoods.numLikelihoods(numAltAlleles+1, 2);
|
||||
}
|
||||
|
||||
public Genotype makeNonInformative() {
|
||||
final int[] nonInformativePLs = new int[GenotypeLikelihoods.numLikelihoods(numAltAlleles, 2)];
|
||||
return makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), nonInformativePLs);
|
||||
}
|
||||
|
||||
public Genotype makePL(final GenotypeType type, final int nonTypePL, final int altI) {
|
||||
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
|
||||
gb.alleles(getAlleles(type, altI));
|
||||
|
||||
final int[] pls = new int[numPLs()];
|
||||
Arrays.fill(pls, nonTypePL);
|
||||
|
||||
int index = 0;
|
||||
switch ( type ) {
|
||||
case HOM_REF: index = GenotypeLikelihoods.calculatePLindex(0, 0); break;
|
||||
case HET: index = GenotypeLikelihoods.calculatePLindex(0, altI); break;
|
||||
case HOM_VAR: index = GenotypeLikelihoods.calculatePLindex(altI, altI); break;
|
||||
}
|
||||
pls[index] = 0;
|
||||
gb.PL(pls);
|
||||
|
||||
return gb.make();
|
||||
}
|
||||
}
|
||||
|
|
@ -23,56 +23,55 @@
|
|||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.ProbabilityVector;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||
public class GeneralPloidyExactAFCalc extends ExactAFCalc {
|
||||
static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them
|
||||
final protected UnifiedArgumentCollection UAC;
|
||||
|
||||
private final int ploidy;
|
||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
private final static boolean VERBOSE = false;
|
||||
|
||||
protected GeneralPloidyExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(UAC, N, logger, verboseWriter);
|
||||
ploidy = UAC.samplePloidy;
|
||||
this.UAC = UAC;
|
||||
|
||||
protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
|
||||
this.ploidy = ploidy;
|
||||
}
|
||||
|
||||
public List<Allele> getLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
GenotypesContext GLs = vc.getGenotypes();
|
||||
List<Allele> alleles = vc.getAlleles();
|
||||
@Override
|
||||
protected VariantContext reduceScope(VariantContext vc) {
|
||||
final int maxAltAlleles = vc.getType().equals(VariantContext.Type.INDEL) ? maxAlternateAllelesForIndels : maxAlternateAllelesToGenotype;
|
||||
|
||||
// don't try to genotype too many alternate alleles
|
||||
if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) {
|
||||
logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
|
||||
if ( vc.getAlternateAlleles().size() > maxAltAlleles) {
|
||||
logger.warn("this tool is currently set to genotype at most " + maxAltAlleles + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
|
||||
|
||||
alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
|
||||
final List<Allele> alleles = new ArrayList<Allele>(maxAltAlleles + 1);
|
||||
alleles.add(vc.getReference());
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE, ploidy));
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, maxAltAlleles, ploidy));
|
||||
|
||||
|
||||
GLs = subsetAlleles(vc, alleles, false, ploidy);
|
||||
VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||
builder.alleles(alleles);
|
||||
builder.genotypes(subsetAlleles(vc, alleles, false, ploidy));
|
||||
return builder.make();
|
||||
} else {
|
||||
return vc;
|
||||
}
|
||||
|
||||
combineSinglePools(GLs, alleles.size(), ploidy, log10AlleleFrequencyPriors, result);
|
||||
|
||||
return alleles;
|
||||
}
|
||||
|
||||
@Override
|
||||
public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) {
|
||||
combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors, getResultTracker());
|
||||
return resultFromTracker(vc, log10AlleleFrequencyPriors);
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple wrapper class to hold values of combined pool likelihoods.
|
||||
|
|
@ -94,8 +93,8 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
|
||||
public void add(ExactACset set) {
|
||||
alleleCountSetList.add(set);
|
||||
conformationMap.put(set.ACcounts, set);
|
||||
final double likelihood = set.log10Likelihoods[0];
|
||||
conformationMap.put(set.getACcounts(), set);
|
||||
final double likelihood = set.getLog10Likelihoods()[0];
|
||||
|
||||
if (likelihood > maxLikelihood )
|
||||
maxLikelihood = likelihood;
|
||||
|
|
@ -108,11 +107,11 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
}
|
||||
|
||||
public double getLikelihoodOfConformation(int[] ac) {
|
||||
return conformationMap.get(new ExactACcounts(ac)).log10Likelihoods[0];
|
||||
return conformationMap.get(new ExactACcounts(ac)).getLog10Likelihoods()[0];
|
||||
}
|
||||
|
||||
public double getGLOfACZero() {
|
||||
return alleleCountSetList.get(0).log10Likelihoods[0]; // AC 0 is always at beginning of list
|
||||
return alleleCountSetList.get(0).getLog10Likelihoods()[0]; // AC 0 is always at beginning of list
|
||||
}
|
||||
|
||||
public int getLength() {
|
||||
|
|
@ -136,7 +135,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
|
||||
|
||||
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
|
||||
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes());
|
||||
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes(), true);
|
||||
for ( final double[] likelihoods : GLs ) {
|
||||
|
||||
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
||||
|
|
@ -171,15 +170,15 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
* @param numAlleles Number of alternate alleles
|
||||
* @param ploidyPerPool Number of samples per pool
|
||||
* @param log10AlleleFrequencyPriors Frequency priors
|
||||
* @param result object to fill with output values
|
||||
* @param resultTracker object to fill with output values
|
||||
*/
|
||||
protected static void combineSinglePools(final GenotypesContext GLs,
|
||||
final int numAlleles,
|
||||
final int ploidyPerPool,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
final AFCalcResultTracker resultTracker) {
|
||||
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs, true);
|
||||
|
||||
|
||||
int combinedPloidy = 0;
|
||||
|
|
@ -190,21 +189,27 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
// first element: zero ploidy, e.g. trivial degenerate distribution
|
||||
final int[] zeroCounts = new int[numAlleles];
|
||||
final ExactACset set = new ExactACset(1, new ExactACcounts(zeroCounts));
|
||||
set.log10Likelihoods[0] = 0.0;
|
||||
set.getLog10Likelihoods()[0] = 0.0;
|
||||
|
||||
combinedPoolLikelihoods.add(set);
|
||||
for (int p=1; p<genotypeLikelihoods.size(); p++) {
|
||||
result.reset();
|
||||
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p), combinedPloidy, ploidyPerPool,
|
||||
numAlleles, log10AlleleFrequencyPriors, result);
|
||||
combinedPloidy = ploidyPerPool + combinedPloidy; // total number of chromosomes in combinedLikelihoods
|
||||
}
|
||||
|
||||
if ( genotypeLikelihoods.size() <= 1 ) {
|
||||
// no meaningful GLs at all, just set the tracker to non poly values
|
||||
resultTracker.reset(); // just mimic-ing call below
|
||||
resultTracker.setLog10LikelihoodOfAFzero(0.0);
|
||||
} else {
|
||||
for (int p=1; p<genotypeLikelihoods.size(); p++) {
|
||||
resultTracker.reset(); // TODO -- why is this here? It makes it hard to track the n evaluation
|
||||
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p), combinedPloidy, ploidyPerPool,
|
||||
numAlleles, log10AlleleFrequencyPriors, resultTracker);
|
||||
combinedPloidy = ploidyPerPool + combinedPloidy; // total number of chromosomes in combinedLikelihoods
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static CombinedPoolLikelihoods fastCombineMultiallelicPool(final CombinedPoolLikelihoods originalPool, double[] newGL, int originalPloidy, int newGLPloidy, int numAlleles,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
final AFCalcResultTracker resultTracker) {
|
||||
|
||||
|
||||
|
||||
|
|
@ -221,23 +226,24 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
ExactACset zeroSet = new ExactACset(1, new ExactACcounts(zeroCounts));
|
||||
|
||||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||
indexesToACset.put(zeroSet.getACcounts(), zeroSet);
|
||||
|
||||
// keep processing while we have AC conformations that need to be calculated
|
||||
MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen();
|
||||
StateTracker stateTracker = new StateTracker();
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
resultTracker.incNEvaluations();
|
||||
// compute log10Likelihoods
|
||||
final ExactACset ACset = ACqueue.remove();
|
||||
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, result, maxLikelihoodSeen, ACqueue, indexesToACset);
|
||||
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, resultTracker, stateTracker, ACqueue, indexesToACset);
|
||||
|
||||
// adjust max likelihood seen if needed
|
||||
if ( log10LofKs > maxLikelihoodSeen.maxLog10L )
|
||||
maxLikelihoodSeen.update(log10LofKs, ACset.ACcounts);
|
||||
if ( log10LofKs > stateTracker.getMaxLog10L())
|
||||
stateTracker.update(log10LofKs, ACset.getACcounts());
|
||||
|
||||
// clean up memory
|
||||
indexesToACset.remove(ACset.ACcounts);
|
||||
indexesToACset.remove(ACset.getACcounts());
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** removing used set=%s%n", ACset.ACcounts);
|
||||
System.out.printf(" *** removing used set=%s%n", ACset.getACcounts());
|
||||
|
||||
}
|
||||
return newPool;
|
||||
|
|
@ -253,8 +259,8 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
* @param log10AlleleFrequencyPriors Prior object
|
||||
* @param originalPloidy Total ploidy of original combined pool
|
||||
* @param newGLPloidy Ploidy of GL vector
|
||||
* @param result AFResult object
|
||||
* @param maxLikelihoodSeen max likelihood observed so far
|
||||
* @param resultTracker AFResult object
|
||||
* @param stateTracker max likelihood observed so far
|
||||
* @param ACqueue Queue of conformations to compute
|
||||
* @param indexesToACset AC indices of objects in queue
|
||||
* @return max log likelihood
|
||||
|
|
@ -266,15 +272,15 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
final double[] log10AlleleFrequencyPriors,
|
||||
final int originalPloidy,
|
||||
final int newGLPloidy,
|
||||
final AlleleFrequencyCalculationResult result,
|
||||
final MaxLikelihoodSeen maxLikelihoodSeen,
|
||||
final AFCalcResultTracker resultTracker,
|
||||
final StateTracker stateTracker,
|
||||
final LinkedList<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
|
||||
|
||||
// compute likeihood in "set" of new set based on original likelihoods
|
||||
final int numAlleles = set.ACcounts.counts.length;
|
||||
final int numAlleles = set.getACcounts().getCounts().length;
|
||||
final int newPloidy = set.getACsum();
|
||||
final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, result);
|
||||
final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, resultTracker);
|
||||
|
||||
|
||||
// add to new pool
|
||||
|
|
@ -282,24 +288,24 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
newPool.add(set);
|
||||
|
||||
// TODO -- uncomment this correct line when the implementation of this model is optimized (it's too slow now to handle this fix)
|
||||
//if ( log10LofK < maxLikelihoodSeen.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && maxLikelihoodSeen.isLowerAC(set.ACcounts) ) {
|
||||
if ( log10LofK < maxLikelihoodSeen.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
//if ( log10LofK < stateTracker.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && stateTracker.isLowerAC(set.ACcounts) ) {
|
||||
if ( log10LofK < stateTracker.getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLikelihoodSeen.maxLog10L);
|
||||
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.getACcounts(), log10LofK, stateTracker.getMaxLog10L());
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// iterate over higher frequencies if possible
|
||||
// by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
|
||||
// so, if first element is zero, it automatically means we have no wiggle since we're in a corner of the conformation space
|
||||
final int ACwiggle = set.ACcounts.counts[0];
|
||||
final int ACwiggle = set.getACcounts().getCounts()[0];
|
||||
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
|
||||
return log10LofK;
|
||||
|
||||
|
||||
// add conformations for other cases
|
||||
for ( int allele = 1; allele < numAlleles; allele++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
final int[] ACcountsClone = set.getACcounts().getCounts().clone();
|
||||
ACcountsClone[allele]++;
|
||||
// is this a valid conformation?
|
||||
int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
|
||||
|
|
@ -329,11 +335,11 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
* @param ploidy2 Ploidy of second pool
|
||||
* @param numAlleles Number of alleles
|
||||
* @param log10AlleleFrequencyPriors Array of biallelic priors
|
||||
* @param result Af calculation result object
|
||||
* @param resultTracker Af calculation result object
|
||||
*/
|
||||
public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
final AFCalcResultTracker resultTracker) {
|
||||
/*
|
||||
final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1);
|
||||
final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2);
|
||||
|
|
@ -387,7 +393,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
* @param numAlleles Number of alleles (including ref)
|
||||
* @param ploidy1 Ploidy of original pool (combined)
|
||||
* @param ploidy2 Ploidy of new pool
|
||||
* @param result AFResult object
|
||||
* @param resultTracker AFResult object
|
||||
* @return log-likehood of requested conformation
|
||||
*/
|
||||
private static double computeLofK(final ExactACset set,
|
||||
|
|
@ -395,7 +401,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
final double[] secondGL,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final int numAlleles, final int ploidy1, final int ploidy2,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
final AFCalcResultTracker resultTracker) {
|
||||
|
||||
final int newPloidy = ploidy1 + ploidy2;
|
||||
|
||||
|
|
@ -404,17 +410,17 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
if (newPloidy != totalAltK)
|
||||
throw new ReviewedStingException("BUG: inconsistent sizes of set.getACsum and passed ploidy values");
|
||||
|
||||
totalAltK -= set.ACcounts.counts[0];
|
||||
totalAltK -= set.getACcounts().getCounts()[0];
|
||||
// totalAltK has sum of alt alleles of conformation now
|
||||
|
||||
|
||||
// special case for k = 0 over all k
|
||||
if ( totalAltK == 0 ) { // all-ref case
|
||||
final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX];
|
||||
set.log10Likelihoods[0] = log10Lof0;
|
||||
set.getLog10Likelihoods()[0] = log10Lof0;
|
||||
|
||||
result.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
resultTracker.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
return log10Lof0;
|
||||
|
||||
} else {
|
||||
|
|
@ -423,12 +429,12 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
// ExactACset holds by convention the conformation of all alleles, and the sum of all allele count is just the ploidy.
|
||||
// To compute n!/k1!k2!k3!... we need to compute first n!/(k2!k3!...) and then further divide by k1! where k1=ploidy-sum_k_i
|
||||
|
||||
int[] currentCount = set.ACcounts.getCounts();
|
||||
int[] currentCount = set.getACcounts().getCounts();
|
||||
double denom = -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount);
|
||||
|
||||
// for current conformation, get all possible ways to break vector K into two components G1 and G2
|
||||
final GeneralPloidyGenotypeLikelihoods.SumIterator innerIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
|
||||
set.log10Likelihoods[0] = Double.NEGATIVE_INFINITY;
|
||||
set.getLog10Likelihoods()[0] = Double.NEGATIVE_INFINITY;
|
||||
while (innerIterator.hasNext()) {
|
||||
// check if breaking current conformation into g1 and g2 is feasible.
|
||||
final int[] acCount2 = innerIterator.getCurrentVector();
|
||||
|
|
@ -444,27 +450,27 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
final double num2 = MathUtils.log10MultinomialCoefficient(ploidy2, acCount2);
|
||||
final double sum = firstGL + gl2 + num1 + num2;
|
||||
|
||||
set.log10Likelihoods[0] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[0], sum);
|
||||
set.getLog10Likelihoods()[0] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[0], sum);
|
||||
}
|
||||
}
|
||||
innerIterator.next();
|
||||
}
|
||||
|
||||
set.log10Likelihoods[0] += denom;
|
||||
set.getLog10Likelihoods()[0] += denom;
|
||||
}
|
||||
|
||||
double log10LofK = set.log10Likelihoods[0];
|
||||
double log10LofK = set.getLog10Likelihoods()[0];
|
||||
|
||||
// update the MLE if necessary
|
||||
final int altCounts[] = Arrays.copyOfRange(set.ACcounts.counts,1, set.ACcounts.counts.length);
|
||||
result.updateMLEifNeeded(log10LofK, altCounts);
|
||||
final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length);
|
||||
resultTracker.updateMLEifNeeded(log10LofK, altCounts);
|
||||
|
||||
// apply the priors over each alternate allele
|
||||
for (final int ACcount : altCounts ) {
|
||||
if ( ACcount > 0 )
|
||||
log10LofK += log10AlleleFrequencyPriors[ACcount];
|
||||
}
|
||||
result.updateMAPifNeeded(log10LofK, altCounts);
|
||||
resultTracker.updateMAPifNeeded(log10LofK, altCounts);
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
|
@ -496,12 +502,12 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
* @param ploidy1 Ploidy of first pool (# of chromosomes in it)
|
||||
* @param ploidy2 Ploidy of second pool
|
||||
* @param log10AlleleFrequencyPriors Array of biallelic priors
|
||||
* @param result Af calculation result object
|
||||
* @param resultTracker Af calculation result object
|
||||
* @return Combined likelihood vector
|
||||
*/
|
||||
public static ProbabilityVector combineBiallelicPoolsNaively(final ProbabilityVector originalPool, final double[] newPLVector,
|
||||
final int ploidy1, final int ploidy2, final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
final AFCalcResultTracker resultTracker) {
|
||||
|
||||
final int newPloidy = ploidy1 + ploidy2;
|
||||
|
||||
|
|
@ -526,8 +532,8 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
|
||||
|
||||
final double log10Lof0 = x[0]+y[0];
|
||||
result.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
resultTracker.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
|
||||
double maxElement = log10Lof0;
|
||||
int maxElementIdx = 0;
|
||||
|
|
@ -569,8 +575,8 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
}
|
||||
|
||||
alleleCounts[0] = k;
|
||||
result.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts);
|
||||
result.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts);
|
||||
resultTracker.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts);
|
||||
resultTracker.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts);
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -622,7 +628,10 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
// create the new likelihoods array from the alleles we are allowed to use
|
||||
final double[] originalLikelihoods = g.getLikelihoods().getAsVector();
|
||||
double[] newLikelihoods;
|
||||
if ( numOriginalAltAlleles == numNewAltAlleles) {
|
||||
|
||||
// Optimization: if # of new alt alleles = 0 (pure ref call), keep original likelihoods so we skip normalization
|
||||
// and subsetting
|
||||
if ( numOriginalAltAlleles == numNewAltAlleles || numNewAltAlleles == 0) {
|
||||
newLikelihoods = originalLikelihoods;
|
||||
} else {
|
||||
newLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse);
|
||||
|
|
@ -283,7 +283,7 @@ public class GenotypingEngine {
|
|||
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
|
||||
if( mergedVC == null ) { continue; }
|
||||
|
||||
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
|
||||
HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
|
||||
int aCount = 0;
|
||||
for( final Allele a : mergedVC.getAlleles() ) {
|
||||
alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
|
||||
|
|
@ -308,9 +308,20 @@ public class GenotypingEngine {
|
|||
}
|
||||
genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() );
|
||||
}
|
||||
final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
|
||||
|
||||
VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
|
||||
if( call != null ) {
|
||||
if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
|
||||
final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call);
|
||||
// also, need to update the allele -> haplotype mapping
|
||||
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMapTrim = new HashMap<Allele, ArrayList<Haplotype>>();
|
||||
for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC
|
||||
alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleHashMap.get(call.getAlleles().get(iii)));
|
||||
}
|
||||
|
||||
call = vcCallTrim;
|
||||
alleleHashMap = alleleHashMapTrim;
|
||||
}
|
||||
|
||||
returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(call, alleleHashMap) );
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -237,9 +237,13 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING);
|
||||
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING);
|
||||
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
|
||||
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
|
||||
|
||||
// create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
|
||||
UnifiedArgumentCollection simpleUAC = UAC.clone();
|
||||
simpleUAC.exactCallsLog = null;
|
||||
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
|
||||
// initialize the output VCF header
|
||||
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||
|
|
|
|||
|
|
@ -40,7 +40,6 @@ import java.util.*;
|
|||
public class LikelihoodCalculationEngine {
|
||||
|
||||
private static final double LOG_ONE_HALF = -Math.log10(2.0);
|
||||
private static final double BEST_LIKELIHOOD_THRESHOLD = 0.1;
|
||||
private final byte constantGCP;
|
||||
private final boolean DEBUG;
|
||||
private final PairHMM pairHMM;
|
||||
|
|
@ -184,7 +183,7 @@ public class LikelihoodCalculationEngine {
|
|||
haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF );
|
||||
}
|
||||
}
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum?
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -323,11 +322,13 @@ public class LikelihoodCalculationEngine {
|
|||
return bestHaplotypes;
|
||||
}
|
||||
|
||||
public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList, final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call) {
|
||||
public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser,
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList,
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call) {
|
||||
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst());
|
||||
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
|
||||
//final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<Allele, List<GATKSAMRecord>>();
|
||||
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
|
||||
|
||||
final ArrayList<GATKSAMRecord> readsForThisSample = sample.getValue();
|
||||
|
|
@ -352,7 +353,7 @@ public class LikelihoodCalculationEngine {
|
|||
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
|
||||
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
|
||||
for( final Allele a : call.getFirst().getAlleles() )
|
||||
likelihoodMap.add(read,a,0.0);
|
||||
likelihoodMap.add(read, a, 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -63,7 +63,7 @@ public class BaseCountsUnitTest extends BaseTest {
|
|||
|
||||
String name = String.format("Test-%s", params.bases);
|
||||
Assert.assertEquals(counts.totalCount(), params.bases.length(), name);
|
||||
Assert.assertEquals(counts.countOfMostCommonBase(), params.mostCommonCount, name);
|
||||
Assert.assertEquals(counts.countOfBase(counts.baseIndexWithMostCounts()), params.mostCommonCount, name);
|
||||
Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,9 +1,9 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -60,31 +60,31 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test(enabled = true)
|
||||
public void testBOTH_GGA_Pools() {
|
||||
PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","a636ae291a27843107294f3e7940b98a");
|
||||
PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","6d60d9f3dfe8e1580214be0d170b0fff");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testINDEL_GGA_Pools() {
|
||||
PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","738fa68a3fc838b4bbad5c257f3e96fe");
|
||||
PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","30abf3c1868a61145edbc08fe35c8150");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
|
||||
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","9bcf1f2c204a251ee2b0b6f17ed59a61");
|
||||
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","ef99bc0513d3267f43b84cb88a324376");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
|
||||
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","c73678eeaad574af9ed45045074828fa");
|
||||
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","8ca07270717641385fe5d2e07e530782");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMT_SNP_DISCOVERY_sp4() {
|
||||
PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","c32e10070e10d30d33e5b882c1f89413");
|
||||
PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","da84bf45f7080a46a7a78542b3a0629d");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMT_SNP_GGA_sp10() {
|
||||
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "d1b48f6f3a175fcba9aec6d427005a45");
|
||||
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "f8ea18ec6a717a77fdf8c5f2482d8d8d");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,603 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public class AFCalcUnitTest extends BaseTest {
|
||||
static Allele A = Allele.create("A", true);
|
||||
static Allele C = Allele.create("C");
|
||||
static Allele G = Allele.create("G");
|
||||
|
||||
static int sampleNameCounter = 0;
|
||||
static Genotype AA1, AB1, BB1, NON_INFORMATIVE1;
|
||||
static Genotype AA2, AB2, AC2, BB2, BC2, CC2, NON_INFORMATIVE2;
|
||||
final double[] FLAT_3SAMPLE_PRIORS = MathUtils.normalizeFromLog10(new double[2*3+1], true); // flat priors
|
||||
|
||||
final private static boolean INCLUDE_BIALLELIC = true;
|
||||
final private static boolean INCLUDE_TRIALLELIC = true;
|
||||
final private static boolean Guillermo_FIXME = false; // TODO -- can only be enabled when GdA fixes bug
|
||||
final private static boolean DEBUG_ONLY = false;
|
||||
|
||||
@BeforeSuite
|
||||
public void before() {
|
||||
AA1 = makePL(Arrays.asList(A, A), 0, 20, 20);
|
||||
AB1 = makePL(Arrays.asList(A, C), 20, 0, 20);
|
||||
BB1 = makePL(Arrays.asList(C, C), 20, 20, 0);
|
||||
NON_INFORMATIVE1 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0);
|
||||
|
||||
AA2 = makePL(Arrays.asList(A, A), 0, 20, 20, 20, 20, 20);
|
||||
AB2 = makePL(Arrays.asList(A, C), 20, 0, 20, 20, 20, 20);
|
||||
BB2 = makePL(Arrays.asList(C, C), 20, 20, 0, 20, 20, 20);
|
||||
AC2 = makePL(Arrays.asList(A, G), 20, 20, 20, 0, 20, 20);
|
||||
BC2 = makePL(Arrays.asList(C, G), 20, 20, 20, 20, 0, 20);
|
||||
CC2 = makePL(Arrays.asList(G, G), 20, 20, 20, 20, 20, 0);
|
||||
NON_INFORMATIVE2 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0, 0, 0, 0);
|
||||
}
|
||||
|
||||
protected static Genotype makePL(final List<Allele> expectedGT, int ... pls) {
|
||||
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
|
||||
gb.alleles(expectedGT);
|
||||
gb.PL(pls);
|
||||
return gb.make();
|
||||
}
|
||||
|
||||
private class GetGLsTest extends TestDataProvider {
|
||||
GenotypesContext GLs;
|
||||
int numAltAlleles;
|
||||
final AFCalc calc;
|
||||
final int[] expectedACs;
|
||||
final double[] priors;
|
||||
final String priorName;
|
||||
|
||||
private GetGLsTest(final AFCalc calc, int numAltAlleles, List<Genotype> arg, final double[] priors, final String priorName) {
|
||||
super(GetGLsTest.class);
|
||||
GLs = GenotypesContext.create(new ArrayList<Genotype>(arg));
|
||||
this.numAltAlleles = numAltAlleles;
|
||||
this.calc = calc;
|
||||
this.priors = priors;
|
||||
this.priorName = priorName;
|
||||
|
||||
expectedACs = new int[numAltAlleles+1];
|
||||
for ( int alleleI = 0; alleleI < expectedACs.length; alleleI++ ) {
|
||||
expectedACs[alleleI] = 0;
|
||||
final Allele allele = getAlleles().get(alleleI);
|
||||
for ( Genotype g : arg ) {
|
||||
expectedACs[alleleI] += Collections.frequency(g.getAlleles(), allele);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public AFCalcResult execute() {
|
||||
return getCalc().getLog10PNonRef(getVC(), getPriors());
|
||||
}
|
||||
|
||||
public AFCalcResult executeRef() {
|
||||
final AFCalc ref = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_REFERENCE, getCalc().nSamples, getCalc().getMaxAltAlleles());
|
||||
return ref.getLog10PNonRef(getVC(), getPriors());
|
||||
}
|
||||
|
||||
public double[] getPriors() {
|
||||
return priors;
|
||||
}
|
||||
|
||||
public AFCalc getCalc() {
|
||||
return calc;
|
||||
}
|
||||
|
||||
public VariantContext getVC() {
|
||||
VariantContextBuilder builder = new VariantContextBuilder("test", "1", 1, 1, getAlleles());
|
||||
builder.genotypes(GLs);
|
||||
return builder.make();
|
||||
}
|
||||
|
||||
public List<Allele> getAlleles() {
|
||||
return Arrays.asList(Allele.create("A", true),
|
||||
Allele.create("C"),
|
||||
Allele.create("G"),
|
||||
Allele.create("T")).subList(0, numAltAlleles+1);
|
||||
}
|
||||
|
||||
public int getExpectedAltAC(final int alleleI) {
|
||||
return expectedACs[alleleI+1];
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s model=%s prior=%s input=%s", super.toString(), calc.getClass().getSimpleName(),
|
||||
priorName, GLs.size() > 5 ? String.format("%d samples", GLs.size()) : GLs);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "wellFormedGLs")
|
||||
public Object[][] createSimpleGLsData() {
|
||||
final List<Genotype> biAllelicSamples = Arrays.asList(AA1, AB1, BB1);
|
||||
final List<Genotype> triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2);
|
||||
|
||||
for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) {
|
||||
List<AFCalc> calcs = AFCalcFactory.createAFCalcs(
|
||||
Arrays.asList(
|
||||
AFCalcFactory.Calculation.EXACT_REFERENCE,
|
||||
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
|
||||
AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY
|
||||
), 4, 2, 2, 2);
|
||||
|
||||
final int nPriorValues = 2*nSamples+1;
|
||||
final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
||||
final double[] humanPriors = new double[nPriorValues];
|
||||
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001);
|
||||
|
||||
for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
|
||||
for ( AFCalc model : calcs ) {
|
||||
final String priorName = priors == humanPriors ? "human" : "flat";
|
||||
|
||||
// bi-allelic
|
||||
if ( INCLUDE_BIALLELIC && nSamples <= biAllelicSamples.size() )
|
||||
for ( List<Genotype> genotypes : Utils.makePermutations(biAllelicSamples, nSamples, true) )
|
||||
new GetGLsTest(model, 1, genotypes, priors, priorName);
|
||||
|
||||
// tri-allelic
|
||||
if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) ) // || model != generalCalc ) )
|
||||
for ( List<Genotype> genotypes : Utils.makePermutations(triAllelicSamples, nSamples, true) )
|
||||
new GetGLsTest(model, 2, genotypes, priors, priorName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return GetGLsTest.getTests(GetGLsTest.class);
|
||||
}
|
||||
|
||||
@DataProvider(name = "badGLs")
|
||||
public Object[][] createBadGLs() {
|
||||
final List<Genotype> genotypes = Arrays.asList(AB2, BB2, CC2, CC2);
|
||||
final int nSamples = genotypes.size();
|
||||
|
||||
final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4);
|
||||
|
||||
final int nPriorValues = 2*nSamples+1;
|
||||
final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
||||
for ( AFCalc model : Arrays.asList(indCalc) ) {
|
||||
final String priorName = "flat";
|
||||
new GetGLsTest(model, 2, genotypes, priors, priorName);
|
||||
}
|
||||
|
||||
return GetGLsTest.getTests(GetGLsTest.class);
|
||||
}
|
||||
|
||||
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "wellFormedGLs")
|
||||
public void testBiallelicGLs(GetGLsTest cfg) {
|
||||
if ( cfg.getAlleles().size() == 2 )
|
||||
testResultSimple(cfg);
|
||||
}
|
||||
|
||||
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "wellFormedGLs")
|
||||
public void testTriallelicGLs(GetGLsTest cfg) {
|
||||
if ( cfg.getAlleles().size() > 2 )
|
||||
testResultSimple(cfg);
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "badGLs")
|
||||
public void testBadGLs(GetGLsTest cfg) {
|
||||
testResultSimple(cfg);
|
||||
}
|
||||
|
||||
private static class NonInformativeData {
|
||||
final Genotype nonInformative;
|
||||
final List<Genotype> called;
|
||||
final int nAltAlleles;
|
||||
|
||||
private NonInformativeData(List<Genotype> called, Genotype nonInformative, int nAltAlleles) {
|
||||
this.called = called;
|
||||
this.nonInformative = nonInformative;
|
||||
this.nAltAlleles = nAltAlleles;
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "GLsWithNonInformative")
|
||||
public Object[][] makeGLsWithNonInformative() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final List<NonInformativeData> nonInformativeTests = new LinkedList<NonInformativeData>();
|
||||
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB1), NON_INFORMATIVE1, 1));
|
||||
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2), NON_INFORMATIVE2, 2));
|
||||
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2, BC2), NON_INFORMATIVE2, 2));
|
||||
|
||||
for ( final int nNonInformative : Arrays.asList(1, 10, 100) ) {
|
||||
for ( final NonInformativeData testData : nonInformativeTests ) {
|
||||
final List<Genotype> samples = new ArrayList<Genotype>();
|
||||
samples.addAll(testData.called);
|
||||
samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative));
|
||||
|
||||
final int nSamples = samples.size();
|
||||
List<AFCalc> calcs = AFCalcFactory.createAFCalcs(
|
||||
Arrays.asList(
|
||||
AFCalcFactory.Calculation.EXACT_REFERENCE,
|
||||
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
|
||||
AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY
|
||||
), 4, 2, 2, 2);
|
||||
|
||||
final double[] priors = MathUtils.normalizeFromLog10(new double[2*nSamples+1], true); // flat priors
|
||||
|
||||
for ( AFCalc model : calcs ) {
|
||||
final GetGLsTest onlyInformative = new GetGLsTest(model, testData.nAltAlleles, testData.called, priors, "flat");
|
||||
|
||||
for ( int rotation = 0; rotation < nSamples; rotation++ ) {
|
||||
Collections.rotate(samples, 1);
|
||||
final GetGLsTest withNonInformative = new GetGLsTest(model, testData.nAltAlleles, samples, priors, "flat");
|
||||
tests.add(new Object[]{onlyInformative, withNonInformative});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "GLsWithNonInformative", dependsOnMethods = {"testBiallelicGLs", "testTriallelicGLs"})
|
||||
public void testGLsWithNonInformative(GetGLsTest onlyInformative, GetGLsTest withNonInformative) {
|
||||
final AFCalcResult expected = onlyInformative.execute();
|
||||
final AFCalcResult actual = withNonInformative.execute();
|
||||
|
||||
testResultSimple(withNonInformative);
|
||||
compareAFCalcResults(actual, expected, onlyInformative.getCalc(), true);
|
||||
}
|
||||
|
||||
private void testResultSimple(final GetGLsTest cfg) {
|
||||
final AFCalcResult refResultTracker = cfg.executeRef();
|
||||
final AFCalcResult resultTracker = cfg.execute();
|
||||
|
||||
compareAFCalcResults(resultTracker, refResultTracker, cfg.getCalc(), true);
|
||||
|
||||
Assert.assertNotNull(resultTracker.getAllelesUsedInGenotyping());
|
||||
Assert.assertTrue(cfg.getAlleles().containsAll(resultTracker.getAllelesUsedInGenotyping()), "Result object has alleles not in our initial allele list");
|
||||
|
||||
for ( int altAlleleI = 0; altAlleleI < cfg.numAltAlleles; altAlleleI++ ) {
|
||||
int expectedAlleleCount = cfg.getExpectedAltAC(altAlleleI);
|
||||
int calcAC_MLE = resultTracker.getAlleleCountsOfMLE()[altAlleleI];
|
||||
|
||||
final Allele allele = cfg.getAlleles().get(altAlleleI+1);
|
||||
Assert.assertEquals(calcAC_MLE, expectedAlleleCount, "MLE AC not equal to expected AC for allele " + allele);
|
||||
}
|
||||
}
|
||||
|
||||
private void compareAFCalcResults(final AFCalcResult actual, final AFCalcResult expected, final AFCalc calc, final boolean onlyPosteriorsShouldBeEqual) {
|
||||
// note we cannot really test the multi-allelic case because we actually meaningfully differ among the models here
|
||||
final double TOLERANCE = calc.getMaxAltAlleles() > 1 ? 1000 : 0.1; // much tighter constraints on bi-allelic results
|
||||
|
||||
if ( ! onlyPosteriorsShouldBeEqual ) {
|
||||
Assert.assertEquals(actual.getLog10PriorOfAFEq0(), expected.getLog10PriorOfAFEq0(), TOLERANCE, "Priors AF == 0");
|
||||
Assert.assertEquals(actual.getLog10PriorOfAFGT0(), expected.getLog10PriorOfAFGT0(), TOLERANCE, "Priors AF > 0");
|
||||
Assert.assertEquals(actual.getLog10LikelihoodOfAFEq0(), expected.getLog10LikelihoodOfAFEq0(), TOLERANCE, "Likelihoods AF == 0");
|
||||
Assert.assertEquals(actual.getLog10LikelihoodOfAFGT0(), expected.getLog10LikelihoodOfAFGT0(), TOLERANCE, "Likelihoods AF > 0");
|
||||
}
|
||||
Assert.assertEquals(actual.getLog10PosteriorOfAFEq0(), expected.getLog10PosteriorOfAFEq0(), TOLERANCE, "Posteriors AF == 0");
|
||||
Assert.assertEquals(actual.getLog10PosteriorOfAFGT0(), expected.getLog10PosteriorOfAFGT0(), TOLERANCE, "Posteriors AF > 0");
|
||||
Assert.assertEquals(actual.getAlleleCountsOfMLE(), expected.getAlleleCountsOfMLE(), "MLE ACs");
|
||||
Assert.assertEquals(actual.getAllelesUsedInGenotyping(), expected.getAllelesUsedInGenotyping(), "Alleles used in genotyping");
|
||||
|
||||
for ( final Allele a : expected.getAllelesUsedInGenotyping() ) {
|
||||
if ( ! a.isReference() ) {
|
||||
Assert.assertEquals(actual.getAlleleCountAtMLE(a), expected.getAlleleCountAtMLE(a), "MLE AC for allele " + a);
|
||||
// TODO -- enable me when IndependentAllelesDiploidExactAFCalc works properly
|
||||
// if ( ! ( calc instanceof GeneralPloidyExactAFCalc ) )
|
||||
// // TODO -- delete when general ploidy works properly with multi-allelics
|
||||
// Assert.assertEquals(actual.isPolymorphic(a, 0.0), expected.isPolymorphic(a, 0.0), "isPolymorphic with thread 0.0 for allele " + a);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "Models")
|
||||
public void testLargeGLs(final ExactAFCalc calc) {
|
||||
final Genotype BB = makePL(Arrays.asList(C, C), 20000000, 20000000, 0);
|
||||
GetGLsTest cfg = new GetGLsTest(calc, 1, Arrays.asList(BB, BB, BB), FLAT_3SAMPLE_PRIORS, "flat");
|
||||
|
||||
final AFCalcResult resultTracker = cfg.execute();
|
||||
|
||||
int calculatedAlleleCount = resultTracker.getAlleleCountsOfMLE()[0];
|
||||
Assert.assertEquals(calculatedAlleleCount, 6);
|
||||
}
|
||||
|
||||
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "Models")
|
||||
public void testMismatchedGLs(final ExactAFCalc calc) {
|
||||
final Genotype AB = makePL(Arrays.asList(A, C), 2000, 0, 2000, 2000, 2000, 2000);
|
||||
final Genotype AC = makePL(Arrays.asList(A, G), 100, 100, 100, 0, 100, 100);
|
||||
GetGLsTest cfg = new GetGLsTest(calc, 2, Arrays.asList(AB, AC), FLAT_3SAMPLE_PRIORS, "flat");
|
||||
|
||||
final AFCalcResult resultTracker = cfg.execute();
|
||||
|
||||
Assert.assertEquals(resultTracker.getAlleleCountsOfMLE()[0], 1);
|
||||
Assert.assertEquals(resultTracker.getAlleleCountsOfMLE()[1], 1);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Code to test that the pNonRef value is meaningful
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private static class PNonRefData {
|
||||
final Genotype g;
|
||||
final double pNonRef, tolerance;
|
||||
final boolean canScale;
|
||||
final List<AFCalcFactory.Calculation> badModels;
|
||||
final VariantContext vc;
|
||||
|
||||
private PNonRefData(final VariantContext vc, Genotype g, double pNonRef, double tolerance, final boolean canScale) {
|
||||
this(vc, g, pNonRef, tolerance, canScale, Collections.<AFCalcFactory.Calculation>emptyList());
|
||||
}
|
||||
|
||||
private PNonRefData(final VariantContext vc, Genotype g, double pNonRef, double tolerance, final boolean canScale, final List<AFCalcFactory.Calculation> badModels) {
|
||||
this.g = g;
|
||||
this.pNonRef = pNonRef;
|
||||
this.tolerance = tolerance;
|
||||
this.canScale = canScale;
|
||||
this.badModels = badModels;
|
||||
this.vc = vc;
|
||||
}
|
||||
|
||||
public PNonRefData scale(final int scaleFactor) {
|
||||
if ( canScale ) {
|
||||
final int[] PLs = new int[g.getPL().length];
|
||||
for ( int i = 0; i < PLs.length; i++ ) PLs[i] = g.getPL()[i] * ((int)Math.log10(scaleFactor)+1);
|
||||
final Genotype scaledG = new GenotypeBuilder(g).PL(PLs).make();
|
||||
final double scaledPNonRef = pNonRef < 0.5 ? pNonRef / scaleFactor : 1 - ((1-pNonRef) / scaleFactor);
|
||||
return new PNonRefData(vc, scaledG, scaledPNonRef, tolerance, true);
|
||||
} else {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "PNonRef")
|
||||
public Object[][] makePNonRefTest() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final List<Allele> AA = Arrays.asList(A, A);
|
||||
final List<Allele> AC = Arrays.asList(A, C);
|
||||
final List<Allele> CC = Arrays.asList(C, C);
|
||||
final List<Allele> AG = Arrays.asList(A, G);
|
||||
final List<Allele> GG = Arrays.asList(G, G);
|
||||
final List<Allele> CG = Arrays.asList(C, G);
|
||||
|
||||
final VariantContext vc2 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C)).make();
|
||||
final VariantContext vc3 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C, G)).make();
|
||||
final AFCalcTestBuilder.PriorType priorType = AFCalcTestBuilder.PriorType.flat;
|
||||
|
||||
final List<AFCalcFactory.Calculation> constrainedModel = Arrays.asList(AFCalcFactory.Calculation.EXACT_CONSTRAINED);
|
||||
|
||||
final double TOLERANCE = 0.5;
|
||||
|
||||
final List<PNonRefData> initialPNonRefData = Arrays.asList(
|
||||
// bi-allelic sites
|
||||
new PNonRefData(vc2, makePL(AA, 0, 10, 10), 0.1666667, TOLERANCE, true),
|
||||
new PNonRefData(vc2, makePL(AA, 0, 1, 10), 0.4721084, TOLERANCE, false, constrainedModel),
|
||||
new PNonRefData(vc2, makePL(AA, 0, 1, 1), 0.6136992, TOLERANCE, false, constrainedModel),
|
||||
new PNonRefData(vc2, makePL(AA, 0, 5, 5), 0.3874259, TOLERANCE, false, constrainedModel),
|
||||
new PNonRefData(vc2, makePL(AC, 10, 0, 10), 0.9166667, TOLERANCE, true),
|
||||
new PNonRefData(vc2, makePL(CC, 10, 10, 0), 0.9166667, TOLERANCE, true),
|
||||
|
||||
// tri-allelic sites -- cannot scale because of the naivety of our scaling estimator
|
||||
new PNonRefData(vc3, makePL(AA, 0, 10, 10, 10, 10, 10), 0.3023255813953489, TOLERANCE * 2, false), // more tolerance because constrained model is a bit inaccurate
|
||||
new PNonRefData(vc3, makePL(AC, 10, 0, 10, 10, 10, 10), 0.9166667, TOLERANCE, false),
|
||||
new PNonRefData(vc3, makePL(CC, 10, 10, 0, 10, 10, 10), 0.9166667, TOLERANCE, false),
|
||||
new PNonRefData(vc3, makePL(AG, 10, 10, 10, 0, 10, 10), 0.9166667, TOLERANCE, false),
|
||||
new PNonRefData(vc3, makePL(CG, 10, 10, 10, 10, 0, 10), 0.80, TOLERANCE, false),
|
||||
new PNonRefData(vc3, makePL(GG, 10, 10, 10, 10, 10, 0), 0.9166667, TOLERANCE, false)
|
||||
);
|
||||
|
||||
for ( AFCalcFactory.Calculation modelType : Arrays.asList(AFCalcFactory.Calculation.EXACT_REFERENCE, AFCalcFactory.Calculation.EXACT_INDEPENDENT) ) {
|
||||
for ( int nNonInformative = 0; nNonInformative < 3; nNonInformative++ ) {
|
||||
for ( final PNonRefData rootData : initialPNonRefData ) {
|
||||
for ( int plScale = 1; plScale <= 100000; plScale *= 10 ) {
|
||||
if ( ! rootData.badModels.contains(modelType) && (plScale == 1 || rootData.canScale) ) {
|
||||
final PNonRefData data = rootData.scale(plScale);
|
||||
tests.add(new Object[]{data.vc, modelType, priorType, Arrays.asList(data.g), data.pNonRef, data.tolerance, nNonInformative});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "PNonRef")
|
||||
private void testPNonRef(final VariantContext vcRoot,
|
||||
AFCalcFactory.Calculation modelType,
|
||||
AFCalcTestBuilder.PriorType priorType,
|
||||
final List<Genotype> genotypes,
|
||||
final double expectedPNonRef,
|
||||
final double tolerance,
|
||||
final int nNonInformative) {
|
||||
final AFCalcTestBuilder testBuilder
|
||||
= new AFCalcTestBuilder(1, vcRoot.getNAlleles()-1, modelType, priorType);
|
||||
|
||||
final VariantContextBuilder vcb = new VariantContextBuilder(vcRoot);
|
||||
vcb.genotypes(genotypes);
|
||||
|
||||
final AFCalcResult resultTracker = testBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors());
|
||||
|
||||
Assert.assertEquals(resultTracker.getLog10PosteriorOfAFGT0(), Math.log10(expectedPNonRef), tolerance,
|
||||
"Actual pNonRef not within tolerance " + tolerance + " of expected");
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Test priors
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
@DataProvider(name = "Models")
|
||||
public Object[][] makeModels() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
for ( final AFCalcFactory.Calculation calc : AFCalcFactory.Calculation.values() ) {
|
||||
if ( calc.usableForParams(2, 4) )
|
||||
tests.add(new Object[]{AFCalcFactory.createAFCalc(calc, 2, 4)});
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(enabled = true & ! DEBUG_ONLY, dataProvider = "Models")
|
||||
public void testBiallelicPriors(final AFCalc model) {
|
||||
|
||||
for ( int REF_PL = 10; REF_PL <= 20; REF_PL += 10 ) {
|
||||
final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000);
|
||||
|
||||
for ( int log10NonRefPrior = 1; log10NonRefPrior < 10*REF_PL; log10NonRefPrior += 1 ) {
|
||||
final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior);
|
||||
final double[] priors = MathUtils.normalizeFromLog10(MathUtils.toLog10(new double[]{refPrior, (1-refPrior) / 2, (1-refPrior) / 2}), true);
|
||||
GetGLsTest cfg = new GetGLsTest(model, 1, Arrays.asList(AB), priors, "pNonRef" + log10NonRefPrior);
|
||||
final AFCalcResult resultTracker = cfg.execute();
|
||||
final int actualAC = resultTracker.getAlleleCountsOfMLE()[0];
|
||||
|
||||
final double pRefWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0];
|
||||
final double pHetWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1] - Math.log10(0.5);
|
||||
final double nonRefPost = Math.pow(10, pHetWithPrior) / (Math.pow(10, pRefWithPrior) + Math.pow(10, pHetWithPrior));
|
||||
final double log10NonRefPost = Math.log10(nonRefPost);
|
||||
|
||||
if ( ! Double.isInfinite(log10NonRefPost) )
|
||||
Assert.assertEquals(resultTracker.getLog10PosteriorOfAFGT0(), log10NonRefPost, 1e-2);
|
||||
|
||||
if ( nonRefPost >= 0.9 )
|
||||
Assert.assertTrue(resultTracker.isPolymorphic(C, -1));
|
||||
|
||||
final int expectedMLEAC = 1; // the MLE is independent of the prior
|
||||
Assert.assertEquals(actualAC, expectedMLEAC,
|
||||
"actual AC with priors " + log10NonRefPrior + " not expected "
|
||||
+ expectedMLEAC + " priors " + Utils.join(",", priors));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "Models")
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Test that polymorphic sites (bi and tri) are properly called
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
@DataProvider(name = "polyTestProvider")
|
||||
public Object[][] makePolyTestProvider() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
// list of all high-quality models in the system
|
||||
final List<AFCalcFactory.Calculation> models = Arrays.asList(
|
||||
AFCalcFactory.Calculation.EXACT,
|
||||
AFCalcFactory.Calculation.EXACT_REFERENCE,
|
||||
AFCalcFactory.Calculation.EXACT_INDEPENDENT);
|
||||
|
||||
// note that we cannot use small PLs here or the thresholds are hard to set
|
||||
for ( final int nonTypePLs : Arrays.asList(100, 1000) ) {
|
||||
for ( final AFCalcFactory.Calculation model : models ) {
|
||||
for ( final int allele1AC : Arrays.asList(0, 1, 2, 10, 100, 1000, 10000) ) {
|
||||
for ( final int nSamples : Arrays.asList(1, 10, 100, 1000, 10000) ) {
|
||||
// for ( final int nonTypePLs : Arrays.asList(10) ) {
|
||||
// for ( final AFCalcFactory.Calculation model : models ) {
|
||||
// for ( final int allele1AC : Arrays.asList(100) ) {
|
||||
// for ( final int nSamples : Arrays.asList(1000) ) {
|
||||
if ( nSamples < allele1AC ) continue;
|
||||
|
||||
final double pPerSample = Math.pow(10, nonTypePLs / -10.0);
|
||||
final double errorFreq = pPerSample * nSamples;
|
||||
final boolean poly1 = allele1AC > errorFreq && (nonTypePLs * allele1AC) > 30;
|
||||
|
||||
// bi-allelic tests
|
||||
{
|
||||
final AFCalcTestBuilder testBuilder
|
||||
= new AFCalcTestBuilder(nSamples, 1, model, AFCalcTestBuilder.PriorType.human);
|
||||
final List<Integer> ACs = Arrays.asList(allele1AC);
|
||||
tests.add(new Object[]{testBuilder, ACs, nonTypePLs, Arrays.asList(poly1)});
|
||||
}
|
||||
|
||||
// multi-allelic tests
|
||||
for ( final int allele2AC : Arrays.asList(0, 1, 2, 10, 20, 50) ) {
|
||||
if ( nSamples < allele2AC || allele1AC + allele2AC > nSamples || nSamples > 100 || nSamples == 1)
|
||||
continue;
|
||||
|
||||
final AFCalcTestBuilder testBuilder
|
||||
= new AFCalcTestBuilder(nSamples, 2, model, AFCalcTestBuilder.PriorType.human);
|
||||
final List<Integer> ACs = Arrays.asList(allele1AC, allele2AC);
|
||||
final boolean poly2 = allele2AC > errorFreq && (nonTypePLs * allele2AC) > 90;
|
||||
tests.add(new Object[]{testBuilder, ACs, nonTypePLs, Arrays.asList(poly1, poly2)});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "polyTestProvider")
|
||||
public void testCallingGeneral(final AFCalcTestBuilder testBuilder, final List<Integer> ACs, final int nonTypePL, final List<Boolean> expectedPoly ) {
|
||||
testCalling(testBuilder, ACs, nonTypePL, expectedPoly);
|
||||
}
|
||||
|
||||
@DataProvider(name = "polyTestProviderLotsOfAlleles")
|
||||
public Object[][] makepolyTestProviderLotsOfAlleles() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
// list of all high-quality models in the system
|
||||
final List<AFCalcFactory.Calculation> models = Arrays.asList(AFCalcFactory.Calculation.EXACT_INDEPENDENT);
|
||||
|
||||
final List<Integer> alleleCounts = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 20);
|
||||
|
||||
final int nonTypePLs = 1000;
|
||||
final int nAlleles = 4;
|
||||
for ( final AFCalcFactory.Calculation model : models ) {
|
||||
for ( final List<Integer> ACs : Utils.makePermutations(alleleCounts, nAlleles, true) ) {
|
||||
final List<Boolean> isPoly = new ArrayList<Boolean>(ACs.size());
|
||||
for ( final int ac : ACs ) isPoly.add(ac > 0);
|
||||
|
||||
final double acSum = MathUtils.sum(ACs);
|
||||
for ( final int nSamples : Arrays.asList(1, 10, 100) ) {
|
||||
if ( nSamples < acSum ) continue;
|
||||
final AFCalcTestBuilder testBuilder
|
||||
= new AFCalcTestBuilder(nSamples, nAlleles, model, AFCalcTestBuilder.PriorType.human);
|
||||
tests.add(new Object[]{testBuilder, ACs, nonTypePLs, isPoly});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "polyTestProviderLotsOfAlleles")
|
||||
public void testCallingLotsOfAlleles(final AFCalcTestBuilder testBuilder, final List<Integer> ACs, final int nonTypePL, final List<Boolean> expectedPoly ) {
|
||||
testCalling(testBuilder, ACs, nonTypePL, expectedPoly);
|
||||
}
|
||||
|
||||
private void testCalling(final AFCalcTestBuilder testBuilder, final List<Integer> ACs, final int nonTypePL, final List<Boolean> expectedPoly) {
|
||||
final AFCalc calc = testBuilder.makeModel();
|
||||
final double[] priors = testBuilder.makePriors();
|
||||
final VariantContext vc = testBuilder.makeACTest(ACs, 0, nonTypePL);
|
||||
final AFCalcResult result = calc.getLog10PNonRef(vc, priors);
|
||||
|
||||
boolean anyPoly = false;
|
||||
for ( final boolean onePoly : expectedPoly ) anyPoly = anyPoly || onePoly;
|
||||
|
||||
if ( anyPoly )
|
||||
Assert.assertTrue(result.getLog10PosteriorOfAFGT0() > -1);
|
||||
|
||||
for ( int altI = 1; altI < result.getAllelesUsedInGenotyping().size(); altI++ ) {
|
||||
final int i = altI - 1;
|
||||
final Allele alt = result.getAllelesUsedInGenotyping().get(altI);
|
||||
|
||||
// must be getCalledChrCount because we cannot ensure that the VC made has our desired ACs
|
||||
Assert.assertEquals(result.getAlleleCountAtMLE(alt), vc.getCalledChrCount(alt));
|
||||
Assert.assertEquals(result.isPolymorphic(alt, -1), (boolean)expectedPoly.get(i), "isPolymorphic for allele " + alt + " " + result.getLog10PosteriorOfAFGt0ForAllele(alt));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,124 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class ConstrainedAFCalculationModelUnitTest extends BaseTest {
|
||||
static Allele A = Allele.create("A", true);
|
||||
static Allele C = Allele.create("C");
|
||||
static Allele G = Allele.create("G");
|
||||
|
||||
protected static Genotype makePL(final List<Allele> expectedGT, int ... pls) {
|
||||
return AFCalcUnitTest.makePL(expectedGT, pls);
|
||||
}
|
||||
|
||||
@DataProvider(name = "MaxACsToVisit")
|
||||
public Object[][] makeMaxACsToVisit() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final int nSamples = 10;
|
||||
|
||||
for (int nNonInformative = 0; nNonInformative < nSamples - 1; nNonInformative++ ) {
|
||||
final int nChrom = (nSamples - nNonInformative) * 2;
|
||||
for ( int i = 0; i < nChrom; i++ ) {
|
||||
// bi-allelic
|
||||
tests.add(new Object[]{nSamples, Arrays.asList(i), nNonInformative, AFCalcFactory.Calculation.EXACT_CONSTRAINED});
|
||||
|
||||
// tri-allelic
|
||||
for ( int j = 0; j < (nChrom - i); j++)
|
||||
tests.add(new Object[]{nSamples, Arrays.asList(i, j), nNonInformative, AFCalcFactory.Calculation.EXACT_CONSTRAINED});
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "MaxACsToVisit")
|
||||
public void testMaxACsToVisit(final int nSamples, final List<Integer> requestedACs, final int nNonInformative, final AFCalcFactory.Calculation modelType) {
|
||||
final int nAlts = requestedACs.size();
|
||||
final AFCalcTestBuilder testBuilder
|
||||
= new AFCalcTestBuilder(nSamples, nAlts, modelType,
|
||||
AFCalcTestBuilder.PriorType.human);
|
||||
|
||||
final VariantContext vc = testBuilder.makeACTest(requestedACs, nNonInformative, 100);
|
||||
final int[] maxACsToVisit = ((ConstrainedDiploidExactAFCalc)testBuilder.makeModel()).computeMaxACs(vc);
|
||||
|
||||
testExpectedACs(vc, maxACsToVisit);
|
||||
}
|
||||
|
||||
private void testExpectedACs(final VariantContext vc, final int[] maxACsToVisit) {
|
||||
// this is necessary because cannot ensure that the tester gives us back the
|
||||
// requested ACs due to rounding errors
|
||||
final List<Integer> ACs = new ArrayList<Integer>();
|
||||
for ( final Allele a : vc.getAlternateAlleles() )
|
||||
ACs.add(vc.getCalledChrCount(a));
|
||||
|
||||
for ( int i = 0; i < maxACsToVisit.length; i++ ) {
|
||||
Assert.assertEquals(maxACsToVisit[i], (int)ACs.get(i), "Maximum AC computed wasn't equal to the max possible in the construction for alt allele " + i);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "MaxACsGenotypes")
|
||||
public Object[][] makeMaxACsForGenotype() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final List<Allele> AA = Arrays.asList(A, A);
|
||||
final List<Allele> AC = Arrays.asList(A, C);
|
||||
final List<Allele> CC = Arrays.asList(C, C);
|
||||
final List<Allele> AG = Arrays.asList(A, G);
|
||||
final List<Allele> GG = Arrays.asList(G, G);
|
||||
final List<Allele> CG = Arrays.asList(C, G);
|
||||
|
||||
final VariantContext vc2 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C)).make();
|
||||
final VariantContext vc3 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C, G)).make();
|
||||
|
||||
tests.add(new Object[]{vc2, makePL(AA, 0, 10, 10)});
|
||||
tests.add(new Object[]{vc2, makePL(AC, 10, 0, 10)});
|
||||
tests.add(new Object[]{vc2, makePL(CC, 10, 10, 0)});
|
||||
|
||||
// make sure non-informative => 0
|
||||
tests.add(new Object[]{vc2, makePL(AA, 0, 0, 0)});
|
||||
tests.add(new Object[]{vc3, makePL(AA, 0, 0, 0, 0, 0, 0)});
|
||||
|
||||
// multi-allelics
|
||||
tests.add(new Object[]{vc3, makePL(AG, 10, 10, 10, 0, 10, 10)});
|
||||
tests.add(new Object[]{vc3, makePL(CG, 10, 10, 10, 10, 0, 10)});
|
||||
tests.add(new Object[]{vc3, makePL(GG, 10, 10, 10, 10, 10, 0)});
|
||||
|
||||
// deal with non-informatives third alleles
|
||||
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 0, 0, 10)});
|
||||
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 10, 0, 10)});
|
||||
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 10, 0, 0)});
|
||||
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 0, 0, 0)});
|
||||
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 0, 0, 10)});
|
||||
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 10, 0, 10)});
|
||||
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 10, 0, 0)});
|
||||
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 0, 0, 0)});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "MaxACsGenotypes")
|
||||
private void testMakeACByGenotype(final VariantContext vcRoot, final Genotype g) {
|
||||
final VariantContext vc = new VariantContextBuilder(vcRoot).genotypes(g).make();
|
||||
|
||||
final AFCalcTestBuilder testBuilder
|
||||
= new AFCalcTestBuilder(1, vc.getNAlleles()-1, AFCalcFactory.Calculation.EXACT_CONSTRAINED,
|
||||
AFCalcTestBuilder.PriorType.human);
|
||||
|
||||
final int[] maxACsToVisit = ((ConstrainedDiploidExactAFCalc)testBuilder.makeModel()).computeMaxACs(vc);
|
||||
|
||||
testExpectedACs(vc, maxACsToVisit);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
|
||||
|
|
@ -137,15 +138,15 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest {
|
|||
@Test(dataProvider = "getGLs")
|
||||
public void testGLs(GetGLsTest cfg) {
|
||||
|
||||
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(cfg.numAltAlleles);
|
||||
final AFCalcResultTracker resultTracker = new AFCalcResultTracker(cfg.numAltAlleles);
|
||||
final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size());
|
||||
double[] priors = new double[len]; // flat priors
|
||||
|
||||
GeneralPloidyExactAFCalculationModel.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result);
|
||||
GeneralPloidyExactAFCalc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, resultTracker);
|
||||
int nameIndex = 1;
|
||||
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
|
||||
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
|
||||
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele];
|
||||
int calculatedAlleleCount = resultTracker.getAlleleCountsOfMAP()[allele];
|
||||
|
||||
// System.out.format( "%s Expected:%d Calc:%d\n",cfg.toString(),expectedAlleleCount, calculatedAlleleCount);
|
||||
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
|
||||
|
|
@ -0,0 +1,210 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
// SEE private/R/pls.R if you want the truth output for these tests
|
||||
public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest {
|
||||
@DataProvider(name = "TestCombineGLs")
|
||||
public Object[][] makeTestCombineGLs() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
tests.add(new Object[]{1, 1, makePL( 0, 10, 20), makePL( 0, 10, 20)});
|
||||
tests.add(new Object[]{1, 1, makePL(10, 0, 20), makePL(10, 0, 20)});
|
||||
tests.add(new Object[]{1, 1, makePL(20, 10, 0), makePL(20, 10, 0)});
|
||||
|
||||
// AA AB BB AC BC CC => AA AB+BC CC
|
||||
tests.add(new Object[]{1, 2, makePL( 0, 10, 20, 30, 40, 50), makePL(0, 10, 20)});
|
||||
tests.add(new Object[]{2, 2, makePL( 0, 10, 20, 30, 40, 50), makePL(0, 30, 50)});
|
||||
|
||||
tests.add(new Object[]{1, 2, makePL( 0, 10, 10, 10, 10, 10), makePL(0, 8, 11)});
|
||||
tests.add(new Object[]{2, 2, makePL( 0, 10, 10, 10, 10, 10), makePL(0, 8, 11)});
|
||||
|
||||
tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 2, 5)});
|
||||
tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 4, 9)});
|
||||
|
||||
tests.add(new Object[]{1, 2, makePL( 0, 50, 50, 50, 50, 50), makePL( 0, 47, 50)});
|
||||
tests.add(new Object[]{2, 2, makePL( 0, 50, 50, 50, 50, 50), makePL( 0, 47, 50)});
|
||||
|
||||
tests.add(new Object[]{1, 2, makePL( 50, 0, 50, 50, 50, 50), makePL(45, 0, 50)});
|
||||
tests.add(new Object[]{2, 2, makePL( 50, 0, 50, 50, 50, 50), makePL( 0, 47, 50)});
|
||||
|
||||
tests.add(new Object[]{1, 2, makePL( 50, 50, 0, 50, 50, 50), makePL(45, 47, 0)});
|
||||
tests.add(new Object[]{2, 2, makePL( 50, 50, 0, 50, 50, 50), makePL( 0, 47, 50)});
|
||||
|
||||
tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 0, 50, 50), makePL(0, 47, 50)});
|
||||
tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 0, 50, 50), makePL(45, 0, 50)});
|
||||
|
||||
tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 50, 0, 50), makePL(45, 0, 50)});
|
||||
tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 50, 0, 50), makePL(45, 0, 50)});
|
||||
|
||||
tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 50, 50, 0), makePL(0, 47, 50)});
|
||||
tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 50, 50, 0), makePL(45, 47, 0)});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@DataProvider(name = "TestCombineGLsWithDrops")
|
||||
public Object[][] makeTestCombineGLsWithDrops() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final Set<Integer> noDrops = Collections.emptySet();
|
||||
final Set<Integer> drop1 = Collections.singleton(1);
|
||||
final Set<Integer> drop2 = Collections.singleton(2);
|
||||
|
||||
// AA AB BB AC BC CC
|
||||
// drop1 (B): AA AC CC
|
||||
// drop2 (C): AA AB BB
|
||||
tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 2, 5), noDrops});
|
||||
tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 4, 9), noDrops});
|
||||
tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 1, 2), drop2});
|
||||
tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 3, 5), drop1});
|
||||
|
||||
tests.add(new Object[]{1, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(0, 2, 6), noDrops});
|
||||
tests.add(new Object[]{2, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(1, 0, 2), noDrops});
|
||||
tests.add(new Object[]{1, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(2, 1, 0), drop2});
|
||||
tests.add(new Object[]{2, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(5, 2, 0), drop1});
|
||||
|
||||
tests.add(new Object[]{1, 2, makePL(10,10,10,10,10, 0), makePL( 0, 8,11), noDrops});
|
||||
tests.add(new Object[]{2, 2, makePL(10,10,10,10,10, 0), makePL( 5, 7, 0), noDrops});
|
||||
tests.add(new Object[]{1, 2, makePL(10,10,10,10,10, 0), makePL( 0, 0, 0), drop2});
|
||||
tests.add(new Object[]{2, 2, makePL(10,10,10,10,10, 0), makePL(10,10, 0), drop1});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
private Genotype makePL(final int ... PLs) {
|
||||
return AFCalcUnitTest.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), PLs);
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "TestCombineGLs")
|
||||
private void testCombineGLs(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected) {
|
||||
testCombineGLsWithDrops(altIndex, nAlts, testg, expected, Collections.<Integer>emptySet());
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "TestCombineGLsWithDrops")
|
||||
private void testCombineGLsWithDrops(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected, Set<Integer> allelesToDrop) {
|
||||
final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4);
|
||||
final Genotype combined = calc.combineGLs(testg, altIndex, allelesToDrop, nAlts);
|
||||
|
||||
Assert.assertEquals(combined.getPL(), expected.getPL(),
|
||||
"Combined PLs " + Utils.join(",", combined.getPL()) + " != expected " + Utils.join(",", expected.getPL()));
|
||||
}
|
||||
|
||||
|
||||
static Allele A = Allele.create("A", true);
|
||||
static Allele C = Allele.create("C");
|
||||
static Allele G = Allele.create("G");
|
||||
|
||||
@DataProvider(name = "TestMakeAlleleConditionalContexts")
|
||||
public Object[][] makeTestMakeAlleleConditionalContexts() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final VariantContextBuilder root = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A));
|
||||
final VariantContextBuilder vcAC = new VariantContextBuilder(root).alleles(Arrays.asList(A, C));
|
||||
final VariantContextBuilder vcAG = new VariantContextBuilder(root).alleles(Arrays.asList(A, G));
|
||||
final VariantContextBuilder vcACG = new VariantContextBuilder(root).alleles(Arrays.asList(A, C, G));
|
||||
final VariantContextBuilder vcAGC = new VariantContextBuilder(root).alleles(Arrays.asList(A, G, C));
|
||||
|
||||
final Genotype gACG = makePL( 0, 1, 2, 3, 4, 5);
|
||||
final Genotype gAGC = makePL( 0, 4, 5, 1, 3, 2);
|
||||
final Genotype gACcombined = makePL(0, 2, 5);
|
||||
final Genotype gAGcombined = makePL(0, 4, 9);
|
||||
final Genotype gACdropped = makePL(0, 1, 2);
|
||||
final Genotype gAGdropped = makePL(0, 3, 5);
|
||||
|
||||
// biallelic
|
||||
tests.add(new Object[]{vcAC.genotypes(gACcombined).make(), Arrays.asList(vcAC.genotypes(gACcombined).make())});
|
||||
|
||||
// tri-allelic
|
||||
tests.add(new Object[]{vcACG.genotypes(gACG).make(), Arrays.asList(vcAC.genotypes(gACcombined).make(), vcAG.genotypes(gAGdropped).make())});
|
||||
tests.add(new Object[]{vcAGC.genotypes(gAGC).make(), Arrays.asList(vcAG.genotypes(gAGcombined).make(), vcAC.genotypes(gACdropped).make())});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
|
||||
@Test(enabled = false, dataProvider = "TestMakeAlleleConditionalContexts")
|
||||
private void testMakeAlleleConditionalContexts(final VariantContext vc, final List<VariantContext> expectedVCs) {
|
||||
final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4);
|
||||
final List<VariantContext> biAllelicVCs = calc.makeAlleleConditionalContexts(vc);
|
||||
|
||||
Assert.assertEquals(biAllelicVCs.size(), expectedVCs.size());
|
||||
|
||||
for ( int i = 0; i < biAllelicVCs.size(); i++ ) {
|
||||
final VariantContext actual = biAllelicVCs.get(i);
|
||||
final VariantContext expected = expectedVCs.get(i);
|
||||
Assert.assertEquals(actual.getAlleles(), expected.getAlleles());
|
||||
|
||||
for ( int j = 0; j < actual.getNSamples(); j++ )
|
||||
Assert.assertEquals(actual.getGenotype(j).getPL(), expected.getGenotype(j).getPL());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@DataProvider(name = "ThetaNTests")
|
||||
public Object[][] makeThetaNTests() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final List<Double> log10LAlleles = Arrays.asList(0.0, -1.0, -2.0, -3.0, -4.0);
|
||||
|
||||
for ( final double log10pRef : Arrays.asList(-1, -2, -3) ) {
|
||||
for ( final int ploidy : Arrays.asList(1, 2, 3, 4) ) {
|
||||
for ( List<Double> permutations : Utils.makePermutations(log10LAlleles, ploidy, true)) {
|
||||
tests.add(new Object[]{permutations, Math.pow(10, log10pRef)});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "ThetaNTests")
|
||||
public void testThetaNTests(final List<Double> log10LAlleles, final double pRef) {
|
||||
// biallelic
|
||||
final double[] rawPriors = MathUtils.toLog10(new double[]{pRef, 1-pRef});
|
||||
|
||||
final double log10pNonRef = Math.log10(1-pRef);
|
||||
|
||||
final List<AFCalcResult> originalPriors = new LinkedList<AFCalcResult>();
|
||||
final List<Double> pNonRefN = new LinkedList<Double>();
|
||||
for ( int i = 0; i < log10LAlleles.size(); i++ ) {
|
||||
final double log10LAllele1 = log10LAlleles.get(i);
|
||||
final double[] L1 = MathUtils.normalizeFromLog10(new double[]{log10LAllele1, 0.0}, true);
|
||||
final AFCalcResult result1 = new AFCalcResult(new int[]{1}, 1, Arrays.asList(A, C), L1, rawPriors, Collections.singletonMap(C, 0.0));
|
||||
originalPriors.add(result1);
|
||||
pNonRefN.add(log10pNonRef*(i+1));
|
||||
}
|
||||
|
||||
final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 2);
|
||||
final List<AFCalcResult> thetaNPriors = calc.applyMultiAllelicPriors(originalPriors);
|
||||
|
||||
double prevPosterior = 0.0;
|
||||
for ( int i = 0; i < log10LAlleles.size(); i++ ) {
|
||||
final AFCalcResult thetaN = thetaNPriors.get(i);
|
||||
AFCalcResult orig = null;
|
||||
for ( final AFCalcResult x : originalPriors )
|
||||
if ( x.getAllelesUsedInGenotyping().equals(thetaN.getAllelesUsedInGenotyping()))
|
||||
orig = x;
|
||||
|
||||
Assert.assertNotNull(orig, "couldn't find original AFCalc");
|
||||
|
||||
Assert.assertEquals(orig.getLog10PriorOfAFGT0(), log10pNonRef, 1e-6);
|
||||
Assert.assertEquals(thetaN.getLog10PriorOfAFGT0(), pNonRefN.get(i), 1e-6);
|
||||
|
||||
Assert.assertTrue(orig.getLog10PosteriorOfAFGT0() <= prevPosterior, "AFCalc results should be sorted but " + prevPosterior + " is > original posterior " + orig.getLog10PosteriorOfAFGT0());
|
||||
prevPosterior = orig.getLog10PosteriorOfAFGT0();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -21,17 +21,17 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSample() {
|
||||
HCTest(CEUTRIO_BAM, "", "5b751474ad0aef4cdb53f094e605f97c");
|
||||
HCTest(CEUTRIO_BAM, "", "a305107a5ec889152aa2efbe90b249d7");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSample() {
|
||||
HCTest(NA12878_BAM, "", "60efcd2d2722087e900f6365985d18bf");
|
||||
HCTest(NA12878_BAM, "", "0c2217ec81f19790a6d1f98ebf8cf70d");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGA() {
|
||||
HCTest(CEUTRIO_BAM, "-gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "71bec55320a2f07af0d54be9d7735322");
|
||||
HCTest(CEUTRIO_BAM, "--max_alternate_alleles_for_indels 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "0396c7352ab8ab98b03dca36299a0ddf");
|
||||
}
|
||||
|
||||
private void HCTestComplexVariants(String bam, String args, String md5) {
|
||||
|
|
@ -42,7 +42,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleComplex() {
|
||||
HCTestComplexVariants(CEUTRIO_BAM, "", "f5a809e3fbd9998f79b75bb2973209e1");
|
||||
HCTestComplexVariants(CEUTRIO_BAM, "", "2cfb7d830d5a7eb7bc754b5f688a27a5");
|
||||
}
|
||||
|
||||
private void HCTestSymbolicVariants(String bam, String args, String md5) {
|
||||
|
|
@ -53,7 +53,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSampleSymbolic() {
|
||||
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "8043b0451a4384e678a93600b34afce7");
|
||||
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "d511848a46083c0d0b2495f65f162c2e");
|
||||
}
|
||||
|
||||
private void HCTestIndelQualityScores(String bam, String args, String md5) {
|
||||
|
|
@ -64,21 +64,36 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "ea6539e05faf10ffaf76f2d16907c47a");
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "98d82d74e8d6a778290bee6c0df6d092");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestProblematicReadsModifiedInActiveRegions() {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8d092b25f40456e618eef91fdce8adf0"));
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("7e112ea4623617f1f7f8f562f54aa2aa"));
|
||||
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestStructuralIndels() {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c29e61810c056b52a47baae0696931ea"));
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c642dcd93771f6f084d55de31f180d1b"));
|
||||
executeTest("HCTestStructuralIndels: ", spec);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing reduced reads
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Test
|
||||
public void HCTestReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("79af83432dc4a1768b3ebffffc4d2b8f"));
|
||||
executeTest("HC calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -114,6 +114,9 @@ public class CommandLineGATK extends CommandLineExecutable {
|
|||
|
||||
public static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file";
|
||||
public static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files";
|
||||
public static final String NO_SPACE_LEFT_ON_DEVICE_ERROR = "No space left on device";
|
||||
public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded";
|
||||
|
||||
private static void checkForMaskedUserErrors(final Throwable t) {
|
||||
final String message = t.getMessage();
|
||||
if ( message == null )
|
||||
|
|
@ -133,9 +136,9 @@ public class CommandLineGATK extends CommandLineExecutable {
|
|||
exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage()));
|
||||
|
||||
// disk is full
|
||||
if ( message.contains("No space left on device") )
|
||||
if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) )
|
||||
exitSystemWithUserError(new UserException.NoSpaceOnDevice());
|
||||
if ( t.getCause() != null && t.getCause().getMessage().contains("No space left on device") )
|
||||
if ( t.getCause() != null && (t.getCause().getMessage().contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || t.getCause().getMessage().contains(DISK_QUOTA_EXCEEDED_ERROR)) )
|
||||
exitSystemWithUserError(new UserException.NoSpaceOnDevice());
|
||||
|
||||
// masked out of memory error
|
||||
|
|
|
|||
|
|
@ -140,6 +140,9 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false)
|
||||
public boolean nonDeterministicRandomSeed = false;
|
||||
|
||||
@Argument(fullName = "disableRandomization",doc="Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.")
|
||||
public boolean disableRandomization = false;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Downsampling Arguments
|
||||
|
|
|
|||
|
|
@ -1,13 +1,12 @@
|
|||
package org.broadinstitute.sting.gatk.arguments;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Advanced;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
|
|
@ -59,4 +58,18 @@ public class StandardCallerArgumentCollection {
|
|||
@Advanced
|
||||
@Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
|
||||
public int MAX_ALTERNATE_ALLELES = 3;
|
||||
|
||||
/**
|
||||
* If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
|
||||
* then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
|
||||
* scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
|
||||
* that you not play around with this parameter.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "Maximum number of alternate alleles to genotype for indels only", required = false)
|
||||
public int MAX_ALTERNATE_ALLELES_FOR_INDELS = 2;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName = "logExactCalls", doc="x", required=false)
|
||||
public File exactCallsLog = null;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -124,6 +124,12 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
*/
|
||||
private FilePointer generatePointerOverEntireFileset() {
|
||||
FilePointer filePointer = new FilePointer();
|
||||
|
||||
// This is a "monolithic" FilePointer representing all regions in all files we will ever visit, and is
|
||||
// the only FilePointer we will create. This allows us to have this FilePointer represent regions from
|
||||
// multiple contigs
|
||||
filePointer.setIsMonolithic(true);
|
||||
|
||||
Map<SAMReaderID,GATKBAMFileSpan> currentPosition;
|
||||
|
||||
// Only use the deprecated SAMDataSource.getCurrentPosition() if we're not using experimental downsampling
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
|
|||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -88,6 +89,17 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
|
|||
*/
|
||||
private PeekableIterator<SAMRecord> currentContigReadsIterator = null;
|
||||
|
||||
/**
|
||||
* How many FilePointers have we pulled from the filePointers iterator?
|
||||
*/
|
||||
private int totalFilePointersConsumed = 0;
|
||||
|
||||
/**
|
||||
* Have we encountered a monolithic FilePointer?
|
||||
*/
|
||||
private boolean encounteredMonolithicFilePointer = false;
|
||||
|
||||
|
||||
{
|
||||
createNextContigFilePointer();
|
||||
advance();
|
||||
|
|
@ -167,6 +179,20 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
|
|||
logger.info("Loading BAM index data for next contig");
|
||||
|
||||
while ( filePointers.hasNext() ) {
|
||||
|
||||
// Make sure that if we see a monolithic FilePointer (representing all regions in all files) that
|
||||
// it is the ONLY FilePointer we ever encounter
|
||||
if ( encounteredMonolithicFilePointer ) {
|
||||
throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer");
|
||||
}
|
||||
if ( filePointers.peek().isMonolithic() ) {
|
||||
if ( totalFilePointersConsumed > 0 ) {
|
||||
throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer");
|
||||
}
|
||||
encounteredMonolithicFilePointer = true;
|
||||
logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek()));
|
||||
}
|
||||
|
||||
// If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the
|
||||
// same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge
|
||||
if ( nextContigFilePointers.isEmpty() ||
|
||||
|
|
@ -175,6 +201,7 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
|
|||
(nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) {
|
||||
|
||||
nextContigFilePointers.add(filePointers.next());
|
||||
totalFilePointersConsumed++;
|
||||
}
|
||||
else {
|
||||
break; // next FilePointer is on a different contig or has different mapped/unmapped status,
|
||||
|
|
|
|||
|
|
@ -50,10 +50,28 @@ public class FilePointer {
|
|||
*/
|
||||
protected final boolean isRegionUnmapped;
|
||||
|
||||
/**
|
||||
* Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
|
||||
* ever visit during this GATK run? If this is set to true, the engine will expect to see only this
|
||||
* one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
|
||||
* from more than one contig.
|
||||
*/
|
||||
private boolean isMonolithic = false;
|
||||
|
||||
/**
|
||||
* Index of the contig covered by this FilePointer. Only meaningful for non-monolithic, mapped FilePointers
|
||||
*/
|
||||
private Integer contigIndex = null;
|
||||
|
||||
|
||||
public FilePointer( List<GenomeLoc> locations ) {
|
||||
this.locations.addAll(locations);
|
||||
this.isRegionUnmapped = checkUnmappedStatus();
|
||||
validateLocations();
|
||||
|
||||
validateAllLocations();
|
||||
if ( locations.size() > 0 ) {
|
||||
contigIndex = locations.get(0).getContigIndex();
|
||||
}
|
||||
}
|
||||
|
||||
public FilePointer( final GenomeLoc... locations ) {
|
||||
|
|
@ -80,8 +98,9 @@ public class FilePointer {
|
|||
return foundUnmapped;
|
||||
}
|
||||
|
||||
private void validateLocations() {
|
||||
if ( isRegionUnmapped ) {
|
||||
private void validateAllLocations() {
|
||||
// Unmapped and monolithic FilePointers are exempted from the one-contig-only restriction
|
||||
if ( isRegionUnmapped || isMonolithic ) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -89,13 +108,22 @@ public class FilePointer {
|
|||
|
||||
for ( GenomeLoc location : locations ) {
|
||||
if ( previousContigIndex != null && previousContigIndex != location.getContigIndex() ) {
|
||||
throw new ReviewedStingException("File pointers must contain intervals from at most one contig");
|
||||
throw new ReviewedStingException("Non-monolithic file pointers must contain intervals from at most one contig");
|
||||
}
|
||||
|
||||
previousContigIndex = location.getContigIndex();
|
||||
}
|
||||
}
|
||||
|
||||
private void validateLocation( GenomeLoc location ) {
|
||||
if ( isRegionUnmapped != GenomeLoc.isUnmapped(location) ) {
|
||||
throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped.");
|
||||
}
|
||||
if ( ! isRegionUnmapped && ! isMonolithic && contigIndex != null && contigIndex != location.getContigIndex() ) {
|
||||
throw new ReviewedStingException("Non-monolithic file pointers must contain intervals from at most one contig");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an immutable view of this FilePointer's file spans
|
||||
*
|
||||
|
|
@ -123,6 +151,29 @@ public class FilePointer {
|
|||
return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
|
||||
* ever visit during this GATK run? If this is set to true, the engine will expect to see only this
|
||||
* one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
|
||||
* from more than one contig.
|
||||
*
|
||||
* @return true if this FP is a monolithic FP representing all regions in all files, otherwise false
|
||||
*/
|
||||
public boolean isMonolithic() {
|
||||
return isMonolithic;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set this FP's "monolithic" status to true or false. An FP is monolithic if it represents all
|
||||
* regions in all files that we will ever visit, and is the only FP we will ever create. A monolithic
|
||||
* FP may contain intervals from more than one contig.
|
||||
*
|
||||
* @param isMonolithic set this FP's monolithic status to this value
|
||||
*/
|
||||
public void setIsMonolithic( boolean isMonolithic ) {
|
||||
this.isMonolithic = isMonolithic;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if(!(other instanceof FilePointer))
|
||||
|
|
@ -151,15 +202,12 @@ public class FilePointer {
|
|||
}
|
||||
|
||||
public void addLocation(final GenomeLoc location) {
|
||||
this.locations.add(location);
|
||||
checkUnmappedStatus();
|
||||
validateLocations();
|
||||
}
|
||||
validateLocation(location);
|
||||
|
||||
public void addLocations( final List<GenomeLoc> locations ) {
|
||||
this.locations.addAll(locations);
|
||||
checkUnmappedStatus();
|
||||
validateLocations();
|
||||
this.locations.add(location);
|
||||
if ( contigIndex == null ) {
|
||||
contigIndex = location.getContigIndex();
|
||||
}
|
||||
}
|
||||
|
||||
public void addFileSpans(final SAMReaderID id, final SAMFileSpan fileSpan) {
|
||||
|
|
|
|||
|
|
@ -215,19 +215,29 @@ public class ReadShard extends Shard {
|
|||
int start = Integer.MAX_VALUE;
|
||||
int stop = Integer.MIN_VALUE;
|
||||
String contig = null;
|
||||
boolean foundMapped = false;
|
||||
|
||||
for ( final SAMRecord read : reads ) {
|
||||
if ( contig != null && ! read.getReferenceName().equals(contig) )
|
||||
throw new ReviewedStingException("ReadShard contains reads spanning contig boundaries, which is no longer allowed. "
|
||||
+ "First contig is " + contig + " next read was " + read.getReferenceName() );
|
||||
contig = read.getReferenceName();
|
||||
if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart();
|
||||
if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd();
|
||||
|
||||
// Even if this shard as a *whole* is not "unmapped", we can still encounter *individual* unmapped mates
|
||||
// of mapped reads within this shard's buffer. In fact, if we're very unlucky with shard boundaries,
|
||||
// this shard might consist *only* of unmapped mates! We need to refrain from using the alignment
|
||||
// starts/stops of these unmapped mates, and detect the case where the shard has been filled *only*
|
||||
// with unmapped mates.
|
||||
if ( ! read.getReadUnmappedFlag() ) {
|
||||
foundMapped = true;
|
||||
if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart();
|
||||
if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd();
|
||||
}
|
||||
}
|
||||
|
||||
assert contig != null;
|
||||
|
||||
if ( contig.equals("*") ) // all reads are unmapped
|
||||
if ( ! foundMapped || contig.equals("*") ) // all reads are unmapped
|
||||
return GenomeLoc.UNMAPPED;
|
||||
else
|
||||
return parser.createGenomeLoc(contig, start, stop);
|
||||
|
|
|
|||
|
|
@ -1008,6 +1008,12 @@ public class SAMDataSource {
|
|||
} catch ( SAMFormatException e ) {
|
||||
throw new UserException.MalformedBAM(readerID.samFile, e.getMessage());
|
||||
}
|
||||
// Picard is throwing a RuntimeException here when BAMs are malformed with bad headers (and so look like SAM files).
|
||||
// Let's keep this separate from the SAMFormatException (which ultimately derives from RuntimeException) case,
|
||||
// just in case we want to change this behavior later.
|
||||
catch ( RuntimeException e ) {
|
||||
throw new UserException.MalformedBAM(readerID.samFile, e.getMessage());
|
||||
}
|
||||
reader.setSAMRecordFactory(factory);
|
||||
reader.enableFileSource(true);
|
||||
reader.setValidationStringency(validationStringency);
|
||||
|
|
|
|||
|
|
@ -34,8 +34,10 @@ import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.lang.reflect.Type;
|
||||
import java.util.List;
|
||||
|
|
@ -239,6 +241,8 @@ class ReferenceOrderedQueryDataPool extends ResourcePool<RMDTrack,LocationAwareS
|
|||
} else {
|
||||
return new SeekableRODIterator(header,sequenceDictionary,referenceSequenceDictionary,genomeLocParser,track.getIterator());
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException.CouldNotReadInputFile(fileDescriptor.getName(), "it could not be found");
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("Unable to create iterator for rod named " + fileDescriptor.getName(),e);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ package org.broadinstitute.sting.gatk.filters;
|
|||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Filter out reads with low mapping qualities.
|
||||
* Filter out reads whose mate maps to a different contig.
|
||||
*
|
||||
* @author ebanks
|
||||
* @version 0.1
|
||||
|
|
|
|||
|
|
@ -186,7 +186,7 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
|
|||
// that way we don't assume it's a specific type
|
||||
final FeatureManager.FeatureDescriptor fd = new FeatureManager().getByFiletype(file);
|
||||
if ( fd == null )
|
||||
throw new ReviewedStingException("Unexpectedly couldn't find valid codec for temporary output file " + file);
|
||||
throw new UserException.LocalParallelizationProblem(file);
|
||||
|
||||
final FeatureCodec<VariantContext> codec = fd.getCodec();
|
||||
final AbstractFeatureReader<VariantContext> source =
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
|||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
|
|
@ -49,8 +50,12 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno
|
|||
if ( perReadAlleleLikelihoodMap.size() == 0 )
|
||||
return null;
|
||||
|
||||
for ( Map.Entry<String, PerReadAlleleLikelihoodMap> sample : perReadAlleleLikelihoodMap.entrySet() )
|
||||
depth += sample.getValue().getNumberOfStoredElements();
|
||||
for (PerReadAlleleLikelihoodMap maps : perReadAlleleLikelihoodMap.values() ) {
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
depth += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
return null;
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines;
|
|||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
|
||||
|
|
@ -72,7 +73,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
|
|||
ReadBackedPileup pileup = stratifiedContext.getBasePileup();
|
||||
for ( PileupElement p : pileup ) {
|
||||
if ( alleleCounts.containsKey(p.getBase()) )
|
||||
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1);
|
||||
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+p.getRepresentativeCount());
|
||||
}
|
||||
|
||||
// we need to add counts in the correct order
|
||||
|
|
@ -91,12 +92,13 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
|
|||
alleleCounts.put(allele, 0);
|
||||
}
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
if (a.isNoCall())
|
||||
continue; // read is non-informative
|
||||
if (!vc.getAlleles().contains(a))
|
||||
continue; // sanity check - shouldn't be needed
|
||||
alleleCounts.put(a,alleleCounts.get(a)+1);
|
||||
alleleCounts.put(a, alleleCounts.get(a) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1));
|
||||
}
|
||||
final int[] counts = new int[alleleCounts.size()];
|
||||
counts[0] = alleleCounts.get(vc.getReference());
|
||||
|
|
|
|||
|
|
@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
|||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
|
|
@ -71,7 +72,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
}
|
||||
else if (stratifiedPerReadAlleleLikelihoodMap != null) {
|
||||
// either SNP with no alignment context, or indels: per-read likelihood map needed
|
||||
final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount());
|
||||
final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc);
|
||||
return pValueForBestTable(table, null);
|
||||
}
|
||||
else
|
||||
|
|
@ -235,14 +236,13 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
* allele2 # #
|
||||
* @return a 2x2 contingency table
|
||||
*/
|
||||
private static int[][] getContingencyTable( final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap,
|
||||
final Allele ref, final Allele alt) {
|
||||
private static int[][] getContingencyTable( final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) {
|
||||
final Allele ref = vc.getReference();
|
||||
final Allele alt = vc.getAltAlleleWithHighestAlleleCount();
|
||||
int[][] table = new int[2][2];
|
||||
|
||||
for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) {
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
|
||||
if ( el.getKey().isReducedRead() ) // ignore reduced reads
|
||||
continue;
|
||||
final boolean matchesRef = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(ref,true);
|
||||
final boolean matchesAlt = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(alt,true);
|
||||
|
||||
|
|
@ -254,7 +254,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
int row = matchesRef ? 0 : 1;
|
||||
int column = isFW ? 0 : 1;
|
||||
|
||||
table[row][column]++;
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
table[row][column] += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -275,7 +276,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
|
||||
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
|
||||
for (PileupElement p : sample.getValue().getBasePileup()) {
|
||||
if ( ! RankSumTest.isUsableBase(p, false) || p.getRead().isReducedRead() ) // ignore deletions and reduced reads
|
||||
if ( ! RankSumTest.isUsableBase(p, false) ) // ignore deletions
|
||||
continue;
|
||||
|
||||
if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider )
|
||||
|
|
@ -290,7 +291,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
int row = matchesRef ? 0 : 1;
|
||||
int column = isFW ? 0 : 1;
|
||||
|
||||
table[row][column]++;
|
||||
table[row][column] += p.getRepresentativeCount();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -10,6 +11,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsC
|
|||
import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.MannWhitneyU;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
|
@ -19,10 +21,7 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
|||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -30,6 +29,7 @@ import java.util.Map;
|
|||
*/
|
||||
public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
|
||||
static final boolean DEBUG = false;
|
||||
private boolean useDithering = true;
|
||||
|
||||
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
|
||||
final AnnotatorCompatible walker,
|
||||
|
|
@ -70,7 +70,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
|
|||
if (refQuals.isEmpty() && altQuals.isEmpty())
|
||||
return null;
|
||||
|
||||
final MannWhitneyU mannWhitneyU = new MannWhitneyU();
|
||||
final MannWhitneyU mannWhitneyU = new MannWhitneyU(useDithering);
|
||||
for (final Double qual : altQuals) {
|
||||
mannWhitneyU.add(qual, MannWhitneyU.USet.SET1);
|
||||
}
|
||||
|
|
@ -131,4 +131,15 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
|
|||
p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ||
|
||||
((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); // need the unBAQed quality score here
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the rank sum test annotation using walker and engine information. Right now this checks to see if
|
||||
* engine randomization is turned off, and if so does not dither.
|
||||
* @param walker
|
||||
* @param toolkit
|
||||
* @param headerLines
|
||||
*/
|
||||
public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set<VCFHeaderLine> headerLines ) {
|
||||
useDithering = ! toolkit.getArguments().disableRandomization;
|
||||
}
|
||||
}
|
||||
|
|
@ -179,7 +179,7 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
|
|||
int numReadGroups = 0;
|
||||
for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() )
|
||||
numReadGroups += header.getReadGroups().size();
|
||||
recalibrationTables = new RecalibrationTables(requestedCovariates, numReadGroups);
|
||||
recalibrationTables = new RecalibrationTables(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG);
|
||||
|
||||
recalibrationEngine = initializeRecalibrationEngine();
|
||||
recalibrationEngine.initialize(requestedCovariates, recalibrationTables);
|
||||
|
|
@ -200,15 +200,15 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
|
|||
}
|
||||
}
|
||||
|
||||
private boolean readHasBeenSkipped(GATKSAMRecord read) {
|
||||
private boolean readHasBeenSkipped( final GATKSAMRecord read ) {
|
||||
return read.containsTemporaryAttribute(SKIP_RECORD_ATTRIBUTE);
|
||||
}
|
||||
|
||||
private boolean isLowQualityBase(GATKSAMRecord read, int offset) {
|
||||
return read.getBaseQualities()[offset] < minimumQToUse;
|
||||
private boolean isLowQualityBase( final PileupElement p ) {
|
||||
return p.getQual() < minimumQToUse;
|
||||
}
|
||||
|
||||
private boolean readNotSeen(GATKSAMRecord read) {
|
||||
private boolean readNotSeen( final GATKSAMRecord read ) {
|
||||
return !read.containsTemporaryAttribute(SEEN_ATTRIBUTE);
|
||||
}
|
||||
|
||||
|
|
@ -230,7 +230,7 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
|
|||
final int offset = p.getOffset();
|
||||
|
||||
// This read has been marked to be skipped or base is low quality (we don't recalibrate low quality bases)
|
||||
if (readHasBeenSkipped(read) || isLowQualityBase(read, offset))
|
||||
if (readHasBeenSkipped(read) || p.isInsertionAtBeginningOfRead() || isLowQualityBase(p) )
|
||||
continue;
|
||||
|
||||
if (readNotSeen(read)) {
|
||||
|
|
|
|||
|
|
@ -182,6 +182,10 @@ public class RecalibrationArgumentCollection {
|
|||
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
|
||||
public String FORCE_PLATFORM = null;
|
||||
|
||||
@Hidden
|
||||
@Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only")
|
||||
public PrintStream RECAL_TABLE_UPDATE_LOG = null;
|
||||
|
||||
public File existingRecalibrationReport = null;
|
||||
|
||||
public GATKReportTable generateReportTable(final String covariateNames) {
|
||||
|
|
|
|||
|
|
@ -1,231 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* The model representing how we calculate a genotype given the priors and a pile
|
||||
* of bases and quality scores
|
||||
*/
|
||||
public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
||||
|
||||
public enum Model {
|
||||
/** The default model with the best performance in all cases */
|
||||
EXACT
|
||||
}
|
||||
|
||||
protected int N;
|
||||
protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
|
||||
protected boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
|
||||
protected Logger logger;
|
||||
protected PrintStream verboseWriter;
|
||||
|
||||
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
|
||||
|
||||
protected AlleleFrequencyCalculationModel(final UnifiedArgumentCollection UAC, final int N, final Logger logger, final PrintStream verboseWriter) {
|
||||
this.N = N;
|
||||
this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = UAC.MAX_ALTERNATE_ALLELES;
|
||||
this.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = UAC.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
this.logger = logger;
|
||||
this.verboseWriter = verboseWriter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrapper class that compares two likelihoods associated with two alleles
|
||||
*/
|
||||
protected static final class LikelihoodSum implements Comparable<LikelihoodSum> {
|
||||
public double sum = 0.0;
|
||||
public Allele allele;
|
||||
|
||||
public LikelihoodSum(Allele allele) { this.allele = allele; }
|
||||
|
||||
public int compareTo(LikelihoodSum other) {
|
||||
final double diff = sum - other.sum;
|
||||
return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Unpack GenotypesContext into arraylist of doubel values
|
||||
* @param GLs Input genotype context
|
||||
* @return ArrayList of doubles corresponding to GL vectors
|
||||
*/
|
||||
protected static ArrayList<double[]> getGLs(GenotypesContext GLs) {
|
||||
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>(GLs.size());
|
||||
|
||||
genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
|
||||
for ( Genotype sample : GLs.iterateInSampleNameOrder() ) {
|
||||
if ( sample.hasLikelihoods() ) {
|
||||
double[] gls = sample.getLikelihoods().getAsVector();
|
||||
|
||||
if ( MathUtils.sum(gls) < UnifiedGenotyperEngine.SUM_GL_THRESH_NOCALL )
|
||||
genotypeLikelihoods.add(gls);
|
||||
}
|
||||
}
|
||||
|
||||
return genotypeLikelihoods;
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
* @param vc variant context with alleles and genotype likelihoods
|
||||
* @param log10AlleleFrequencyPriors priors
|
||||
* @param result (pre-allocated) object to store likelihoods results
|
||||
* @return the alleles used for genotyping
|
||||
*/
|
||||
protected abstract List<Allele> getLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result);
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
* @param vc variant context with alleles and genotype likelihoods
|
||||
* @param allelesToUse alleles to subset
|
||||
* @param assignGenotypes
|
||||
* @param ploidy
|
||||
* @return GenotypesContext object
|
||||
*/
|
||||
protected abstract GenotypesContext subsetAlleles(final VariantContext vc,
|
||||
final List<Allele> allelesToUse,
|
||||
final boolean assignGenotypes,
|
||||
final int ploidy);
|
||||
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// protected classes used to store exact model matrix columns
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first
|
||||
|
||||
// a wrapper around the int array so that we can make it hashable
|
||||
protected static final class ExactACcounts {
|
||||
|
||||
protected final int[] counts;
|
||||
private int hashcode = -1;
|
||||
|
||||
public ExactACcounts(final int[] counts) {
|
||||
this.counts = counts;
|
||||
}
|
||||
|
||||
public int[] getCounts() {
|
||||
return counts;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
return (obj instanceof ExactACcounts) && Arrays.equals(counts, ((ExactACcounts) obj).counts);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
if ( hashcode == -1 )
|
||||
hashcode = Arrays.hashCode(counts);
|
||||
return hashcode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append(counts[0]);
|
||||
for ( int i = 1; i < counts.length; i++ ) {
|
||||
sb.append("/");
|
||||
sb.append(counts[i]);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
// This class represents a column in the Exact AC calculation matrix
|
||||
protected static final class ExactACset {
|
||||
|
||||
// the counts of the various alternate alleles which this column represents
|
||||
final ExactACcounts ACcounts;
|
||||
|
||||
// the column of the matrix
|
||||
final double[] log10Likelihoods;
|
||||
|
||||
int sum = -1;
|
||||
|
||||
public ExactACset(final int size, final ExactACcounts ACcounts) {
|
||||
this.ACcounts = ACcounts;
|
||||
log10Likelihoods = new double[size];
|
||||
Arrays.fill(log10Likelihoods, Double.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
// sum of all the non-reference alleles
|
||||
public int getACsum() {
|
||||
if ( sum == -1 ) {
|
||||
sum = 0;
|
||||
for ( int count : ACcounts.getCounts() )
|
||||
sum += count;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
public boolean equals(Object obj) {
|
||||
return (obj instanceof ExactACset) && ACcounts.equals(((ExactACset)obj).ACcounts);
|
||||
}
|
||||
}
|
||||
|
||||
protected static final class MaxLikelihoodSeen {
|
||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
ExactACcounts ACs = null;
|
||||
|
||||
public MaxLikelihoodSeen() {}
|
||||
|
||||
public void update(final double maxLog10L, final ExactACcounts ACs) {
|
||||
this.maxLog10L = maxLog10L;
|
||||
this.ACs = ACs;
|
||||
}
|
||||
|
||||
// returns true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set
|
||||
public boolean isLowerAC(final ExactACcounts otherACs) {
|
||||
final int[] myACcounts = this.ACs.getCounts();
|
||||
final int[] otherACcounts = otherACs.getCounts();
|
||||
|
||||
for ( int i = 0; i < myACcounts.length; i++ ) {
|
||||
if ( myACcounts[i] > otherACcounts[i] )
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,150 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
* Date: Dec 14, 2011
|
||||
*
|
||||
* Useful helper class to communicate the results of the allele frequency calculation
|
||||
*/
|
||||
public class AlleleFrequencyCalculationResult {
|
||||
|
||||
// These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles
|
||||
private double log10MLE;
|
||||
private double log10MAP;
|
||||
private final int[] alleleCountsOfMLE;
|
||||
private final int[] alleleCountsOfMAP;
|
||||
|
||||
// The posteriors seen, not including that of AF=0
|
||||
private static final int POSTERIORS_CACHE_SIZE = 5000;
|
||||
private final double[] log10PosteriorMatrixValues = new double[POSTERIORS_CACHE_SIZE];
|
||||
private int currentPosteriorsCacheIndex = 0;
|
||||
private Double log10PosteriorMatrixSum = null;
|
||||
|
||||
// These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
|
||||
private double log10LikelihoodOfAFzero;
|
||||
private double log10PosteriorOfAFzero;
|
||||
|
||||
|
||||
public AlleleFrequencyCalculationResult(final int maxAltAlleles) {
|
||||
alleleCountsOfMLE = new int[maxAltAlleles];
|
||||
alleleCountsOfMAP = new int[maxAltAlleles];
|
||||
reset();
|
||||
}
|
||||
|
||||
public double getLog10MLE() {
|
||||
return log10MLE;
|
||||
}
|
||||
|
||||
public double getLog10MAP() {
|
||||
return log10MAP;
|
||||
}
|
||||
|
||||
public double getLog10PosteriorsMatrixSumWithoutAFzero() {
|
||||
if ( log10PosteriorMatrixSum == null ) {
|
||||
log10PosteriorMatrixSum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex);
|
||||
}
|
||||
return log10PosteriorMatrixSum;
|
||||
}
|
||||
|
||||
public int[] getAlleleCountsOfMLE() {
|
||||
return alleleCountsOfMLE;
|
||||
}
|
||||
|
||||
public int[] getAlleleCountsOfMAP() {
|
||||
return alleleCountsOfMAP;
|
||||
}
|
||||
|
||||
public double getLog10LikelihoodOfAFzero() {
|
||||
return log10LikelihoodOfAFzero;
|
||||
}
|
||||
|
||||
public double getLog10PosteriorOfAFzero() {
|
||||
return log10PosteriorOfAFzero;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
|
||||
for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) {
|
||||
alleleCountsOfMLE[i] = 0;
|
||||
alleleCountsOfMAP[i] = 0;
|
||||
}
|
||||
currentPosteriorsCacheIndex = 0;
|
||||
log10PosteriorMatrixSum = null;
|
||||
}
|
||||
|
||||
public void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
|
||||
if ( log10LofK > log10MLE ) {
|
||||
log10MLE = log10LofK;
|
||||
for ( int i = 0; i < alleleCountsForK.length; i++ )
|
||||
alleleCountsOfMLE[i] = alleleCountsForK[i];
|
||||
}
|
||||
}
|
||||
|
||||
public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
|
||||
addToPosteriorsCache(log10LofK);
|
||||
|
||||
if ( log10LofK > log10MAP ) {
|
||||
log10MAP = log10LofK;
|
||||
for ( int i = 0; i < alleleCountsForK.length; i++ )
|
||||
alleleCountsOfMAP[i] = alleleCountsForK[i];
|
||||
}
|
||||
}
|
||||
|
||||
private void addToPosteriorsCache(final double log10LofK) {
|
||||
// add to the cache
|
||||
log10PosteriorMatrixValues[currentPosteriorsCacheIndex++] = log10LofK;
|
||||
|
||||
// if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell
|
||||
if ( currentPosteriorsCacheIndex == POSTERIORS_CACHE_SIZE ) {
|
||||
final double temporarySum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex);
|
||||
log10PosteriorMatrixValues[0] = temporarySum;
|
||||
currentPosteriorsCacheIndex = 1;
|
||||
}
|
||||
}
|
||||
|
||||
public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
|
||||
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
|
||||
if ( log10LikelihoodOfAFzero > log10MLE ) {
|
||||
log10MLE = log10LikelihoodOfAFzero;
|
||||
Arrays.fill(alleleCountsOfMLE, 0);
|
||||
}
|
||||
}
|
||||
|
||||
public void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
|
||||
this.log10PosteriorOfAFzero = log10PosteriorOfAFzero;
|
||||
if ( log10PosteriorOfAFzero > log10MAP ) {
|
||||
log10MAP = log10PosteriorOfAFzero;
|
||||
Arrays.fill(alleleCountsOfMAP, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,481 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||
|
||||
// private final static boolean DEBUG = false;
|
||||
|
||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
|
||||
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(UAC, N, logger, verboseWriter);
|
||||
}
|
||||
|
||||
public List<Allele> getLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
GenotypesContext GLs = vc.getGenotypes();
|
||||
List<Allele> alleles = vc.getAlleles();
|
||||
|
||||
final int myMaxAltAllelesToGenotype = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS && vc.getType().equals(VariantContext.Type.INDEL) ? 2 : MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
|
||||
|
||||
// don't try to genotype too many alternate alleles
|
||||
if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) {
|
||||
logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
|
||||
|
||||
alleles = new ArrayList<Allele>(myMaxAltAllelesToGenotype + 1);
|
||||
alleles.add(vc.getReference());
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype));
|
||||
GLs = VariantContextUtils.subsetDiploidAlleles(vc, alleles, false);
|
||||
}
|
||||
|
||||
linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result);
|
||||
|
||||
return alleles;
|
||||
}
|
||||
|
||||
|
||||
private static final int PL_INDEX_OF_HOM_REF = 0;
|
||||
private static List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) {
|
||||
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
|
||||
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles];
|
||||
for ( int i = 0; i < numOriginalAltAlleles; i++ )
|
||||
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
|
||||
|
||||
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
|
||||
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes());
|
||||
for ( final double[] likelihoods : GLs ) {
|
||||
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
||||
if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) {
|
||||
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL);
|
||||
if ( alleles.alleleIndex1 != 0 )
|
||||
likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
|
||||
// don't double-count it
|
||||
if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 )
|
||||
likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
|
||||
}
|
||||
}
|
||||
|
||||
// sort them by probability mass and choose the best ones
|
||||
Collections.sort(Arrays.asList(likelihoodSums));
|
||||
final ArrayList<Allele> bestAlleles = new ArrayList<Allele>(numAllelesToChoose);
|
||||
for ( int i = 0; i < numAllelesToChoose; i++ )
|
||||
bestAlleles.add(likelihoodSums[i].allele);
|
||||
|
||||
final ArrayList<Allele> orderedBestAlleles = new ArrayList<Allele>(numAllelesToChoose);
|
||||
for ( Allele allele : vc.getAlternateAlleles() ) {
|
||||
if ( bestAlleles.contains(allele) )
|
||||
orderedBestAlleles.add(allele);
|
||||
}
|
||||
|
||||
return orderedBestAlleles;
|
||||
}
|
||||
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// Multi-allelic implementation.
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
public static void linearExactMultiAllelic(final GenotypesContext GLs,
|
||||
final int numAlternateAlleles,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
||||
final int numSamples = genotypeLikelihoods.size()-1;
|
||||
final int numChr = 2*numSamples;
|
||||
|
||||
// queue of AC conformations to process
|
||||
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
|
||||
|
||||
// mapping of ExactACset indexes to the objects
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>(numChr+1);
|
||||
|
||||
// add AC=0 to the queue
|
||||
int[] zeroCounts = new int[numAlternateAlleles];
|
||||
ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts));
|
||||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||
|
||||
// keep processing while we have AC conformations that need to be calculated
|
||||
MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen();
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
// compute log10Likelihoods
|
||||
final ExactACset set = ACqueue.remove();
|
||||
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLikelihoodSeen, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result);
|
||||
|
||||
// adjust max likelihood seen if needed
|
||||
if ( log10LofKs > maxLikelihoodSeen.maxLog10L )
|
||||
maxLikelihoodSeen.update(log10LofKs, set.ACcounts);
|
||||
|
||||
// clean up memory
|
||||
indexesToACset.remove(set.ACcounts);
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** removing used set=%s%n", set.ACcounts);
|
||||
}
|
||||
}
|
||||
|
||||
private static final class DependentSet {
|
||||
public final int[] ACcounts;
|
||||
public final int PLindex;
|
||||
|
||||
public DependentSet(final int[] ACcounts, final int PLindex) {
|
||||
this.ACcounts = ACcounts;
|
||||
this.PLindex = PLindex;
|
||||
}
|
||||
}
|
||||
|
||||
private static double calculateAlleleCountConformation(final ExactACset set,
|
||||
final ArrayList<double[]> genotypeLikelihoods,
|
||||
final MaxLikelihoodSeen maxLikelihoodSeen,
|
||||
final int numChr,
|
||||
final LinkedList<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
|
||||
|
||||
// compute the log10Likelihoods
|
||||
computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, result);
|
||||
|
||||
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
|
||||
|
||||
// can we abort early because the log10Likelihoods are so small?
|
||||
if ( log10LofK < maxLikelihoodSeen.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && maxLikelihoodSeen.isLowerAC(set.ACcounts) ) {
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// iterate over higher frequencies if possible
|
||||
final int ACwiggle = numChr - set.getACsum();
|
||||
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
|
||||
return log10LofK;
|
||||
|
||||
final int numAltAlleles = set.ACcounts.getCounts().length;
|
||||
|
||||
// add conformations for the k+1 case
|
||||
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
ACcountsClone[allele]++;
|
||||
// to get to this conformation, a sample would need to be AB (remember that ref=0)
|
||||
final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1);
|
||||
updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
}
|
||||
|
||||
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
|
||||
if ( ACwiggle > 1 ) {
|
||||
final ArrayList<DependentSet> differentAlleles = new ArrayList<DependentSet>(numAltAlleles * numAltAlleles);
|
||||
final ArrayList<DependentSet> sameAlleles = new ArrayList<DependentSet>(numAltAlleles);
|
||||
|
||||
for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) {
|
||||
for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
ACcountsClone[allele_i]++;
|
||||
ACcountsClone[allele_j]++;
|
||||
|
||||
// to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index)
|
||||
final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1);
|
||||
if ( allele_i == allele_j )
|
||||
sameAlleles.add(new DependentSet(ACcountsClone, PLindex));
|
||||
else
|
||||
differentAlleles.add(new DependentSet(ACcountsClone, PLindex));
|
||||
}
|
||||
}
|
||||
|
||||
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
|
||||
for ( DependentSet dependent : differentAlleles )
|
||||
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
for ( DependentSet dependent : sameAlleles )
|
||||
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
}
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
|
||||
// also pushes its value to the given callingSetIndex.
|
||||
private static void updateACset(final int[] newSetCounts,
|
||||
final int numChr,
|
||||
final ExactACset dependentSet,
|
||||
final int PLsetIndex,
|
||||
final Queue<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||
final ArrayList<double[]> genotypeLikelihoods) {
|
||||
final ExactACcounts index = new ExactACcounts(newSetCounts);
|
||||
if ( !indexesToACset.containsKey(index) ) {
|
||||
ExactACset set = new ExactACset(numChr/2 +1, index);
|
||||
indexesToACset.put(index, set);
|
||||
ACqueue.add(set);
|
||||
}
|
||||
|
||||
// push data from the dependency to the new set
|
||||
//if ( DEBUG )
|
||||
// System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts);
|
||||
pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods);
|
||||
}
|
||||
|
||||
private static void computeLofK(final ExactACset set,
|
||||
final ArrayList<double[]> genotypeLikelihoods,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
set.log10Likelihoods[0] = 0.0; // the zero case
|
||||
final int totalK = set.getACsum();
|
||||
|
||||
// special case for k = 0 over all k
|
||||
if ( totalK == 0 ) {
|
||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ )
|
||||
set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX];
|
||||
|
||||
final double log10Lof0 = set.log10Likelihoods[set.log10Likelihoods.length-1];
|
||||
result.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
return;
|
||||
}
|
||||
|
||||
// if we got here, then k > 0 for at least one k.
|
||||
// the non-AA possible conformations were already dealt with by pushes from dependent sets;
|
||||
// now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
|
||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
||||
|
||||
if ( totalK < 2*j-1 ) {
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
|
||||
set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue);
|
||||
}
|
||||
|
||||
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator;
|
||||
}
|
||||
|
||||
double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
|
||||
|
||||
// update the MLE if necessary
|
||||
result.updateMLEifNeeded(log10LofK, set.ACcounts.counts);
|
||||
|
||||
// apply the priors over each alternate allele
|
||||
for ( final int ACcount : set.ACcounts.getCounts() ) {
|
||||
if ( ACcount > 0 )
|
||||
log10LofK += log10AlleleFrequencyPriors[ACcount];
|
||||
}
|
||||
result.updateMAPifNeeded(log10LofK, set.ACcounts.counts);
|
||||
}
|
||||
|
||||
private static void pushData(final ExactACset targetSet,
|
||||
final ExactACset dependentSet,
|
||||
final int PLsetIndex,
|
||||
final ArrayList<double[]> genotypeLikelihoods) {
|
||||
final int totalK = targetSet.getACsum();
|
||||
|
||||
for ( int j = 1; j < targetSet.log10Likelihoods.length; j++ ) {
|
||||
|
||||
if ( totalK <= 2*j ) { // skip impossible conformations
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
final double conformationValue =
|
||||
determineCoefficient(PLsetIndex, j, targetSet.ACcounts.getCounts(), totalK) + dependentSet.log10Likelihoods[j-1] + gl[PLsetIndex];
|
||||
targetSet.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(targetSet.log10Likelihoods[j], conformationValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
|
||||
|
||||
// the closed form representation generalized for multiple alleles is as follows:
|
||||
// AA: (2j - totalK) * (2j - totalK - 1)
|
||||
// AB: 2k_b * (2j - totalK)
|
||||
// AC: 2k_c * (2j - totalK)
|
||||
// BB: k_b * (k_b - 1)
|
||||
// BC: 2 * k_b * k_c
|
||||
// CC: k_c * (k_c - 1)
|
||||
|
||||
// find the 2 alleles that are represented by this PL index
|
||||
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
|
||||
|
||||
// *** note that throughout this method we subtract one from the alleleIndex because ACcounts ***
|
||||
// *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. ***
|
||||
|
||||
// the AX het case
|
||||
if ( alleles.alleleIndex1 == 0 )
|
||||
return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK];
|
||||
|
||||
final int k_i = ACcounts[alleles.alleleIndex1-1];
|
||||
|
||||
// the hom var case (e.g. BB, CC, DD)
|
||||
final double coeff;
|
||||
if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) {
|
||||
coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1];
|
||||
}
|
||||
// the het non-ref case (e.g. BC, BD, CD)
|
||||
else {
|
||||
final int k_j = ACcounts[alleles.alleleIndex2-1];
|
||||
coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j];
|
||||
}
|
||||
|
||||
return coeff;
|
||||
}
|
||||
|
||||
public GenotypesContext subsetAlleles(final VariantContext vc,
|
||||
final List<Allele> allelesToUse,
|
||||
final boolean assignGenotypes,
|
||||
final int ploidy) {
|
||||
return VariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// Deprecated bi-allelic ~O(N) implementation. Kept here for posterity.
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* A simple data structure that holds the current, prev, and prev->prev likelihoods vectors
|
||||
* for the exact model calculation
|
||||
*/
|
||||
/*
|
||||
private final static class ExactACCache {
|
||||
double[] kMinus2, kMinus1, kMinus0;
|
||||
|
||||
private final static double[] create(int n) {
|
||||
return new double[n];
|
||||
}
|
||||
|
||||
public ExactACCache(int n) {
|
||||
kMinus2 = create(n);
|
||||
kMinus1 = create(n);
|
||||
kMinus0 = create(n);
|
||||
}
|
||||
|
||||
final public void rotate() {
|
||||
double[] tmp = kMinus2;
|
||||
kMinus2 = kMinus1;
|
||||
kMinus1 = kMinus0;
|
||||
kMinus0 = tmp;
|
||||
}
|
||||
|
||||
final public double[] getkMinus2() {
|
||||
return kMinus2;
|
||||
}
|
||||
|
||||
final public double[] getkMinus1() {
|
||||
return kMinus1;
|
||||
}
|
||||
|
||||
final public double[] getkMinus0() {
|
||||
return kMinus0;
|
||||
}
|
||||
}
|
||||
|
||||
public int linearExact(GenotypesContext GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[][] log10AlleleFrequencyLikelihoods,
|
||||
double[][] log10AlleleFrequencyPosteriors) {
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
||||
final int numSamples = genotypeLikelihoods.size()-1;
|
||||
final int numChr = 2*numSamples;
|
||||
|
||||
final ExactACCache logY = new ExactACCache(numSamples+1);
|
||||
logY.getkMinus0()[0] = 0.0; // the zero case
|
||||
|
||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
boolean done = false;
|
||||
int lastK = -1;
|
||||
|
||||
for (int k=0; k <= numChr && ! done; k++ ) {
|
||||
final double[] kMinus0 = logY.getkMinus0();
|
||||
|
||||
if ( k == 0 ) { // special case for k = 0
|
||||
for ( int j=1; j <= numSamples; j++ ) {
|
||||
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[0];
|
||||
}
|
||||
} else { // k > 0
|
||||
final double[] kMinus1 = logY.getkMinus1();
|
||||
final double[] kMinus2 = logY.getkMinus2();
|
||||
|
||||
for ( int j=1; j <= numSamples; j++ ) {
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
|
||||
double aa = Double.NEGATIVE_INFINITY;
|
||||
double ab = Double.NEGATIVE_INFINITY;
|
||||
if (k < 2*j-1)
|
||||
aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[0];
|
||||
|
||||
if (k < 2*j)
|
||||
ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[1];
|
||||
|
||||
double log10Max;
|
||||
if (k > 1) {
|
||||
final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[2];
|
||||
log10Max = approximateLog10SumLog10(aa, ab, bb);
|
||||
} else {
|
||||
// we know we aren't considering the BB case, so we can use an optimized log10 function
|
||||
log10Max = approximateLog10SumLog10(aa, ab);
|
||||
}
|
||||
|
||||
// finally, update the L(j,k) value
|
||||
kMinus0[j] = log10Max - logDenominator;
|
||||
}
|
||||
}
|
||||
|
||||
// update the posteriors vector
|
||||
final double log10LofK = kMinus0[numSamples];
|
||||
log10AlleleFrequencyLikelihoods[0][k] = log10LofK;
|
||||
log10AlleleFrequencyPosteriors[0][k] = log10LofK + log10AlleleFrequencyPriors[k];
|
||||
|
||||
// can we abort early?
|
||||
lastK = k;
|
||||
maxLog10L = Math.max(maxLog10L, log10LofK);
|
||||
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
//if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
|
||||
done = true;
|
||||
}
|
||||
|
||||
logY.rotate();
|
||||
}
|
||||
|
||||
return lastK;
|
||||
}
|
||||
|
||||
final static double approximateLog10SumLog10(double a, double b, double c) {
|
||||
return approximateLog10SumLog10(approximateLog10SumLog10(a, b), c);
|
||||
}
|
||||
*/
|
||||
|
||||
}
|
||||
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
|
||||
|
|
@ -41,7 +42,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
|
||||
protected AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT;
|
||||
public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.EXACT;
|
||||
|
||||
/**
|
||||
* The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily
|
||||
|
|
@ -75,10 +76,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
|
||||
public Double MAX_DELETION_FRACTION = 0.05;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "cap_max_alternate_alleles_for_indels", shortName = "capMaxAltAllelesForIndels", doc = "Cap the maximum number of alternate alleles to genotype for indel calls at 2; overrides the --max_alternate_alleles argument; GSA production use only", required = false)
|
||||
public boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = false;
|
||||
|
||||
// indel-related arguments
|
||||
/**
|
||||
* A candidate indel is genotyped (and potentially called) if there are this number of reads with a consensus indel at a site.
|
||||
|
|
@ -160,7 +157,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
Sample ploidy - equivalent to number of chromosomes per pool. In pooled experiments this should be = # of samples in pool * individual sample ploidy
|
||||
*/
|
||||
@Argument(shortName="ploidy", fullName="sample_ploidy", doc="Plody (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false)
|
||||
int samplePloidy = VariantContextUtils.DEFAULT_PLOIDY;
|
||||
public int samplePloidy = VariantContextUtils.DEFAULT_PLOIDY;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName="minqs", fullName="min_quality_score", doc="Min quality score to consider. Smaller numbers process faster. Default: Q1.", required=false)
|
||||
|
|
@ -186,7 +183,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
@Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false)
|
||||
boolean EXCLUDE_FILTERED_REFERENCE_SITES = false;
|
||||
|
||||
|
||||
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
|
||||
public UnifiedArgumentCollection clone() {
|
||||
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
|
||||
|
|
@ -212,7 +208,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE;
|
||||
uac.alleles = alleles;
|
||||
uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES;
|
||||
uac.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
uac.MAX_ALTERNATE_ALLELES_FOR_INDELS = MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
uac.GLmodel = GLmodel;
|
||||
uac.TREAT_ALL_READS_AS_SINGLE_POOL = TREAT_ALL_READS_AS_SINGLE_POOL;
|
||||
uac.referenceSampleRod = referenceSampleRod;
|
||||
|
|
@ -224,6 +220,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
uac.minReferenceDepth = minReferenceDepth;
|
||||
uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES;
|
||||
uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO;
|
||||
uac.exactCallsLog = exactCallsLog;
|
||||
|
||||
// todo- arguments to remove
|
||||
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
|
||||
|
|
@ -239,8 +236,10 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
this.GenotypingMode = SCAC.GenotypingMode;
|
||||
this.heterozygosity = SCAC.heterozygosity;
|
||||
this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
|
||||
this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
this.OutputMode = SCAC.OutputMode;
|
||||
this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
|
||||
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
|
||||
this.exactCallsLog = SCAC.exactCallsLog;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,10 +27,10 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
|
|
@ -249,7 +249,7 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
|
|||
throw new UserException("Incorrect genotype calculation model chosen. Only [POOLSNP|POOLINDEL|POOLBOTH] supported with this walker if sample ploidy != 2");
|
||||
}
|
||||
|
||||
if (UAC.AFmodel != AlleleFrequencyCalculationModel.Model.POOL)
|
||||
if (UAC.AFmodel != AFCalc.Model.POOL)
|
||||
throw new UserException("Incorrect AF Calculation model. Only POOL model supported if sample ploidy != 2");
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -34,6 +34,9 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
|||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalc;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcResult;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
|
|
@ -78,11 +81,7 @@ public class UnifiedGenotyperEngine {
|
|||
private ThreadLocal<Map<String, GenotypeLikelihoodsCalculationModel>> glcm = new ThreadLocal<Map<String, GenotypeLikelihoodsCalculationModel>>();
|
||||
|
||||
// the model used for calculating p(non-ref)
|
||||
private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();
|
||||
|
||||
// the allele frequency likelihoods and posteriors (allocated once as an optimization)
|
||||
private ThreadLocal<AlleleFrequencyCalculationResult> alleleFrequencyCalculationResult = new ThreadLocal<AlleleFrequencyCalculationResult>();
|
||||
private ThreadLocal<double[]> posteriorsArray = new ThreadLocal<double[]>();
|
||||
private ThreadLocal<AFCalc> afcm = new ThreadLocal<AFCalc>();
|
||||
|
||||
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
|
||||
private final double[] log10AlleleFrequencyPriorsSNPs;
|
||||
|
|
@ -105,8 +104,6 @@ public class UnifiedGenotyperEngine {
|
|||
private final GenomeLocParser genomeLocParser;
|
||||
private final boolean BAQEnabledOnCMDLine;
|
||||
|
||||
protected static final double SUM_GL_THRESH_NOCALL = VariantContextUtils.SUM_GL_THRESH_NOCALL;
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Public interface functions
|
||||
|
|
@ -355,11 +352,8 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
// initialize the data for this thread if that hasn't been done yet
|
||||
if ( afcm.get() == null ) {
|
||||
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
|
||||
alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES));
|
||||
posteriorsArray.set(new double[2]);
|
||||
afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger));
|
||||
}
|
||||
AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get();
|
||||
|
||||
// estimate our confidence in a reference call and return
|
||||
if ( vc.getNSamples() == 0 ) {
|
||||
|
|
@ -370,8 +364,7 @@ public class UnifiedGenotyperEngine {
|
|||
generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext));
|
||||
}
|
||||
|
||||
AFresult.reset();
|
||||
List<Allele> allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult);
|
||||
AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model));
|
||||
|
||||
// is the most likely frequency conformation AC=0 for all alternate alleles?
|
||||
boolean bestGuessIsRef = true;
|
||||
|
|
@ -380,50 +373,43 @@ public class UnifiedGenotyperEngine {
|
|||
final List<Allele> myAlleles = new ArrayList<Allele>(vc.getAlleles().size());
|
||||
final List<Integer> alleleCountsofMLE = new ArrayList<Integer>(vc.getAlleles().size());
|
||||
myAlleles.add(vc.getReference());
|
||||
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
|
||||
final Allele alternateAllele = vc.getAlternateAllele(i);
|
||||
final int indexOfAllele = allelesUsedInGenotyping.indexOf(alternateAllele);
|
||||
// the genotyping model may have stripped it out
|
||||
if ( indexOfAllele == -1 )
|
||||
for ( int i = 0; i < AFresult.getAllelesUsedInGenotyping().size(); i++ ) {
|
||||
final Allele alternateAllele = AFresult.getAllelesUsedInGenotyping().get(i);
|
||||
if ( alternateAllele.isReference() )
|
||||
continue;
|
||||
|
||||
final int indexOfBestAC = AFresult.getAlleleCountsOfMAP()[indexOfAllele-1];
|
||||
// we are non-ref if the probability of being non-ref > the emit confidence.
|
||||
// the emit confidence is phred-scaled, say 30 => 10^-3.
|
||||
// the posterior AF > 0 is log10: -5 => 10^-5
|
||||
// we are non-ref if 10^-5 < 10^-3 => -5 < -3
|
||||
final boolean isNonRef = AFresult.isPolymorphic(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING / -10.0);
|
||||
|
||||
// if the most likely AC is not 0, then this is a good alternate allele to use
|
||||
if ( indexOfBestAC != 0 ) {
|
||||
if ( isNonRef ) {
|
||||
myAlleles.add(alternateAllele);
|
||||
alleleCountsofMLE.add(AFresult.getAlleleCountsOfMLE()[indexOfAllele-1]);
|
||||
alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele));
|
||||
bestGuessIsRef = false;
|
||||
}
|
||||
// if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele
|
||||
else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
myAlleles.add(alternateAllele);
|
||||
alleleCountsofMLE.add(AFresult.getAlleleCountsOfMLE()[indexOfAllele-1]);
|
||||
alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele));
|
||||
}
|
||||
}
|
||||
|
||||
// calculate p(f>0):
|
||||
final double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get());
|
||||
final double PofF = 1.0 - normalizedPosteriors[0];
|
||||
final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0());
|
||||
|
||||
double phredScaledConfidence;
|
||||
if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
|
||||
if ( Double.isInfinite(phredScaledConfidence) )
|
||||
phredScaledConfidence = -10.0 * AFresult.getLog10PosteriorOfAFzero();
|
||||
} else {
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
|
||||
if ( Double.isInfinite(phredScaledConfidence) ) {
|
||||
final double sum = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
|
||||
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
|
||||
}
|
||||
}
|
||||
// note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice
|
||||
final double phredScaledConfidence =
|
||||
Math.abs(! bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
|
||||
? -10 * AFresult.getLog10PosteriorOfAFEq0()
|
||||
: -10 * AFresult.getLog10PosteriorOfAFGT0());
|
||||
|
||||
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
|
||||
if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) {
|
||||
// technically, at this point our confidence in a reference call isn't accurately estimated
|
||||
// because it didn't take into account samples with no data, so let's get a better estimate
|
||||
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, 1.0 - PofF);
|
||||
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0);
|
||||
}
|
||||
|
||||
// start constructing the resulting VC
|
||||
|
|
@ -439,7 +425,7 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
// print out stats if we have a writer
|
||||
if ( verboseWriter != null && !limitedContext )
|
||||
printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, model);
|
||||
printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model);
|
||||
|
||||
// *** note that calculating strand bias involves overwriting data structures, so we do that last
|
||||
final HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
|
|
@ -470,27 +456,25 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
// the overall lod
|
||||
//double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0];
|
||||
double overallLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
|
||||
double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0();
|
||||
//if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
|
||||
|
||||
List<Allele> allAllelesToUse = builder.make().getAlleles();
|
||||
|
||||
// the forward lod
|
||||
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap);
|
||||
AFresult.reset();
|
||||
afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult);
|
||||
AFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model));
|
||||
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
|
||||
double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
|
||||
double forwardLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
|
||||
double forwardLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0();
|
||||
double forwardLog10PofF = AFresult.getLog10LikelihoodOfAFGT0();
|
||||
//if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
|
||||
|
||||
// the reverse lod
|
||||
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap);
|
||||
AFresult.reset();
|
||||
afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult);
|
||||
AFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model));
|
||||
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
|
||||
double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
|
||||
double reverseLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
|
||||
double reverseLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0();
|
||||
double reverseLog10PofF = AFresult.getLog10LikelihoodOfAFGT0();
|
||||
//if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF);
|
||||
|
||||
double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
|
||||
|
|
@ -513,10 +497,10 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
// if we are subsetting alleles (either because there were too many or because some were not polymorphic)
|
||||
// then we may need to trim the alleles (because the original VariantContext may have had to pad at the end).
|
||||
if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext )
|
||||
if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // limitedContext callers need to handle allele trimming on their own to keep their perReadAlleleLikelihoodMap alleles in sync
|
||||
vcCall = VariantContextUtils.reverseTrimAlleles(vcCall);
|
||||
|
||||
if ( annotationEngine != null && !limitedContext ) {
|
||||
if ( annotationEngine != null && !limitedContext ) { // limitedContext callers need to handle annotations on their own by calling their own annotationEngine
|
||||
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
|
||||
final ReadBackedPileup pileup = rawContext.getBasePileup();
|
||||
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);
|
||||
|
|
@ -524,13 +508,7 @@ public class UnifiedGenotyperEngine {
|
|||
vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap);
|
||||
}
|
||||
|
||||
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF));
|
||||
}
|
||||
|
||||
public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) {
|
||||
normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero();
|
||||
normalizedPosteriors[1] = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
|
||||
return MathUtils.normalizeFromLog10(normalizedPosteriors);
|
||||
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0));
|
||||
}
|
||||
|
||||
private Map<String, AlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) {
|
||||
|
|
@ -633,8 +611,6 @@ public class UnifiedGenotyperEngine {
|
|||
AFline.append(i + "/" + N + "\t");
|
||||
AFline.append(String.format("%.2f\t", ((float)i)/N));
|
||||
AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i]));
|
||||
AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MLE()));
|
||||
AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MAP()));
|
||||
verboseWriter.println(AFline.toString());
|
||||
}
|
||||
|
||||
|
|
@ -700,7 +676,7 @@ public class UnifiedGenotyperEngine {
|
|||
return models;
|
||||
}
|
||||
|
||||
protected static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) {
|
||||
public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) {
|
||||
|
||||
double sum = 0.0;
|
||||
|
||||
|
|
@ -754,34 +730,6 @@ public class UnifiedGenotyperEngine {
|
|||
return glcm;
|
||||
}
|
||||
|
||||
private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) {
|
||||
|
||||
List<Class<? extends AlleleFrequencyCalculationModel>> afClasses = new PluginManager<AlleleFrequencyCalculationModel>(AlleleFrequencyCalculationModel.class).getPlugins();
|
||||
|
||||
// user-specified name
|
||||
String afModelName = UAC.AFmodel.name();
|
||||
|
||||
if (!afModelName.contains(GPSTRING) && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY)
|
||||
afModelName = GPSTRING + afModelName;
|
||||
|
||||
for (int i = 0; i < afClasses.size(); i++) {
|
||||
Class<? extends AlleleFrequencyCalculationModel> afClass = afClasses.get(i);
|
||||
String key = afClass.getSimpleName().replace("AFCalculationModel","").toUpperCase();
|
||||
if (afModelName.equalsIgnoreCase(key)) {
|
||||
try {
|
||||
Object args[] = new Object[]{UAC,N,logger,verboseWriter};
|
||||
Constructor c = afClass.getDeclaredConstructor(UnifiedArgumentCollection.class, int.class, Logger.class, PrintStream.class);
|
||||
|
||||
return (AlleleFrequencyCalculationModel)c.newInstance(args);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel);
|
||||
}
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel);
|
||||
}
|
||||
|
||||
public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding<VariantContext> allelesBinding) {
|
||||
if ( tracker == null || ref == null || logger == null )
|
||||
return null;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,224 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* Generic interface for calculating the probability of alleles segregating given priors and genotype likelihoods
|
||||
*
|
||||
*/
|
||||
public abstract class AFCalc implements Cloneable {
|
||||
private final static Logger defaultLogger = Logger.getLogger(AFCalc.class);
|
||||
|
||||
protected final int nSamples;
|
||||
protected final int maxAlternateAllelesToGenotype;
|
||||
protected final int maxAlternateAllelesForIndels;
|
||||
|
||||
protected Logger logger = defaultLogger;
|
||||
|
||||
private SimpleTimer callTimer = new SimpleTimer();
|
||||
private PrintStream callReport = null;
|
||||
private final AFCalcResultTracker resultTracker;
|
||||
|
||||
protected AFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
|
||||
if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples);
|
||||
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles);
|
||||
if ( maxAltAllelesForIndels < 1 ) throw new IllegalArgumentException("maxAltAllelesForIndels must be greater than zero " + maxAltAllelesForIndels);
|
||||
if ( ploidy < 1 ) throw new IllegalArgumentException("ploidy must be > 0 but got " + ploidy);
|
||||
|
||||
this.nSamples = nSamples;
|
||||
this.maxAlternateAllelesToGenotype = maxAltAlleles;
|
||||
this.maxAlternateAllelesForIndels = maxAltAllelesForIndels;
|
||||
this.resultTracker = new AFCalcResultTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels));
|
||||
}
|
||||
|
||||
public void enableProcessLog(final File exactCallsLog) {
|
||||
initializeOutputFile(exactCallsLog);
|
||||
}
|
||||
|
||||
public void setLogger(Logger logger) {
|
||||
this.logger = logger;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the probability of the alleles segregating given the genotype likelihoods of the samples in vc
|
||||
*
|
||||
* @param vc the VariantContext holding the alleles and sample information
|
||||
* @param log10AlleleFrequencyPriors a prior vector nSamples x 2 in length indicating the Pr(AF = i)
|
||||
* @return result (for programming convenience)
|
||||
*/
|
||||
public AFCalcResult getLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) {
|
||||
if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null");
|
||||
if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null");
|
||||
if ( resultTracker == null ) throw new IllegalArgumentException("Results object cannot be null");
|
||||
|
||||
// reset the result, so we can store our new result there
|
||||
resultTracker.reset();
|
||||
|
||||
final VariantContext vcWorking = reduceScope(vc);
|
||||
|
||||
callTimer.start();
|
||||
final AFCalcResult result = computeLog10PNonRef(vcWorking, log10AlleleFrequencyPriors);
|
||||
final long nanoTime = callTimer.getElapsedTimeNano();
|
||||
|
||||
if ( callReport != null )
|
||||
printCallInfo(vcWorking, log10AlleleFrequencyPriors, nanoTime, resultTracker.getLog10PosteriorOfAFzero());
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
protected AFCalcResult resultFromTracker(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) {
|
||||
resultTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles());
|
||||
return resultTracker.toAFCalcResult(log10AlleleFrequencyPriors);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
//
|
||||
// Abstract methods that should be implemented by concrete implementations
|
||||
// to actually calculate the AF
|
||||
//
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Look at VC and perhaps return a new one of reduced complexity, if that's necessary
|
||||
*
|
||||
* Used before the call to computeLog10PNonRef to simply the calculation job at hand,
|
||||
* if vc exceeds bounds. For example, if VC has 100 alt alleles this function
|
||||
* may decide to only genotype the best 2 of them.
|
||||
*
|
||||
* @param vc the initial VC provided by the caller to this AFcalculation
|
||||
* @return a potentially simpler VC that's more tractable to genotype
|
||||
*/
|
||||
@Requires("vc != null")
|
||||
@Ensures("result != null")
|
||||
protected abstract VariantContext reduceScope(final VariantContext vc);
|
||||
|
||||
/**
|
||||
* Actually carry out the log10PNonRef calculation on vc, storing results in results
|
||||
*
|
||||
* @param vc variant context with alleles and genotype likelihoods
|
||||
* @param log10AlleleFrequencyPriors priors
|
||||
* @return a AFCalcResult object describing the results of this calculation
|
||||
*/
|
||||
// TODO -- add consistent requires among args
|
||||
protected abstract AFCalcResult computeLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors);
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
*
|
||||
* @param vc variant context with alleles and genotype likelihoods
|
||||
* @param allelesToUse alleles to subset
|
||||
* @param assignGenotypes
|
||||
* @param ploidy
|
||||
* @return GenotypesContext object
|
||||
*/
|
||||
public abstract GenotypesContext subsetAlleles(final VariantContext vc,
|
||||
final List<Allele> allelesToUse,
|
||||
final boolean assignGenotypes,
|
||||
final int ploidy);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
//
|
||||
// accessors
|
||||
//
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
public int getMaxAltAlleles() {
|
||||
return Math.max(maxAlternateAllelesToGenotype, maxAlternateAllelesForIndels);
|
||||
}
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
//
|
||||
// Print information about the call to the calls log
|
||||
//
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
private void initializeOutputFile(final File outputFile) {
|
||||
try {
|
||||
if (outputFile != null) {
|
||||
callReport = new PrintStream( new FileOutputStream(outputFile) );
|
||||
callReport.println(Utils.join("\t", Arrays.asList("loc", "variable", "key", "value")));
|
||||
}
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotCreateOutputFile(outputFile, e);
|
||||
}
|
||||
}
|
||||
|
||||
private void printCallInfo(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final long runtimeNano,
|
||||
final double log10PosteriorOfAFzero) {
|
||||
printCallElement(vc, "type", "ignore", vc.getType());
|
||||
|
||||
int allelei = 0;
|
||||
for ( final Allele a : vc.getAlleles() )
|
||||
printCallElement(vc, "allele", allelei++, a.getDisplayString());
|
||||
|
||||
for ( final Genotype g : vc.getGenotypes() )
|
||||
printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString());
|
||||
|
||||
for ( int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++ )
|
||||
printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]);
|
||||
|
||||
printCallElement(vc, "runtime.nano", "ignore", runtimeNano);
|
||||
printCallElement(vc, "log10PosteriorOfAFzero", "ignore", log10PosteriorOfAFzero);
|
||||
|
||||
callReport.flush();
|
||||
}
|
||||
|
||||
private void printCallElement(final VariantContext vc,
|
||||
final Object variable,
|
||||
final Object key,
|
||||
final Object value) {
|
||||
final String loc = String.format("%s:%d", vc.getChr(), vc.getStart());
|
||||
callReport.println(Utils.join("\t", Arrays.asList(loc, variable, key, value)));
|
||||
}
|
||||
|
||||
public AFCalcResultTracker getResultTracker() {
|
||||
return resultTracker;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,226 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.lang.reflect.Constructor;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Factory to make AFCalculations
|
||||
*/
|
||||
public class AFCalcFactory {
|
||||
/**
|
||||
* Enumeration of usable AF calculation, their constraints (i.e. ploidy).
|
||||
*
|
||||
* Note that the order these occur in the enum is the order of preference, so
|
||||
* the first value is taken over the second when multiple calculations satisfy
|
||||
* the needs of the request (i.e., considering ploidy).
|
||||
*/
|
||||
public enum Calculation {
|
||||
/** The default implementation */
|
||||
EXACT(ReferenceDiploidExactAFCalc.class, 2, -1),
|
||||
|
||||
/** reference implementation of multi-allelic EXACT model */
|
||||
EXACT_REFERENCE(ReferenceDiploidExactAFCalc.class, 2, -1),
|
||||
|
||||
/** expt. implementation */
|
||||
@Deprecated
|
||||
EXACT_CONSTRAINED(ConstrainedDiploidExactAFCalc.class, 2, -1),
|
||||
|
||||
/** expt. implementation -- for testing only */
|
||||
EXACT_INDEPENDENT(IndependentAllelesDiploidExactAFCalc.class, 2, -1),
|
||||
|
||||
/** original biallelic exact model, for testing only */
|
||||
EXACT_ORIGINAL(OriginalDiploidExactAFCalc.class, 2, 2),
|
||||
|
||||
/** implementation that supports any sample ploidy */
|
||||
EXACT_GENERAL_PLOIDY("GeneralPloidyExactAFCalc", -1, -1);
|
||||
|
||||
/**
|
||||
* Must be a name because we look this up dynamically
|
||||
*/
|
||||
public final String className;
|
||||
public final int maxAltAlleles;
|
||||
public final int requiredPloidy;
|
||||
|
||||
private Calculation(final String className, final int requiredPloidy, final int maxAltAlleles) {
|
||||
this.className = className;
|
||||
this.requiredPloidy = requiredPloidy;
|
||||
this.maxAltAlleles = maxAltAlleles;
|
||||
}
|
||||
|
||||
private Calculation(final Class clazz, final int requiredPloidy, final int maxAltAlleles) {
|
||||
this(clazz.getSimpleName(), requiredPloidy, maxAltAlleles);
|
||||
}
|
||||
|
||||
public boolean usableForParams(final int requestedPloidy, final int requestedMaxAltAlleles) {
|
||||
return (requiredPloidy == -1 || requiredPloidy == requestedPloidy)
|
||||
&& (maxAltAlleles == -1 || maxAltAlleles >= requestedMaxAltAlleles);
|
||||
}
|
||||
}
|
||||
|
||||
private static final Map<String, Class<? extends AFCalc>> afClasses;
|
||||
static {
|
||||
afClasses = new PluginManager<AFCalc>(AFCalc.class).getPluginsByName();
|
||||
}
|
||||
|
||||
private AFCalcFactory() {
|
||||
|
||||
}
|
||||
|
||||
private static Class<? extends AFCalc> getClassByName(final String name) {
|
||||
for ( final Class<? extends AFCalc> clazz : afClasses.values() ) {
|
||||
if ( clazz.getSimpleName().contains(name) ) {
|
||||
return clazz;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new AFCalc based on the parameters in the UAC
|
||||
*
|
||||
* @param UAC the UnifiedArgumentCollection containing the command-line parameters for the caller
|
||||
* @param nSamples the number of samples we will be using
|
||||
* @param logger an optional (can be null) logger to override the default in the model
|
||||
* @return an initialized AFCalc
|
||||
*/
|
||||
public static AFCalc createAFCalc(final UnifiedArgumentCollection UAC,
|
||||
final int nSamples,
|
||||
final Logger logger) {
|
||||
final int maxAltAlleles = Math.max(UAC.MAX_ALTERNATE_ALLELES, UAC.MAX_ALTERNATE_ALLELES_FOR_INDELS);
|
||||
if ( ! UAC.AFmodel.usableForParams(UAC.samplePloidy, maxAltAlleles) ) {
|
||||
logger.info("Requested ploidy " + UAC.samplePloidy + " maxAltAlleles " + maxAltAlleles + " not supported by requested model " + UAC.AFmodel + " looking for an option");
|
||||
final List<Calculation> supportingCalculations = new LinkedList<Calculation>();
|
||||
for ( final Calculation calc : Calculation.values() ) {
|
||||
if ( calc.usableForParams(UAC.samplePloidy, maxAltAlleles) )
|
||||
supportingCalculations.add(calc);
|
||||
}
|
||||
|
||||
if ( supportingCalculations.isEmpty() )
|
||||
throw new UserException("no AFCalculation model found that supports ploidy of " + UAC.samplePloidy + " and max alt alleles " + maxAltAlleles);
|
||||
else if ( supportingCalculations.size() > 1 )
|
||||
logger.debug("Warning, multiple supporting AFCalcs found " + Utils.join(",", supportingCalculations) + " choosing first arbitrarily");
|
||||
else
|
||||
UAC.AFmodel = supportingCalculations.get(0);
|
||||
logger.info("Selecting model " + UAC.AFmodel);
|
||||
}
|
||||
|
||||
final AFCalc calc = createAFCalc(UAC.AFmodel, nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.MAX_ALTERNATE_ALLELES_FOR_INDELS, UAC.samplePloidy);
|
||||
|
||||
if ( logger != null ) calc.setLogger(logger);
|
||||
if ( UAC.exactCallsLog != null ) calc.enableProcessLog(UAC.exactCallsLog);
|
||||
|
||||
return calc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new AFCalc, choosing the best implementation based on the given parameters, assuming
|
||||
* that we will only be requesting bi-allelic variants to diploid genotypes
|
||||
*
|
||||
* @param nSamples the number of samples we'll be using
|
||||
*
|
||||
* @return an initialized AFCalc
|
||||
*/
|
||||
public static AFCalc createAFCalc(final int nSamples) {
|
||||
return createAFCalc(chooseBestCalculation(nSamples, 2, 1), nSamples, 2, 2, 2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new AFCalc that supports maxAltAlleles for all variants and diploid genotypes
|
||||
*
|
||||
* @param calc the calculation we'd like to use
|
||||
* @param nSamples the number of samples we'll be using
|
||||
* @param maxAltAlleles the max. alt alleles for both SNPs and indels
|
||||
*
|
||||
* @return an initialized AFCalc
|
||||
*/
|
||||
public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles) {
|
||||
return createAFCalc(calc, nSamples, maxAltAlleles, maxAltAlleles, 2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new AFCalc, choosing the best implementation based on the given parameters
|
||||
*
|
||||
* @param nSamples the number of samples we'll be using
|
||||
* @param maxAltAlleles the max. alt alleles to consider for SNPs
|
||||
* @param maxAltAllelesForIndels the max. alt alleles to consider for non-SNPs
|
||||
* @param ploidy the sample ploidy. Must be consistent with the calc
|
||||
*
|
||||
* @return an initialized AFCalc
|
||||
*/
|
||||
public static AFCalc createAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
|
||||
final int maxAlt = Math.max(maxAltAlleles, maxAltAllelesForIndels);
|
||||
return createAFCalc(chooseBestCalculation(nSamples, ploidy, maxAlt), nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
|
||||
}
|
||||
|
||||
/**
|
||||
* Choose the best calculation for nSamples and ploidy
|
||||
*
|
||||
* @param nSamples
|
||||
* @param ploidy
|
||||
* @param maxAltAlleles
|
||||
* @return
|
||||
*/
|
||||
private static Calculation chooseBestCalculation(final int nSamples, final int ploidy, final int maxAltAlleles) {
|
||||
for ( final Calculation calc : Calculation.values() ) {
|
||||
if ( calc.usableForParams(ploidy, maxAltAlleles) ) {
|
||||
return calc;
|
||||
}
|
||||
}
|
||||
|
||||
throw new IllegalStateException("no calculation found that supports nSamples " + nSamples + " ploidy " + ploidy + " and maxAltAlleles " + maxAltAlleles);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new AFCalc
|
||||
*
|
||||
* @param calc the calculation to use
|
||||
* @param nSamples the number of samples we'll be using
|
||||
* @param maxAltAlleles the max. alt alleles to consider for SNPs
|
||||
* @param maxAltAllelesForIndels the max. alt alleles to consider for non-SNPs
|
||||
* @param ploidy the sample ploidy. Must be consistent with the calc
|
||||
*
|
||||
* @return an initialized AFCalc
|
||||
*/
|
||||
public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
|
||||
if ( calc == null ) throw new IllegalArgumentException("Calculation cannot be null");
|
||||
if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples);
|
||||
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles);
|
||||
if ( maxAltAllelesForIndels < 1 ) throw new IllegalArgumentException("maxAltAllelesForIndels must be greater than zero " + maxAltAllelesForIndels);
|
||||
if ( ploidy < 1 ) throw new IllegalArgumentException("sample ploidy must be greater than zero " + ploidy);
|
||||
|
||||
final int maxAlt = Math.max(maxAltAlleles, maxAltAllelesForIndels);
|
||||
if ( ! calc.usableForParams(ploidy, maxAlt) )
|
||||
throw new IllegalArgumentException("AFCalc " + calc + " does not support requested ploidy " + ploidy);
|
||||
|
||||
final Class<? extends AFCalc> afClass = getClassByName(calc.className);
|
||||
if ( afClass == null )
|
||||
throw new IllegalArgumentException("Unexpected AFCalc " + calc);
|
||||
|
||||
try {
|
||||
Object args[] = new Object[]{nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy};
|
||||
Constructor c = afClass.getDeclaredConstructor(int.class, int.class, int.class, int.class);
|
||||
return (AFCalc)c.newInstance(args);
|
||||
} catch (Exception e) {
|
||||
throw new ReviewedStingException("Could not instantiate AFCalc " + calc, e);
|
||||
}
|
||||
}
|
||||
|
||||
protected static List<AFCalc> createAFCalcs(final List<Calculation> calcs, final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
|
||||
final List<AFCalc> AFCalcs = new LinkedList<AFCalc>();
|
||||
|
||||
for ( final Calculation calc : calcs )
|
||||
AFCalcs.add(createAFCalc(calc, nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy));
|
||||
|
||||
return AFCalcs;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,334 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Describes the results of the AFCalc
|
||||
*
|
||||
* Only the bare essentials are represented here, as all AFCalc models must return meaningful results for
|
||||
* all of these fields.
|
||||
*
|
||||
* Note that all of the values -- i.e. priors -- are checked now that they are meaningful, which means
|
||||
* that users of this code can rely on the values coming out of these functions.
|
||||
*/
|
||||
public class AFCalcResult {
|
||||
private final static int AF0 = 0;
|
||||
private final static int AF1p = 1;
|
||||
private final static int LOG_10_ARRAY_SIZES = 2;
|
||||
|
||||
private final double[] log10LikelihoodsOfAC;
|
||||
private final double[] log10PriorsOfAC;
|
||||
private final double[] log10PosteriorsOfAC;
|
||||
|
||||
private final Map<Allele, Double> log10pNonRefByAllele;
|
||||
|
||||
/**
|
||||
* The AC values for all ALT alleles at the MLE
|
||||
*/
|
||||
private final int[] alleleCountsOfMLE;
|
||||
|
||||
int nEvaluations = 0;
|
||||
|
||||
/**
|
||||
* The list of alleles actually used in computing the AF
|
||||
*/
|
||||
private List<Allele> allelesUsedInGenotyping = null;
|
||||
|
||||
/**
|
||||
* Create a results object capability of storing results for calls with up to maxAltAlleles
|
||||
*/
|
||||
public AFCalcResult(final int[] alleleCountsOfMLE,
|
||||
final int nEvaluations,
|
||||
final List<Allele> allelesUsedInGenotyping,
|
||||
final double[] log10LikelihoodsOfAC,
|
||||
final double[] log10PriorsOfAC,
|
||||
final Map<Allele, Double> log10pNonRefByAllele) {
|
||||
if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.size() < 1 ) throw new IllegalArgumentException("allelesUsedInGenotyping must be non-null list of at least 1 value " + allelesUsedInGenotyping);
|
||||
if ( alleleCountsOfMLE == null ) throw new IllegalArgumentException("alleleCountsOfMLE cannot be null");
|
||||
if ( alleleCountsOfMLE.length != allelesUsedInGenotyping.size() - 1) throw new IllegalArgumentException("alleleCountsOfMLE.length " + alleleCountsOfMLE.length + " != allelesUsedInGenotyping.size() " + allelesUsedInGenotyping.size());
|
||||
if ( nEvaluations < 0 ) throw new IllegalArgumentException("nEvaluations must be >= 0 but saw " + nEvaluations);
|
||||
if ( log10LikelihoodsOfAC.length != 2 ) throw new IllegalArgumentException("log10LikelihoodsOfAC must have length equal 2");
|
||||
if ( log10PriorsOfAC.length != 2 ) throw new IllegalArgumentException("log10PriorsOfAC must have length equal 2");
|
||||
if ( log10pNonRefByAllele == null ) throw new IllegalArgumentException("log10pNonRefByAllele cannot be null");
|
||||
if ( log10pNonRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pNonRefByAllele has the wrong number of elements: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping);
|
||||
if ( ! allelesUsedInGenotyping.containsAll(log10pNonRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pNonRefByAllele doesn't contain all of the alleles used in genotyping: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping);
|
||||
if ( ! goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC));
|
||||
if ( ! goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC));
|
||||
|
||||
this.alleleCountsOfMLE = alleleCountsOfMLE;
|
||||
this.nEvaluations = nEvaluations;
|
||||
this.allelesUsedInGenotyping = allelesUsedInGenotyping;
|
||||
|
||||
this.log10LikelihoodsOfAC = Arrays.copyOf(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES);
|
||||
this.log10PriorsOfAC = Arrays.copyOf(log10PriorsOfAC, LOG_10_ARRAY_SIZES);
|
||||
this.log10PosteriorsOfAC = computePosteriors(log10LikelihoodsOfAC, log10PriorsOfAC);
|
||||
this.log10pNonRefByAllele = new HashMap<Allele, Double>(log10pNonRefByAllele);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a new AFCalcResult with a new prior probability
|
||||
*
|
||||
* @param log10PriorsOfAC
|
||||
* @return
|
||||
*/
|
||||
public AFCalcResult withNewPriors(final double[] log10PriorsOfAC) {
|
||||
return new AFCalcResult(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pNonRefByAllele);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a vector with maxAltAlleles values containing AC values at the MLE
|
||||
*
|
||||
* The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order,
|
||||
* starting from index 0 (i.e., the first alt allele is at 0). The vector is always
|
||||
* maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values
|
||||
* are meaningful.
|
||||
*
|
||||
* @return a vector with allele counts, not all of which may be meaningful
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public int[] getAlleleCountsOfMLE() {
|
||||
return alleleCountsOfMLE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the AC of allele a la #getAlleleCountsOfMLE
|
||||
*
|
||||
* @param allele the allele whose AC we want to know. Error if its not in allelesUsedInGenotyping
|
||||
* @throws IllegalStateException if allele isn't in allelesUsedInGenotyping
|
||||
* @return the AC of allele
|
||||
*/
|
||||
public int getAlleleCountAtMLE(final Allele allele) {
|
||||
return getAlleleCountsOfMLE()[altAlleleIndex(allele)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of cycles used to evaluate the pNonRef for this AF calculation
|
||||
*
|
||||
* @return the number of evaluations required to produce the answer for this AF calculation
|
||||
*/
|
||||
public int getnEvaluations() {
|
||||
return nEvaluations;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the list of alleles actually used in genotyping.
|
||||
*
|
||||
* Due to computational / implementation constraints this may be smaller than
|
||||
* the actual list of alleles requested
|
||||
*
|
||||
* @return a non-empty list of alleles used during genotyping
|
||||
*/
|
||||
@Ensures({"result != null", "! result.isEmpty()"})
|
||||
public List<Allele> getAllelesUsedInGenotyping() {
|
||||
return allelesUsedInGenotyping;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the log10 normalized -- across all ACs -- posterior probability of AC == 0 for all alleles
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Ensures({"goodLog10Probability(result)"})
|
||||
public double getLog10PosteriorOfAFEq0() {
|
||||
return log10PosteriorsOfAC[AF0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the log10 normalized -- across all ACs -- posterior probability of AC > 0 for any alleles
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Ensures({"goodLog10Probability(result)"})
|
||||
public double getLog10PosteriorOfAFGT0() {
|
||||
return log10PosteriorsOfAC[AF1p];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the log10 unnormalized -- across all ACs -- likelihood of AC == 0 for all alleles
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Ensures({"goodLog10Probability(result)"})
|
||||
public double getLog10LikelihoodOfAFEq0() {
|
||||
return log10LikelihoodsOfAC[AF0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the log10 unnormalized -- across all ACs -- likelihood of AC > 0 for any alleles
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Ensures({"goodLog10Probability(result)"})
|
||||
public double getLog10LikelihoodOfAFGT0() {
|
||||
return log10LikelihoodsOfAC[AF1p];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the log10 unnormalized -- across all ACs -- prior probability of AC == 0 for all alleles
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Ensures({"goodLog10Probability(result)"})
|
||||
public double getLog10PriorOfAFEq0() {
|
||||
return log10PriorsOfAC[AF0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the log10 unnormalized -- across all ACs -- prior probability of AC > 0
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Ensures({"goodLog10Probability(result)"})
|
||||
public double getLog10PriorOfAFGT0() {
|
||||
return log10PriorsOfAC[AF1p];
|
||||
}
|
||||
|
||||
/**
|
||||
* Are we sufficiently confidence in being non-ref that the site is considered polymorphic?
|
||||
*
|
||||
* We are non-ref if the probability of being non-ref > the emit confidence (often an argument).
|
||||
* Suppose posterior AF > 0 is log10: -5 => 10^-5
|
||||
* And that log10minPNonRef is -3.
|
||||
* We are considered polymorphic since 10^-5 < 10^-3 => -5 < -3
|
||||
*
|
||||
* @param log10minPNonRef the log10 scaled min pr of being non-ref to be considered polymorphic
|
||||
*
|
||||
* @return true if there's enough confidence (relative to log10minPNonRef) to reject AF == 0
|
||||
*/
|
||||
public boolean isPolymorphic(final Allele allele, final double log10minPNonRef) {
|
||||
return getLog10PosteriorOfAFGt0ForAllele(allele) >= log10minPNonRef;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the log10 probability that allele is segregating
|
||||
*
|
||||
* Unlike the sites-level annotation, this calculation is specific to allele, and can be
|
||||
* used to separately determine how much evidence there is that allele is independently
|
||||
* segregating as opposed to the site being polymorphic with any allele. In the bi-allelic
|
||||
* case these are obviously the same but for multiple alt alleles there can be lots of
|
||||
* evidence for one allele but not so much for any other allele
|
||||
*
|
||||
* @param allele the allele we're interested in, must be in getAllelesUsedInGenotyping
|
||||
* @return the log10 probability that allele is segregating at this site
|
||||
*/
|
||||
@Ensures("goodLog10Probability(result)")
|
||||
public double getLog10PosteriorOfAFGt0ForAllele(final Allele allele) {
|
||||
final Double log10pNonRef = log10pNonRefByAllele.get(allele);
|
||||
if ( log10pNonRef == null ) throw new IllegalArgumentException("Unknown allele " + allele);
|
||||
return log10pNonRef;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the log10 normalized posteriors given the log10 likelihoods and priors
|
||||
*
|
||||
* @param log10LikelihoodsOfAC
|
||||
* @param log10PriorsOfAC
|
||||
*
|
||||
* @return freshly allocated log10 normalized posteriors vector
|
||||
*/
|
||||
@Requires("log10LikelihoodsOfAC.length == log10PriorsOfAC.length")
|
||||
@Ensures("goodLog10ProbVector(result, LOG_10_ARRAY_SIZES, true)")
|
||||
private static double[] computePosteriors(final double[] log10LikelihoodsOfAC, final double[] log10PriorsOfAC) {
|
||||
final double[] log10UnnormalizedPosteriors = new double[log10LikelihoodsOfAC.length];
|
||||
for ( int i = 0; i < log10LikelihoodsOfAC.length; i++ )
|
||||
log10UnnormalizedPosteriors[i] = log10LikelihoodsOfAC[i] + log10PriorsOfAC[i];
|
||||
|
||||
// necessary because the posteriors may be so skewed that the log-space normalized value isn't
|
||||
// good, so we have to try both log-space normalization as well as the real-space normalization if the
|
||||
// result isn't good
|
||||
final double[] logNormalized = MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, true);
|
||||
if ( goodLog10ProbVector(logNormalized, logNormalized.length, true) )
|
||||
return logNormalized;
|
||||
else
|
||||
return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check that the log10 prob vector vector is well formed
|
||||
*
|
||||
* @param vector
|
||||
* @param expectedSize
|
||||
* @param shouldSumToOne
|
||||
*
|
||||
* @return true if vector is well-formed, false otherwise
|
||||
*/
|
||||
private static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) {
|
||||
if ( vector.length != expectedSize ) return false;
|
||||
|
||||
for ( final double pr : vector ) {
|
||||
if ( ! goodLog10Probability(pr) )
|
||||
return false;
|
||||
}
|
||||
|
||||
if ( shouldSumToOne && MathUtils.compareDoubles(MathUtils.sumLog10(vector), 1.0, 1e-4) != 0 )
|
||||
return false;
|
||||
|
||||
return true; // everything is good
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the offset into linear vectors indexed by alt allele for allele
|
||||
*
|
||||
* Things like our MLE allele count vector are indexed by alt allele index, with
|
||||
* the first alt allele being 0, the second 1, etc. This function computes the index
|
||||
* associated with allele.
|
||||
*
|
||||
* @param allele the allele whose alt index we'd like to know
|
||||
* @throws IllegalArgumentException if allele isn't in allelesUsedInGenotyping
|
||||
* @return an index value greater than 0 suitable for indexing into the MLE and other alt allele indexed arrays
|
||||
*/
|
||||
@Requires("allele != null")
|
||||
@Ensures({"result >= 0", "result < allelesUsedInGenotyping.size() - 1"})
|
||||
private int altAlleleIndex(final Allele allele) {
|
||||
if ( allele.isReference() ) throw new IllegalArgumentException("Cannot get the alt allele index for reference allele " + allele);
|
||||
final int index = allelesUsedInGenotyping.indexOf(allele);
|
||||
if ( index == -1 )
|
||||
throw new IllegalArgumentException("could not find allele " + allele + " in " + allelesUsedInGenotyping);
|
||||
else
|
||||
return index - 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks that the result is a well-formed log10 probability
|
||||
*
|
||||
* @param result a supposedly well-formed log10 probability value
|
||||
* @return true if result is really well formed
|
||||
*/
|
||||
private static boolean goodLog10Probability(final double result) {
|
||||
return result <= 0.0 && ! Double.isInfinite(result) && ! Double.isNaN(result);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,256 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
* Date: Dec 14, 2011
|
||||
*
|
||||
* Useful helper class to communicate the results of the allele frequency calculation
|
||||
*
|
||||
* TODO -- WHAT IS THE CONTRACT ON MAP AC AND P NON REF?
|
||||
*/
|
||||
class AFCalcResultTracker {
|
||||
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
|
||||
|
||||
// These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles
|
||||
protected double log10MLE;
|
||||
protected double log10MAP;
|
||||
private final int[] alleleCountsOfMLE;
|
||||
private final int[] alleleCountsOfMAP;
|
||||
|
||||
// The posteriors seen, not including that of AF=0
|
||||
private static final int LIKELIHOODS_CACHE_SIZE = 5000;
|
||||
private final double[] log10LikelihoodsMatrixValues = new double[LIKELIHOODS_CACHE_SIZE];
|
||||
private int currentLikelihoodsCacheIndex = 0;
|
||||
protected Double log10LikelihoodsMatrixSum = null;
|
||||
|
||||
// These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
|
||||
private double log10LikelihoodOfAFzero;
|
||||
private double log10PosteriorOfAFzero;
|
||||
private int[] AClimits;
|
||||
|
||||
int nEvaluations = 0;
|
||||
|
||||
/**
|
||||
* The list of alleles actually used in computing the AF
|
||||
*/
|
||||
private List<Allele> allelesUsedInGenotyping = null;
|
||||
|
||||
/**
|
||||
* Create a results object capability of storing results for calls with up to maxAltAlleles
|
||||
*
|
||||
* @param maxAltAlleles an integer >= 1
|
||||
*/
|
||||
public AFCalcResultTracker(final int maxAltAlleles) {
|
||||
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles);
|
||||
|
||||
alleleCountsOfMLE = new int[maxAltAlleles];
|
||||
alleleCountsOfMAP = new int[maxAltAlleles];
|
||||
|
||||
reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a vector with maxAltAlleles values containing AC values at the MLE
|
||||
*
|
||||
* The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order,
|
||||
* starting from index 0 (i.e., the first alt allele is at 0). The vector is always
|
||||
* maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values
|
||||
* are meaningful.
|
||||
*
|
||||
* @return a vector with allele counts, not all of which may be meaningful
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public int[] getAlleleCountsOfMLE() {
|
||||
return alleleCountsOfMLE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a vector with maxAltAlleles values containing AC values at the MAP
|
||||
*
|
||||
* @see #getAlleleCountsOfMLE() for the encoding of results in this vector
|
||||
*
|
||||
* @return a non-null vector of ints
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public int[] getAlleleCountsOfMAP() {
|
||||
return alleleCountsOfMAP;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the likelihoods summed across all AC values for AC > 0
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double getLog10LikelihoodOfAFNotZero() {
|
||||
if ( log10LikelihoodsMatrixSum == null ) {
|
||||
if ( currentLikelihoodsCacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have
|
||||
log10LikelihoodsMatrixSum = MathUtils.LOG10_P_OF_ZERO;
|
||||
else
|
||||
log10LikelihoodsMatrixSum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex);
|
||||
}
|
||||
return log10LikelihoodsMatrixSum;
|
||||
}
|
||||
|
||||
public double getLog10LikelihoodOfAFNotZero(final boolean capAt0) {
|
||||
return Math.min(getLog10LikelihoodOfAFNotZero(), capAt0 ? 0.0 : Double.POSITIVE_INFINITY);
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double getLog10LikelihoodOfAFzero() {
|
||||
return log10LikelihoodOfAFzero;
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double getLog10PosteriorOfAFzero() {
|
||||
return log10PosteriorOfAFzero;
|
||||
}
|
||||
|
||||
protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) {
|
||||
final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1);
|
||||
final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)};
|
||||
final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true);
|
||||
|
||||
// TODO -- replace with more meaningful computation
|
||||
// TODO -- refactor this calculation into the ref calculation
|
||||
final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(allelesUsedInGenotyping.size());
|
||||
for ( int i = 0; i < subACOfMLE.length; i++ ) {
|
||||
final Allele allele = allelesUsedInGenotyping.get(i+1);
|
||||
final double log10PNonRef = getAlleleCountsOfMAP()[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was
|
||||
log10pNonRefByAllele.put(allele, log10PNonRef);
|
||||
}
|
||||
|
||||
return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Protected mutational methods only for use within the calculation models themselves
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Reset the data in this results object, so that it can be used in a subsequent AF calculation
|
||||
*
|
||||
* Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer
|
||||
*/
|
||||
protected void reset() {
|
||||
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = VALUE_NOT_CALCULATED;
|
||||
for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) {
|
||||
alleleCountsOfMLE[i] = 0;
|
||||
alleleCountsOfMAP[i] = 0;
|
||||
}
|
||||
currentLikelihoodsCacheIndex = 0;
|
||||
log10LikelihoodsMatrixSum = null;
|
||||
allelesUsedInGenotyping = null;
|
||||
nEvaluations = 0;
|
||||
Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tell this result we used one more evaluation cycle
|
||||
*/
|
||||
protected void incNEvaluations() {
|
||||
nEvaluations++;
|
||||
}
|
||||
|
||||
protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
|
||||
addToLikelihoodsCache(log10LofK);
|
||||
|
||||
if ( log10LofK > log10MLE ) {
|
||||
log10MLE = log10LofK;
|
||||
for ( int i = 0; i < alleleCountsForK.length; i++ )
|
||||
alleleCountsOfMLE[i] = alleleCountsForK[i];
|
||||
}
|
||||
}
|
||||
|
||||
protected void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
|
||||
if ( log10LofK > log10MAP ) {
|
||||
log10MAP = log10LofK;
|
||||
for ( int i = 0; i < alleleCountsForK.length; i++ )
|
||||
alleleCountsOfMAP[i] = alleleCountsForK[i];
|
||||
}
|
||||
}
|
||||
|
||||
private void addToLikelihoodsCache(final double log10LofK) {
|
||||
// add to the cache
|
||||
log10LikelihoodsMatrixValues[currentLikelihoodsCacheIndex++] = log10LofK;
|
||||
|
||||
// if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell
|
||||
if ( currentLikelihoodsCacheIndex == LIKELIHOODS_CACHE_SIZE) {
|
||||
final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex);
|
||||
Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY);
|
||||
log10LikelihoodsMatrixValues[0] = temporarySum;
|
||||
currentLikelihoodsCacheIndex = 1;
|
||||
}
|
||||
}
|
||||
|
||||
protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
|
||||
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
|
||||
if ( log10LikelihoodOfAFzero > log10MLE ) {
|
||||
log10MLE = log10LikelihoodOfAFzero;
|
||||
Arrays.fill(alleleCountsOfMLE, 0);
|
||||
}
|
||||
}
|
||||
|
||||
protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
|
||||
this.log10PosteriorOfAFzero = log10PosteriorOfAFzero;
|
||||
if ( log10PosteriorOfAFzero > log10MAP ) {
|
||||
log10MAP = log10PosteriorOfAFzero;
|
||||
Arrays.fill(alleleCountsOfMAP, 0);
|
||||
}
|
||||
}
|
||||
|
||||
protected void setAllelesUsedInGenotyping(List<Allele> allelesUsedInGenotyping) {
|
||||
if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() )
|
||||
throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty");
|
||||
|
||||
this.allelesUsedInGenotyping = allelesUsedInGenotyping;
|
||||
}
|
||||
|
||||
protected void setAClimits(int[] AClimits) {
|
||||
this.AClimits = AClimits;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,107 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
@Deprecated
|
||||
public class ConstrainedDiploidExactAFCalc extends DiploidExactAFCalc {
|
||||
protected ConstrainedDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
|
||||
}
|
||||
|
||||
protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) {
|
||||
final int[] maxACsToConsider = computeMaxACs(vc);
|
||||
resultTracker.setAClimits(maxACsToConsider);
|
||||
return new StateTracker(maxACsToConsider);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the maximum ACs we need to consider for each alt allele
|
||||
*
|
||||
* Walks over the genotypes in VC, and computes for each alt allele the maximum
|
||||
* AC we need to consider in that alt allele dimension. Does the calculation
|
||||
* based on the PLs in each genotype g, choosing to update the max AC for the
|
||||
* alt alleles corresponding to that PL. Only takes the first lowest PL,
|
||||
* if there are multiple genotype configurations with the same PL value. It
|
||||
* takes values in the order of the alt alleles.
|
||||
*
|
||||
* @param vc the variant context we will compute max alt alleles for
|
||||
* @return a vector of max alt alleles, indexed by alt allele, so result[0] is the AC of the
|
||||
* first alt allele.
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
protected final int[] computeMaxACs(final VariantContext vc) {
|
||||
final int[] maxACs = new int[vc.getNAlleles()-1];
|
||||
|
||||
for ( final Genotype g : vc.getGenotypes() )
|
||||
updateMaxACs(g, maxACs);
|
||||
|
||||
return maxACs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the maximum achievable allele counts in maxAC according to the PLs in g
|
||||
*
|
||||
* Selects the maximum genotype configuration from the PLs in g, and updates
|
||||
* the maxAC for this configure. For example, if the lowest PL is for 0/1, updates
|
||||
* the maxAC for the alt allele 1 by 1. If it's 1/1, update is 2. Works for
|
||||
* many number of alt alleles (determined by length of maxACs).
|
||||
*
|
||||
* If the max PL occurs at 0/0, updates nothing
|
||||
* Note that this function greedily takes the first min PL, so that if 0/1 and 1/1 have
|
||||
* the same PL value, then updates the first one.
|
||||
*
|
||||
* Also, only will update 1 alt allele, so if 0/1 and 0/2 both have the same PL,
|
||||
* then only first one (1) will be updated
|
||||
*
|
||||
* @param g the genotype to update
|
||||
* @param maxACs the max allele count vector for alt alleles (starting at 0 => first alt allele)
|
||||
*/
|
||||
@Requires({
|
||||
"g != null",
|
||||
"maxACs != null",
|
||||
"goodMaxACs(maxACs)"})
|
||||
private void updateMaxACs(final Genotype g, final int[] maxACs) {
|
||||
final int[] PLs = g.getLikelihoods().getAsPLs();
|
||||
|
||||
int minPLi = 0;
|
||||
int minPL = PLs[0];
|
||||
|
||||
for ( int i = 0; i < PLs.length; i++ ) {
|
||||
if ( PLs[i] < minPL ) {
|
||||
minPL = PLs[i];
|
||||
minPLi = i;
|
||||
}
|
||||
}
|
||||
|
||||
final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(minPLi);
|
||||
updateMaxACs(maxACs, pair.alleleIndex1);
|
||||
updateMaxACs(maxACs, pair.alleleIndex2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple helper. Update max alt alleles maxACs according to the allele index (where 0 == ref)
|
||||
*
|
||||
* If alleleI == 0 => doesn't update anything
|
||||
* else maxACs[alleleI - 1]++
|
||||
*
|
||||
* @param maxACs array of max alt allele ACs
|
||||
* @param alleleI the index (relative to 0) to update a count of 1 in max alt alleles.
|
||||
*/
|
||||
@Requires({
|
||||
"alleleI >= 0",
|
||||
"(alleleI - 1) < maxACs.length",
|
||||
"goodMaxACs(maxACs)"})
|
||||
private void updateMaxACs(final int[] maxACs, final int alleleI) {
|
||||
if ( alleleI > 0 )
|
||||
maxACs[alleleI-1]++;
|
||||
}
|
||||
|
||||
private static boolean goodMaxACs(final int[] maxACs) {
|
||||
return MathUtils.sum(maxACs) >= 0;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,350 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public abstract class DiploidExactAFCalc extends ExactAFCalc {
|
||||
public DiploidExactAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
|
||||
if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy);
|
||||
}
|
||||
|
||||
protected abstract StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker);
|
||||
|
||||
@Override
|
||||
protected AFCalcResult computeLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors) {
|
||||
final int numAlternateAlleles = vc.getNAlleles() - 1;
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(vc.getGenotypes(), true);
|
||||
final int numSamples = genotypeLikelihoods.size()-1;
|
||||
final int numChr = 2*numSamples;
|
||||
|
||||
// queue of AC conformations to process
|
||||
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
|
||||
|
||||
// mapping of ExactACset indexes to the objects
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>(numChr+1);
|
||||
|
||||
// add AC=0 to the queue
|
||||
final int[] zeroCounts = new int[numAlternateAlleles];
|
||||
ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts));
|
||||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.getACcounts(), zeroSet);
|
||||
|
||||
// keep processing while we have AC conformations that need to be calculated
|
||||
final StateTracker stateTracker = makeMaxLikelihood(vc, getResultTracker());
|
||||
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
getResultTracker().incNEvaluations(); // keep track of the number of evaluations
|
||||
|
||||
// compute log10Likelihoods
|
||||
final ExactACset set = ACqueue.remove();
|
||||
|
||||
if ( stateTracker.withinMaxACs(set.getACcounts()) ) {
|
||||
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, stateTracker, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, getResultTracker());
|
||||
|
||||
// adjust max likelihood seen if needed
|
||||
stateTracker.update(log10LofKs, set.getACcounts());
|
||||
|
||||
// clean up memory
|
||||
indexesToACset.remove(set.getACcounts());
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** removing used set=%s%n", set.ACcounts);
|
||||
}
|
||||
}
|
||||
|
||||
return resultFromTracker(vc, log10AlleleFrequencyPriors);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected VariantContext reduceScope(final VariantContext vc) {
|
||||
final int myMaxAltAllelesToGenotype = vc.getType().equals(VariantContext.Type.INDEL) ? maxAlternateAllelesForIndels : maxAlternateAllelesToGenotype;
|
||||
|
||||
// don't try to genotype too many alternate alleles
|
||||
if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) {
|
||||
logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
|
||||
|
||||
VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||
List<Allele> alleles = new ArrayList<Allele>(myMaxAltAllelesToGenotype + 1);
|
||||
alleles.add(vc.getReference());
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype));
|
||||
builder.alleles(alleles);
|
||||
builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false));
|
||||
return builder.make();
|
||||
} else {
|
||||
return vc;
|
||||
}
|
||||
}
|
||||
|
||||
private static final int PL_INDEX_OF_HOM_REF = 0;
|
||||
private static List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) {
|
||||
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
|
||||
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles];
|
||||
for ( int i = 0; i < numOriginalAltAlleles; i++ )
|
||||
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
|
||||
|
||||
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
|
||||
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes(), true);
|
||||
for ( final double[] likelihoods : GLs ) {
|
||||
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
||||
if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) {
|
||||
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL);
|
||||
if ( alleles.alleleIndex1 != 0 )
|
||||
likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
|
||||
// don't double-count it
|
||||
if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 )
|
||||
likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
|
||||
}
|
||||
}
|
||||
|
||||
// sort them by probability mass and choose the best ones
|
||||
Collections.sort(Arrays.asList(likelihoodSums));
|
||||
final ArrayList<Allele> bestAlleles = new ArrayList<Allele>(numAllelesToChoose);
|
||||
for ( int i = 0; i < numAllelesToChoose; i++ )
|
||||
bestAlleles.add(likelihoodSums[i].allele);
|
||||
|
||||
final ArrayList<Allele> orderedBestAlleles = new ArrayList<Allele>(numAllelesToChoose);
|
||||
for ( Allele allele : vc.getAlternateAlleles() ) {
|
||||
if ( bestAlleles.contains(allele) )
|
||||
orderedBestAlleles.add(allele);
|
||||
}
|
||||
|
||||
return orderedBestAlleles;
|
||||
}
|
||||
|
||||
private static final class DependentSet {
|
||||
public final int[] ACcounts;
|
||||
public final int PLindex;
|
||||
|
||||
public DependentSet(final int[] ACcounts, final int PLindex) {
|
||||
this.ACcounts = ACcounts;
|
||||
this.PLindex = PLindex;
|
||||
}
|
||||
}
|
||||
|
||||
private double calculateAlleleCountConformation(final ExactACset set,
|
||||
final ArrayList<double[]> genotypeLikelihoods,
|
||||
final StateTracker stateTracker,
|
||||
final int numChr,
|
||||
final LinkedList<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AFCalcResultTracker resultTracker) {
|
||||
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
|
||||
|
||||
// compute the log10Likelihoods
|
||||
computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, resultTracker);
|
||||
|
||||
final double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1];
|
||||
|
||||
// can we abort early because the log10Likelihoods are so small?
|
||||
if ( stateTracker.abort(log10LofK, set.getACcounts()) ) {
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// iterate over higher frequencies if possible
|
||||
final int ACwiggle = numChr - set.getACsum();
|
||||
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
|
||||
return log10LofK;
|
||||
|
||||
final int numAltAlleles = set.getACcounts().getCounts().length;
|
||||
|
||||
// add conformations for the k+1 case
|
||||
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
|
||||
final int[] ACcountsClone = set.getACcounts().getCounts().clone();
|
||||
ACcountsClone[allele]++;
|
||||
// to get to this conformation, a sample would need to be AB (remember that ref=0)
|
||||
final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1);
|
||||
updateACset(stateTracker, ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
}
|
||||
|
||||
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
|
||||
if ( ACwiggle > 1 ) {
|
||||
final ArrayList<DependentSet> differentAlleles = new ArrayList<DependentSet>(numAltAlleles * numAltAlleles);
|
||||
final ArrayList<DependentSet> sameAlleles = new ArrayList<DependentSet>(numAltAlleles);
|
||||
|
||||
for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) {
|
||||
for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) {
|
||||
final int[] ACcountsClone = set.getACcounts().getCounts().clone();
|
||||
ACcountsClone[allele_i]++;
|
||||
ACcountsClone[allele_j]++;
|
||||
|
||||
// to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index)
|
||||
final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1);
|
||||
if ( allele_i == allele_j )
|
||||
sameAlleles.add(new DependentSet(ACcountsClone, PLindex));
|
||||
else
|
||||
differentAlleles.add(new DependentSet(ACcountsClone, PLindex));
|
||||
}
|
||||
}
|
||||
|
||||
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
|
||||
for ( DependentSet dependent : differentAlleles )
|
||||
updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
for ( DependentSet dependent : sameAlleles )
|
||||
updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
}
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
|
||||
// also pushes its value to the given callingSetIndex.
|
||||
private void updateACset(final StateTracker stateTracker,
|
||||
final int[] newSetCounts,
|
||||
final int numChr,
|
||||
final ExactACset dependentSet,
|
||||
final int PLsetIndex,
|
||||
final Queue<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||
final ArrayList<double[]> genotypeLikelihoods) {
|
||||
final ExactACcounts index = new ExactACcounts(newSetCounts);
|
||||
if ( !indexesToACset.containsKey(index) ) {
|
||||
ExactACset set = new ExactACset(numChr/2 +1, index);
|
||||
indexesToACset.put(index, set);
|
||||
ACqueue.add(set);
|
||||
}
|
||||
|
||||
// push data from the dependency to the new set
|
||||
//if ( DEBUG )
|
||||
// System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts);
|
||||
pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods);
|
||||
}
|
||||
|
||||
private void computeLofK(final ExactACset set,
|
||||
final ArrayList<double[]> genotypeLikelihoods,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AFCalcResultTracker resultTracker) {
|
||||
|
||||
set.getLog10Likelihoods()[0] = 0.0; // the zero case
|
||||
final int totalK = set.getACsum();
|
||||
|
||||
// special case for k = 0 over all k
|
||||
if ( totalK == 0 ) {
|
||||
for ( int j = 1; j < set.getLog10Likelihoods().length; j++ )
|
||||
set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX];
|
||||
|
||||
final double log10Lof0 = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1];
|
||||
resultTracker.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
return;
|
||||
}
|
||||
|
||||
// if we got here, then k > 0 for at least one k.
|
||||
// the non-AA possible conformations were already dealt with by pushes from dependent sets;
|
||||
// now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
|
||||
for ( int j = 1; j < set.getLog10Likelihoods().length; j++ ) {
|
||||
|
||||
if ( totalK < 2*j-1 ) {
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.getLog10Likelihoods()[j-1] + gl[HOM_REF_INDEX];
|
||||
set.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[j], conformationValue);
|
||||
}
|
||||
|
||||
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j] - logDenominator;
|
||||
}
|
||||
|
||||
double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1];
|
||||
|
||||
// update the MLE if necessary
|
||||
resultTracker.updateMLEifNeeded(log10LofK, set.getACcounts().getCounts());
|
||||
|
||||
// apply the priors over each alternate allele
|
||||
for ( final int ACcount : set.getACcounts().getCounts() ) {
|
||||
if ( ACcount > 0 )
|
||||
log10LofK += log10AlleleFrequencyPriors[ACcount];
|
||||
}
|
||||
resultTracker.updateMAPifNeeded(log10LofK, set.getACcounts().getCounts());
|
||||
}
|
||||
|
||||
private void pushData(final ExactACset targetSet,
|
||||
final ExactACset dependentSet,
|
||||
final int PLsetIndex,
|
||||
final ArrayList<double[]> genotypeLikelihoods) {
|
||||
final int totalK = targetSet.getACsum();
|
||||
|
||||
for ( int j = 1; j < targetSet.getLog10Likelihoods().length; j++ ) {
|
||||
|
||||
if ( totalK <= 2*j ) { // skip impossible conformations
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
final double conformationValue =
|
||||
determineCoefficient(PLsetIndex, j, targetSet.getACcounts().getCounts(), totalK) + dependentSet.getLog10Likelihoods()[j-1] + gl[PLsetIndex];
|
||||
targetSet.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(targetSet.getLog10Likelihoods()[j], conformationValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
|
||||
// the closed form representation generalized for multiple alleles is as follows:
|
||||
// AA: (2j - totalK) * (2j - totalK - 1)
|
||||
// AB: 2k_b * (2j - totalK)
|
||||
// AC: 2k_c * (2j - totalK)
|
||||
// BB: k_b * (k_b - 1)
|
||||
// BC: 2 * k_b * k_c
|
||||
// CC: k_c * (k_c - 1)
|
||||
|
||||
// find the 2 alleles that are represented by this PL index
|
||||
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
|
||||
|
||||
// *** note that throughout this method we subtract one from the alleleIndex because ACcounts ***
|
||||
// *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. ***
|
||||
|
||||
// the AX het case
|
||||
if ( alleles.alleleIndex1 == 0 )
|
||||
return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK];
|
||||
|
||||
final int k_i = ACcounts[alleles.alleleIndex1-1];
|
||||
|
||||
// the hom var case (e.g. BB, CC, DD)
|
||||
final double coeff;
|
||||
if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) {
|
||||
coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1];
|
||||
}
|
||||
// the het non-ref case (e.g. BC, BD, CD)
|
||||
else {
|
||||
final int k_j = ACcounts[alleles.alleleIndex2-1];
|
||||
coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j];
|
||||
}
|
||||
|
||||
return coeff;
|
||||
}
|
||||
|
||||
public GenotypesContext subsetAlleles(final VariantContext vc,
|
||||
final List<Allele> allelesToUse,
|
||||
final boolean assignGenotypes,
|
||||
final int ploidy) {
|
||||
return VariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,46 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: 10/5/12
|
||||
* Time: 2:54 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/ // a wrapper around the int array so that we can make it hashable
|
||||
public final class ExactACcounts {
|
||||
private final int[] counts;
|
||||
private int hashcode = -1;
|
||||
|
||||
public ExactACcounts(final int[] counts) {
|
||||
this.counts = counts;
|
||||
}
|
||||
|
||||
public int[] getCounts() {
|
||||
return counts;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
return (obj instanceof ExactACcounts) && Arrays.equals(getCounts(), ((ExactACcounts) obj).getCounts());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
if ( hashcode == -1 )
|
||||
hashcode = Arrays.hashCode(getCounts());
|
||||
return hashcode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append(getCounts()[0]);
|
||||
for ( int i = 1; i < getCounts().length; i++ ) {
|
||||
sb.append("/");
|
||||
sb.append(getCounts()[i]);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: 10/5/12
|
||||
* Time: 2:53 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/ // This class represents a column in the Exact AC calculation matrix
|
||||
public final class ExactACset {
|
||||
// the counts of the various alternate alleles which this column represents
|
||||
private final ExactACcounts ACcounts;
|
||||
|
||||
// the column of the matrix
|
||||
private final double[] log10Likelihoods;
|
||||
|
||||
int sum = -1;
|
||||
|
||||
public ExactACset(final int size, final ExactACcounts ACcounts) {
|
||||
this.ACcounts = ACcounts;
|
||||
log10Likelihoods = new double[size];
|
||||
Arrays.fill(log10Likelihoods, Double.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
/**
|
||||
* sum of all the non-reference alleles
|
||||
*/
|
||||
public int getACsum() {
|
||||
if ( sum == -1 )
|
||||
sum = (int)MathUtils.sum(getACcounts().getCounts());
|
||||
return sum;
|
||||
}
|
||||
|
||||
public boolean equals(Object obj) {
|
||||
return (obj instanceof ExactACset) && getACcounts().equals(((ExactACset)obj).getACcounts());
|
||||
}
|
||||
|
||||
public ExactACcounts getACcounts() {
|
||||
return ACcounts;
|
||||
}
|
||||
|
||||
public double[] getLog10Likelihoods() {
|
||||
return log10Likelihoods;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Uses the Exact calculation of Heng Li
|
||||
*/
|
||||
abstract class ExactAFCalc extends AFCalc {
|
||||
protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first
|
||||
|
||||
protected ExactAFCalc(final int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrapper class that compares two likelihoods associated with two alleles
|
||||
*/
|
||||
protected static final class LikelihoodSum implements Comparable<LikelihoodSum> {
|
||||
public double sum = 0.0;
|
||||
public Allele allele;
|
||||
|
||||
public LikelihoodSum(Allele allele) { this.allele = allele; }
|
||||
|
||||
public int compareTo(LikelihoodSum other) {
|
||||
final double diff = sum - other.sum;
|
||||
return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Unpack GenotypesContext into arraylist of doubel values
|
||||
* @param GLs Input genotype context
|
||||
* @return ArrayList of doubles corresponding to GL vectors
|
||||
*/
|
||||
protected static ArrayList<double[]> getGLs(final GenotypesContext GLs, final boolean includeDummy) {
|
||||
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>(GLs.size() + 1);
|
||||
|
||||
if ( includeDummy ) genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
|
||||
for ( Genotype sample : GLs.iterateInSampleNameOrder() ) {
|
||||
if ( sample.hasLikelihoods() ) {
|
||||
double[] gls = sample.getLikelihoods().getAsVector();
|
||||
|
||||
if ( MathUtils.sum(gls) < VariantContextUtils.SUM_GL_THRESH_NOCALL )
|
||||
genotypeLikelihoods.add(gls);
|
||||
}
|
||||
}
|
||||
|
||||
return genotypeLikelihoods;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,338 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
|
||||
private final static List<Allele> BIALLELIC_NOCALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
|
||||
|
||||
private final static class CompareAFCalcResultsByPNonRef implements Comparator<AFCalcResult> {
|
||||
@Override
|
||||
public int compare(AFCalcResult o1, AFCalcResult o2) {
|
||||
return Double.compare(o1.getLog10LikelihoodOfAFGT0(), o2.getLog10LikelihoodOfAFGT0());
|
||||
}
|
||||
}
|
||||
|
||||
private final static CompareAFCalcResultsByPNonRef compareAFCalcResultsByPNonRef = new CompareAFCalcResultsByPNonRef();
|
||||
|
||||
final ReferenceDiploidExactAFCalc refModel;
|
||||
|
||||
protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
|
||||
refModel = new ReferenceDiploidExactAFCalc(nSamples, 1, 1, ploidy);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected StateTracker makeMaxLikelihood(VariantContext vc, AFCalcResultTracker resultTracker) {
|
||||
return refModel.makeMaxLikelihood(vc, resultTracker);
|
||||
}
|
||||
|
||||
private static class MyAFCalcResult extends AFCalcResult {
|
||||
final List<AFCalcResult> supporting;
|
||||
|
||||
private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List<Allele> allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map<Allele, Double> log10pNonRefByAllele, List<AFCalcResult> supporting) {
|
||||
super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pNonRefByAllele);
|
||||
this.supporting = supporting;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public AFCalcResult computeLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors) {
|
||||
final double log10LikelihoodOfRef = computelog10LikelihoodOfRef(vc);
|
||||
final List<AFCalcResult> independentResultTrackers = computeAlleleConditionalExact(vc, log10AlleleFrequencyPriors);
|
||||
final List<AFCalcResult> withMultiAllelicPriors = applyMultiAllelicPriors(independentResultTrackers);
|
||||
return combineIndependentPNonRefs(vc, log10LikelihoodOfRef, withMultiAllelicPriors);
|
||||
}
|
||||
|
||||
protected final double computelog10LikelihoodOfRef(final VariantContext vc) {
|
||||
// this value just the likelihood of AF == 0 in the special constrained multi-allelic calculation
|
||||
final List<double[]> allGLs = getGLs(vc.getGenotypes(), false);
|
||||
double log10LikelihoodOfHomRef = 0.0;
|
||||
|
||||
// TODO -- can be easily optimized (currently looks at all GLs via getGLs)
|
||||
for ( int i = 0; i < allGLs.size(); i++ ) {
|
||||
final double[] GLs = allGLs.get(i);
|
||||
log10LikelihoodOfHomRef += GLs[0];
|
||||
//log10LikelihoodOfHomRef += MathUtils.normalizeFromLog10(GLs, true)[0];
|
||||
}
|
||||
|
||||
return log10LikelihoodOfHomRef;
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the conditional bi-allelic exact results
|
||||
*
|
||||
* Suppose vc contains 2 alt allele: A* with C and T. This function first computes:
|
||||
*
|
||||
* (1) P(D | AF_c > 0 && AF_t == *) [i.e., T can be anything]
|
||||
*
|
||||
* it then computes the conditional probability on AF_c == 0:
|
||||
*
|
||||
* (2) P(D | AF_t > 0 && AF_c == 0)
|
||||
*
|
||||
* Thinking about this visually, we have the following likelihood matrix where each cell is
|
||||
* the P(D | AF_c == i && AF_t == j):
|
||||
*
|
||||
* 0 AF_c > 0
|
||||
* -----------------
|
||||
* 0 | |
|
||||
* |--|-------------
|
||||
* a | |
|
||||
* f | |
|
||||
* _ | |
|
||||
* t | |
|
||||
* > | |
|
||||
* 0 | |
|
||||
*
|
||||
* What we really want to know how
|
||||
*
|
||||
* (3) P(D | AF_c == 0 & AF_t == 0)
|
||||
*
|
||||
* compares with
|
||||
*
|
||||
* (4) P(D | AF_c > 0 || AF_t > 0)
|
||||
*
|
||||
* This is effectively asking for the value in the upper left vs. the sum of all cells.
|
||||
*
|
||||
* The quantity (1) is the same of all cells except those with AF_c == 0, while (2) is the
|
||||
* band at the top where AF_t > 0 and AF_c == 0
|
||||
*
|
||||
* So (4) is actually (1) + (2).
|
||||
*
|
||||
* (3) is the direct inverse of the (1) and (2), as we are simultaneously calculating
|
||||
*
|
||||
* (1*) P(D | AF_c == 0 && AF_t == *) [i.e., T can be anything]
|
||||
* (2*) P(D | AF_t == 0 && AF_c == 0) [TODO -- note this value looks like the thing we are supposed to use]
|
||||
*
|
||||
* This function implements the conditional likelihoods summation for any number of alt
|
||||
* alleles (not just the tri-allelic case), where each subsequent variant context is
|
||||
* further constrained such that each already considered allele x has AF_x == 0 in the
|
||||
* compute.
|
||||
*
|
||||
* @param vc
|
||||
* @param log10AlleleFrequencyPriors
|
||||
* @return
|
||||
*/
|
||||
protected List<AFCalcResult> computeAlleleConditionalExact(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors) {
|
||||
final List<AFCalcResult> results = new LinkedList<AFCalcResult>();
|
||||
|
||||
for ( final VariantContext subvc : makeAlleleConditionalContexts(vc) ) {
|
||||
final AFCalcResult resultTracker = refModel.getLog10PNonRef(subvc, log10AlleleFrequencyPriors);
|
||||
results.add(resultTracker);
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
protected List<VariantContext> makeAlleleConditionalContexts(final VariantContext vc) {
|
||||
final int nAltAlleles = vc.getNAlleles() - 1;
|
||||
final List<VariantContext> vcs = new LinkedList<VariantContext>();
|
||||
|
||||
final List<Allele> afZeroAlleles = new LinkedList<Allele>();
|
||||
for ( int altI = 0; altI < nAltAlleles; altI++ ) {
|
||||
final Allele altAllele = vc.getAlternateAllele(altI);
|
||||
final List<Allele> biallelic = Arrays.asList(vc.getReference(), altAllele);
|
||||
vcs.add(biallelicCombinedGLs(vc, biallelic, afZeroAlleles, altI + 1));
|
||||
//afZeroAlleles.add(altAllele);
|
||||
}
|
||||
|
||||
return vcs;
|
||||
}
|
||||
|
||||
protected VariantContext biallelicCombinedGLs(final VariantContext rootVC, final List<Allele> biallelic, final List<Allele> afZeroAlleles, final int allele2) {
|
||||
if ( rootVC.isBiallelic() ) {
|
||||
if ( ! afZeroAlleles.isEmpty() ) throw new IllegalArgumentException("Root VariantContext is biallelic but afZeroAlleles wasn't empty: " + afZeroAlleles);
|
||||
return rootVC;
|
||||
} else {
|
||||
final Set<Integer> allelesToDiscard = new HashSet<Integer>(rootVC.getAlleleIndices(afZeroAlleles));
|
||||
final int nAlts = rootVC.getNAlleles() - 1;
|
||||
final List<Genotype> biallelicGenotypes = new ArrayList<Genotype>(rootVC.getNSamples());
|
||||
for ( final Genotype g : rootVC.getGenotypes() )
|
||||
biallelicGenotypes.add(combineGLs(g, allele2, allelesToDiscard, nAlts));
|
||||
|
||||
final VariantContextBuilder vcb = new VariantContextBuilder(rootVC);
|
||||
vcb.alleles(biallelic);
|
||||
vcb.genotypes(biallelicGenotypes);
|
||||
return vcb.make();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new Genotype with the PLs of the multi-allelic original reduced to a bi-allelic case
|
||||
*
|
||||
* This is handled in the following way:
|
||||
*
|
||||
* Suppose we have for a A/B/C site the following GLs:
|
||||
*
|
||||
* AA AB BB AC BC CC
|
||||
*
|
||||
* and we want to get the bi-allelic GLs for X/B, where X is everything not B
|
||||
*
|
||||
* XX = AA + AC + CC (since X = A or C)
|
||||
* XB = AB + BC
|
||||
* BB = BB
|
||||
*
|
||||
* Supports the additional mode of simply dropping GLs whose allele index occurs in allelesToDiscard. This is
|
||||
* useful in the case where you want to drop alleles (not combine them), such as above:
|
||||
*
|
||||
* AA AB BB AC BC CC
|
||||
*
|
||||
* and we want to get the bi-allelic GLs for X/B, where X is everything not B, but dropping C (index 2)
|
||||
*
|
||||
* XX = AA (since X = A and C is dropped)
|
||||
* XB = AB
|
||||
* BB = BB
|
||||
*
|
||||
* This allows us to recover partial GLs the correspond to any allele in allelesToDiscard having strictly
|
||||
* AF == 0.
|
||||
*
|
||||
* @param original the original multi-allelic genotype
|
||||
* @param altIndex the index of the alt allele we wish to keep in the bialleic case -- with ref == 0
|
||||
* @param nAlts the total number of alt alleles
|
||||
* @return a new biallelic genotype with appropriate PLs
|
||||
*/
|
||||
@Requires({"original.hasLikelihoods()", "! allelesToDiscard.contains(altIndex)"})
|
||||
@Ensures({"result.hasLikelihoods()", "result.getPL().length == 3"})
|
||||
protected Genotype combineGLs(final Genotype original, final int altIndex, final Set<Integer> allelesToDiscard, final int nAlts ) {
|
||||
if ( original.isNonInformative() )
|
||||
return new GenotypeBuilder(original).PL(new int[]{0,0,0}).alleles(BIALLELIC_NOCALL).make();
|
||||
|
||||
if ( altIndex < 1 || altIndex > nAlts ) throw new IllegalStateException("altIndex must be between 1 and nAlts " + nAlts);
|
||||
|
||||
final double[] normalizedPr = MathUtils.normalizeFromLog10(GenotypeLikelihoods.fromPLs(original.getPL()).getAsVector());
|
||||
final double[] biAllelicPr = new double[3];
|
||||
|
||||
for ( int index = 0; index < normalizedPr.length; index++ ) {
|
||||
final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(index);
|
||||
|
||||
// just continue if we shouldn't include the pair because it's in the discard set
|
||||
if ( discardAllelePair(pair, allelesToDiscard) )
|
||||
continue;
|
||||
|
||||
if ( pair.alleleIndex1 == altIndex ) {
|
||||
if ( pair.alleleIndex2 == altIndex )
|
||||
// hom-alt case
|
||||
biAllelicPr[2] = normalizedPr[index];
|
||||
else
|
||||
// het-alt case
|
||||
biAllelicPr[1] += normalizedPr[index];
|
||||
} else {
|
||||
if ( pair.alleleIndex2 == altIndex )
|
||||
// het-alt case
|
||||
biAllelicPr[1] += normalizedPr[index];
|
||||
else
|
||||
// hom-non-alt
|
||||
biAllelicPr[0] += normalizedPr[index];
|
||||
}
|
||||
}
|
||||
|
||||
final double[] GLs = new double[3];
|
||||
for ( int i = 0; i < GLs.length; i++ ) GLs[i] = Math.log10(biAllelicPr[i]);
|
||||
|
||||
return new GenotypeBuilder(original).PL(GLs).alleles(BIALLELIC_NOCALL).make();
|
||||
}
|
||||
|
||||
protected boolean discardAllelePair(final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair, Set<Integer> allelesToDiscard) {
|
||||
return allelesToDiscard.contains(pair.alleleIndex1) || allelesToDiscard.contains(pair.alleleIndex2);
|
||||
}
|
||||
|
||||
protected List<AFCalcResult> applyMultiAllelicPriors(final List<AFCalcResult> conditionalPNonRefResults) {
|
||||
final ArrayList<AFCalcResult> sorted = new ArrayList<AFCalcResult>(conditionalPNonRefResults);
|
||||
|
||||
// sort the results, so the most likely allele is first
|
||||
Collections.sort(sorted, compareAFCalcResultsByPNonRef);
|
||||
|
||||
final double log10SingleAllelePriorOfAFGt0 = conditionalPNonRefResults.get(0).getLog10PriorOfAFGT0();
|
||||
|
||||
for ( int i = 0; i < sorted.size(); i++ ) {
|
||||
final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0;
|
||||
final double log10PriorAFEq0 = Math.log10(1 - Math.pow(10, log10PriorAFGt0));
|
||||
final double[] thetaTONPriors = new double[] { log10PriorAFEq0, log10PriorAFGt0 };
|
||||
|
||||
// bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior
|
||||
sorted.set(i, sorted.get(i).withNewPriors(MathUtils.normalizeFromLog10(thetaTONPriors, true)));
|
||||
}
|
||||
|
||||
return sorted;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Take the independent estimates of pNonRef for each alt allele and combine them into a single result
|
||||
*
|
||||
* @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently
|
||||
*/
|
||||
protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc,
|
||||
final double log10LikelihoodsOfACEq0,
|
||||
final List<AFCalcResult> sortedResultsWithThetaNPriors) {
|
||||
int nEvaluations = 0;
|
||||
final int nAltAlleles = sortedResultsWithThetaNPriors.size();
|
||||
final int[] alleleCountsOfMLE = new int[nAltAlleles];
|
||||
final double[] log10PriorsOfAC = new double[2];
|
||||
final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(nAltAlleles);
|
||||
|
||||
// this value is a sum in real space so we need to store values to sum up later
|
||||
final double[] log10LikelihoodsOfACGt0 = new double[nAltAlleles];
|
||||
|
||||
for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) {
|
||||
final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1);
|
||||
final int altI = vc.getAlleles().indexOf(altAllele) - 1;
|
||||
|
||||
// MLE of altI allele is simply the MLE of this allele in altAlleles
|
||||
alleleCountsOfMLE[altI] = sortedResultWithThetaNPriors.getAlleleCountAtMLE(altAllele);
|
||||
|
||||
log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0();
|
||||
log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0();
|
||||
|
||||
// the AF > 0 case requires us to store the normalized likelihood for later summation
|
||||
log10LikelihoodsOfACGt0[altI] = sortedResultWithThetaNPriors.getLog10LikelihoodOfAFGT0();
|
||||
|
||||
// bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior
|
||||
log10pNonRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0());
|
||||
|
||||
// trivial -- update the number of evaluations
|
||||
nEvaluations += sortedResultWithThetaNPriors.nEvaluations;
|
||||
}
|
||||
|
||||
// the log10 likelihoods are the sum of the log10 likelihoods across all alt alleles
|
||||
final double[] log10LikelihoodsOfAC = new double[]{
|
||||
log10LikelihoodsOfACEq0,
|
||||
MathUtils.log10sumLog10(log10LikelihoodsOfACGt0)};
|
||||
|
||||
return new MyAFCalcResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(),
|
||||
MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true, true), // necessary to ensure all values < 0
|
||||
MathUtils.normalizeFromLog10(log10PriorsOfAC, true), // priors incorporate multiple alt alleles, must be normalized
|
||||
log10pNonRefByAllele, sortedResultsWithThetaNPriors);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,145 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Original bi-allelic ~O(N) implementation. Kept here for posterity and reference
|
||||
*/
|
||||
public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
|
||||
protected OriginalDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
|
||||
}
|
||||
|
||||
protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) {
|
||||
return new StateTracker();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected AFCalcResult computeLog10PNonRef(VariantContext vc, double[] log10AlleleFrequencyPriors) {
|
||||
final double[] log10AlleleFrequencyLikelihoods = new double[log10AlleleFrequencyPriors.length];
|
||||
final double[] log10AlleleFrequencyPosteriors = new double[log10AlleleFrequencyPriors.length];
|
||||
final int lastK = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
|
||||
|
||||
final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1)};
|
||||
final double[] log10Priors = new double[]{log10AlleleFrequencyPriors[0], MathUtils.log10sumLog10(log10AlleleFrequencyPriors, 1)};
|
||||
|
||||
final double pNonRef = lastK > 0 ? 0.0 : -1000.0;
|
||||
final Map<Allele, Double> log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), pNonRef);
|
||||
|
||||
return new AFCalcResult(new int[]{lastK}, 0, vc.getAlleles(), log10Likelihoods, log10Priors, log10pNonRefByAllele);
|
||||
}
|
||||
|
||||
/**
|
||||
* A simple data structure that holds the current, prev, and prev->prev likelihoods vectors
|
||||
* for the exact model calculation
|
||||
*/
|
||||
private final static class ExactACCache {
|
||||
double[] kMinus2, kMinus1, kMinus0;
|
||||
|
||||
private static double[] create(int n) {
|
||||
return new double[n];
|
||||
}
|
||||
|
||||
public ExactACCache(int n) {
|
||||
kMinus2 = create(n);
|
||||
kMinus1 = create(n);
|
||||
kMinus0 = create(n);
|
||||
}
|
||||
|
||||
final public void rotate() {
|
||||
double[] tmp = kMinus2;
|
||||
kMinus2 = kMinus1;
|
||||
kMinus1 = kMinus0;
|
||||
kMinus0 = tmp;
|
||||
}
|
||||
|
||||
final public double[] getkMinus2() {
|
||||
return kMinus2;
|
||||
}
|
||||
|
||||
final public double[] getkMinus1() {
|
||||
return kMinus1;
|
||||
}
|
||||
|
||||
final public double[] getkMinus0() {
|
||||
return kMinus0;
|
||||
}
|
||||
}
|
||||
|
||||
public int linearExact(final VariantContext vc,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyLikelihoods,
|
||||
double[] log10AlleleFrequencyPosteriors) {
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(vc.getGenotypes(), false);
|
||||
final int numSamples = genotypeLikelihoods.size()-1;
|
||||
final int numChr = 2*numSamples;
|
||||
|
||||
final ExactACCache logY = new ExactACCache(numSamples+1);
|
||||
logY.getkMinus0()[0] = 0.0; // the zero case
|
||||
|
||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
boolean done = false;
|
||||
int lastK = -1;
|
||||
|
||||
for (int k=0; k <= numChr && ! done; k++ ) {
|
||||
final double[] kMinus0 = logY.getkMinus0();
|
||||
|
||||
if ( k == 0 ) { // special case for k = 0
|
||||
for ( int j=1; j <= numSamples; j++ ) {
|
||||
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[0];
|
||||
}
|
||||
} else { // k > 0
|
||||
final double[] kMinus1 = logY.getkMinus1();
|
||||
final double[] kMinus2 = logY.getkMinus2();
|
||||
|
||||
for ( int j=1; j <= numSamples; j++ ) {
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
|
||||
double aa = Double.NEGATIVE_INFINITY;
|
||||
double ab = Double.NEGATIVE_INFINITY;
|
||||
if (k < 2*j-1)
|
||||
aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[0];
|
||||
|
||||
if (k < 2*j)
|
||||
ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[1];
|
||||
|
||||
double log10Max;
|
||||
if (k > 1) {
|
||||
final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[2];
|
||||
log10Max = MathUtils.approximateLog10SumLog10(aa, ab, bb);
|
||||
} else {
|
||||
// we know we aren't considering the BB case, so we can use an optimized log10 function
|
||||
log10Max = MathUtils.approximateLog10SumLog10(aa, ab);
|
||||
}
|
||||
|
||||
// finally, update the L(j,k) value
|
||||
kMinus0[j] = log10Max - logDenominator;
|
||||
}
|
||||
}
|
||||
|
||||
// update the posteriors vector
|
||||
final double log10LofK = kMinus0[numSamples];
|
||||
log10AlleleFrequencyLikelihoods[k] = log10LofK;
|
||||
log10AlleleFrequencyPosteriors[k] = log10LofK + log10AlleleFrequencyPriors[k];
|
||||
|
||||
// can we abort early?
|
||||
lastK = k;
|
||||
maxLog10L = Math.max(maxLog10L, log10LofK);
|
||||
if ( log10LofK < maxLog10L - StateTracker.MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
//if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
|
||||
done = true;
|
||||
}
|
||||
|
||||
logY.rotate();
|
||||
}
|
||||
|
||||
return lastK;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
public class ReferenceDiploidExactAFCalc extends DiploidExactAFCalc {
|
||||
protected ReferenceDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
|
||||
}
|
||||
|
||||
protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) {
|
||||
return new StateTracker();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,96 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
/**
|
||||
* Keeps track of the best state seen by the exact model and the max states to visit
|
||||
* allowing us to abort the search before we visit the entire matrix of AC x samples
|
||||
*/
|
||||
final class StateTracker {
|
||||
public final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
|
||||
final private int[] maxACsToConsider;
|
||||
|
||||
private ExactACcounts ACsAtMax = null;
|
||||
private double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
|
||||
public StateTracker() {
|
||||
this(null);
|
||||
}
|
||||
|
||||
public StateTracker(final int[] maxACsToConsider) {
|
||||
this.maxACsToConsider = maxACsToConsider;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the maximum log10L seen, if log10LofKs is higher, and the corresponding ACs of this state
|
||||
*
|
||||
* @param log10LofKs the likelihood of our current configuration state
|
||||
*/
|
||||
public void update(final double log10LofKs, final ExactACcounts ACs) {
|
||||
if ( log10LofKs > getMaxLog10L()) {
|
||||
this.setMaxLog10L(log10LofKs);
|
||||
this.ACsAtMax = ACs;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Is the likelihood of configuration K too low to consider, related to the
|
||||
* maximum likelihood seen already?
|
||||
*
|
||||
* @param log10LofK the log10 likelihood of the configuration we're considering analyzing
|
||||
* @return true if the configuration cannot meaningfully contribute to our likelihood sum
|
||||
*/
|
||||
public boolean tooLowLikelihood(final double log10LofK) {
|
||||
return log10LofK < getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY;
|
||||
}
|
||||
|
||||
/**
|
||||
* Are all ACs in otherACs less than or equal to their corresponding ACs in the maxACsToConsider?
|
||||
*
|
||||
* @param otherACs the set of otherACs that we want to know if we should consider analyzing
|
||||
* @return true if otherACs is a state worth considering, or false otherwise
|
||||
*/
|
||||
public boolean withinMaxACs(final ExactACcounts otherACs) {
|
||||
if ( maxACsToConsider == null )
|
||||
return true;
|
||||
|
||||
final int[] otherACcounts = otherACs.getCounts();
|
||||
|
||||
for ( int i = 0; i < maxACsToConsider.length; i++ ) {
|
||||
// consider one more than the max AC to collect a bit more likelihood mass
|
||||
if ( otherACcounts[i] > maxACsToConsider[i] + 1 )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* returns true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set
|
||||
*/
|
||||
public boolean isLowerAC(final ExactACcounts otherACs) {
|
||||
if ( ACsAtMax == null )
|
||||
return true;
|
||||
|
||||
final int[] myACcounts = this.ACsAtMax.getCounts();
|
||||
final int[] otherACcounts = otherACs.getCounts();
|
||||
|
||||
for ( int i = 0; i < myACcounts.length; i++ ) {
|
||||
if ( myACcounts[i] > otherACcounts[i] )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean abort( final double log10LofK, final ExactACcounts ACs ) {
|
||||
return tooLowLikelihood(log10LofK) && isLowerAC(ACs);
|
||||
}
|
||||
|
||||
public double getMaxLog10L() {
|
||||
return maxLog10L;
|
||||
}
|
||||
|
||||
public void setMaxLog10L(double maxLog10L) {
|
||||
this.maxLog10L = maxLog10L;
|
||||
}
|
||||
}
|
||||
|
|
@ -23,8 +23,9 @@
|
|||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalc;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcResult;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.TreeSet;
|
||||
|
|
@ -32,7 +33,9 @@ import java.util.TreeSet;
|
|||
|
||||
public class GLBasedSampleSelector extends SampleSelector {
|
||||
double[] flatPriors = null;
|
||||
double referenceLikelihood;
|
||||
final double referenceLikelihood;
|
||||
AFCalc AFCalculator;
|
||||
|
||||
public GLBasedSampleSelector(TreeSet<String> sm, double refLik) {
|
||||
super(sm);
|
||||
referenceLikelihood = refLik;
|
||||
|
|
@ -49,11 +52,11 @@ public class GLBasedSampleSelector extends SampleSelector {
|
|||
// do we want to apply a prior? maybe user-spec?
|
||||
if ( flatPriors == null ) {
|
||||
flatPriors = new double[1+2*samples.size()];
|
||||
AFCalculator = AFCalcFactory.createAFCalc(samples.size(), 4, 4, 2);
|
||||
}
|
||||
AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size());
|
||||
ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPriors,result);
|
||||
final AFCalcResult result = AFCalculator.getLog10PNonRef(subContext, flatPriors);
|
||||
// do we want to let this qual go up or down?
|
||||
if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) {
|
||||
if ( result.getLog10LikelihoodOfAFEq0() < referenceLikelihood ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -75,7 +75,7 @@ public class FilterLiftedVariants extends RodWalker<Integer, Integer> {
|
|||
boolean failed = false;
|
||||
byte[] recordRef = vc.getReference().getBases();
|
||||
for (int i = 0; i < recordRef.length && i < MAX_VARIANT_SIZE; i++) {
|
||||
if ( recordRef[i] != ref[i + (vc.isPointEvent() ? 0 : 1)] ) {
|
||||
if ( recordRef[i] != ref[i] ) {
|
||||
failed = true;
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -50,7 +50,6 @@ import org.broadinstitute.sting.utils.variantcontext.*;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -278,13 +277,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
@Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
|
||||
protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
|
||||
|
||||
/**
|
||||
* Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so make sure you supply the program with enough memory
|
||||
* given your input set. This option will NOT work well for large callsets; use --select_random_fraction for sets with a large numbers of variants.
|
||||
*/
|
||||
@Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false)
|
||||
protected int numRandom = 0;
|
||||
|
||||
/**
|
||||
* This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions.
|
||||
*/
|
||||
|
|
@ -330,20 +322,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false;
|
||||
|
||||
|
||||
/* Private class used to store the intermediate variants in the integer random selection process */
|
||||
private static class RandomVariantStructure {
|
||||
private VariantContext vc;
|
||||
|
||||
RandomVariantStructure(VariantContext vcP) {
|
||||
vc = vcP;
|
||||
}
|
||||
|
||||
public void set (VariantContext vcP) {
|
||||
vc = vcP;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public enum NumberAlleleRestriction {
|
||||
ALL,
|
||||
BIALLELIC,
|
||||
|
|
@ -364,12 +342,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
|
||||
|
||||
/* variables used by the SELECT RANDOM modules */
|
||||
private boolean SELECT_RANDOM_NUMBER = false;
|
||||
private boolean SELECT_RANDOM_FRACTION = false;
|
||||
private int variantNumber = 0;
|
||||
private int nVariantsAdded = 0;
|
||||
private int positionToAdd = 0;
|
||||
private RandomVariantStructure [] variantArray;
|
||||
|
||||
//Random number generator for the genotypes to remove
|
||||
private Random randomGenotypes = new Random();
|
||||
|
|
@ -478,12 +451,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
mv = new MendelianViolation(MENDELIAN_VIOLATION_QUAL_THRESHOLD,false,true);
|
||||
}
|
||||
|
||||
SELECT_RANDOM_NUMBER = numRandom > 0;
|
||||
if (SELECT_RANDOM_NUMBER) {
|
||||
logger.info("Selecting " + numRandom + " variants at random from the variant track");
|
||||
variantArray = new RandomVariantStructure[numRandom];
|
||||
}
|
||||
|
||||
SELECT_RANDOM_FRACTION = fractionRandom > 0;
|
||||
if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track");
|
||||
|
||||
|
|
@ -588,14 +555,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
break;
|
||||
}
|
||||
}
|
||||
if ( !failedJexlMatch ) {
|
||||
if (SELECT_RANDOM_NUMBER) {
|
||||
randomlyAddVariant(++variantNumber, sub);
|
||||
}
|
||||
else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) {
|
||||
if ( ! justRead )
|
||||
vcfWriter.add(sub);
|
||||
}
|
||||
if ( !failedJexlMatch &&
|
||||
!justRead &&
|
||||
( !SELECT_RANDOM_FRACTION || GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom ) ) {
|
||||
vcfWriter.add(sub);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -718,14 +681,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
|
||||
public void onTraversalDone(Integer result) {
|
||||
logger.info(result + " records processed.");
|
||||
|
||||
if (SELECT_RANDOM_NUMBER) {
|
||||
int positionToPrint = positionToAdd;
|
||||
for (int i=0; i<numRandom; i++) {
|
||||
vcfWriter.add(variantArray[positionToPrint].vc);
|
||||
positionToPrint = nextCircularPosition(positionToPrint);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -746,9 +701,9 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
|
||||
GenotypesContext newGC = sub.getGenotypes();
|
||||
|
||||
// if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs (because they are no longer accurate)
|
||||
// if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs and AD (because they are no longer accurate)
|
||||
if ( vc.getAlleles().size() != sub.getAlleles().size() )
|
||||
newGC = VariantContextUtils.stripPLs(sub.getGenotypes());
|
||||
newGC = VariantContextUtils.stripPLsAndAD(sub.getGenotypes());
|
||||
|
||||
// if we have fewer samples in the selected VC than in the original VC, we need to strip out the MLE tags
|
||||
if ( vc.getNSamples() != sub.getNSamples() ) {
|
||||
|
|
@ -809,25 +764,4 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
if ( sawDP )
|
||||
builder.attribute("DP", depth);
|
||||
}
|
||||
|
||||
private void randomlyAddVariant(int rank, VariantContext vc) {
|
||||
if (nVariantsAdded < numRandom)
|
||||
variantArray[nVariantsAdded++] = new RandomVariantStructure(vc);
|
||||
|
||||
else {
|
||||
double v = GenomeAnalysisEngine.getRandomGenerator().nextDouble();
|
||||
double t = (1.0/(rank-numRandom+1));
|
||||
if ( v < t) {
|
||||
variantArray[positionToAdd].set(vc);
|
||||
nVariantsAdded++;
|
||||
positionToAdd = nextCircularPosition(positionToAdd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int nextCircularPosition(int cur) {
|
||||
if ((cur + 1) == variantArray.length)
|
||||
return 0;
|
||||
return cur + 1;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broadinstitute.sting.alignment.bwa.java.AlignmentMatchSequence;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
||||
|
|
@ -7,29 +9,28 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Reference;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Window;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Yet another VCF to Ped converter. The world actually does need one that will
|
||||
* work efficiently on large VCFs (or at least give a progress bar). This
|
||||
* produces a binary ped file in individual major mode.
|
||||
* Converts a VCF file to a binary plink Ped file (.bed/.bim/.fam)
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = "Variant Evaluation and Manipulation Tools", extraDocs = {CommandLineGATK.class} )
|
||||
@Reference(window=@Window(start=0,stop=100))
|
||||
public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
||||
@ArgumentCollection
|
||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
|
|
@ -40,29 +41,33 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
/**
|
||||
* The metaData file can take two formats, the first of which is the first 6 lines of the standard ped file. This
|
||||
* is what Plink describes as a fam file. An example fam file is (note that there is no header):
|
||||
*
|
||||
* CEUTrio NA12878 NA12891 NA12892 2 -9
|
||||
* CEUTrio NA12891 UNKN1 UNKN2 2 -9
|
||||
* CEUTrio NA12892 UNKN3 UNKN4 1 -9
|
||||
*
|
||||
* <p><p>
|
||||
* CEUTrio NA12878 NA12891 NA12892 2 -9</p><p>
|
||||
* CEUTrio NA12891 UNKN1 UNKN2 2 -9</p><p>
|
||||
* CEUTrio NA12892 UNKN3 UNKN4 1 -9</p><p>
|
||||
* </p>
|
||||
* where the entries are (FamilyID IndividualID DadID MomID Phenotype Sex)
|
||||
*
|
||||
* <p>
|
||||
* An alternate format is a two-column key-value file
|
||||
*
|
||||
* NA12878 fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9
|
||||
* NA12891 fid=CEUTrio;sex=2;phenotype=-9
|
||||
* NA12892 fid=CEUTrio;sex=1;phenotype=-9
|
||||
*
|
||||
* </p><p><p>
|
||||
* NA12878 fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9</p><p>
|
||||
* NA12891 fid=CEUTrio;sex=2;phenotype=-9</p><p>
|
||||
* NA12892 fid=CEUTrio;sex=1;phenotype=-9</p><p>
|
||||
* </p><p>
|
||||
* wherein unknown parents needn't be specified. The columns are the individual ID, and a list of key-value pairs.
|
||||
*
|
||||
* </p><p>
|
||||
* Regardless of which file is specified, the walker will output a .fam file alongside the bed file. If the
|
||||
* command line has "-md [name].fam", the fam file will simply be copied. However, if a metadata file of the
|
||||
* alternate format is passed by "-md [name].txt", the walker will construct a formatted .fam file from the data.
|
||||
* </p>
|
||||
*/
|
||||
@Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file " +
|
||||
"(in which case it will be copied to the file you provide as fam output).")
|
||||
File metaDataFile;
|
||||
|
||||
@Input(shortName="mode",fullName="outputMode",required=false,doc="The output file mode (SNP major or individual major)")
|
||||
OutputMode mode = OutputMode.INDIVIDUAL_MAJOR;
|
||||
|
||||
@Output(shortName="bed",fullName = "bed",required=true,doc="output ped file")
|
||||
PrintStream outBed;
|
||||
|
||||
|
|
@ -78,7 +83,7 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
@Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele")
|
||||
boolean majorAlleleFirst = false;
|
||||
|
||||
private ValidateVariants vv = new ValidateVariants();
|
||||
enum OutputMode { INDIVIDUAL_MAJOR,SNP_MAJOR }
|
||||
|
||||
private static double APPROX_CM_PER_BP = 1000000.0/750000.0;
|
||||
|
||||
|
|
@ -89,6 +94,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
|
||||
private static final int BUFFER_SIZE = 1000; //4k genotypes per sample = Nmb for N*1000 samples
|
||||
|
||||
private static final String PLINK_DELETION_MARKER = "-";
|
||||
|
||||
// note that HET and NO_CALL are flipped from the documentation: that's because
|
||||
// plink actually reads these in backwards; and we want to use a shift operator
|
||||
// to put these in the appropriate location
|
||||
|
|
@ -99,9 +106,10 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
private int genotypeCount = 0;
|
||||
private int byteCount = 0;
|
||||
private List<String> famOrder = new ArrayList<String>();
|
||||
private long totalByteCount = 0l;
|
||||
private long totalGenotypeCount = 0l;
|
||||
|
||||
public void initialize() {
|
||||
initializeValidator();
|
||||
writeBedHeader();
|
||||
Map<String,Map<String,String>> sampleMetaValues = parseMetaData();
|
||||
// create temporary output streams and buffers
|
||||
|
|
@ -136,36 +144,43 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
throw new UserException("No metadata provided for sample "+sample);
|
||||
}
|
||||
}
|
||||
try {
|
||||
File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp");
|
||||
printMap.put(sample,new PrintStream(temp));
|
||||
tempFiles.put(sample,temp);
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("Error creating temporary file",e);
|
||||
if ( mode == OutputMode.INDIVIDUAL_MAJOR ) {
|
||||
// only need to instantiate the files and buffers if in individual major.
|
||||
// Cut down on memory.
|
||||
try {
|
||||
File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp");
|
||||
printMap.put(sample,new PrintStream(temp));
|
||||
tempFiles.put(sample,temp);
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("Error creating temporary file",e);
|
||||
}
|
||||
genotypeBuffer.put(sample,new byte[BUFFER_SIZE]);
|
||||
}
|
||||
genotypeBuffer.put(sample,new byte[BUFFER_SIZE]);
|
||||
famOrder.add(sample);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null || ! tracker.hasValues(variantCollection.variants) ||
|
||||
tracker.getFirstValue(variantCollection.variants).isFiltered() ||
|
||||
! tracker.getFirstValue(variantCollection.variants).isSNP() ||
|
||||
! tracker.getFirstValue(variantCollection.variants).isBiallelic()) {
|
||||
if ( tracker == null ) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
VariantContext vc = tracker.getFirstValue(variantCollection.variants,context.getLocation());
|
||||
if ( vc == null || vc.isFiltered() || ! vc.isBiallelic() ) {
|
||||
return 0;
|
||||
}
|
||||
try {
|
||||
vv.map(tracker,ref,context);
|
||||
} catch (UserException e) {
|
||||
validateVariantSite(vc,ref,context);
|
||||
} catch (TribbleException e) {
|
||||
throw new UserException("Input VCF file is invalid; we cannot guarantee the resulting ped file. "+
|
||||
"Please run ValidateVariants for more detailed information.");
|
||||
"Please run ValidateVariants for more detailed information. This error is: "+e.getMessage());
|
||||
}
|
||||
|
||||
VariantContext vc = tracker.getFirstValue(variantCollection.variants);
|
||||
String refOut;
|
||||
String altOut;
|
||||
String vcRef = getReferenceAllele(vc);
|
||||
String vcAlt = getAlternateAllele(vc);
|
||||
boolean altMajor;
|
||||
if ( majorAlleleFirst ) {
|
||||
// want to use the major allele as ref
|
||||
|
|
@ -174,29 +189,42 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
VariantContextUtils.calculateChromosomeCounts(vc,ats,true);
|
||||
}
|
||||
if ( getAF(ats.get("AF")) > 0.5 ) {
|
||||
refOut = vc.getAlternateAllele(0).getBaseString();
|
||||
altOut = vc.getReference().getBaseString();
|
||||
refOut = vcAlt;
|
||||
altOut = vcRef;
|
||||
altMajor = true;
|
||||
} else {
|
||||
refOut = vc.getReference().getBaseString();
|
||||
altOut = vc.getAlternateAllele(0).getBaseString();
|
||||
refOut = vcRef;
|
||||
altOut = vcAlt;
|
||||
altMajor = false;
|
||||
}
|
||||
} else {
|
||||
refOut = vc.getReference().getBaseString();
|
||||
altOut = vc.getAlternateAllele(0).getBaseString();
|
||||
refOut = vcRef;
|
||||
altOut = vcAlt;
|
||||
altMajor = false;
|
||||
}
|
||||
// write an entry into the map file
|
||||
outBim.printf("%s\t%s\t%.2f\t%d\t%s\t%s%n",vc.getChr(),getID(vc),APPROX_CM_PER_BP*vc.getStart(),vc.getStart(),
|
||||
refOut,altOut);
|
||||
if ( mode == OutputMode.INDIVIDUAL_MAJOR ) {
|
||||
writeIndividualMajor(vc,altMajor);
|
||||
} else {
|
||||
writeSNPMajor(vc,altMajor);
|
||||
}
|
||||
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
public void writeIndividualMajor(VariantContext vc, boolean altMajor) {
|
||||
// store genotypes per sample into the buffer
|
||||
for ( Genotype g : vc.getGenotypes() ) {
|
||||
++totalGenotypeCount;
|
||||
String sample = g.getSampleName();
|
||||
byte[] samBuf = genotypeBuffer.get(sample);
|
||||
byte enc = getEncoding(g,genotypeCount,altMajor);
|
||||
samBuf[byteCount] |= enc;
|
||||
}
|
||||
|
||||
genotypeCount++;
|
||||
if ( genotypeCount % 4 == 0 ) {
|
||||
byteCount++;
|
||||
|
|
@ -217,8 +245,30 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
}
|
||||
genotypeCount = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
public void writeSNPMajor(VariantContext vc, boolean altMajor) {
|
||||
// for each sample, write the genotype into the bed file, in the
|
||||
// order of the fam file
|
||||
genotypeCount = 0;
|
||||
byteCount = 0;
|
||||
byte[] bytes = new byte[(3+famOrder.size())/4]; // this exploits java integer fractions, which round down by default (1-4) -> 1, (5-8) -> 2
|
||||
for ( Genotype g : vc.getGenotypesOrderedBy(famOrder) ) {
|
||||
byte enc = getEncoding(g,genotypeCount,altMajor);
|
||||
bytes[byteCount] |= enc;
|
||||
genotypeCount++;
|
||||
if ( genotypeCount % 4 == 0 ) {
|
||||
byteCount++;
|
||||
genotypeCount = 0;
|
||||
}
|
||||
}
|
||||
totalGenotypeCount += famOrder.size();
|
||||
totalByteCount += bytes.length;
|
||||
try {
|
||||
outBed.write(bytes);
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("Error writing to output bed file",e);
|
||||
}
|
||||
}
|
||||
|
||||
public Integer reduce(Integer m, Integer r) {
|
||||
|
|
@ -230,7 +280,15 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
}
|
||||
|
||||
public void onTraversalDone(Integer numSites) {
|
||||
logger.info(String.format("%d sites processed!",numSites));
|
||||
logger.info(String.format("%d sites processed for a total of %d genotypes encoded in %d bytes",numSites,totalGenotypeCount,totalByteCount));
|
||||
|
||||
if ( mode == OutputMode.INDIVIDUAL_MAJOR ) {
|
||||
mergeGenotypeTempFiles(numSites);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void mergeGenotypeTempFiles(int numSites) {
|
||||
// push out the remaining genotypes and close stream
|
||||
for ( String sample : printMap.keySet() ) {
|
||||
try {
|
||||
|
|
@ -262,18 +320,19 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
byte[] readGenotypes = new byte[BUFFER_SIZE];
|
||||
inStream.read(readGenotypes);
|
||||
outBed.write(readGenotypes);
|
||||
totalByteCount += BUFFER_SIZE;
|
||||
}
|
||||
if ( ttr > 0 ) {
|
||||
byte[] readGenotypes = new byte[ttr];
|
||||
inStream.read(readGenotypes);
|
||||
outBed.write(readGenotypes);
|
||||
totalByteCount += ttr;
|
||||
}
|
||||
inStream.close();
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("Error reading form temp file for input.",e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private byte getEncoding(Genotype g, int offset, boolean altMajor) {
|
||||
|
|
@ -286,8 +345,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
|
||||
private byte getStandardEncoding(Genotype g, int offset) {
|
||||
byte b;
|
||||
if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) {
|
||||
b = NO_CALL;
|
||||
if ( ! checkGQIsGood(g) ) {
|
||||
b = NO_CALL;
|
||||
} else if ( g.isHomRef() ) {
|
||||
b = HOM_REF;
|
||||
} else if ( g.isHomVar() ) {
|
||||
|
|
@ -322,10 +381,11 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
if ( genotype.hasGQ() ) {
|
||||
return genotype.getGQ() >= minGenotypeQuality;
|
||||
} else if ( genotype.hasLikelihoods() ) {
|
||||
return GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector()) >= minGenotypeQuality;
|
||||
double log10gq = GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector());
|
||||
return MathUtils.log10ProbabilityToPhredScale(log10gq) >= minGenotypeQuality;
|
||||
}
|
||||
|
||||
return false;
|
||||
return minGenotypeQuality <= 0;
|
||||
}
|
||||
|
||||
private static String getID(VariantContext v) {
|
||||
|
|
@ -346,17 +406,10 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
private void initializeValidator() {
|
||||
vv.variantCollection = variantCollection;
|
||||
vv.dbsnp = dbsnp;
|
||||
vv.DO_NOT_VALIDATE_FILTERED = true;
|
||||
vv.type = ValidateVariants.ValidationType.REF;
|
||||
}
|
||||
|
||||
private void writeBedHeader() {
|
||||
// write magic bits into the ped file
|
||||
try {
|
||||
outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0});
|
||||
outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, (byte) (mode == OutputMode.INDIVIDUAL_MAJOR ? 0x0 : 0x1)});
|
||||
// ultimately, the bed will be in individual-major mode
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("error writing to output file.");
|
||||
|
|
@ -410,4 +463,53 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
|
||||
return metaValues;
|
||||
}
|
||||
|
||||
private void validateVariantSite(VariantContext vc, ReferenceContext ref, AlignmentContext context) {
|
||||
final Allele reportedRefAllele = vc.getReference();
|
||||
final int refLength = reportedRefAllele.length();
|
||||
if ( refLength > 100 ) {
|
||||
logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", refLength, vc.getChr(), vc.getStart()));
|
||||
return;
|
||||
}
|
||||
|
||||
final byte[] observedRefBases = new byte[refLength];
|
||||
System.arraycopy(ref.getBases(), 0, observedRefBases, 0, refLength);
|
||||
final Allele observedRefAllele = Allele.create(observedRefBases);
|
||||
vc.validateReferenceBases(reportedRefAllele, observedRefAllele);
|
||||
vc.validateAlternateAlleles();
|
||||
}
|
||||
|
||||
private String getReferenceAllele(VariantContext vc) {
|
||||
if ( vc.isSimpleInsertion() ) {
|
||||
// bi-allelic, so we just have "-" for ped output
|
||||
return PLINK_DELETION_MARKER;
|
||||
}
|
||||
if ( vc.isSymbolic() ) {
|
||||
// either symbolic or really long alleles. Plink alleles are allowed to be 1 or 2. Reference will just be 1.
|
||||
return "1";
|
||||
}
|
||||
if ( vc.isSimpleDeletion() ) {
|
||||
// bi-allelic. Want to take the standard representation and strip off the leading base.
|
||||
return vc.getReference().getBaseString().substring(1);
|
||||
}
|
||||
// snp or mnp
|
||||
return vc.getReference().getBaseString();
|
||||
}
|
||||
|
||||
private String getAlternateAllele(VariantContext vc ) {
|
||||
if ( vc.isSimpleInsertion() ) {
|
||||
// bi-allelic. Want to take the standard representation and strip off the leading base.
|
||||
return vc.getAlternateAllele(0).getBaseString().substring(1);
|
||||
}
|
||||
if ( vc.isSymbolic() ) {
|
||||
// either symbolic or really long alleles. Plink alleles are allowed to be 1 or 2. Alt will just be 2.
|
||||
return "2";
|
||||
}
|
||||
if ( vc.isSimpleDeletion() ) {
|
||||
// bi-allelic, so we just have "-" for ped output
|
||||
return PLINK_DELETION_MARKER;
|
||||
}
|
||||
// snp or mnp
|
||||
return vc.getAlternateAllele(0).getBaseString();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
|
|||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.lang.reflect.Array;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -334,12 +335,12 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
return records;
|
||||
}
|
||||
|
||||
private static void addFieldValue(Object val, List<List<String>> result) {
|
||||
private static void addFieldValue(final Object val, final List<List<String>> result) {
|
||||
final int numResultRecords = result.size();
|
||||
|
||||
// if we're trying to create a single output record, add it
|
||||
if ( numResultRecords == 1 ) {
|
||||
result.get(0).add(val.toString());
|
||||
result.get(0).add(prettyPrintObject(val));
|
||||
}
|
||||
// if this field is a list of the proper size, add the appropriate entry to each record
|
||||
else if ( (val instanceof List) && ((List)val).size() == numResultRecords ) {
|
||||
|
|
@ -355,6 +356,26 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
private static String prettyPrintObject(final Object val) {
|
||||
if ( val instanceof List )
|
||||
return prettyPrintObject(((List)val).toArray());
|
||||
|
||||
if ( !val.getClass().isArray() )
|
||||
return val.toString();
|
||||
|
||||
final int length = Array.getLength(val);
|
||||
if ( length == 0 )
|
||||
return "";
|
||||
|
||||
final StringBuilder sb = new StringBuilder(prettyPrintObject(Array.get(val, 0)));
|
||||
for ( int i = 1; i < length; i++ ) {
|
||||
sb.append(",");
|
||||
sb.append(prettyPrintObject(Array.get(val, i)));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
public static List<List<String>> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
|
||||
return extractFields(vc, fields, null, null, allowMissingData, false);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.utils;
|
|||
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
|
|
@ -198,7 +199,9 @@ public class BaseUtils {
|
|||
* @param base [AaCcGgTt]
|
||||
* @return 0, 1, 2, 3, or -1 if the base can't be understood
|
||||
*/
|
||||
static public int simpleBaseToBaseIndex(byte base) {
|
||||
static public int simpleBaseToBaseIndex(final byte base) {
|
||||
if ( base < 0 || base >= 256 )
|
||||
throw new UserException.BadInput("Non-standard bases were encountered in either the input reference or BAM file(s)");
|
||||
return baseIndexMap[base];
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -58,6 +58,12 @@ public class MathUtils {
|
|||
private static final int MAXN = 50000;
|
||||
private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients
|
||||
|
||||
/**
|
||||
* The smallest log10 value we'll emit from normalizeFromLog10 and other functions
|
||||
* where the real-space value is 0.0.
|
||||
*/
|
||||
public final static double LOG10_P_OF_ZERO = -1000000.0;
|
||||
|
||||
static {
|
||||
log10Cache = new double[LOG10_CACHE_SIZE];
|
||||
log10FactorialCache = new double[LOG10_CACHE_SIZE];
|
||||
|
|
@ -572,16 +578,26 @@ public class MathUtils {
|
|||
return normalizeFromLog10(array, takeLog10OfOutput, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* See #normalizeFromLog10 but with the additional option to use an approximation that keeps the calculation always in log-space
|
||||
*
|
||||
* @param array
|
||||
* @param takeLog10OfOutput
|
||||
* @param keepInLogSpace
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static double[] normalizeFromLog10(double[] array, boolean takeLog10OfOutput, boolean keepInLogSpace) {
|
||||
|
||||
// for precision purposes, we need to add (or really subtract, since they're
|
||||
// all negative) the largest value; also, we need to convert to normal-space.
|
||||
double maxValue = arrayMax(array);
|
||||
|
||||
// we may decide to just normalize in log space without converting to linear space
|
||||
if (keepInLogSpace) {
|
||||
for (int i = 0; i < array.length; i++)
|
||||
for (int i = 0; i < array.length; i++) {
|
||||
array[i] -= maxValue;
|
||||
array[i] = Math.max(array[i], LOG10_P_OF_ZERO);
|
||||
}
|
||||
return array;
|
||||
}
|
||||
|
||||
|
|
@ -598,7 +614,8 @@ public class MathUtils {
|
|||
for (int i = 0; i < array.length; i++) {
|
||||
double x = normalized[i] / sum;
|
||||
if (takeLog10OfOutput)
|
||||
x = Math.log10(x);
|
||||
x = Math.max(Math.log10(x), LOG10_P_OF_ZERO);
|
||||
|
||||
normalized[i] = x;
|
||||
}
|
||||
|
||||
|
|
@ -1666,4 +1683,36 @@ public class MathUtils {
|
|||
return result;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a series of integer values between start and stop, inclusive,
|
||||
* expontentially distributed between the two. That is, if there are
|
||||
* ten values between 0-10 there will be 10 between 10-100.
|
||||
*
|
||||
* WARNING -- BADLY TESTED
|
||||
* @param start
|
||||
* @param stop
|
||||
* @param eps
|
||||
* @return
|
||||
*/
|
||||
public static List<Integer> log10LinearRange(final int start, final int stop, final double eps) {
|
||||
final LinkedList<Integer> values = new LinkedList<Integer>();
|
||||
final double log10range = Math.log10(stop - start);
|
||||
|
||||
if ( start == 0 )
|
||||
values.add(0);
|
||||
|
||||
double i = 0.0;
|
||||
while ( i <= log10range ) {
|
||||
final int index = (int)Math.round(Math.pow(10, i)) + start;
|
||||
if ( index < stop && (values.peekLast() == null || values.peekLast() != index ) )
|
||||
values.add(index);
|
||||
i += eps;
|
||||
}
|
||||
|
||||
if ( values.peekLast() == null || values.peekLast() != stop )
|
||||
values.add(stop);
|
||||
|
||||
return values;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -236,6 +236,33 @@ public class Utils {
|
|||
}
|
||||
}
|
||||
|
||||
public static <T> List<T> append(final List<T> left, T ... elts) {
|
||||
final List<T> l = new LinkedList<T>(left);
|
||||
l.addAll(Arrays.asList(elts));
|
||||
return l;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a string of the values in joined by separator, such as A,B,C
|
||||
*
|
||||
* @param separator
|
||||
* @param doubles
|
||||
* @return
|
||||
*/
|
||||
public static String join(String separator, double[] doubles) {
|
||||
if ( doubles == null || doubles.length == 0)
|
||||
return "";
|
||||
else {
|
||||
StringBuilder ret = new StringBuilder();
|
||||
ret.append(doubles[0]);
|
||||
for (int i = 1; i < doubles.length; ++i) {
|
||||
ret.append(separator);
|
||||
ret.append(doubles[i]);
|
||||
}
|
||||
return ret.toString();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a string of the form elt1.toString() [sep elt2.toString() ... sep elt.toString()] for a collection of
|
||||
* elti objects (note there's no actual space between sep and the elti elements). Returns
|
||||
|
|
|
|||
|
|
@ -0,0 +1,79 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.collections;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Wrapper around the basic NestedIntegerArray class that logs all updates (ie., all calls to put())
|
||||
* to the provided output stream. For testing/debugging purposes.
|
||||
*
|
||||
* Log entries are of the following form (fields are tab-separated):
|
||||
* LABEL VALUE KEY1 KEY2 ... KEY_N
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class LoggingNestedIntegerArray<T> extends NestedIntegerArray<T> {
|
||||
|
||||
private PrintStream log;
|
||||
private String logEntryLabel;
|
||||
|
||||
/**
|
||||
*
|
||||
* @param log output stream to which to log update operations
|
||||
* @param logEntryLabel String that should be prefixed to each log entry
|
||||
* @param dimensions
|
||||
*/
|
||||
public LoggingNestedIntegerArray( PrintStream log, String logEntryLabel, final int... dimensions ) {
|
||||
super(dimensions);
|
||||
|
||||
if ( log == null ) {
|
||||
throw new ReviewedStingException("Log output stream must not be null");
|
||||
}
|
||||
this.log = log;
|
||||
this.logEntryLabel = logEntryLabel != null ? logEntryLabel : "";
|
||||
}
|
||||
|
||||
@Override
|
||||
public void put( final T value, final int... keys ) {
|
||||
super.put(value, keys);
|
||||
|
||||
StringBuilder logEntry = new StringBuilder();
|
||||
|
||||
logEntry.append(logEntryLabel);
|
||||
logEntry.append("\t");
|
||||
logEntry.append(value);
|
||||
for ( int key : keys ) {
|
||||
logEntry.append("\t");
|
||||
logEntry.append(key);
|
||||
}
|
||||
|
||||
// PrintStream methods all use synchronized blocks internally, so our logging is thread-safe
|
||||
log.println(logEntry.toString());
|
||||
}
|
||||
}
|
||||
|
|
@ -129,6 +129,12 @@ public class UserException extends ReviewedStingException {
|
|||
}
|
||||
}
|
||||
|
||||
public static class LocalParallelizationProblem extends UserException {
|
||||
public LocalParallelizationProblem(final File file) {
|
||||
super(String.format("There was a failure because temporary file %s could not be found while running the GATK with more than one thread. Possible causes for this problem include: your system's open file handle limit is too small, your output or temp directories do not have sufficient space, or just an isolated file system blip", file.getAbsolutePath()));
|
||||
}
|
||||
}
|
||||
|
||||
public static class NotEnoughMemory extends UserException {
|
||||
public NotEnoughMemory() {
|
||||
super(String.format("There was a failure because you did not provide enough memory to run this program. See the -Xmx JVM argument to adjust the maximum heap size provided to Java"));
|
||||
|
|
@ -160,6 +166,10 @@ public class UserException extends ReviewedStingException {
|
|||
super(String.format("Couldn't read file %s because %s", file.getAbsolutePath(), message));
|
||||
}
|
||||
|
||||
public CouldNotReadInputFile(String file, String message) {
|
||||
super(String.format("Couldn't read file %s because %s", file, message));
|
||||
}
|
||||
|
||||
public CouldNotReadInputFile(File file, String message, Exception e) {
|
||||
super(String.format("Couldn't read file %s because %s with exception %s", file.getAbsolutePath(), message, getMessage(e)));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,9 +25,12 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.collections.LoggingNestedIntegerArray;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
|
||||
import org.broadinstitute.sting.utils.collections.NestedIntegerArray;
|
||||
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Utility class to facilitate on-the-fly base quality score recalibration.
|
||||
*
|
||||
|
|
@ -52,19 +55,31 @@ public class RecalibrationTables {
|
|||
private final NestedIntegerArray[] tables;
|
||||
|
||||
public RecalibrationTables(final Covariate[] covariates) {
|
||||
this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1);
|
||||
this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1, null);
|
||||
}
|
||||
|
||||
public RecalibrationTables(final Covariate[] covariates, final PrintStream log) {
|
||||
this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1, log);
|
||||
}
|
||||
|
||||
public RecalibrationTables(final Covariate[] covariates, final int numReadGroups) {
|
||||
this(covariates, numReadGroups, null);
|
||||
}
|
||||
|
||||
public RecalibrationTables(final Covariate[] covariates, final int numReadGroups, final PrintStream log) {
|
||||
tables = new NestedIntegerArray[covariates.length];
|
||||
|
||||
final int qualDimension = covariates[TableType.QUALITY_SCORE_TABLE.index].maximumKeyValue() + 1;
|
||||
final int eventDimension = EventType.values().length;
|
||||
|
||||
tables[TableType.READ_GROUP_TABLE.index] = new NestedIntegerArray<RecalDatum>(numReadGroups, eventDimension);
|
||||
tables[TableType.QUALITY_SCORE_TABLE.index] = new NestedIntegerArray<RecalDatum>(numReadGroups, qualDimension, eventDimension);
|
||||
tables[TableType.READ_GROUP_TABLE.index] = log == null ? new NestedIntegerArray<RecalDatum>(numReadGroups, eventDimension) :
|
||||
new LoggingNestedIntegerArray<RecalDatum>(log, "READ_GROUP_TABLE", numReadGroups, eventDimension);
|
||||
tables[TableType.QUALITY_SCORE_TABLE.index] = log == null ? new NestedIntegerArray<RecalDatum>(numReadGroups, qualDimension, eventDimension) :
|
||||
new LoggingNestedIntegerArray<RecalDatum>(log, "QUALITY_SCORE_TABLE", numReadGroups, qualDimension, eventDimension);
|
||||
for (int i = TableType.OPTIONAL_COVARIATE_TABLES_START.index; i < covariates.length; i++)
|
||||
tables[i] = new NestedIntegerArray<RecalDatum>(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension);
|
||||
tables[i] = log == null ? new NestedIntegerArray<RecalDatum>(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension) :
|
||||
new LoggingNestedIntegerArray<RecalDatum>(log, String.format("OPTIONAL_COVARIATE_TABLE_%d", i - TableType.OPTIONAL_COVARIATE_TABLES_START.index + 1),
|
||||
numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension);
|
||||
}
|
||||
|
||||
public NestedIntegerArray<RecalDatum> getReadGroupTable() {
|
||||
|
|
|
|||
|
|
@ -288,6 +288,24 @@ public abstract class Genotype implements Comparable<Genotype> {
|
|||
return hasLikelihoods() ? GenotypeLikelihoods.fromPLs(getPL()) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Are all likelihoods for this sample non-informative?
|
||||
*
|
||||
* Returns true if all PLs are 0 => 0,0,0 => true
|
||||
* 0,0,0,0,0,0 => true
|
||||
* 0,10,100 => false
|
||||
*
|
||||
* @return true if all samples PLs are equal and == 0
|
||||
*/
|
||||
public boolean isNonInformative() {
|
||||
for ( final int PL : getPL() ) {
|
||||
if ( PL != 0 )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Unsafe low-level accessor the PL field itself, may be null.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -1517,15 +1517,32 @@ public class VariantContext implements Feature { // to enable tribble integratio
|
|||
return best;
|
||||
}
|
||||
|
||||
/**
|
||||
* Lookup the index of allele in this variant context
|
||||
*
|
||||
* @param allele the allele whose index we want to get
|
||||
* @return the index of the allele into getAlleles(), or -1 if it cannot be found
|
||||
*/
|
||||
public int getAlleleIndex(final Allele allele) {
|
||||
return getAlleles().indexOf(allele);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the allele index #getAlleleIndex for each allele in alleles
|
||||
*
|
||||
* @param alleles the alleles we want to look up
|
||||
* @return a list of indices for each allele, in order
|
||||
*/
|
||||
public List<Integer> getAlleleIndices(final Collection<Allele> alleles) {
|
||||
final List<Integer> indices = new LinkedList<Integer>();
|
||||
for ( final Allele allele : alleles )
|
||||
indices.add(getAlleleIndex(allele));
|
||||
return indices;
|
||||
}
|
||||
|
||||
public int[] getGLIndecesOfAlternateAllele(Allele targetAllele) {
|
||||
|
||||
int index = 1;
|
||||
for ( Allele allele : getAlternateAlleles() ) {
|
||||
if ( allele.equals(targetAllele) )
|
||||
break;
|
||||
index++;
|
||||
}
|
||||
|
||||
final int index = getAlleleIndex(targetAllele);
|
||||
if ( index == -1 ) throw new IllegalArgumentException("Allele " + targetAllele + " not in this VariantContex " + this);
|
||||
return GenotypeLikelihoods.getPLIndecesOfAlleles(0, index);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -157,11 +157,8 @@ public class VariantContextUtils {
|
|||
builder.attributes(calculateChromosomeCounts(vc, new HashMap<String, Object>(vc.getAttributes()), removeStaleValues, founderIds));
|
||||
}
|
||||
|
||||
public static Genotype removePLs(Genotype g) {
|
||||
if ( g.hasLikelihoods() )
|
||||
return new GenotypeBuilder(g).noPL().make();
|
||||
else
|
||||
return g;
|
||||
public static Genotype removePLsAndAD(final Genotype g) {
|
||||
return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g;
|
||||
}
|
||||
|
||||
public final static VCFCompoundHeaderLine getMetaDataForField(final VCFHeader header, final String field) {
|
||||
|
|
@ -573,7 +570,7 @@ public class VariantContextUtils {
|
|||
}
|
||||
|
||||
// if we have more alternate alleles in the merged VC than in one or more of the
|
||||
// original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF
|
||||
// original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD
|
||||
for ( final VariantContext vc : VCs ) {
|
||||
if (vc.alleles.size() == 1)
|
||||
continue;
|
||||
|
|
@ -581,7 +578,7 @@ public class VariantContextUtils {
|
|||
if ( ! genotypes.isEmpty() )
|
||||
logger.debug(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s",
|
||||
genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles));
|
||||
genotypes = stripPLs(genotypes);
|
||||
genotypes = stripPLsAndAD(genotypes);
|
||||
// this will remove stale AC,AF attributed from vc
|
||||
calculateChromosomeCounts(vc, attributes, true);
|
||||
break;
|
||||
|
|
@ -672,11 +669,11 @@ public class VariantContextUtils {
|
|||
return true;
|
||||
}
|
||||
|
||||
public static GenotypesContext stripPLs(GenotypesContext genotypes) {
|
||||
public static GenotypesContext stripPLsAndAD(GenotypesContext genotypes) {
|
||||
GenotypesContext newGs = GenotypesContext.create(genotypes.size());
|
||||
|
||||
for ( final Genotype g : genotypes ) {
|
||||
newGs.add(g.hasLikelihoods() ? removePLs(g) : g);
|
||||
newGs.add(removePLsAndAD(g));
|
||||
}
|
||||
|
||||
return newGs;
|
||||
|
|
@ -1343,10 +1340,7 @@ public class VariantContextUtils {
|
|||
|
||||
public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) {
|
||||
|
||||
// TODO - this function doesn't work with mixed records or records that started as mixed and then became non-mixed
|
||||
|
||||
// see whether we need to trim common reference base from all alleles
|
||||
|
||||
final int trimExtent = computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, false);
|
||||
if ( trimExtent <= 0 || inputVC.getAlleles().size() <= 1 )
|
||||
return inputVC;
|
||||
|
|
|
|||
|
|
@ -477,10 +477,10 @@ class VCFWriter extends IndexingVariantContextWriter {
|
|||
else if ( val instanceof List ) {
|
||||
result = formatVCFField(((List)val).toArray());
|
||||
} else if ( val.getClass().isArray() ) {
|
||||
int length = Array.getLength(val);
|
||||
final int length = Array.getLength(val);
|
||||
if ( length == 0 )
|
||||
return formatVCFField(null);
|
||||
StringBuffer sb = new StringBuffer(formatVCFField(Array.get(val, 0)));
|
||||
final StringBuilder sb = new StringBuilder(formatVCFField(Array.get(val, 0)));
|
||||
for ( int i = 1; i < length; i++) {
|
||||
sb.append(",");
|
||||
sb.append(formatVCFField(Array.get(val, i)));
|
||||
|
|
|
|||
|
|
@ -1,127 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
|
||||
public class ExactAFCalculationModelUnitTest extends BaseTest {
|
||||
|
||||
static double[] AA1, AB1, BB1;
|
||||
static double[] AA2, AB2, AC2, BB2, BC2, CC2;
|
||||
static final int numSamples = 3;
|
||||
static double[] priors = new double[2*numSamples+1]; // flat priors
|
||||
|
||||
@BeforeSuite
|
||||
public void before() {
|
||||
AA1 = new double[]{0.0, -20.0, -20.0};
|
||||
AB1 = new double[]{-20.0, 0.0, -20.0};
|
||||
BB1 = new double[]{-20.0, -20.0, 0.0};
|
||||
AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0};
|
||||
AC2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0};
|
||||
BB2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0};
|
||||
BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0};
|
||||
CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0};
|
||||
}
|
||||
|
||||
private class GetGLsTest extends TestDataProvider {
|
||||
GenotypesContext GLs;
|
||||
int numAltAlleles;
|
||||
String name;
|
||||
|
||||
private GetGLsTest(String name, int numAltAlleles, Genotype... arg) {
|
||||
super(GetGLsTest.class, name);
|
||||
GLs = GenotypesContext.create(arg);
|
||||
this.name = name;
|
||||
this.numAltAlleles = numAltAlleles;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s input=%s", super.toString(), GLs);
|
||||
}
|
||||
}
|
||||
|
||||
private static Genotype createGenotype(String name, double[] gls) {
|
||||
return new GenotypeBuilder(name, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).PL(gls).make();
|
||||
}
|
||||
|
||||
@DataProvider(name = "getGLs")
|
||||
public Object[][] createGLsData() {
|
||||
|
||||
// bi-allelic case
|
||||
new GetGLsTest("B0", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AA3", AA1));
|
||||
new GetGLsTest("B1", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AB", AB1));
|
||||
new GetGLsTest("B2", 1, createGenotype("AA1", AA1), createGenotype("BB", BB1), createGenotype("AA2", AA1));
|
||||
new GetGLsTest("B3a", 1, createGenotype("AB", AB1), createGenotype("AA", AA1), createGenotype("BB", BB1));
|
||||
new GetGLsTest("B3b", 1, createGenotype("AB1", AB1), createGenotype("AB2", AB1), createGenotype("AB3", AB1));
|
||||
new GetGLsTest("B4", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("AA", AA1));
|
||||
new GetGLsTest("B5", 1, createGenotype("BB1", BB1), createGenotype("AB", AB1), createGenotype("BB2", BB1));
|
||||
new GetGLsTest("B6", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("BB3", BB1));
|
||||
|
||||
// tri-allelic case
|
||||
new GetGLsTest("B1C0", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AB", AB2));
|
||||
new GetGLsTest("B0C1", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AC", AC2));
|
||||
new GetGLsTest("B1C1a", 2, createGenotype("AA", AA2), createGenotype("AB", AB2), createGenotype("AC", AC2));
|
||||
new GetGLsTest("B1C1b", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("BC", BC2));
|
||||
new GetGLsTest("B2C1", 2, createGenotype("AB1", AB2), createGenotype("AB2", AB2), createGenotype("AC", AC2));
|
||||
new GetGLsTest("B3C2a", 2, createGenotype("AB", AB2), createGenotype("BC1", BC2), createGenotype("BC2", BC2));
|
||||
new GetGLsTest("B3C2b", 2, createGenotype("AB", AB2), createGenotype("BB", BB2), createGenotype("CC", CC2));
|
||||
|
||||
return GetGLsTest.getTests(GetGLsTest.class);
|
||||
}
|
||||
|
||||
|
||||
@Test(dataProvider = "getGLs")
|
||||
public void testGLs(GetGLsTest cfg) {
|
||||
|
||||
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
|
||||
|
||||
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
|
||||
|
||||
int nameIndex = 1;
|
||||
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
|
||||
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
|
||||
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele];
|
||||
|
||||
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLargeGLs() {
|
||||
|
||||
final double[] BB = new double[]{-20000000.0, -20000000.0, 0.0};
|
||||
GetGLsTest cfg = new GetGLsTest("B6", 1, createGenotype("1", BB), createGenotype("2", BB), createGenotype("3", BB));
|
||||
|
||||
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
|
||||
|
||||
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
|
||||
|
||||
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[0];
|
||||
Assert.assertEquals(calculatedAlleleCount, 6);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMismatchedGLs() {
|
||||
|
||||
final double[] AB = new double[]{-2000.0, 0.0, -2000.0, -2000.0, -2000.0, -2000.0};
|
||||
final double[] AC = new double[]{-100.0, -100.0, -100.0, 0.0, -100.0, -100.0};
|
||||
GetGLsTest cfg = new GetGLsTest("B1C1", 2, createGenotype("1", AC), createGenotype("2", AB));
|
||||
|
||||
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
|
||||
|
||||
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
|
||||
|
||||
Assert.assertEquals(result.getAlleleCountsOfMAP()[0], 1);
|
||||
Assert.assertEquals(result.getAlleleCountsOfMAP()[1], 1);
|
||||
}
|
||||
}
|
||||
|
|
@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot1() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
|
||||
Arrays.asList("cafd404f1b4f53586f7aa7a7084b91da"));
|
||||
Arrays.asList("fe9c0e9e4b4ee4677145748cdd2285ff"));
|
||||
executeTest("test MultiSample Pilot1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -36,7 +36,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testWithAllelesPassedIn1() {
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
|
||||
Arrays.asList("9a760dffbb299bda4934bcb4f7aad42a"));
|
||||
Arrays.asList("bc15123620e1134f799005d71d1180fe"));
|
||||
executeTest("test MultiSample Pilot2 with alleles passed in", spec1);
|
||||
}
|
||||
|
||||
|
|
@ -44,7 +44,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testWithAllelesPassedIn2() {
|
||||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
|
||||
Arrays.asList("8391146877aa7801ffdb3aa954bf2965"));
|
||||
Arrays.asList("1ba7afccc8552f20d72d0b62237558e3"));
|
||||
executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2);
|
||||
}
|
||||
|
||||
|
|
@ -52,7 +52,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testSingleSamplePilot2() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("85b79ff7910f218dd59595d03ffe6ccc"));
|
||||
Arrays.asList("57e409dbb12e0d85cd8af73db221b1fc"));
|
||||
executeTest("test SingleSample Pilot2", spec);
|
||||
}
|
||||
|
||||
|
|
@ -60,7 +60,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultipleSNPAlleles() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
|
||||
Arrays.asList("cceb34ffbd2dbc45b8821f86ea255284"));
|
||||
Arrays.asList("772e14d8c908044c04053d204bad69ef"));
|
||||
executeTest("test Multiple SNP alleles", spec);
|
||||
}
|
||||
|
||||
|
|
@ -76,7 +76,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testReverseTrim() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
|
||||
Arrays.asList("00f54a0097e710c0f7b001444c237e32"));
|
||||
Arrays.asList("1fb69aa3857e921191997daa73f1b687"));
|
||||
executeTest("test reverse trim", spec);
|
||||
}
|
||||
|
||||
|
|
@ -84,7 +84,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMismatchedPLs() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
|
||||
Arrays.asList("b3fae6bf4c620458f4259dbc93125e37"));
|
||||
Arrays.asList("d210ee1baa75dd4a0c63aef6b1fa7a8a"));
|
||||
executeTest("test mismatched PLs", spec);
|
||||
}
|
||||
|
||||
|
|
@ -94,7 +94,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
private final static String COMPRESSED_OUTPUT_MD5 = "712e87db5e278e92bd36e96d377303c6";
|
||||
private final static String COMPRESSED_OUTPUT_MD5 = "367c0355b4e7b10c2988e5c41f44b3d2";
|
||||
|
||||
@Test
|
||||
public void testCompressedOutput() {
|
||||
|
|
@ -115,7 +115,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
|
||||
// Note that we need to turn off any randomization for this to work, so no downsampling and no annotations
|
||||
|
||||
String md5 = "306943dd63111e2e64388cd2e2de6c01";
|
||||
String md5 = "360d1274c1072a1ae9868e4e106c2650";
|
||||
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
|
||||
|
|
@ -147,7 +147,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMinBaseQualityScore() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1,
|
||||
Arrays.asList("f73dec2e77f14c170f7b6a8eee5793ff"));
|
||||
Arrays.asList("6ae4a219c7b9c837fcbf12edeeac3c0c"));
|
||||
executeTest("test min_base_quality_score 26", spec);
|
||||
}
|
||||
|
||||
|
|
@ -155,7 +155,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testSLOD() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
|
||||
Arrays.asList("da7a5a3aa1c9f401896c34199c535954"));
|
||||
Arrays.asList("c7429e670ba477bf9a6bbee2fb41c5a9"));
|
||||
executeTest("test SLOD", spec);
|
||||
}
|
||||
|
||||
|
|
@ -163,7 +163,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testNDA() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
|
||||
Arrays.asList("07f5962f790673a1299f3a0f56579b65"));
|
||||
Arrays.asList("abd8e33e649cc11b55e200d3940cc7e2"));
|
||||
executeTest("test NDA", spec);
|
||||
}
|
||||
|
||||
|
|
@ -171,23 +171,23 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testCompTrack() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
|
||||
Arrays.asList("22037eac40a3b1df3086c2d7b27f0d5f"));
|
||||
Arrays.asList("8a9b424e00cdbe6b5e73d517335b2186"));
|
||||
executeTest("test using comp track", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOutputParameterSitesOnly() {
|
||||
testOutputParameters("-sites_only", "92db524b334f1416e595c711abc2d798");
|
||||
testOutputParameters("-sites_only", "97ba874eafc9884a4de027a84c036311");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOutputParameterAllConfident() {
|
||||
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "da318257d25a02abd26a3348421c3c69");
|
||||
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "f9ea04d96eeef29e71d37e60518c2579");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOutputParameterAllSites() {
|
||||
testOutputParameters("--output_mode EMIT_ALL_SITES", "13c4f01cffbbfac600318be95b3ca02f");
|
||||
testOutputParameters("--output_mode EMIT_ALL_SITES", "67739a3ccf30975bcaef8a563e4b80cf");
|
||||
}
|
||||
|
||||
private void testOutputParameters(final String args, final String md5) {
|
||||
|
|
@ -201,7 +201,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testConfidence() {
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1,
|
||||
Arrays.asList("7326eb84d8418546a408b68839a0a47e"));
|
||||
Arrays.asList("9addd225a985178339a0c49dc5fdc220"));
|
||||
executeTest("test confidence 1", spec1);
|
||||
}
|
||||
|
||||
|
|
@ -209,7 +209,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testConfidence2() {
|
||||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1,
|
||||
Arrays.asList("7326eb84d8418546a408b68839a0a47e"));
|
||||
Arrays.asList("9addd225a985178339a0c49dc5fdc220"));
|
||||
executeTest("test confidence 2", spec2);
|
||||
}
|
||||
|
||||
|
|
@ -220,12 +220,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
// --------------------------------------------------------------------------------------------------------------
|
||||
@Test
|
||||
public void testHeterozyosity1() {
|
||||
testHeterozosity( 0.01, "7aed8361e692eff559e6bca88752db0d" );
|
||||
testHeterozosity( 0.01, "f1c4c8e701b2334bf3c4f12fc395fec8" );
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHeterozyosity2() {
|
||||
testHeterozosity( 1.0 / 1850, "989e65bb7337117d31cd615163a8ac84" );
|
||||
testHeterozosity( 1.0 / 1850, "7fbbf4a21d6bf0026bfdadbb3c086fbe" );
|
||||
}
|
||||
|
||||
private void testHeterozosity(final double arg, final String md5) {
|
||||
|
|
@ -249,7 +249,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,100,000",
|
||||
1,
|
||||
Arrays.asList("c155587aa0410f43d7ccc57e1ae09a68"));
|
||||
Arrays.asList("5d19e3077e0cabbb364f68676a09ebe0"));
|
||||
|
||||
executeTest(String.format("test multiple technologies"), spec);
|
||||
}
|
||||
|
|
@ -268,7 +268,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -L 1:10,000,000-10,100,000" +
|
||||
" -baq CALCULATE_AS_NECESSARY",
|
||||
1,
|
||||
Arrays.asList("0748a711c6154f8d85847afb79aead94"));
|
||||
Arrays.asList("8a1931095f70523ad11cb99b30df7b84"));
|
||||
|
||||
executeTest(String.format("test calling with BAQ"), spec);
|
||||
}
|
||||
|
|
@ -287,7 +287,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,500,000",
|
||||
1,
|
||||
Arrays.asList("6aa034f669ec09ac4f5a28624cbe1830"));
|
||||
Arrays.asList("64a491b5276fd5d1cd04260ea3e63cf7"));
|
||||
|
||||
executeTest(String.format("test indel caller in SLX"), spec);
|
||||
}
|
||||
|
|
@ -302,7 +302,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -minIndelCnt 1" +
|
||||
" -L 1:10,000,000-10,100,000",
|
||||
1,
|
||||
Arrays.asList("ba7a011d0c665acc4455d58a6ab28716"));
|
||||
Arrays.asList("f63a8b8061e6c5999408d34798061895"));
|
||||
|
||||
executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
|
||||
}
|
||||
|
|
@ -315,7 +315,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,500,000",
|
||||
1,
|
||||
Arrays.asList("4f7d80f4f53ef0f0959414cb30097482"));
|
||||
Arrays.asList("c9d684ff2f2a9083480db6e962d612a9"));
|
||||
|
||||
executeTest(String.format("test indel calling, multiple technologies"), spec);
|
||||
}
|
||||
|
|
@ -325,7 +325,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
|
||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("95986d0c92436d3b9c1f1be9c768a368"));
|
||||
Arrays.asList("833fd97c6f32d7af6c9c088a78e51f68"));
|
||||
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
|
||||
}
|
||||
|
||||
|
|
@ -335,7 +335,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
|
||||
+ privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
|
||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("cecd3e35a817e299e97e8f7bbf083d2c"));
|
||||
Arrays.asList("95b73c24c68dc475516571d9f49dfb1e"));
|
||||
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
|
||||
}
|
||||
|
||||
|
|
@ -343,13 +343,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSampleIndels1() {
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
|
||||
Arrays.asList("af04b81f0548ca22b8d1f6bf223b336e"));
|
||||
Arrays.asList("3bdbf48de30bac58f3bcbc5bf3aa63aa"));
|
||||
List<File> result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst();
|
||||
|
||||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
|
||||
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
|
||||
Arrays.asList("c7792e27477ecf99893a76ecbac5c2f9"));
|
||||
Arrays.asList("beee9457d7cea42006ac45400db5e873"));
|
||||
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
|
||||
}
|
||||
|
||||
|
|
@ -371,7 +371,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 20:10,000,000-10,100,000",
|
||||
1,
|
||||
Arrays.asList("59ff26d7e5ca2503ebe9f74902251551"));
|
||||
Arrays.asList("945a2f994eaced8efdf8de24b58f2680"));
|
||||
|
||||
executeTest(String.format("test UG with base indel quality scores"), spec);
|
||||
}
|
||||
|
|
@ -405,7 +405,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMinIndelFraction0() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
assessMinIndelFraction + " -minIndelFrac 0.0", 1,
|
||||
Arrays.asList("f99f9a917529bfef717fad97f725d5df"));
|
||||
Arrays.asList("ba4fafec383fb988f20c8cf53dd3e9a0"));
|
||||
executeTest("test minIndelFraction 0.0", spec);
|
||||
}
|
||||
|
||||
|
|
@ -413,7 +413,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMinIndelFraction25() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
assessMinIndelFraction + " -minIndelFrac 0.25", 1,
|
||||
Arrays.asList("eac2cd649bd5836068350eb4260aaea7"));
|
||||
Arrays.asList("4c57a88de275105156aaafc6f9041365"));
|
||||
executeTest("test minIndelFraction 0.25", spec);
|
||||
}
|
||||
|
||||
|
|
@ -435,7 +435,22 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testNsInCigar() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + validationDataLocation + "testWithNs.bam -o %s -L 8:141799600-141814700", 1,
|
||||
Arrays.asList("22c9fd65ce3298bd7fbf400c9c209f29"));
|
||||
Arrays.asList("e8ebfaac0804b782f22ab8ea35152735"));
|
||||
executeTest("test calling on reads with Ns in CIGAR", spec);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing reduced reads
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Test
|
||||
public void testReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("bbf16e1873e525ee5975021cfb8988cf"));
|
||||
executeTest("test calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -61,4 +61,13 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest {
|
|||
Arrays.asList("7e7bad0e1890753a01303c09a38ceb8d"));
|
||||
executeTest("test hg18 to hg19, unsorted", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLiftoverFilteringOfIndels() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T FilterLiftedVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "liftover_indel_test.vcf --no_cmdline_in_header",
|
||||
1,
|
||||
Arrays.asList("0909a953291a5e701194668c9b8833ab"));
|
||||
executeTest("test liftover filtering of indels", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -190,7 +190,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
|||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header",
|
||||
1,
|
||||
Arrays.asList("52cb2f150559ca1457e9df7ec153dbb4")
|
||||
Arrays.asList("549321a2543608f214ab4893ab478be6")
|
||||
);
|
||||
|
||||
executeTest("testRegenotype--" + testFile, spec);
|
||||
|
|
@ -216,7 +216,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
|||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header",
|
||||
1,
|
||||
Arrays.asList("52cb2f150559ca1457e9df7ec153dbb4")
|
||||
Arrays.asList("549321a2543608f214ab4893ab478be6")
|
||||
);
|
||||
|
||||
executeTest("testRemoveMLEAndRegenotype--" + testFile, spec);
|
||||
|
|
@ -255,7 +255,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
|||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile,
|
||||
1,
|
||||
Arrays.asList("3ab35d5e81a29fb5db3e2add11c7e823")
|
||||
Arrays.asList("f14d75892b99547d8e9ba3a03bfb04ea")
|
||||
);
|
||||
executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,6 +28,13 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
|
|||
|
||||
}
|
||||
|
||||
public static String baseTestString(String inputVCF, String inputMetaData, int gq, String mode) {
|
||||
return "-T VariantsToBinaryPed -R " + b37KGReference + " -mode "+mode +
|
||||
" -V " + VTBP_DATA_DIR+inputVCF + " -m "+VTBP_DATA_DIR+inputMetaData + String.format(" -mgq %d",gq) +
|
||||
" -bim %s -fam %s -bed %s";
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNA12878Alone() {
|
||||
String testName = "testNA12878Alone";
|
||||
|
|
@ -52,6 +59,72 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
|
|||
executeTest(testName, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNA12878AloneSNPMajor() {
|
||||
String testName = "testNA12878AloneSNPMajor";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",10,"SNP_MAJOR"),
|
||||
3,
|
||||
Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","ada1acc475d096012b921b3219c3a446")
|
||||
);
|
||||
|
||||
executeTest(testName, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNA12878HighGQ() {
|
||||
String testName = "testNA12878HighGQ";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",80),
|
||||
3,
|
||||
Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","0822adea688e99bb336afe5172d4c959")
|
||||
);
|
||||
|
||||
executeTest(testName, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testVCFMismatchReference() {
|
||||
String testName = "testVCFMismatchReference";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("NA12878.badReference.vcf", "CEUTrio.NA12878.metadata.txt",80),
|
||||
3,
|
||||
UserException.class
|
||||
);
|
||||
|
||||
executeTest(testName, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test1000GWithIndels() {
|
||||
String testName = "test1000GWithIndels";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("1000G_selected_allVariants.vcf", "1000G_selected_allVariants.md.txt",0),
|
||||
3,
|
||||
Arrays.asList("3c98112434d9948dc47da72ad14e8d84","3aceda4f9bb5b5457797c1fe5a85b03d","451498ceff06c1649890900fa994f1af")
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test1000GWithIndelsSNPMajor() {
|
||||
String testName = "test1000GWithIndelsSNPMajor";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("1000G_selected_allVariants.vcf", "1000G_selected_allVariants.md.txt",0,"SNP_MAJOR"),
|
||||
3,
|
||||
Arrays.asList("3c98112434d9948dc47da72ad14e8d84","4a0ba3d0594b06306aa6459e4e28ec9a","451498ceff06c1649890900fa994f1af")
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test1000G_Symbolic() {
|
||||
String testName = "test1000G_Symbolic";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("1000G_selected_SVs.vcf", "1000G_selected_allVariants.md.txt",0),
|
||||
3,
|
||||
Arrays.asList("5e7ede48e7c5d5972c59dc5558a06e40","451498ceff06c1649890900fa994f1af","4b53a82a0b2d1a22a6eebca50a4f83a8")
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCEUTrio() {
|
||||
String testName = "testCEUTrio";
|
||||
|
|
@ -112,6 +185,7 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
|
|||
executeTest(testName, spec);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -63,7 +63,7 @@ public class VariantsToTableIntegrationTest extends WalkerTest {
|
|||
@Test(enabled = true)
|
||||
public void testMultiAllelicOneRecord() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(variantsToTableMultiAllelicCmd(""),
|
||||
Arrays.asList("13dd36c08be6c800f23988e6000d963e"));
|
||||
Arrays.asList("0ff49c08690f61a38614606a090f23ea"));
|
||||
executeTest("testMultiAllelicOneRecord", spec);
|
||||
}
|
||||
|
||||
|
|
@ -100,6 +100,19 @@ public class VariantsToTableIntegrationTest extends WalkerTest {
|
|||
executeTest("testGenotypeFieldsWithInline", spec);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testListFields() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" --variant " + privateTestDir + "vcfexample.withMLE.vcf" +
|
||||
" -T VariantsToTable" +
|
||||
" -GF PL" +
|
||||
" -o %s",
|
||||
1,
|
||||
Arrays.asList("1cb2737ab0eaee0a9ae25ab2e7ac3e7e"));
|
||||
executeTest("testGenotypeFields", spec);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMoltenOutput() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest {
|
|||
for ( final int nct : Arrays.asList(1, 2) ) {
|
||||
// tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct });
|
||||
//// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct });
|
||||
tests.add(new Object[]{ "BOTH", "081d077786ac0af24e9f97259a55209c", nt, nct });
|
||||
tests.add(new Object[]{ "BOTH", "78ce72d8f9d029313f5f2ceb02bb9822", nt, nct });
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
|
|
|
|||
|
|
@ -10,13 +10,17 @@ class ExampleRetryMemoryLimit extends QScript {
|
|||
var bamFile: File = _
|
||||
|
||||
def script() {
|
||||
val ug = new UnifiedGenotyper with RetryMemoryLimit
|
||||
// First run with 1m
|
||||
ug.memoryLimit = .001
|
||||
// On retry run with 1g
|
||||
ug.retryMemoryFunction = (d => d * 1000)
|
||||
ug.reference_sequence = referenceFile
|
||||
ug.input_file = Seq(bamFile)
|
||||
add(ug)
|
||||
for (scatterCount <- 1 to 2) {
|
||||
val ug = new UnifiedGenotyper with RetryMemoryLimit
|
||||
// First run with 1m
|
||||
ug.memoryLimit = .001
|
||||
// On retry run with 1g
|
||||
ug.retryMemoryFunction = (d => d * 1000)
|
||||
ug.reference_sequence = referenceFile
|
||||
ug.input_file = Seq(bamFile)
|
||||
ug.out = swapExt(bamFile, ".bam", ".scattered_%d.vcf".format(scatterCount))
|
||||
ug.scatterCount = scatterCount
|
||||
add(ug)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -189,7 +189,7 @@ class QCommandLine extends CommandLineProgram with Logging {
|
|||
private def createQueueHeader() : Seq[String] = {
|
||||
Seq(String.format("Queue v%s, Compiled %s", getQueueVersion, getBuildTimestamp),
|
||||
"Copyright (c) 2012 The Broad Institute",
|
||||
"Fro support and documentation go to http://www.broadinstitute.org/gatk")
|
||||
"For support and documentation go to http://www.broadinstitute.org/gatk")
|
||||
}
|
||||
|
||||
private def getQueueVersion : String = {
|
||||
|
|
|
|||
|
|
@ -26,19 +26,19 @@ package org.broadinstitute.sting.queue.extensions.gatk
|
|||
|
||||
import org.broadinstitute.sting.queue.function.scattergather.GatherFunction
|
||||
import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction
|
||||
import org.broadinstitute.sting.queue.function.QFunction
|
||||
import org.broadinstitute.sting.queue.function.{RetryMemoryLimit, QFunction}
|
||||
import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor
|
||||
|
||||
/**
|
||||
* Merges BAM files using net.sf.picard.sam.MergeSamFiles.
|
||||
*/
|
||||
class BamGatherFunction extends GatherFunction with PicardBamFunction {
|
||||
class BamGatherFunction extends GatherFunction with PicardBamFunction with RetryMemoryLimit {
|
||||
this.javaMainClass = "net.sf.picard.sam.MergeSamFiles"
|
||||
this.assumeSorted = Some(true)
|
||||
protected def inputBams = gatherParts
|
||||
protected def outputBam = originalOutput
|
||||
|
||||
override def freezeFieldValues {
|
||||
override def freezeFieldValues() {
|
||||
val originalGATK = originalFunction.asInstanceOf[CommandLineGATK]
|
||||
|
||||
// Whatever the original function can handle, merging *should* do less.
|
||||
|
|
|
|||
|
|
@ -25,13 +25,13 @@
|
|||
package org.broadinstitute.sting.queue.extensions.gatk
|
||||
|
||||
import org.broadinstitute.sting.queue.function.scattergather.GatherFunction
|
||||
import org.broadinstitute.sting.queue.function.QFunction
|
||||
import org.broadinstitute.sting.queue.function.{RetryMemoryLimit, QFunction}
|
||||
import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor
|
||||
|
||||
/**
|
||||
* Merges a vcf text file.
|
||||
*/
|
||||
class VcfGatherFunction extends CombineVariants with GatherFunction {
|
||||
class VcfGatherFunction extends CombineVariants with GatherFunction with RetryMemoryLimit {
|
||||
this.assumeIdenticalSamples = true
|
||||
this.suppressCommandLineHeader = true
|
||||
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ class SortSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFun
|
|||
override def freezeFieldValues() {
|
||||
super.freezeFieldValues()
|
||||
if (outputIndex == null && output != null)
|
||||
outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai")
|
||||
outputIndex = new File(output.getAbsolutePath.stripSuffix(".bam") + ".bai")
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@
|
|||
package org.broadinstitute.sting.queue.function
|
||||
|
||||
import org.broadinstitute.sting.queue.util._
|
||||
import org.broadinstitute.sting.commandline.Argument
|
||||
|
||||
/**
|
||||
* A command line that will be run in a pipeline.
|
||||
|
|
@ -33,12 +34,15 @@ trait CommandLineFunction extends QFunction with Logging {
|
|||
def commandLine: String
|
||||
|
||||
/** Upper memory limit */
|
||||
@Argument(doc="Memory limit", required=false)
|
||||
var memoryLimit: Option[Double] = None
|
||||
|
||||
/** Resident memory limit */
|
||||
@Argument(doc="Resident memory limit", required=false)
|
||||
var residentLimit: Option[Double] = None
|
||||
|
||||
/** Resident memory request */
|
||||
@Argument(doc="Resident memory request", required=false)
|
||||
var residentRequest: Option[Double] = None
|
||||
|
||||
/** the number of SMP cores this job wants */
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@ trait JavaCommandLineFunction extends CommandLineFunction {
|
|||
/**
|
||||
* Memory limit for the java executable, or if None will use the default memoryLimit.
|
||||
*/
|
||||
@Argument(doc="Java memory limit", required=false)
|
||||
var javaMemoryLimit: Option[Double] = None
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -113,11 +113,13 @@ trait QFunction extends Logging with QJobReport {
|
|||
var jobErrorFile: File = _
|
||||
|
||||
/** Errors (if any) from the last failed run of jobErrorFiles. */
|
||||
@Argument(doc="Job error lines", required=false)
|
||||
var jobErrorLines: Seq[String] = Nil
|
||||
|
||||
/**
|
||||
* The number of times this function has previously been run.
|
||||
*/
|
||||
@Argument(doc="Job retries", required=false)
|
||||
var retries = 0
|
||||
|
||||
/** Change settings for the next run. Retries will be set to the number of times the function was run and jobErrorLines may contain the error text. */
|
||||
|
|
@ -541,4 +543,11 @@ object QFunction {
|
|||
classFields
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the Seq of fields for a QFunction class.
|
||||
* @param clazz Class to retrieve fields for.
|
||||
* @return the fields of the class.
|
||||
*/
|
||||
def classFunctionFields(clazz: Class[_]) = classFields(clazz).functionFields
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,17 +24,26 @@
|
|||
|
||||
package org.broadinstitute.sting.queue.function
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument
|
||||
|
||||
object RetryMemoryLimit {
|
||||
private val defaultRetryMemoryFunction: (Double => Double) = ( 2 * _ )
|
||||
private val defaultMemoryLimitErrorText = Seq("OutOfMemory", "you did not provide enough memory", "TERM_MEMLIMIT")
|
||||
}
|
||||
|
||||
/** A mixin that on retry increases the memory limit when certain text is found. */
|
||||
trait RetryMemoryLimit extends CommandLineFunction {
|
||||
|
||||
/** How to increase the memory. By default doubles the memory. */
|
||||
var retryMemoryFunction: (Double => Double) = (2 * _)
|
||||
var retryMemoryFunction: (Double => Double) = RetryMemoryLimit.defaultRetryMemoryFunction
|
||||
|
||||
/** Once the threshold is passed, no more memory will be added to memory limit. */
|
||||
@Argument(doc="threshold to stop doubling the memory", required=false)
|
||||
var memoryLimitThreshold: Option[Double] = None
|
||||
|
||||
/** Various strings to look for to determine we ran out of memory. */
|
||||
var memoryLimitErrorText = Seq("OutOfMemory", "you did not provide enough memory", "TERM_MEMLIMIT")
|
||||
@Argument(doc="text to look for in the errors", required = false)
|
||||
var memoryLimitErrorText = RetryMemoryLimit.defaultMemoryLimitErrorText
|
||||
|
||||
override def freezeFieldValues() {
|
||||
super.freezeFieldValues()
|
||||
|
|
@ -42,6 +51,21 @@ trait RetryMemoryLimit extends CommandLineFunction {
|
|||
this.memoryLimitThreshold = this.qSettings.memoryLimitThreshold
|
||||
}
|
||||
|
||||
|
||||
override def copySettingsTo(function: QFunction) {
|
||||
super.copySettingsTo(function)
|
||||
function match {
|
||||
case retryMemoryLimit: RetryMemoryLimit =>
|
||||
if (retryMemoryLimit.memoryLimitThreshold.isEmpty)
|
||||
retryMemoryLimit.memoryLimitThreshold = this.memoryLimitThreshold
|
||||
if (retryMemoryLimit.retryMemoryFunction == RetryMemoryLimit.defaultRetryMemoryFunction)
|
||||
retryMemoryLimit.retryMemoryFunction = this.retryMemoryFunction
|
||||
if (retryMemoryLimit.memoryLimitErrorText == RetryMemoryLimit.defaultMemoryLimitErrorText)
|
||||
retryMemoryLimit.memoryLimitErrorText = this.memoryLimitErrorText
|
||||
case _ => /* ignore */
|
||||
}
|
||||
}
|
||||
|
||||
override def setupRetry() {
|
||||
super.setupRetry()
|
||||
if (this.memoryLimitThreshold.isDefined && this.memoryLimit.isDefined) {
|
||||
|
|
|
|||
|
|
@ -30,6 +30,10 @@ import org.broadinstitute.sting.queue.function.{QFunction, CommandLineFunction}
|
|||
/**
|
||||
* Shadow clones another command line function.
|
||||
*/
|
||||
object CloneFunction {
|
||||
private lazy val cloneFunctionFields = QFunction.classFunctionFields(classOf[CloneFunction])
|
||||
}
|
||||
|
||||
class CloneFunction extends CommandLineFunction {
|
||||
var originalFunction: ScatterGatherableFunction = _
|
||||
var cloneIndex: Int = _
|
||||
|
|
@ -41,10 +45,10 @@ class CloneFunction extends CommandLineFunction {
|
|||
var originalValues = Map.empty[ArgumentSource, Any]
|
||||
withScatterPartCount += 1
|
||||
if (withScatterPartCount == 1) {
|
||||
overriddenFields.foreach{
|
||||
case (field, overrideValue) => {
|
||||
originalFunction.functionFields.foreach {
|
||||
case (field) => {
|
||||
originalValues += field -> originalFunction.getFieldValue(field)
|
||||
originalFunction.setFieldValue(field, overrideValue)
|
||||
originalFunction.setFieldValue(field, getFieldValue(field))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -52,9 +56,11 @@ class CloneFunction extends CommandLineFunction {
|
|||
f()
|
||||
} finally {
|
||||
if (withScatterPartCount == 1) {
|
||||
originalValues.foreach{
|
||||
case (name, value) =>
|
||||
originalFunction.setFieldValue(name, value)
|
||||
originalFunction.functionFields.foreach {
|
||||
case (field) => {
|
||||
setFieldValue(field, originalFunction.getFieldValue(field))
|
||||
originalFunction.setFieldValue(field, originalValues(field))
|
||||
}
|
||||
}
|
||||
}
|
||||
withScatterPartCount -= 1
|
||||
|
|
@ -63,6 +69,8 @@ class CloneFunction extends CommandLineFunction {
|
|||
|
||||
override def description = withScatterPart(() => originalFunction.description)
|
||||
override def shortDescription = withScatterPart(() => originalFunction.shortDescription)
|
||||
override def setupRetry() { withScatterPart(() => originalFunction.setupRetry()) }
|
||||
|
||||
override protected def functionFieldClass = originalFunction.getClass
|
||||
|
||||
def commandLine = withScatterPart(() => originalFunction.commandLine)
|
||||
|
|
@ -73,13 +81,19 @@ class CloneFunction extends CommandLineFunction {
|
|||
}
|
||||
|
||||
override def getFieldValue(source: ArgumentSource): AnyRef = {
|
||||
overriddenFields.get(source) match {
|
||||
case Some(value) => value.asInstanceOf[AnyRef]
|
||||
case None => {
|
||||
val value = originalFunction.getFieldValue(source)
|
||||
overriddenFields += source -> value
|
||||
value
|
||||
}
|
||||
CloneFunction.cloneFunctionFields.find(_.field.getName == source.field.getName) match {
|
||||
case Some(cloneSource) =>
|
||||
super.getFieldValue(cloneSource)
|
||||
case None =>
|
||||
overriddenFields.get(source) match {
|
||||
case Some(value) =>
|
||||
value.asInstanceOf[AnyRef]
|
||||
case None => {
|
||||
val value = originalFunction.getFieldValue(source)
|
||||
overriddenFields += source -> value
|
||||
value
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue