Merge branch 'master' of gsa2:/humgen/gsa-scr1/chartl/dev/unstable
This commit is contained in:
commit
6b9987cf1b
|
|
@ -23,7 +23,7 @@ public class BaseAndQualsCounts extends BaseCounts {
|
|||
}
|
||||
}
|
||||
|
||||
public void incr(byte base, byte baseQual, byte insQual, byte delQual) {
|
||||
public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
|
||||
super.incr(base, baseQual);
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // do not allow Ns
|
||||
|
|
@ -32,7 +32,7 @@ public class BaseAndQualsCounts extends BaseCounts {
|
|||
}
|
||||
}
|
||||
|
||||
public void decr(byte base, byte baseQual, byte insQual, byte delQual) {
|
||||
public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
|
||||
super.decr(base, baseQual);
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // do not allow Ns
|
||||
|
|
@ -41,16 +41,15 @@ public class BaseAndQualsCounts extends BaseCounts {
|
|||
}
|
||||
}
|
||||
|
||||
public byte averageInsertionQualsOfMostCommonBase() {
|
||||
return getGenericAverageQualOfMostCommonBase(sumInsertionQuals);
|
||||
public byte averageInsertionQualsOfBase(final BaseIndex base) {
|
||||
return getGenericAverageQualOfBase(base, sumInsertionQuals);
|
||||
}
|
||||
|
||||
public byte averageDeletionQualsOfMostCommonBase() {
|
||||
return getGenericAverageQualOfMostCommonBase(sumDeletionQuals);
|
||||
public byte averageDeletionQualsOfBase(final BaseIndex base) {
|
||||
return getGenericAverageQualOfBase(base, sumDeletionQuals);
|
||||
}
|
||||
|
||||
private byte getGenericAverageQualOfMostCommonBase(Map<BaseIndex, Long> sumQuals) {
|
||||
BaseIndex base = BaseIndex.byteToBase(baseWithMostCounts());
|
||||
private byte getGenericAverageQualOfBase(final BaseIndex base, final Map<BaseIndex, Long> sumQuals) {
|
||||
return (byte) (sumQuals.get(base) / getCount(base));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -41,26 +41,26 @@ import java.util.Map;
|
|||
|
||||
@Requires("other != null")
|
||||
public void add(BaseCounts other) {
|
||||
for (BaseIndex i : BaseIndex.values())
|
||||
for (final BaseIndex i : BaseIndex.values())
|
||||
counts.put(i, counts.get(i) + other.counts.get(i));
|
||||
}
|
||||
|
||||
@Requires("other != null")
|
||||
public void sub(BaseCounts other) {
|
||||
for (BaseIndex i : BaseIndex.values())
|
||||
for (final BaseIndex i : BaseIndex.values())
|
||||
counts.put(i, counts.get(i) - other.counts.get(i));
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
|
||||
public void incr(byte base) {
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) // no Ns
|
||||
counts.put(i, counts.get(i) + 1);
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
|
||||
public void incr(byte base, byte qual) {
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // no Ns
|
||||
counts.put(i, counts.get(i) + 1);
|
||||
sumQuals.put(i, sumQuals.get(i) + qual);
|
||||
|
|
@ -69,14 +69,14 @@ import java.util.Map;
|
|||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
|
||||
public void decr(byte base) {
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) // no Ns
|
||||
counts.put(i, counts.get(i) - 1);
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
|
||||
public void decr(byte base, byte qual) {
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // no Ns
|
||||
counts.put(i, counts.get(i) - 1);
|
||||
sumQuals.put(i, sumQuals.get(i) - qual);
|
||||
|
|
@ -84,52 +84,48 @@ import java.util.Map;
|
|||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int getCount(byte base) {
|
||||
public int getCount(final byte base) {
|
||||
return getCount(BaseIndex.byteToBase(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int getCount(BaseIndex base) {
|
||||
public int getCount(final BaseIndex base) {
|
||||
return counts.get(base);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public long getSumQuals(byte base) {
|
||||
public long getSumQuals(final byte base) {
|
||||
return getSumQuals(BaseIndex.byteToBase(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public long getSumQuals(BaseIndex base) {
|
||||
public long getSumQuals(final BaseIndex base) {
|
||||
return sumQuals.get(base);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public byte averageQuals(byte base) {
|
||||
public byte averageQuals(final byte base) {
|
||||
return (byte) (getSumQuals(base) / getCount(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public byte averageQuals(BaseIndex base) {
|
||||
public byte averageQuals(final BaseIndex base) {
|
||||
return (byte) (getSumQuals(base) / getCount(base));
|
||||
}
|
||||
|
||||
public byte baseWithMostCounts() {
|
||||
return baseIndexWithMostCounts().getByte();
|
||||
@Ensures("result >= 0")
|
||||
public int countOfBase(final BaseIndex base) {
|
||||
return counts.get(base);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int countOfMostCommonBase() {
|
||||
return counts.get(baseIndexWithMostCounts());
|
||||
public long sumQualsOfBase(final BaseIndex base) {
|
||||
return sumQuals.get(base);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public long sumQualsOfMostCommonBase() {
|
||||
return sumQuals.get(baseIndexWithMostCounts());
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public byte averageQualsOfMostCommonBase() {
|
||||
return (byte) (sumQualsOfMostCommonBase() / countOfMostCommonBase());
|
||||
public byte averageQualsOfBase(final BaseIndex base) {
|
||||
return (byte) (sumQualsOfBase(base) / countOfBase(base));
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -149,7 +145,7 @@ import java.util.Map;
|
|||
* @return the proportion of this base over all other bases
|
||||
*/
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportion(byte base) {
|
||||
public double baseCountProportion(final byte base) {
|
||||
return (double) counts.get(BaseIndex.byteToBase(base)) / totalCount();
|
||||
}
|
||||
|
||||
|
|
@ -160,7 +156,7 @@ import java.util.Map;
|
|||
* @return the proportion of this base over all other bases
|
||||
*/
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportion(BaseIndex baseIndex) {
|
||||
public double baseCountProportion(final BaseIndex baseIndex) {
|
||||
int total = totalCount();
|
||||
if (total == 0)
|
||||
return 0.0;
|
||||
|
|
@ -177,22 +173,28 @@ import java.util.Map;
|
|||
return b.toString();
|
||||
}
|
||||
|
||||
public byte baseWithMostCounts() {
|
||||
return baseIndexWithMostCounts().getByte();
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostCounts() {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (BaseIndex i : counts.keySet())
|
||||
if (hasHigherCount(i, maxI))
|
||||
maxI = i;
|
||||
for (Map.Entry<BaseIndex, Integer> entry : counts.entrySet()) {
|
||||
if (entry.getValue() > counts.get(maxI))
|
||||
maxI = entry.getKey();
|
||||
}
|
||||
return maxI;
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostCountsWithoutIndels() {
|
||||
BaseIndex mostCounts = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (BaseIndex index : counts.keySet())
|
||||
if (index.isNucleotide() && hasHigherCount(index, mostCounts))
|
||||
mostCounts = index;
|
||||
return mostCounts;
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (Map.Entry<BaseIndex, Integer> entry : counts.entrySet()) {
|
||||
if (entry.getKey().isNucleotide() && entry.getValue() > counts.get(maxI))
|
||||
maxI = entry.getKey();
|
||||
}
|
||||
return maxI;
|
||||
}
|
||||
|
||||
private boolean hasHigherCount(final BaseIndex targetIndex, final BaseIndex testIndex) {
|
||||
|
|
@ -201,6 +203,30 @@ import java.util.Map;
|
|||
return ( targetCount > testCount || (targetCount == testCount && sumQuals.get(targetIndex) > sumQuals.get(testIndex)) );
|
||||
}
|
||||
|
||||
public byte baseWithMostProbability() {
|
||||
return baseIndexWithMostProbability().getByte();
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostProbability() {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (Map.Entry<BaseIndex, Long> entry : sumQuals.entrySet()) {
|
||||
if (entry.getValue() > sumQuals.get(maxI))
|
||||
maxI = entry.getKey();
|
||||
}
|
||||
return (sumQuals.get(maxI) > 0L ? maxI : baseIndexWithMostCounts());
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostProbabilityWithoutIndels() {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (Map.Entry<BaseIndex, Long> entry : sumQuals.entrySet()) {
|
||||
if (entry.getKey().isNucleotide() && entry.getValue() > sumQuals.get(maxI))
|
||||
maxI = entry.getKey();
|
||||
}
|
||||
return (sumQuals.get(maxI) > 0L ? maxI : baseIndexWithMostCountsWithoutIndels());
|
||||
}
|
||||
|
||||
@Ensures("result >=0")
|
||||
public int totalCountWithoutIndels() {
|
||||
int sum = 0;
|
||||
|
|
@ -218,8 +244,8 @@ import java.util.Map;
|
|||
*/
|
||||
@Requires("index.isNucleotide()")
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportionWithoutIndels(BaseIndex index) {
|
||||
int total = totalCountWithoutIndels();
|
||||
public double baseCountProportionWithoutIndels(final BaseIndex index) {
|
||||
final int total = totalCountWithoutIndels();
|
||||
if (total == 0)
|
||||
return 0.0;
|
||||
return (double) counts.get(index) / totalCountWithoutIndels();
|
||||
|
|
|
|||
|
|
@ -182,7 +182,7 @@ public class HeaderElement {
|
|||
* @return whether or not the HeaderElement is variant due to excess insertions
|
||||
*/
|
||||
private boolean isVariantFromMismatches(double minVariantProportion) {
|
||||
BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostCountsWithoutIndels();
|
||||
BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels();
|
||||
double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon);
|
||||
return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ public class SlidingWindow {
|
|||
|
||||
private final int nContigs;
|
||||
|
||||
private boolean allowPolyploidReduction;
|
||||
private boolean allowPolyploidReductionInGeneral;
|
||||
|
||||
/**
|
||||
* The types of synthetic reads to use in the finalizeAndAdd method
|
||||
|
|
@ -117,7 +117,7 @@ public class SlidingWindow {
|
|||
this.hasIndelQualities = hasIndelQualities;
|
||||
this.nContigs = nContigs;
|
||||
|
||||
this.allowPolyploidReduction = allowPolyploidReduction;
|
||||
this.allowPolyploidReductionInGeneral = allowPolyploidReduction;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -207,8 +207,9 @@ public class SlidingWindow {
|
|||
finalizedReads = closeVariantRegions(regions, false);
|
||||
|
||||
List<GATKSAMRecord> readsToRemove = new LinkedList<GATKSAMRecord>();
|
||||
for (GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!)
|
||||
if (read.getSoftEnd() < getStartLocation(windowHeader)) {
|
||||
final int windowHeaderStartLoc = getStartLocation(windowHeader);
|
||||
for (final GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!)
|
||||
if (read.getSoftEnd() < windowHeaderStartLoc) {
|
||||
readsToRemove.add(read);
|
||||
}
|
||||
}
|
||||
|
|
@ -291,7 +292,7 @@ public class SlidingWindow {
|
|||
reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS));
|
||||
|
||||
int endOfFilteredData = findNextNonFilteredDataElement(header, start, end);
|
||||
addToFilteredData(header, start, endOfFilteredData, isNegativeStrand);
|
||||
reads.addAll(addToFilteredData(header, start, endOfFilteredData, isNegativeStrand));
|
||||
|
||||
if (endOfFilteredData <= start)
|
||||
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start));
|
||||
|
|
@ -418,7 +419,9 @@ public class SlidingWindow {
|
|||
* @param start the first header index to add to consensus
|
||||
* @param end the first header index NOT TO add to consensus
|
||||
*/
|
||||
private void addToFilteredData(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
|
||||
private List<GATKSAMRecord> addToFilteredData(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
|
||||
List<GATKSAMRecord> result = new ArrayList<GATKSAMRecord>(0);
|
||||
|
||||
if (filteredDataConsensus == null)
|
||||
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
|
||||
|
||||
|
|
@ -434,8 +437,15 @@ public class SlidingWindow {
|
|||
if (!headerElement.hasFilteredData())
|
||||
throw new ReviewedStingException("No filtered data in " + index);
|
||||
|
||||
if ( filteredDataConsensus.getRefStart() + filteredDataConsensus.size() != headerElement.getLocation() ) {
|
||||
result.add(finalizeFilteredDataConsensus());
|
||||
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
|
||||
}
|
||||
|
||||
genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS());
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -472,15 +482,15 @@ public class SlidingWindow {
|
|||
* @param rms the rms mapping quality in the header element
|
||||
*/
|
||||
private void genericAddBaseToConsensus(SyntheticRead syntheticRead, BaseAndQualsCounts baseCounts, double rms) {
|
||||
BaseIndex base = baseCounts.baseIndexWithMostCounts();
|
||||
byte count = (byte) Math.min(baseCounts.countOfMostCommonBase(), Byte.MAX_VALUE);
|
||||
byte qual = baseCounts.averageQualsOfMostCommonBase();
|
||||
byte insQual = baseCounts.averageInsertionQualsOfMostCommonBase();
|
||||
byte delQual = baseCounts.averageDeletionQualsOfMostCommonBase();
|
||||
final BaseIndex base = baseCounts.baseIndexWithMostProbability();
|
||||
byte count = (byte) Math.min(baseCounts.countOfBase(base), Byte.MAX_VALUE);
|
||||
byte qual = baseCounts.averageQualsOfBase(base);
|
||||
byte insQual = baseCounts.averageInsertionQualsOfBase(base);
|
||||
byte delQual = baseCounts.averageDeletionQualsOfBase(base);
|
||||
syntheticRead.add(base, count, qual, insQual, delQual, rms);
|
||||
}
|
||||
|
||||
private List<GATKSAMRecord> compressVariantRegion(int start, int stop) {
|
||||
private List<GATKSAMRecord> compressVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
|
||||
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
// Try to compress into a polyploid consensus
|
||||
|
|
@ -490,7 +500,8 @@ public class SlidingWindow {
|
|||
boolean foundEvent = false;
|
||||
Object[] header = windowHeader.toArray();
|
||||
|
||||
if ( allowPolyploidReduction ) { // foundEvent will remain false if we don't allow polyploid reduction
|
||||
// foundEvent will remain false if we don't allow polyploid reduction
|
||||
if ( allowPolyploidReductionInGeneral && !disallowPolyploidReductionAtThisPosition ) {
|
||||
for (int i = start; i<=stop; i++) {
|
||||
nHaplotypes = ((HeaderElement) header[i]).getNumberOfHaplotypes(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT);
|
||||
if (nHaplotypes > nContigs) {
|
||||
|
|
@ -512,9 +523,6 @@ public class SlidingWindow {
|
|||
}
|
||||
}
|
||||
|
||||
int refStart = windowHeader.get(start).getLocation();
|
||||
int refStop = windowHeader.get(stop).getLocation();
|
||||
|
||||
// Try to compress the variant region
|
||||
// the "foundEvent" protects us from trying to compress variant regions that are created by insertions
|
||||
if (canCompress && foundEvent) {
|
||||
|
|
@ -524,6 +532,9 @@ public class SlidingWindow {
|
|||
// Return all reads that overlap the variant region and remove them from the window header entirely
|
||||
// also remove all reads preceding the variant region (since they will be output as consensus right after compression
|
||||
else {
|
||||
final int refStart = windowHeader.get(start).getLocation();
|
||||
final int refStop = windowHeader.get(stop).getLocation();
|
||||
|
||||
LinkedList<GATKSAMRecord> toRemove = new LinkedList<GATKSAMRecord>();
|
||||
for (GATKSAMRecord read : readsInWindow) {
|
||||
if (read.getSoftStart() <= refStop) {
|
||||
|
|
@ -549,8 +560,8 @@ public class SlidingWindow {
|
|||
* @return all reads contained in the variant region plus any adjacent synthetic reads
|
||||
*/
|
||||
@Requires("start <= stop")
|
||||
protected List<GATKSAMRecord> closeVariantRegion(int start, int stop) {
|
||||
List<GATKSAMRecord> allReads = compressVariantRegion(start, stop);
|
||||
protected List<GATKSAMRecord> closeVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
|
||||
List<GATKSAMRecord> allReads = compressVariantRegion(start, stop, disallowPolyploidReductionAtThisPosition);
|
||||
|
||||
List<GATKSAMRecord> result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads;
|
||||
result.addAll(addToSyntheticReads(windowHeader, 0, stop, false));
|
||||
|
|
@ -570,7 +581,7 @@ public class SlidingWindow {
|
|||
if (stop < 0 && forceClose)
|
||||
stop = windowHeader.size() - 1;
|
||||
if (stop >= 0) {
|
||||
allReads.addAll(closeVariantRegion(start, stop));
|
||||
allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1));
|
||||
lastStop = stop;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ public class SyntheticRead {
|
|||
private String contig;
|
||||
private int contigIndex;
|
||||
private String readName;
|
||||
private Integer refStart;
|
||||
private int refStart;
|
||||
private boolean hasIndelQualities = false;
|
||||
private boolean isNegativeStrand = false;
|
||||
|
||||
|
|
@ -60,7 +60,7 @@ public class SyntheticRead {
|
|||
* @param refStart the alignment start (reference based)
|
||||
* @param readTag the reduce reads tag for the synthetic read
|
||||
*/
|
||||
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) {
|
||||
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) {
|
||||
final int initialCapacity = 10000;
|
||||
bases = new ArrayList<BaseIndex>(initialCapacity);
|
||||
counts = new ArrayList<Byte>(initialCapacity);
|
||||
|
|
@ -80,7 +80,7 @@ public class SyntheticRead {
|
|||
this.isNegativeStrand = isNegativeRead;
|
||||
}
|
||||
|
||||
public SyntheticRead(List<BaseIndex> bases, List<Byte> counts, List<Byte> quals, List<Byte> insertionQuals, List<Byte> deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, boolean hasIndelQualities, boolean isNegativeRead) {
|
||||
public SyntheticRead(List<BaseIndex> bases, List<Byte> counts, List<Byte> quals, List<Byte> insertionQuals, List<Byte> deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) {
|
||||
this.bases = bases;
|
||||
this.counts = counts;
|
||||
this.quals = quals;
|
||||
|
|
@ -115,11 +115,15 @@ public class SyntheticRead {
|
|||
this.mappingQuality += mappingQuality;
|
||||
}
|
||||
|
||||
public BaseIndex getBase(int readCoordinate) {
|
||||
public BaseIndex getBase(final int readCoordinate) {
|
||||
return bases.get(readCoordinate);
|
||||
}
|
||||
|
||||
/**
|
||||
public int getRefStart() {
|
||||
return refStart;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a GATKSAMRecord of the synthetic read. Will return null if the read is invalid.
|
||||
*
|
||||
* Invalid reads are :
|
||||
|
|
|
|||
|
|
@ -0,0 +1,228 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.ConsoleAppender;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.log4j.SimpleLayout;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: 10/2/12
|
||||
* Time: 10:25 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class ExactAFCalculationPerformanceTest {
|
||||
final static Logger logger = Logger.getLogger(ExactAFCalculationPerformanceTest.class);
|
||||
|
||||
private static abstract class Analysis {
|
||||
final GATKReport report;
|
||||
|
||||
public Analysis(final String name, final List<String> columns) {
|
||||
report = GATKReport.newSimpleReport(name, columns);
|
||||
}
|
||||
|
||||
public abstract void run(final ExactAFCalculationTestBuilder testBuilder,
|
||||
final List<Object> coreColumns);
|
||||
|
||||
public String getName() {
|
||||
return getTable().getTableName();
|
||||
}
|
||||
|
||||
public GATKReportTable getTable() {
|
||||
return report.getTables().iterator().next();
|
||||
}
|
||||
}
|
||||
|
||||
private static class AnalyzeByACAndPL extends Analysis {
|
||||
public AnalyzeByACAndPL(final List<String> columns) {
|
||||
super("AnalyzeByACAndPL", Utils.append(columns, "non.type.pls", "ac", "n.alt.seg", "other.ac"));
|
||||
}
|
||||
|
||||
public void run(final ExactAFCalculationTestBuilder testBuilder, final List<Object> coreValues) {
|
||||
final SimpleTimer timer = new SimpleTimer();
|
||||
|
||||
for ( final int nonTypePL : Arrays.asList(10, 100, 1000) ) {
|
||||
final ExactAFCalculation calc = testBuilder.makeModel();
|
||||
final double[] priors = testBuilder.makePriors();
|
||||
|
||||
for ( int[] ACs : makeACs(testBuilder.numAltAlleles, testBuilder.nSamples*2) ) {
|
||||
final VariantContext vc = testBuilder.makeACTest(ACs, nonTypePL);
|
||||
|
||||
timer.start();
|
||||
final AlleleFrequencyCalculationResult result = calc.getLog10PNonRef(vc, priors);
|
||||
final long runtime = timer.getElapsedTimeNano();
|
||||
|
||||
int otherAC = 0;
|
||||
int nAltSeg = 0;
|
||||
for ( int i = 0; i < ACs.length; i++ ) {
|
||||
nAltSeg += ACs[i] > 0 ? 1 : 0;
|
||||
if ( i > 0 ) otherAC += ACs[i];
|
||||
}
|
||||
|
||||
final List<Object> columns = new LinkedList<Object>(coreValues);
|
||||
columns.addAll(Arrays.asList(runtime, result.getnEvaluations(), nonTypePL, ACs[0], nAltSeg, otherAC));
|
||||
report.addRowList(columns);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private List<int[]> makeACs(final int nAltAlleles, final int nChrom) {
|
||||
if ( nAltAlleles > 2 ) throw new IllegalArgumentException("nAltAlleles must be < 3");
|
||||
|
||||
final List<int[]> ACs = new LinkedList<int[]>();
|
||||
|
||||
if ( nAltAlleles == 1 )
|
||||
for ( int i = 0; i < nChrom; i++ ) {
|
||||
ACs.add(new int[]{i});
|
||||
} else if ( nAltAlleles == 2 ) {
|
||||
for ( int i = 0; i < nChrom; i++ ) {
|
||||
for ( int j : Arrays.asList(0, 1, 5, 10, 50, 100, 1000, 10000, 100000) ) {
|
||||
if ( j < nChrom - i )
|
||||
ACs.add(new int[]{i, j});
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw new IllegalStateException("cannot get here");
|
||||
}
|
||||
|
||||
return ACs;
|
||||
}
|
||||
}
|
||||
|
||||
private static class AnalyzeBySingletonPosition extends Analysis {
|
||||
public AnalyzeBySingletonPosition(final List<String> columns) {
|
||||
super("AnalyzeBySingletonPosition", Utils.append(columns, "non.type.pls", "position.of.singleton"));
|
||||
}
|
||||
|
||||
public void run(final ExactAFCalculationTestBuilder testBuilder, final List<Object> coreValues) {
|
||||
final SimpleTimer timer = new SimpleTimer();
|
||||
|
||||
for ( final int nonTypePL : Arrays.asList(100) ) {
|
||||
final ExactAFCalculation calc = testBuilder.makeModel();
|
||||
final double[] priors = testBuilder.makePriors();
|
||||
|
||||
final int[] ac = new int[testBuilder.numAltAlleles];
|
||||
ac[0] = 1;
|
||||
final VariantContext vc = testBuilder.makeACTest(ac, nonTypePL);
|
||||
|
||||
for ( int position = 0; position < vc.getNSamples(); position++ ) {
|
||||
final VariantContextBuilder vcb = new VariantContextBuilder(vc);
|
||||
final List<Genotype> genotypes = new ArrayList<Genotype>(vc.getGenotypes());
|
||||
Collections.rotate(genotypes, position);
|
||||
vcb.genotypes(genotypes);
|
||||
|
||||
timer.start();
|
||||
final AlleleFrequencyCalculationResult result = calc.getLog10PNonRef(vcb.make(), priors);
|
||||
final long runtime = timer.getElapsedTimeNano();
|
||||
|
||||
final List<Object> columns = new LinkedList<Object>(coreValues);
|
||||
columns.addAll(Arrays.asList(runtime, result.getnEvaluations(), nonTypePL, position));
|
||||
report.addRowList(columns);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static class AnalyzeByNonInformative extends Analysis {
|
||||
public AnalyzeByNonInformative(final List<String> columns) {
|
||||
super("AnalyzeByNonInformative", Utils.append(columns, "non.type.pls", "n.non.informative"));
|
||||
}
|
||||
|
||||
public void run(final ExactAFCalculationTestBuilder testBuilder, final List<Object> coreValues) {
|
||||
final SimpleTimer timer = new SimpleTimer();
|
||||
|
||||
for ( final int nonTypePL : Arrays.asList(100) ) {
|
||||
final ExactAFCalculation calc = testBuilder.makeModel();
|
||||
final double[] priors = testBuilder.makePriors();
|
||||
|
||||
final int[] ac = new int[testBuilder.numAltAlleles];
|
||||
ac[0] = 1;
|
||||
final VariantContext vc = testBuilder.makeACTest(ac, nonTypePL);
|
||||
final Genotype nonInformative = testBuilder.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0);
|
||||
|
||||
for ( int nNonInformative = 0; nNonInformative < vc.getNSamples(); nNonInformative++ ) {
|
||||
final VariantContextBuilder vcb = new VariantContextBuilder(vc);
|
||||
|
||||
final List<Genotype> genotypes = new ArrayList<Genotype>();
|
||||
genotypes.addAll(vc.getGenotypes().subList(0, nNonInformative + 1));
|
||||
genotypes.addAll(Collections.nCopies(vc.getNSamples() - nNonInformative, nonInformative));
|
||||
vcb.genotypes(genotypes);
|
||||
|
||||
timer.start();
|
||||
final AlleleFrequencyCalculationResult result = calc.getLog10PNonRef(vcb.make(), priors);
|
||||
final long runtime = timer.getElapsedTimeNano();
|
||||
|
||||
final List<Object> columns = new LinkedList<Object>(coreValues);
|
||||
columns.addAll(Arrays.asList(runtime, result.getnEvaluations(), nonTypePL, nNonInformative));
|
||||
report.addRowList(columns);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
logger.addAppender(new ConsoleAppender(new SimpleLayout()));
|
||||
|
||||
final List<String> coreColumns = Arrays.asList("iteration", "n.alt.alleles", "n.samples",
|
||||
"exact.model", "prior.type", "runtime", "n.evaluations");
|
||||
|
||||
final PrintStream out = new PrintStream(new FileOutputStream(args[0]));
|
||||
|
||||
final boolean USE_GENERAL = false;
|
||||
final List<ExactAFCalculationTestBuilder.ModelType> modelTypes = USE_GENERAL
|
||||
? Arrays.asList(ExactAFCalculationTestBuilder.ModelType.values())
|
||||
: Arrays.asList(ExactAFCalculationTestBuilder.ModelType.DiploidExact);
|
||||
|
||||
final boolean ONLY_HUMAN_PRIORS = false;
|
||||
final List<ExactAFCalculationTestBuilder.PriorType> priorTypes = ONLY_HUMAN_PRIORS
|
||||
? Arrays.asList(ExactAFCalculationTestBuilder.PriorType.values())
|
||||
: Arrays.asList(ExactAFCalculationTestBuilder.PriorType.human);
|
||||
|
||||
final int MAX_N_SAMPLES_FOR_MULTI_ALLELIC = 100;
|
||||
|
||||
final List<Analysis> analyzes = new ArrayList<Analysis>();
|
||||
analyzes.add(new AnalyzeByACAndPL(coreColumns));
|
||||
analyzes.add(new AnalyzeBySingletonPosition(coreColumns));
|
||||
analyzes.add(new AnalyzeByNonInformative(coreColumns));
|
||||
|
||||
for ( int iteration = 0; iteration < 1; iteration++ ) {
|
||||
for ( final int nAltAlleles : Arrays.asList(1, 2) ) {
|
||||
for ( final int nSamples : Arrays.asList(1, 10, 100, 1000, 10000) ) {
|
||||
if ( nSamples > MAX_N_SAMPLES_FOR_MULTI_ALLELIC && nAltAlleles > 1 )
|
||||
continue; // skip things that will take forever!
|
||||
|
||||
for ( final ExactAFCalculationTestBuilder.ModelType modelType : modelTypes ) {
|
||||
for ( final ExactAFCalculationTestBuilder.PriorType priorType : priorTypes ) {
|
||||
final ExactAFCalculationTestBuilder testBuilder
|
||||
= new ExactAFCalculationTestBuilder(nSamples, nAltAlleles, modelType, priorType);
|
||||
|
||||
for ( final Analysis analysis : analyzes ) {
|
||||
logger.info(Utils.join("\t", Arrays.asList(iteration, nAltAlleles, nSamples, modelType, priorType, analysis.getName())));
|
||||
final List<?> values = Arrays.asList(iteration, nAltAlleles, nSamples, modelType, priorType);
|
||||
analysis.run(testBuilder, (List<Object>)values);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final GATKReport report = new GATKReport();
|
||||
for ( final Analysis analysis : analyzes )
|
||||
report.addTable(analysis.getTable());
|
||||
report.print(out);
|
||||
out.close();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,158 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class ExactAFCalculationTestBuilder {
|
||||
final static Allele A = Allele.create("A", true);
|
||||
final static Allele C = Allele.create("C");
|
||||
final static Allele G = Allele.create("G");
|
||||
final static Allele T = Allele.create("T");
|
||||
|
||||
static int sampleNameCounter = 0;
|
||||
|
||||
final int nSamples;
|
||||
final int numAltAlleles;
|
||||
final ModelType modelType;
|
||||
final PriorType priorType;
|
||||
|
||||
public ExactAFCalculationTestBuilder(final int nSamples, final int numAltAlleles,
|
||||
final ModelType modelType, final PriorType priorType) {
|
||||
this.nSamples = nSamples;
|
||||
this.numAltAlleles = numAltAlleles;
|
||||
this.modelType = modelType;
|
||||
this.priorType = priorType;
|
||||
}
|
||||
|
||||
public enum ModelType {
|
||||
DiploidExact,
|
||||
OptimizedDiploidExact,
|
||||
GeneralExact
|
||||
}
|
||||
|
||||
public enum PriorType {
|
||||
flat,
|
||||
human
|
||||
}
|
||||
|
||||
public int getnSamples() {
|
||||
return nSamples;
|
||||
}
|
||||
|
||||
public ExactAFCalculation makeModel() {
|
||||
switch (modelType) {
|
||||
case DiploidExact: return new DiploidExactAFCalculation(nSamples, 4);
|
||||
case OptimizedDiploidExact: return new OptimizedDiploidExactAFCalculation(nSamples, 4);
|
||||
case GeneralExact: return new GeneralPloidyExactAFCalculation(nSamples, 4, 2);
|
||||
default: throw new RuntimeException("Unexpected type " + modelType);
|
||||
}
|
||||
}
|
||||
|
||||
public double[] makePriors() {
|
||||
final int nPriorValues = 2*nSamples+1;
|
||||
|
||||
switch ( priorType ) {
|
||||
case flat:
|
||||
return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
||||
case human:
|
||||
final double[] humanPriors = new double[nPriorValues];
|
||||
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues-1, humanPriors, 0.001);
|
||||
return humanPriors;
|
||||
default:
|
||||
throw new RuntimeException("Unexpected type " + priorType);
|
||||
}
|
||||
}
|
||||
|
||||
public VariantContext makeACTest(final int[] ACs, final int nonTypePL) {
|
||||
final int nChrom = nSamples * 2;
|
||||
|
||||
final int[] nhet = new int[numAltAlleles];
|
||||
final int[] nhomvar = new int[numAltAlleles];
|
||||
|
||||
for ( int i = 0; i < ACs.length; i++ ) {
|
||||
final double p = ACs[i] / (1.0 * nChrom);
|
||||
nhomvar[i] = (int)Math.floor(nSamples * p * p);
|
||||
nhet[i] = ACs[i] - 2 * nhomvar[i];
|
||||
|
||||
if ( nhet[i] < 0 )
|
||||
throw new IllegalStateException("Bug!");
|
||||
}
|
||||
|
||||
final long calcAC = MathUtils.sum(nhet) + 2 * MathUtils.sum(nhomvar);
|
||||
if ( calcAC != MathUtils.sum(ACs) )
|
||||
throw new IllegalStateException("calculated AC " + calcAC + " not equal to desired AC " + Utils.join(",", ACs));
|
||||
|
||||
return makeACTest(nhet, nhomvar, nonTypePL);
|
||||
}
|
||||
|
||||
public VariantContext makeACTest(final int[] nhet, final int[] nhomvar, final int nonTypePL) {
|
||||
List<Genotype> samples = new ArrayList<Genotype>(nSamples);
|
||||
|
||||
for ( int altI = 0; altI < nhet.length; altI++ ) {
|
||||
for ( int i = 0; i < nhet[altI]; i++ )
|
||||
samples.add(makePL(GenotypeType.HET, nonTypePL, altI+1));
|
||||
for ( int i = 0; i < nhomvar[altI]; i++ )
|
||||
samples.add(makePL(GenotypeType.HOM_VAR, nonTypePL, altI+1));
|
||||
}
|
||||
|
||||
final int nRef = (int)(nSamples - MathUtils.sum(nhet) - MathUtils.sum(nhomvar));
|
||||
for ( int i = 0; i < nRef; i++ ) samples.add(makePL(GenotypeType.HOM_REF, nonTypePL, 0));
|
||||
|
||||
samples = samples.subList(0, nSamples);
|
||||
|
||||
if ( samples.size() > nSamples )
|
||||
throw new IllegalStateException("too many samples");
|
||||
|
||||
VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, getAlleles());
|
||||
vcb.genotypes(samples);
|
||||
return vcb.make();
|
||||
}
|
||||
|
||||
public List<Allele> getAlleles() {
|
||||
return Arrays.asList(A, C, G, T).subList(0, numAltAlleles+1);
|
||||
}
|
||||
|
||||
public List<Allele> getAlleles(final GenotypeType type, final int altI) {
|
||||
switch (type) {
|
||||
case HOM_REF: return Arrays.asList(getAlleles().get(0), getAlleles().get(0));
|
||||
case HET: return Arrays.asList(getAlleles().get(0), getAlleles().get(altI));
|
||||
case HOM_VAR: return Arrays.asList(getAlleles().get(altI), getAlleles().get(altI));
|
||||
default: throw new IllegalArgumentException("Unexpected type " + type);
|
||||
}
|
||||
}
|
||||
|
||||
public Genotype makePL(final List<Allele> expectedGT, int ... pls) {
|
||||
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
|
||||
gb.alleles(expectedGT);
|
||||
gb.PL(pls);
|
||||
return gb.make();
|
||||
}
|
||||
|
||||
private int numPLs() {
|
||||
return GenotypeLikelihoods.numLikelihoods(numAltAlleles+1, 2);
|
||||
}
|
||||
|
||||
public Genotype makePL(final GenotypeType type, final int nonTypePL, final int altI) {
|
||||
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
|
||||
gb.alleles(getAlleles(type, altI));
|
||||
|
||||
final int[] pls = new int[numPLs()];
|
||||
Arrays.fill(pls, nonTypePL);
|
||||
|
||||
int index = 0;
|
||||
switch ( type ) {
|
||||
case HOM_REF: index = GenotypeLikelihoods.calculatePLindex(0, 0); break;
|
||||
case HET: index = GenotypeLikelihoods.calculatePLindex(0, altI); break;
|
||||
case HOM_VAR: index = GenotypeLikelihoods.calculatePLindex(altI, altI); break;
|
||||
}
|
||||
pls[index] = 0;
|
||||
gb.PL(pls);
|
||||
|
||||
return gb.make();
|
||||
}
|
||||
}
|
||||
|
|
@ -34,43 +34,47 @@ import org.broadinstitute.sting.utils.variantcontext.*;
|
|||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||
public class GeneralPloidyExactAFCalculation extends ExactAFCalculation {
|
||||
static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them
|
||||
final protected UnifiedArgumentCollection UAC;
|
||||
|
||||
private final int ploidy;
|
||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
private final static boolean VERBOSE = false;
|
||||
|
||||
protected GeneralPloidyExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
protected GeneralPloidyExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(UAC, N, logger, verboseWriter);
|
||||
ploidy = UAC.samplePloidy;
|
||||
this.UAC = UAC;
|
||||
|
||||
}
|
||||
|
||||
public List<Allele> getLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
GenotypesContext GLs = vc.getGenotypes();
|
||||
List<Allele> alleles = vc.getAlleles();
|
||||
public GeneralPloidyExactAFCalculation(final int nSamples, final int maxAltAlleles, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, maxAltAlleles, null, null, null);
|
||||
this.ploidy = ploidy;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected VariantContext reduceScope(VariantContext vc) {
|
||||
// don't try to genotype too many alternate alleles
|
||||
if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) {
|
||||
logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
|
||||
|
||||
alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
|
||||
final List<Allele> alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
|
||||
alleles.add(vc.getReference());
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE, ploidy));
|
||||
|
||||
|
||||
GLs = subsetAlleles(vc, alleles, false, ploidy);
|
||||
VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||
builder.alleles(alleles);
|
||||
builder.genotypes(subsetAlleles(vc, alleles, false, ploidy));
|
||||
return builder.make();
|
||||
} else {
|
||||
return vc;
|
||||
}
|
||||
}
|
||||
|
||||
combineSinglePools(GLs, alleles.size(), ploidy, log10AlleleFrequencyPriors, result);
|
||||
|
||||
return alleles;
|
||||
@Override
|
||||
public void computeLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors, result);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -194,7 +198,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
|
||||
combinedPoolLikelihoods.add(set);
|
||||
for (int p=1; p<genotypeLikelihoods.size(); p++) {
|
||||
result.reset();
|
||||
result.reset(); // TODO -- why is this here? It makes it hard to track the n evaluation
|
||||
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p), combinedPloidy, ploidyPerPool,
|
||||
numAlleles, log10AlleleFrequencyPriors, result);
|
||||
combinedPloidy = ploidyPerPool + combinedPloidy; // total number of chromosomes in combinedLikelihoods
|
||||
|
|
@ -226,6 +230,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
// keep processing while we have AC conformations that need to be calculated
|
||||
MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen();
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
result.incNEvaluations();
|
||||
// compute log10Likelihoods
|
||||
final ExactACset ACset = ACqueue.remove();
|
||||
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, result, maxLikelihoodSeen, ACqueue, indexesToACset);
|
||||
|
|
@ -491,15 +491,15 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
// If neighbors fall below maximum - threshold, we don't queue up THEIR own neighbors
|
||||
// and we repeat until queue is empty
|
||||
// queue of AC conformations to process
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue = new LinkedList<AlleleFrequencyCalculationModel.ExactACset>();
|
||||
final LinkedList<ExactAFCalculation.ExactACset> ACqueue = new LinkedList<ExactAFCalculation.ExactACset>();
|
||||
// mapping of ExactACset indexes to the objects
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset = new HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset>(likelihoodDim);
|
||||
final HashMap<ExactAFCalculation.ExactACcounts, ExactAFCalculation.ExactACset> indexesToACset = new HashMap<ExactAFCalculation.ExactACcounts, ExactAFCalculation.ExactACset>(likelihoodDim);
|
||||
// add AC=0 to the queue
|
||||
final int[] zeroCounts = new int[nAlleles];
|
||||
zeroCounts[0] = numChromosomes;
|
||||
|
||||
AlleleFrequencyCalculationModel.ExactACset zeroSet =
|
||||
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(zeroCounts));
|
||||
ExactAFCalculation.ExactACset zeroSet =
|
||||
new ExactAFCalculation.ExactACset(1, new ExactAFCalculation.ExactACcounts(zeroCounts));
|
||||
|
||||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||
|
|
@ -508,7 +508,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
// compute log10Likelihoods
|
||||
final AlleleFrequencyCalculationModel.ExactACset ACset = ACqueue.remove();
|
||||
final ExactAFCalculation.ExactACset ACset = ACqueue.remove();
|
||||
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup);
|
||||
|
||||
// adjust max likelihood seen if needed
|
||||
|
|
@ -525,8 +525,8 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
int plIdx = 0;
|
||||
SumIterator iterator = new SumIterator(nAlleles, numChromosomes);
|
||||
while (iterator.hasNext()) {
|
||||
AlleleFrequencyCalculationModel.ExactACset ACset =
|
||||
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(iterator.getCurrentVector()));
|
||||
ExactAFCalculation.ExactACset ACset =
|
||||
new ExactAFCalculation.ExactACset(1, new ExactAFCalculation.ExactACcounts(iterator.getCurrentVector()));
|
||||
// for observed base X, add Q(jX,k) to likelihood vector for all k in error model
|
||||
//likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
|
||||
getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup);
|
||||
|
|
@ -540,14 +540,14 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
|
||||
}
|
||||
|
||||
private double calculateACConformationAndUpdateQueue(final ExactAFCalculationModel.ExactACset set,
|
||||
private double calculateACConformationAndUpdateQueue(final DiploidExactAFCalculation.ExactACset set,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
final double maxLog10L,
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts,
|
||||
AlleleFrequencyCalculationModel.ExactACset> indexesToACset,
|
||||
final LinkedList<ExactAFCalculation.ExactACset> ACqueue,
|
||||
final HashMap<ExactAFCalculation.ExactACcounts,
|
||||
ExactAFCalculation.ExactACset> indexesToACset,
|
||||
final ReadBackedPileup pileup) {
|
||||
// compute likelihood of set
|
||||
getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup);
|
||||
|
|
@ -597,7 +597,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
* @param numObservations Number of observations for each allele
|
||||
* @param pileup Read backed pileup in case it's necessary
|
||||
*/
|
||||
public abstract void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
public abstract void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
|
|
@ -608,12 +608,12 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
|
||||
// Static methods
|
||||
public static void updateACset(final int[] newSetCounts,
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset) {
|
||||
final LinkedList<ExactAFCalculation.ExactACset> ACqueue,
|
||||
final HashMap<ExactAFCalculation.ExactACcounts, ExactAFCalculation.ExactACset> indexesToACset) {
|
||||
|
||||
final AlleleFrequencyCalculationModel.ExactACcounts index = new AlleleFrequencyCalculationModel.ExactACcounts(newSetCounts);
|
||||
final ExactAFCalculation.ExactACcounts index = new ExactAFCalculation.ExactACcounts(newSetCounts);
|
||||
if ( !indexesToACset.containsKey(index) ) {
|
||||
AlleleFrequencyCalculationModel.ExactACset newSet = new AlleleFrequencyCalculationModel.ExactACset(1, index);
|
||||
ExactAFCalculation.ExactACset newSet = new ExactAFCalculation.ExactACset(1, index);
|
||||
indexesToACset.put(index, newSet);
|
||||
ACqueue.add(newSet);
|
||||
if (VERBOSE)
|
||||
|
|
|
|||
|
|
@ -188,7 +188,7 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
|
|||
* @param alleleList List of alleles
|
||||
* @param numObservations Number of observations for each allele in alleleList
|
||||
*/
|
||||
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
public void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
|
|
|
|||
|
|
@ -12,7 +12,10 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
|||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import static java.lang.Math.pow;
|
||||
|
|
@ -218,7 +221,7 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
|
|||
* @param alleleList List of alleles
|
||||
* @param numObservations Number of observations for each allele in alleleList
|
||||
*/
|
||||
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
public void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
|
|
|
|||
|
|
@ -283,7 +283,7 @@ public class GenotypingEngine {
|
|||
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
|
||||
if( mergedVC == null ) { continue; }
|
||||
|
||||
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
|
||||
HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
|
||||
int aCount = 0;
|
||||
for( final Allele a : mergedVC.getAlleles() ) {
|
||||
alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
|
||||
|
|
@ -308,9 +308,20 @@ public class GenotypingEngine {
|
|||
}
|
||||
genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() );
|
||||
}
|
||||
final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
|
||||
|
||||
VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
|
||||
if( call != null ) {
|
||||
if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
|
||||
final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call);
|
||||
// also, need to update the allele -> haplotype mapping
|
||||
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMapTrim = new HashMap<Allele, ArrayList<Haplotype>>();
|
||||
for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC
|
||||
alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleHashMap.get(call.getAlleles().get(iii)));
|
||||
}
|
||||
|
||||
call = vcCallTrim;
|
||||
alleleHashMap = alleleHashMapTrim;
|
||||
}
|
||||
|
||||
returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(call, alleleHashMap) );
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -237,9 +237,13 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING);
|
||||
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING);
|
||||
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
|
||||
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
|
||||
|
||||
// create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
|
||||
UnifiedArgumentCollection simpleUAC = UAC.clone();
|
||||
simpleUAC.exactCallsLog = null;
|
||||
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
|
||||
// initialize the output VCF header
|
||||
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||
|
|
|
|||
|
|
@ -40,7 +40,6 @@ import java.util.*;
|
|||
public class LikelihoodCalculationEngine {
|
||||
|
||||
private static final double LOG_ONE_HALF = -Math.log10(2.0);
|
||||
private static final double BEST_LIKELIHOOD_THRESHOLD = 0.1;
|
||||
private final byte constantGCP;
|
||||
private final boolean DEBUG;
|
||||
private final PairHMM pairHMM;
|
||||
|
|
@ -184,7 +183,7 @@ public class LikelihoodCalculationEngine {
|
|||
haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF );
|
||||
}
|
||||
}
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum?
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -323,11 +322,13 @@ public class LikelihoodCalculationEngine {
|
|||
return bestHaplotypes;
|
||||
}
|
||||
|
||||
public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList, final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call) {
|
||||
public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser,
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList,
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call) {
|
||||
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst());
|
||||
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
|
||||
//final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<Allele, List<GATKSAMRecord>>();
|
||||
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
|
||||
|
||||
final ArrayList<GATKSAMRecord> readsForThisSample = sample.getValue();
|
||||
|
|
@ -352,7 +353,7 @@ public class LikelihoodCalculationEngine {
|
|||
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
|
||||
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
|
||||
for( final Allele a : call.getFirst().getAlleles() )
|
||||
likelihoodMap.add(read,a,0.0);
|
||||
likelihoodMap.add(read, a, 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -63,7 +63,7 @@ public class BaseCountsUnitTest extends BaseTest {
|
|||
|
||||
String name = String.format("Test-%s", params.bases);
|
||||
Assert.assertEquals(counts.totalCount(), params.bases.length(), name);
|
||||
Assert.assertEquals(counts.countOfMostCommonBase(), params.mostCommonCount, name);
|
||||
Assert.assertEquals(counts.countOfBase(counts.baseIndexWithMostCounts()), params.mostCommonCount, name);
|
||||
Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,348 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public class ExactAFCalculationModelUnitTest extends BaseTest {
|
||||
static Allele A = Allele.create("A", true);
|
||||
static Allele C = Allele.create("C");
|
||||
static Allele G = Allele.create("G");
|
||||
|
||||
static int sampleNameCounter = 0;
|
||||
static Genotype AA1, AB1, BB1, NON_INFORMATIVE1;
|
||||
static Genotype AA2, AB2, AC2, BB2, BC2, CC2, NON_INFORMATIVE2;
|
||||
final double[] FLAT_3SAMPLE_PRIORS = new double[2*3+1]; // flat priors
|
||||
final private static boolean INCLUDE_BIALLELIC = true;
|
||||
final private static boolean INCLUDE_TRIALLELIC = true;
|
||||
final private static boolean Guillermo_FIXME = false; // TODO -- can only be enabled when GdA fixes bug
|
||||
|
||||
@BeforeSuite
|
||||
public void before() {
|
||||
AA1 = makePL(Arrays.asList(A, A), 0, 20, 20);
|
||||
AB1 = makePL(Arrays.asList(A, C), 20, 0, 20);
|
||||
BB1 = makePL(Arrays.asList(C, C), 20, 20, 0);
|
||||
NON_INFORMATIVE1 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0);
|
||||
|
||||
AA2 = makePL(Arrays.asList(A, A), 0, 20, 20, 20, 20, 20);
|
||||
AB2 = makePL(Arrays.asList(A, C), 20, 0, 20, 20, 20, 20);
|
||||
BB2 = makePL(Arrays.asList(C, C), 20, 20, 0, 20, 20, 20);
|
||||
AC2 = makePL(Arrays.asList(A, G), 20, 20, 20, 0, 20, 20);
|
||||
BC2 = makePL(Arrays.asList(C, G), 20, 20, 20, 20, 0, 20);
|
||||
CC2 = makePL(Arrays.asList(G, G), 20, 20, 20, 20, 20, 0);
|
||||
NON_INFORMATIVE2 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0, 0, 0, 0);
|
||||
}
|
||||
|
||||
private Genotype makePL(final List<Allele> expectedGT, int ... pls) {
|
||||
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
|
||||
gb.alleles(expectedGT);
|
||||
gb.PL(pls);
|
||||
return gb.make();
|
||||
}
|
||||
|
||||
private class GetGLsTest extends TestDataProvider {
|
||||
GenotypesContext GLs;
|
||||
int numAltAlleles;
|
||||
final ExactAFCalculation calc;
|
||||
final int[] expectedACs;
|
||||
final double[] priors;
|
||||
final String priorName;
|
||||
|
||||
private GetGLsTest(final ExactAFCalculation calculation, int numAltAlleles, List<Genotype> arg, final double[] priors, final String priorName) {
|
||||
super(GetGLsTest.class);
|
||||
GLs = GenotypesContext.create(new ArrayList<Genotype>(arg));
|
||||
this.numAltAlleles = numAltAlleles;
|
||||
this.calc = calculation;
|
||||
this.priors = priors;
|
||||
this.priorName = priorName;
|
||||
|
||||
expectedACs = new int[numAltAlleles+1];
|
||||
for ( int alleleI = 0; alleleI < expectedACs.length; alleleI++ ) {
|
||||
expectedACs[alleleI] = 0;
|
||||
final Allele allele = getAlleles().get(alleleI);
|
||||
for ( Genotype g : arg ) {
|
||||
expectedACs[alleleI] += Collections.frequency(g.getAlleles(), allele);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public AlleleFrequencyCalculationResult execute() {
|
||||
return getCalc().getLog10PNonRef(getVC(), getPriors());
|
||||
}
|
||||
|
||||
public double[] getPriors() {
|
||||
return priors;
|
||||
}
|
||||
|
||||
public ExactAFCalculation getCalc() {
|
||||
return calc;
|
||||
}
|
||||
|
||||
public VariantContext getVC() {
|
||||
VariantContextBuilder builder = new VariantContextBuilder("test", "1", 1, 1, getAlleles());
|
||||
builder.genotypes(GLs);
|
||||
return builder.make();
|
||||
}
|
||||
|
||||
public List<Allele> getAlleles() {
|
||||
return Arrays.asList(Allele.create("A", true),
|
||||
Allele.create("C"),
|
||||
Allele.create("G"),
|
||||
Allele.create("T")).subList(0, numAltAlleles+1);
|
||||
}
|
||||
|
||||
public int getExpectedAltAC(final int alleleI) {
|
||||
return expectedACs[alleleI+1];
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s model=%s prior=%s input=%s", super.toString(), calc.getClass().getSimpleName(),
|
||||
priorName, GLs.size() > 5 ? String.format("%d samples", GLs.size()) : GLs);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "wellFormedGLs")
|
||||
public Object[][] createSimpleGLsData() {
|
||||
final List<Genotype> biAllelicSamples = Arrays.asList(AA1, AB1, BB1);
|
||||
final List<Genotype> triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2);
|
||||
|
||||
for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) {
|
||||
final ExactAFCalculation diploidCalc = new DiploidExactAFCalculation(nSamples, 4);
|
||||
final ExactAFCalculation optDiploidCalc = new OptimizedDiploidExactAFCalculation(nSamples, 4);
|
||||
final ExactAFCalculation generalCalc = new GeneralPloidyExactAFCalculation(nSamples, 4, 2);
|
||||
|
||||
final int nPriorValues = 2*nSamples+1;
|
||||
final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
||||
final double[] humanPriors = new double[nPriorValues];
|
||||
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues-1, humanPriors, 0.001);
|
||||
|
||||
for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
|
||||
for ( ExactAFCalculation model : Arrays.asList(diploidCalc, optDiploidCalc, generalCalc) ) {
|
||||
final String priorName = priors == humanPriors ? "human" : "flat";
|
||||
|
||||
// bi-allelic
|
||||
if ( INCLUDE_BIALLELIC && nSamples <= biAllelicSamples.size() )
|
||||
for ( List<Genotype> genotypes : Utils.makePermutations(biAllelicSamples, nSamples, true) )
|
||||
new GetGLsTest(model, 1, genotypes, priors, priorName);
|
||||
|
||||
// tri-allelic
|
||||
if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || model != generalCalc || Guillermo_FIXME ) )
|
||||
for ( List<Genotype> genotypes : Utils.makePermutations(triAllelicSamples, nSamples, true) )
|
||||
new GetGLsTest(model, 2, genotypes, priors, priorName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return GetGLsTest.getTests(GetGLsTest.class);
|
||||
}
|
||||
|
||||
private static class NonInformativeData {
|
||||
final Genotype nonInformative;
|
||||
final List<Genotype> called;
|
||||
final int nAltAlleles;
|
||||
|
||||
private NonInformativeData(List<Genotype> called, Genotype nonInformative, int nAltAlleles) {
|
||||
this.called = called;
|
||||
this.nonInformative = nonInformative;
|
||||
this.nAltAlleles = nAltAlleles;
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "GLsWithNonInformative")
|
||||
public Object[][] makeGLsWithNonInformative() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final List<NonInformativeData> nonInformativeTests = new LinkedList<NonInformativeData>();
|
||||
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB1), NON_INFORMATIVE1, 1));
|
||||
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2), NON_INFORMATIVE2, 2));
|
||||
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2, BC2), NON_INFORMATIVE2, 2));
|
||||
|
||||
for ( final int nNonInformative : Arrays.asList(1, 10, 100) ) {
|
||||
for ( final NonInformativeData testData : nonInformativeTests ) {
|
||||
final List<Genotype> samples = new ArrayList<Genotype>();
|
||||
samples.addAll(testData.called);
|
||||
samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative));
|
||||
|
||||
final int nSamples = samples.size();
|
||||
final ExactAFCalculation diploidCalc = new DiploidExactAFCalculation(nSamples, 4);
|
||||
final ExactAFCalculation optDiploidCalc = new OptimizedDiploidExactAFCalculation(nSamples, 4);
|
||||
final ExactAFCalculation generalCalc = new GeneralPloidyExactAFCalculation(nSamples, 4, 2);
|
||||
final double[] priors = new double[2*nSamples+1]; // flat priors
|
||||
|
||||
for ( ExactAFCalculation model : Arrays.asList(diploidCalc, optDiploidCalc, generalCalc) ) {
|
||||
final GetGLsTest onlyInformative = new GetGLsTest(model, testData.nAltAlleles, testData.called, priors, "flat");
|
||||
|
||||
for ( int rotation = 0; rotation < nSamples; rotation++ ) {
|
||||
Collections.rotate(samples, 1);
|
||||
final GetGLsTest withNonInformative = new GetGLsTest(model, testData.nAltAlleles, samples, priors, "flat");
|
||||
tests.add(new Object[]{onlyInformative, withNonInformative});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "wellFormedGLs")
|
||||
public void testGLs(GetGLsTest cfg) {
|
||||
testResultSimple(cfg);
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "GLsWithNonInformative", dependsOnMethods = "testGLs")
|
||||
public void testGLsWithNonInformative(GetGLsTest onlyInformative, GetGLsTest withNonInformative) {
|
||||
final AlleleFrequencyCalculationResult expected = onlyInformative.execute();
|
||||
final AlleleFrequencyCalculationResult actual = withNonInformative.execute();
|
||||
|
||||
testResultSimple(withNonInformative);
|
||||
|
||||
Assert.assertEquals(actual.getLog10PosteriorOfAFzero(), expected.getLog10LikelihoodOfAFzero());
|
||||
Assert.assertEquals(actual.getLog10LikelihoodOfAFzero(), expected.getLog10LikelihoodOfAFzero());
|
||||
Assert.assertEquals(actual.getLog10PosteriorsMatrixSumWithoutAFzero(), expected.getLog10PosteriorsMatrixSumWithoutAFzero());
|
||||
Assert.assertEquals(actual.getAlleleCountsOfMAP(), expected.getAlleleCountsOfMAP());
|
||||
Assert.assertEquals(actual.getAlleleCountsOfMLE(), expected.getAlleleCountsOfMLE());
|
||||
Assert.assertEquals(actual.getLog10MAP(), expected.getLog10MAP());
|
||||
Assert.assertEquals(actual.getLog10MLE(), expected.getLog10MLE());
|
||||
Assert.assertEquals(actual.getAllelesUsedInGenotyping(), expected.getAllelesUsedInGenotyping());
|
||||
}
|
||||
|
||||
private void testResultSimple(final GetGLsTest cfg) {
|
||||
final AlleleFrequencyCalculationResult result = cfg.execute();
|
||||
|
||||
Assert.assertEquals(result.getNormalizedPosteriorOfAFzero() + result.getNormalizedPosteriorOfAFGTZero(), 1.0, 1e-4);
|
||||
|
||||
final int minNumberOfEvaluations = cfg.getVC().getCalledChrCount();
|
||||
Assert.assertTrue(result.getnEvaluations() >= minNumberOfEvaluations,
|
||||
"Number of evaluations " + result.getnEvaluations() + " must be at least " + minNumberOfEvaluations);
|
||||
Assert.assertNotNull(result.getAllelesUsedInGenotyping());
|
||||
Assert.assertTrue(cfg.getAlleles().containsAll(result.getAllelesUsedInGenotyping()), "Result object has alleles not in our initial allele list");
|
||||
|
||||
for ( int altAlleleI = 0; altAlleleI < cfg.numAltAlleles; altAlleleI++ ) {
|
||||
int expectedAlleleCount = cfg.getExpectedAltAC(altAlleleI);
|
||||
int calcAC_MLE = result.getAlleleCountsOfMLE()[altAlleleI];
|
||||
|
||||
final Allele allele = cfg.getAlleles().get(altAlleleI+1);
|
||||
Assert.assertEquals(calcAC_MLE, expectedAlleleCount, "MLE AC not equal to expected AC for allele " + allele);
|
||||
}
|
||||
|
||||
// TODO
|
||||
// TODO -- enable when we understand the contract between AC_MAP and pNonRef
|
||||
// TODO
|
||||
// final int AC_MAP = (int)MathUtils.sum(result.getAlleleCountsOfMAP());
|
||||
// if ( AC_MAP > 0 ) {
|
||||
// Assert.assertTrue(result.getNormalizedPosteriorOfAFzero() < 0.50, "MAP AC " + AC_MAP + " > 0 but we had posterior AF = 0 > 0.5 of " + result.getNormalizedPosteriorOfAFzero());
|
||||
// } else {
|
||||
// Assert.assertTrue(result.getNormalizedPosteriorOfAFzero() > 0.50, "MAP AC " + AC_MAP + " == 0 but we had posterior AF = 0 < 0.5 of " + result.getNormalizedPosteriorOfAFzero());
|
||||
// }
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "Models")
|
||||
public void testLargeGLs(final ExactAFCalculation calc) {
|
||||
final Genotype BB = makePL(Arrays.asList(C, C), 20000000, 20000000, 0);
|
||||
GetGLsTest cfg = new GetGLsTest(calc, 1, Arrays.asList(BB, BB, BB), FLAT_3SAMPLE_PRIORS, "flat");
|
||||
|
||||
final AlleleFrequencyCalculationResult result = cfg.execute();
|
||||
|
||||
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[0];
|
||||
Assert.assertEquals(calculatedAlleleCount, 6);
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "Models")
|
||||
public void testMismatchedGLs(final ExactAFCalculation calc) {
|
||||
final Genotype AB = makePL(Arrays.asList(A,C), 2000, 0, 2000, 2000, 2000, 2000);
|
||||
final Genotype AC = makePL(Arrays.asList(A,G), 100, 100, 100, 0, 100, 100);
|
||||
GetGLsTest cfg = new GetGLsTest(calc, 2, Arrays.asList(AB, AC), FLAT_3SAMPLE_PRIORS, "flat");
|
||||
|
||||
final AlleleFrequencyCalculationResult result = cfg.execute();
|
||||
|
||||
Assert.assertEquals(result.getAlleleCountsOfMAP()[0], 1);
|
||||
Assert.assertEquals(result.getAlleleCountsOfMAP()[1], 1);
|
||||
}
|
||||
|
||||
@DataProvider(name = "Models")
|
||||
public Object[][] makeModels() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
tests.add(new Object[]{new DiploidExactAFCalculation(2, 4)});
|
||||
tests.add(new Object[]{new OptimizedDiploidExactAFCalculation(2, 4)});
|
||||
tests.add(new Object[]{new GeneralPloidyExactAFCalculation(2, 4, 2)});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "Models")
|
||||
public void testBiallelicPriors(final ExactAFCalculation model) {
|
||||
final int REF_PL = 10;
|
||||
final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000);
|
||||
|
||||
for ( int log10NonRefPrior = 1; log10NonRefPrior < 100*REF_PL; log10NonRefPrior += 1 ) {
|
||||
final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior);
|
||||
final double[] priors = MathUtils.toLog10(new double[]{refPrior, (1-refPrior) / 2, (1-refPrior) / 2});
|
||||
GetGLsTest cfg = new GetGLsTest(model, 1, Arrays.asList(AB), priors, "pNonRef" + log10NonRefPrior);
|
||||
final AlleleFrequencyCalculationResult result = cfg.execute();
|
||||
final int actualAC = result.getAlleleCountsOfMAP()[0];
|
||||
|
||||
final double pRefWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0];
|
||||
final double pHetWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1];
|
||||
final boolean expectNonRef = pRefWithPrior <= pHetWithPrior;
|
||||
|
||||
if ( expectNonRef )
|
||||
Assert.assertTrue(result.getNormalizedPosteriorOfAFGTZero() > 0.5);
|
||||
else
|
||||
Assert.assertTrue(result.getNormalizedPosteriorOfAFGTZero() < 0.5);
|
||||
|
||||
final int expectedAC = expectNonRef ? 1 : 0;
|
||||
Assert.assertEquals(actualAC, expectedAC,
|
||||
"actual AC with priors " + log10NonRefPrior + " not expected "
|
||||
+ expectedAC + " priors " + Utils.join(",", priors));
|
||||
}
|
||||
}
|
||||
|
||||
@Test(enabled = false, dataProvider = "Models")
|
||||
public void testTriallelicPriors(final ExactAFCalculation model) {
|
||||
// TODO
|
||||
// TODO
|
||||
// TODO THIS SEEMS TO ID A BUG IN THE EXACT MODEL FOR MULTI-ALLELICS, AS THE
|
||||
// TODO SECOND ALLELE ISN'T HAVING A SQUARED PRIOR. TALK TO ERIC AND CONFIRM
|
||||
// TODO
|
||||
// TODO
|
||||
final int REF_PL_AB = 10, REF_PL_AC = 20; // first AC goes, then AB
|
||||
final Genotype AB = makePL(Arrays.asList(A,C), REF_PL_AB, 0, 10000, 10000, 10000);
|
||||
final Genotype AC = makePL(Arrays.asList(A, G), REF_PL_AC, 10000, 10000, 0, 10000, 10000);
|
||||
|
||||
for ( int log10NonRefPrior = 1; log10NonRefPrior < 100*REF_PL_AC; log10NonRefPrior += 1 ) {
|
||||
final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior);
|
||||
final double nonRefPrior = (1-refPrior) / 2;
|
||||
final double[] priors = MathUtils.toLog10(new double[]{refPrior, nonRefPrior, nonRefPrior, nonRefPrior, nonRefPrior, nonRefPrior});
|
||||
GetGLsTest cfg = new GetGLsTest(model, 2, Arrays.asList(AB, AC), priors, "pNonRef" + log10NonRefPrior);
|
||||
final AlleleFrequencyCalculationResult result = cfg.execute();
|
||||
final int actualAC_AB = result.getAlleleCountsOfMAP()[0];
|
||||
|
||||
final double pRefABWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0];
|
||||
final double pHetABWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1];
|
||||
final int expectedAC_AB = pRefABWithPrior <= pHetABWithPrior ? 1 : 0;
|
||||
Assert.assertEquals(actualAC_AB, expectedAC_AB,
|
||||
"actual AC with priors " + log10NonRefPrior + " not expected "
|
||||
+ expectedAC_AB + " priors " + Utils.join(",", priors));
|
||||
|
||||
final double nonRefPriorSecondAllele = Math.pow(nonRefPrior, 2);
|
||||
final double refPriorSecondAllele = 1 - nonRefPriorSecondAllele;
|
||||
final int actualAC_AC = result.getAlleleCountsOfMAP()[1];
|
||||
final double pRefACWithPrior = AB.getLikelihoods().getAsVector()[0] + Math.log10(refPriorSecondAllele);
|
||||
final double pHetACWithPrior = AC.getLikelihoods().getAsVector()[3] + Math.log10(nonRefPriorSecondAllele);
|
||||
final int expectedAC_AC = pRefACWithPrior <= pHetACWithPrior ? 1 : 0;
|
||||
Assert.assertEquals(actualAC_AC, expectedAC_AC,
|
||||
"actual AC with priors " + log10NonRefPrior + " not expected "
|
||||
+ expectedAC_AC + " priors " + Utils.join(",", priors));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -141,7 +141,7 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest {
|
|||
final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size());
|
||||
double[] priors = new double[len]; // flat priors
|
||||
|
||||
GeneralPloidyExactAFCalculationModel.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result);
|
||||
GeneralPloidyExactAFCalculation.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result);
|
||||
int nameIndex = 1;
|
||||
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
|
||||
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGA() {
|
||||
HCTest(CEUTRIO_BAM, "-gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "71bec55320a2f07af0d54be9d7735322");
|
||||
HCTest(CEUTRIO_BAM, "--max_alternate_alleles_for_indels 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "71bec55320a2f07af0d54be9d7735322");
|
||||
}
|
||||
|
||||
private void HCTestComplexVariants(String bam, String args, String md5) {
|
||||
|
|
@ -42,7 +42,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleComplex() {
|
||||
HCTestComplexVariants(CEUTRIO_BAM, "", "f5a809e3fbd9998f79b75bb2973209e1");
|
||||
HCTestComplexVariants(CEUTRIO_BAM, "", "966da0de8466d21d79f1523488dff6bd");
|
||||
}
|
||||
|
||||
private void HCTestSymbolicVariants(String bam, String args, String md5) {
|
||||
|
|
@ -81,4 +81,19 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
executeTest("HCTestStructuralIndels: ", spec);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing reduced reads
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Test
|
||||
public void HCTestReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("864abe729828248333aee14818c1d2e1"));
|
||||
executeTest("HC calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,13 +1,12 @@
|
|||
package org.broadinstitute.sting.gatk.arguments;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Advanced;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
|
|
@ -59,4 +58,18 @@ public class StandardCallerArgumentCollection {
|
|||
@Advanced
|
||||
@Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
|
||||
public int MAX_ALTERNATE_ALLELES = 3;
|
||||
|
||||
/**
|
||||
* If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
|
||||
* then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
|
||||
* scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
|
||||
* that you not play around with this parameter.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "Maximum number of alternate alleles to genotype for indels only", required = false)
|
||||
public int MAX_ALTERNATE_ALLELES_FOR_INDELS = 2;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName = "logExactCalls", doc="x", required=false)
|
||||
public File exactCallsLog = null;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -124,6 +124,12 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
*/
|
||||
private FilePointer generatePointerOverEntireFileset() {
|
||||
FilePointer filePointer = new FilePointer();
|
||||
|
||||
// This is a "monolithic" FilePointer representing all regions in all files we will ever visit, and is
|
||||
// the only FilePointer we will create. This allows us to have this FilePointer represent regions from
|
||||
// multiple contigs
|
||||
filePointer.setIsMonolithic(true);
|
||||
|
||||
Map<SAMReaderID,GATKBAMFileSpan> currentPosition;
|
||||
|
||||
// Only use the deprecated SAMDataSource.getCurrentPosition() if we're not using experimental downsampling
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
|
|||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -88,6 +89,17 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
|
|||
*/
|
||||
private PeekableIterator<SAMRecord> currentContigReadsIterator = null;
|
||||
|
||||
/**
|
||||
* How many FilePointers have we pulled from the filePointers iterator?
|
||||
*/
|
||||
private int totalFilePointersConsumed = 0;
|
||||
|
||||
/**
|
||||
* Have we encountered a monolithic FilePointer?
|
||||
*/
|
||||
private boolean encounteredMonolithicFilePointer = false;
|
||||
|
||||
|
||||
{
|
||||
createNextContigFilePointer();
|
||||
advance();
|
||||
|
|
@ -167,6 +179,20 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
|
|||
logger.info("Loading BAM index data for next contig");
|
||||
|
||||
while ( filePointers.hasNext() ) {
|
||||
|
||||
// Make sure that if we see a monolithic FilePointer (representing all regions in all files) that
|
||||
// it is the ONLY FilePointer we ever encounter
|
||||
if ( encounteredMonolithicFilePointer ) {
|
||||
throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer");
|
||||
}
|
||||
if ( filePointers.peek().isMonolithic() ) {
|
||||
if ( totalFilePointersConsumed > 0 ) {
|
||||
throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer");
|
||||
}
|
||||
encounteredMonolithicFilePointer = true;
|
||||
logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek()));
|
||||
}
|
||||
|
||||
// If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the
|
||||
// same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge
|
||||
if ( nextContigFilePointers.isEmpty() ||
|
||||
|
|
@ -175,6 +201,7 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
|
|||
(nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) {
|
||||
|
||||
nextContigFilePointers.add(filePointers.next());
|
||||
totalFilePointersConsumed++;
|
||||
}
|
||||
else {
|
||||
break; // next FilePointer is on a different contig or has different mapped/unmapped status,
|
||||
|
|
|
|||
|
|
@ -50,10 +50,28 @@ public class FilePointer {
|
|||
*/
|
||||
protected final boolean isRegionUnmapped;
|
||||
|
||||
/**
|
||||
* Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
|
||||
* ever visit during this GATK run? If this is set to true, the engine will expect to see only this
|
||||
* one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
|
||||
* from more than one contig.
|
||||
*/
|
||||
private boolean isMonolithic = false;
|
||||
|
||||
/**
|
||||
* Index of the contig covered by this FilePointer. Only meaningful for non-monolithic, mapped FilePointers
|
||||
*/
|
||||
private Integer contigIndex = null;
|
||||
|
||||
|
||||
public FilePointer( List<GenomeLoc> locations ) {
|
||||
this.locations.addAll(locations);
|
||||
this.isRegionUnmapped = checkUnmappedStatus();
|
||||
validateLocations();
|
||||
|
||||
validateAllLocations();
|
||||
if ( locations.size() > 0 ) {
|
||||
contigIndex = locations.get(0).getContigIndex();
|
||||
}
|
||||
}
|
||||
|
||||
public FilePointer( final GenomeLoc... locations ) {
|
||||
|
|
@ -80,8 +98,9 @@ public class FilePointer {
|
|||
return foundUnmapped;
|
||||
}
|
||||
|
||||
private void validateLocations() {
|
||||
if ( isRegionUnmapped ) {
|
||||
private void validateAllLocations() {
|
||||
// Unmapped and monolithic FilePointers are exempted from the one-contig-only restriction
|
||||
if ( isRegionUnmapped || isMonolithic ) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -89,13 +108,22 @@ public class FilePointer {
|
|||
|
||||
for ( GenomeLoc location : locations ) {
|
||||
if ( previousContigIndex != null && previousContigIndex != location.getContigIndex() ) {
|
||||
throw new ReviewedStingException("File pointers must contain intervals from at most one contig");
|
||||
throw new ReviewedStingException("Non-monolithic file pointers must contain intervals from at most one contig");
|
||||
}
|
||||
|
||||
previousContigIndex = location.getContigIndex();
|
||||
}
|
||||
}
|
||||
|
||||
private void validateLocation( GenomeLoc location ) {
|
||||
if ( isRegionUnmapped != GenomeLoc.isUnmapped(location) ) {
|
||||
throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped.");
|
||||
}
|
||||
if ( ! isRegionUnmapped && ! isMonolithic && contigIndex != null && contigIndex != location.getContigIndex() ) {
|
||||
throw new ReviewedStingException("Non-monolithic file pointers must contain intervals from at most one contig");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an immutable view of this FilePointer's file spans
|
||||
*
|
||||
|
|
@ -123,6 +151,29 @@ public class FilePointer {
|
|||
return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
|
||||
* ever visit during this GATK run? If this is set to true, the engine will expect to see only this
|
||||
* one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
|
||||
* from more than one contig.
|
||||
*
|
||||
* @return true if this FP is a monolithic FP representing all regions in all files, otherwise false
|
||||
*/
|
||||
public boolean isMonolithic() {
|
||||
return isMonolithic;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set this FP's "monolithic" status to true or false. An FP is monolithic if it represents all
|
||||
* regions in all files that we will ever visit, and is the only FP we will ever create. A monolithic
|
||||
* FP may contain intervals from more than one contig.
|
||||
*
|
||||
* @param isMonolithic set this FP's monolithic status to this value
|
||||
*/
|
||||
public void setIsMonolithic( boolean isMonolithic ) {
|
||||
this.isMonolithic = isMonolithic;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if(!(other instanceof FilePointer))
|
||||
|
|
@ -151,15 +202,12 @@ public class FilePointer {
|
|||
}
|
||||
|
||||
public void addLocation(final GenomeLoc location) {
|
||||
this.locations.add(location);
|
||||
checkUnmappedStatus();
|
||||
validateLocations();
|
||||
}
|
||||
validateLocation(location);
|
||||
|
||||
public void addLocations( final List<GenomeLoc> locations ) {
|
||||
this.locations.addAll(locations);
|
||||
checkUnmappedStatus();
|
||||
validateLocations();
|
||||
this.locations.add(location);
|
||||
if ( contigIndex == null ) {
|
||||
contigIndex = location.getContigIndex();
|
||||
}
|
||||
}
|
||||
|
||||
public void addFileSpans(final SAMReaderID id, final SAMFileSpan fileSpan) {
|
||||
|
|
|
|||
|
|
@ -215,19 +215,29 @@ public class ReadShard extends Shard {
|
|||
int start = Integer.MAX_VALUE;
|
||||
int stop = Integer.MIN_VALUE;
|
||||
String contig = null;
|
||||
boolean foundMapped = false;
|
||||
|
||||
for ( final SAMRecord read : reads ) {
|
||||
if ( contig != null && ! read.getReferenceName().equals(contig) )
|
||||
throw new ReviewedStingException("ReadShard contains reads spanning contig boundaries, which is no longer allowed. "
|
||||
+ "First contig is " + contig + " next read was " + read.getReferenceName() );
|
||||
contig = read.getReferenceName();
|
||||
if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart();
|
||||
if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd();
|
||||
|
||||
// Even if this shard as a *whole* is not "unmapped", we can still encounter *individual* unmapped mates
|
||||
// of mapped reads within this shard's buffer. In fact, if we're very unlucky with shard boundaries,
|
||||
// this shard might consist *only* of unmapped mates! We need to refrain from using the alignment
|
||||
// starts/stops of these unmapped mates, and detect the case where the shard has been filled *only*
|
||||
// with unmapped mates.
|
||||
if ( ! read.getReadUnmappedFlag() ) {
|
||||
foundMapped = true;
|
||||
if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart();
|
||||
if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd();
|
||||
}
|
||||
}
|
||||
|
||||
assert contig != null;
|
||||
|
||||
if ( contig.equals("*") ) // all reads are unmapped
|
||||
if ( ! foundMapped || contig.equals("*") ) // all reads are unmapped
|
||||
return GenomeLoc.UNMAPPED;
|
||||
else
|
||||
return parser.createGenomeLoc(contig, start, stop);
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
|||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
|
|
@ -49,8 +50,12 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno
|
|||
if ( perReadAlleleLikelihoodMap.size() == 0 )
|
||||
return null;
|
||||
|
||||
for ( Map.Entry<String, PerReadAlleleLikelihoodMap> sample : perReadAlleleLikelihoodMap.entrySet() )
|
||||
depth += sample.getValue().getNumberOfStoredElements();
|
||||
for (PerReadAlleleLikelihoodMap maps : perReadAlleleLikelihoodMap.values() ) {
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
depth += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
return null;
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines;
|
|||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
|
||||
|
|
@ -72,7 +73,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
|
|||
ReadBackedPileup pileup = stratifiedContext.getBasePileup();
|
||||
for ( PileupElement p : pileup ) {
|
||||
if ( alleleCounts.containsKey(p.getBase()) )
|
||||
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1);
|
||||
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+p.getRepresentativeCount());
|
||||
}
|
||||
|
||||
// we need to add counts in the correct order
|
||||
|
|
@ -91,12 +92,13 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
|
|||
alleleCounts.put(allele, 0);
|
||||
}
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
if (a.isNoCall())
|
||||
continue; // read is non-informative
|
||||
if (!vc.getAlleles().contains(a))
|
||||
continue; // sanity check - shouldn't be needed
|
||||
alleleCounts.put(a,alleleCounts.get(a)+1);
|
||||
alleleCounts.put(a, alleleCounts.get(a) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1));
|
||||
}
|
||||
final int[] counts = new int[alleleCounts.size()];
|
||||
counts[0] = alleleCounts.get(vc.getReference());
|
||||
|
|
|
|||
|
|
@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
|||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
|
|
@ -71,7 +72,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
}
|
||||
else if (stratifiedPerReadAlleleLikelihoodMap != null) {
|
||||
// either SNP with no alignment context, or indels: per-read likelihood map needed
|
||||
final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount());
|
||||
final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc);
|
||||
return pValueForBestTable(table, null);
|
||||
}
|
||||
else
|
||||
|
|
@ -235,14 +236,13 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
* allele2 # #
|
||||
* @return a 2x2 contingency table
|
||||
*/
|
||||
private static int[][] getContingencyTable( final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap,
|
||||
final Allele ref, final Allele alt) {
|
||||
private static int[][] getContingencyTable( final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) {
|
||||
final Allele ref = vc.getReference();
|
||||
final Allele alt = vc.getAltAlleleWithHighestAlleleCount();
|
||||
int[][] table = new int[2][2];
|
||||
|
||||
for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) {
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
|
||||
if ( el.getKey().isReducedRead() ) // ignore reduced reads
|
||||
continue;
|
||||
final boolean matchesRef = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(ref,true);
|
||||
final boolean matchesAlt = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(alt,true);
|
||||
|
||||
|
|
@ -254,7 +254,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
int row = matchesRef ? 0 : 1;
|
||||
int column = isFW ? 0 : 1;
|
||||
|
||||
table[row][column]++;
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
table[row][column] += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -275,7 +276,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
|
||||
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
|
||||
for (PileupElement p : sample.getValue().getBasePileup()) {
|
||||
if ( ! RankSumTest.isUsableBase(p, false) || p.getRead().isReducedRead() ) // ignore deletions and reduced reads
|
||||
if ( ! RankSumTest.isUsableBase(p, false) ) // ignore deletions
|
||||
continue;
|
||||
|
||||
if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider )
|
||||
|
|
@ -290,7 +291,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
int row = matchesRef ? 0 : 1;
|
||||
int column = isFW ? 0 : 1;
|
||||
|
||||
table[row][column]++;
|
||||
table[row][column] += p.getRepresentativeCount();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -200,15 +200,15 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
|
|||
}
|
||||
}
|
||||
|
||||
private boolean readHasBeenSkipped(GATKSAMRecord read) {
|
||||
private boolean readHasBeenSkipped( final GATKSAMRecord read ) {
|
||||
return read.containsTemporaryAttribute(SKIP_RECORD_ATTRIBUTE);
|
||||
}
|
||||
|
||||
private boolean isLowQualityBase(GATKSAMRecord read, int offset) {
|
||||
return read.getBaseQualities()[offset] < minimumQToUse;
|
||||
private boolean isLowQualityBase( final PileupElement p ) {
|
||||
return p.getQual() < minimumQToUse;
|
||||
}
|
||||
|
||||
private boolean readNotSeen(GATKSAMRecord read) {
|
||||
private boolean readNotSeen( final GATKSAMRecord read ) {
|
||||
return !read.containsTemporaryAttribute(SEEN_ATTRIBUTE);
|
||||
}
|
||||
|
||||
|
|
@ -230,7 +230,7 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
|
|||
final int offset = p.getOffset();
|
||||
|
||||
// This read has been marked to be skipped or base is low quality (we don't recalibrate low quality bases)
|
||||
if (readHasBeenSkipped(read) || isLowQualityBase(read, offset))
|
||||
if (readHasBeenSkipped(read) || p.isInsertionAtBeginningOfRead() || isLowQualityBase(p) )
|
||||
continue;
|
||||
|
||||
if (readNotSeen(read)) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,233 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* Generic interface for calculating the probability of alleles segregating given priors and genotype likelihoods
|
||||
*/
|
||||
public abstract class AlleleFrequencyCalculation implements Cloneable {
|
||||
private final static Logger defaultLogger = Logger.getLogger(AlleleFrequencyCalculation.class);
|
||||
|
||||
public enum Model {
|
||||
/** The default model with the best performance in all cases */
|
||||
EXACT("ExactAFCalculation");
|
||||
|
||||
final String implementationName;
|
||||
|
||||
private Model(String implementationName) {
|
||||
this.implementationName = implementationName;
|
||||
}
|
||||
}
|
||||
|
||||
protected int nSamples;
|
||||
protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
|
||||
protected int MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
|
||||
protected Logger logger;
|
||||
protected PrintStream verboseWriter;
|
||||
|
||||
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
|
||||
|
||||
private SimpleTimer callTimer = new SimpleTimer();
|
||||
private PrintStream callReport = null;
|
||||
|
||||
protected AlleleFrequencyCalculation(final UnifiedArgumentCollection UAC, final int nSamples, final Logger logger, final PrintStream verboseWriter) {
|
||||
this(nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.MAX_ALTERNATE_ALLELES_FOR_INDELS, UAC.exactCallsLog, logger, verboseWriter);
|
||||
}
|
||||
|
||||
protected AlleleFrequencyCalculation(final int nSamples,
|
||||
final int maxAltAlleles,
|
||||
final int maxAltAllelesForIndels,
|
||||
final File exactCallsLog,
|
||||
final Logger logger,
|
||||
final PrintStream verboseWriter) {
|
||||
if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples);
|
||||
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles);
|
||||
|
||||
this.nSamples = nSamples;
|
||||
this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = maxAltAlleles;
|
||||
this.MAX_ALTERNATE_ALLELES_FOR_INDELS = maxAltAllelesForIndels;
|
||||
this.logger = logger == null ? defaultLogger : logger;
|
||||
this.verboseWriter = verboseWriter;
|
||||
if ( exactCallsLog != null )
|
||||
initializeOutputFile(exactCallsLog);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #getLog10PNonRef(org.broadinstitute.sting.utils.variantcontext.VariantContext, double[], AlleleFrequencyCalculationResult)
|
||||
*
|
||||
* Allocates a new results object. Useful for testing but slow in practice.
|
||||
*/
|
||||
public final AlleleFrequencyCalculationResult getLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors) {
|
||||
return getLog10PNonRef(vc, log10AlleleFrequencyPriors, new AlleleFrequencyCalculationResult(MAX_ALTERNATE_ALLELES_TO_GENOTYPE));
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the probability of the alleles segregating given the genotype likelihoods of the samples in vc
|
||||
*
|
||||
* @param vc the VariantContext holding the alleles and sample information
|
||||
* @param log10AlleleFrequencyPriors a prior vector nSamples x 2 in length indicating the Pr(AF = i)
|
||||
* @param result a pre-allocated (for efficiency) object to hold the result of the calculation
|
||||
* @return result (for programming convenience)
|
||||
*/
|
||||
public final AlleleFrequencyCalculationResult getLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null");
|
||||
if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null");
|
||||
if ( result == null ) throw new IllegalArgumentException("Results object cannot be null");
|
||||
|
||||
// reset the result, so we can store our new result there
|
||||
result.reset();
|
||||
|
||||
final VariantContext vcWorking = reduceScope(vc);
|
||||
|
||||
callTimer.start();
|
||||
computeLog10PNonRef(vcWorking, log10AlleleFrequencyPriors, result);
|
||||
final long nanoTime = callTimer.getElapsedTimeNano();
|
||||
|
||||
if ( callReport != null )
|
||||
printCallInfo(vcWorking, log10AlleleFrequencyPriors, nanoTime, result.getLog10PosteriorOfAFzero());
|
||||
|
||||
result.setAllelesUsedInGenotyping(vcWorking.getAlleles());
|
||||
return result;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
//
|
||||
// Abstract methods that should be implemented by concrete implementations
|
||||
// to actually calculate the AF
|
||||
//
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Look at VC and perhaps return a new one of reduced complexity, if that's necessary
|
||||
*
|
||||
* Used before the call to computeLog10PNonRef to simply the calculation job at hand,
|
||||
* if vc exceeds bounds. For example, if VC has 100 alt alleles this function
|
||||
* may decide to only genotype the best 2 of them.
|
||||
*
|
||||
* @param vc the initial VC provided by the caller to this AFcalculation
|
||||
* @return a potentially simpler VC that's more tractable to genotype
|
||||
*/
|
||||
@Requires("vc != null")
|
||||
@Ensures("result != null")
|
||||
protected abstract VariantContext reduceScope(final VariantContext vc);
|
||||
|
||||
/**
|
||||
* Actually carry out the log10PNonRef calculation on vc, storing results in results
|
||||
*
|
||||
* @param vc variant context with alleles and genotype likelihoods
|
||||
* @param log10AlleleFrequencyPriors priors
|
||||
* @param result (pre-allocated) object to store results
|
||||
*/
|
||||
// TODO -- add consistent requires among args
|
||||
protected abstract void computeLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result);
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
*
|
||||
* @param vc variant context with alleles and genotype likelihoods
|
||||
* @param allelesToUse alleles to subset
|
||||
* @param assignGenotypes
|
||||
* @param ploidy
|
||||
* @return GenotypesContext object
|
||||
*/
|
||||
protected abstract GenotypesContext subsetAlleles(final VariantContext vc,
|
||||
final List<Allele> allelesToUse,
|
||||
final boolean assignGenotypes,
|
||||
final int ploidy);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
//
|
||||
// Print information about the call to the calls log
|
||||
//
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
private void initializeOutputFile(final File outputFile) {
|
||||
try {
|
||||
if (outputFile != null) {
|
||||
callReport = new PrintStream( new FileOutputStream(outputFile) );
|
||||
callReport.println(Utils.join("\t", Arrays.asList("loc", "variable", "key", "value")));
|
||||
}
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotCreateOutputFile(outputFile, e);
|
||||
}
|
||||
}
|
||||
|
||||
private void printCallInfo(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final long runtimeNano,
|
||||
final double log10PosteriorOfAFzero) {
|
||||
printCallElement(vc, "type", "ignore", vc.getType());
|
||||
|
||||
int allelei = 0;
|
||||
for ( final Allele a : vc.getAlleles() )
|
||||
printCallElement(vc, "allele", allelei++, a.getDisplayString());
|
||||
|
||||
for ( final Genotype g : vc.getGenotypes() )
|
||||
printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString());
|
||||
|
||||
for ( int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++ )
|
||||
printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]);
|
||||
|
||||
printCallElement(vc, "runtime.nano", "ignore", runtimeNano);
|
||||
printCallElement(vc, "log10PosteriorOfAFzero", "ignore", log10PosteriorOfAFzero);
|
||||
|
||||
callReport.flush();
|
||||
}
|
||||
|
||||
private void printCallElement(final VariantContext vc,
|
||||
final Object variable,
|
||||
final Object key,
|
||||
final Object value) {
|
||||
final String loc = String.format("%s:%d", vc.getChr(), vc.getStart());
|
||||
callReport.println(Utils.join("\t", Arrays.asList(loc, variable, key, value)));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -25,9 +25,12 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -35,9 +38,10 @@ import java.util.Arrays;
|
|||
* Date: Dec 14, 2011
|
||||
*
|
||||
* Useful helper class to communicate the results of the allele frequency calculation
|
||||
*
|
||||
* TODO -- WHAT IS THE CONTRACT ON MAP AC AND P NON REF?
|
||||
*/
|
||||
public class AlleleFrequencyCalculationResult {
|
||||
|
||||
// These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles
|
||||
private double log10MLE;
|
||||
private double log10MAP;
|
||||
|
|
@ -54,21 +58,88 @@ public class AlleleFrequencyCalculationResult {
|
|||
private double log10LikelihoodOfAFzero;
|
||||
private double log10PosteriorOfAFzero;
|
||||
|
||||
int nEvaluations = 0;
|
||||
|
||||
/**
|
||||
* The list of alleles actually used in computing the AF
|
||||
*/
|
||||
private List<Allele> allelesUsedInGenotyping = null;
|
||||
|
||||
/**
|
||||
* Create a results object capability of storing results for calls with up to maxAltAlleles
|
||||
*
|
||||
* @param maxAltAlleles an integer >= 1
|
||||
*/
|
||||
public AlleleFrequencyCalculationResult(final int maxAltAlleles) {
|
||||
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles);
|
||||
|
||||
alleleCountsOfMLE = new int[maxAltAlleles];
|
||||
alleleCountsOfMAP = new int[maxAltAlleles];
|
||||
|
||||
reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the log10 value of the probability mass at the MLE
|
||||
*
|
||||
* @return a log10 prob
|
||||
*/
|
||||
@Ensures("goodLog10Value(result)")
|
||||
public double getLog10MLE() {
|
||||
return log10MLE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the log10 value of the probability mass at the max. a posterior (MAP)
|
||||
*
|
||||
* @return a log10 prob
|
||||
*/
|
||||
@Ensures("goodLog10Value(result)")
|
||||
public double getLog10MAP() {
|
||||
return log10MAP;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a vector with maxAltAlleles values containing AC values at the MLE
|
||||
*
|
||||
* The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order,
|
||||
* starting from index 0 (i.e., the first alt allele is at 0). The vector is always
|
||||
* maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values
|
||||
* are meaningful.
|
||||
*
|
||||
* @return a vector with allele counts, not all of which may be meaningful
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public int[] getAlleleCountsOfMLE() {
|
||||
return alleleCountsOfMLE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a vector with maxAltAlleles values containing AC values at the MAP
|
||||
*
|
||||
* @see #getAlleleCountsOfMLE() for the encoding of results in this vector
|
||||
*
|
||||
* @return a non-null vector of ints
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public int[] getAlleleCountsOfMAP() {
|
||||
return alleleCountsOfMAP;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of cycles used to evaluate the pNonRef for this AF calculation
|
||||
*
|
||||
* @return the number of evaluations required to produce the answer for this AF calculation
|
||||
*/
|
||||
public int getnEvaluations() {
|
||||
return nEvaluations;
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double getLog10PosteriorsMatrixSumWithoutAFzero() {
|
||||
if ( log10PosteriorMatrixSum == null ) {
|
||||
log10PosteriorMatrixSum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex);
|
||||
|
|
@ -76,33 +147,99 @@ public class AlleleFrequencyCalculationResult {
|
|||
return log10PosteriorMatrixSum;
|
||||
}
|
||||
|
||||
public int[] getAlleleCountsOfMLE() {
|
||||
return alleleCountsOfMLE;
|
||||
}
|
||||
|
||||
public int[] getAlleleCountsOfMAP() {
|
||||
return alleleCountsOfMAP;
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double getLog10LikelihoodOfAFzero() {
|
||||
return log10LikelihoodOfAFzero;
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double getLog10PosteriorOfAFzero() {
|
||||
return log10PosteriorOfAFzero;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
|
||||
/**
|
||||
* Get the list of alleles actually used in genotyping.
|
||||
*
|
||||
* Due to computational / implementation constraints this may be smaller than
|
||||
* the actual list of alleles requested
|
||||
*
|
||||
* @return a non-empty list of alleles used during genotyping
|
||||
*/
|
||||
@Ensures({"result != null", "! result.isEmpty()"})
|
||||
public List<Allele> getAllelesUsedInGenotyping() {
|
||||
if ( allelesUsedInGenotyping == null )
|
||||
throw new IllegalStateException("allelesUsedInGenotyping requested but not yet set");
|
||||
|
||||
return allelesUsedInGenotyping;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the normalized -- across all AFs -- of AC == 0, NOT LOG10
|
||||
* @return
|
||||
*/
|
||||
// TODO -- this ensure cannot be enabled right now because the log10 inputs can be infinity, etc.
|
||||
// TODO -- we should own these values in a more meaningful way and return good values in the case
|
||||
// TODO -- where this happens, or instead thrown an error and have a function to say "was this calculation successful
|
||||
// @Ensures({"result >= 0.0", "result <= 1.0"})
|
||||
public double getNormalizedPosteriorOfAFzero() {
|
||||
return getNormalizedPosteriors()[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the normalized -- across all AFs -- of AC > 0, NOT LOG10
|
||||
* @return
|
||||
*/
|
||||
// TODO -- this ensure cannot be enabled right now because the log10 inputs can be infinity, etc.
|
||||
// TODO -- we should own these values in a more meaningful way and return good values in the case
|
||||
// TODO -- where this happens, or instead thrown an error and have a function to say "was this calculation successful
|
||||
//@Ensures({"result >= 0.0", "result <= 1.0"})
|
||||
public double getNormalizedPosteriorOfAFGTZero() {
|
||||
return getNormalizedPosteriors()[1];
|
||||
}
|
||||
|
||||
private double[] getNormalizedPosteriors() {
|
||||
final double[] posteriors = new double[]{ getLog10PosteriorOfAFzero(), getLog10PosteriorsMatrixSumWithoutAFzero() };
|
||||
return MathUtils.normalizeFromLog10(posteriors);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Protected mutational methods only for use within the calculation models themselves
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Reset the data in this results object, so that it can be used in a subsequent AF calculation
|
||||
*
|
||||
* Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer
|
||||
*/
|
||||
protected void reset() {
|
||||
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculation.VALUE_NOT_CALCULATED;
|
||||
for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) {
|
||||
alleleCountsOfMLE[i] = 0;
|
||||
alleleCountsOfMAP[i] = 0;
|
||||
}
|
||||
currentPosteriorsCacheIndex = 0;
|
||||
log10PosteriorMatrixSum = null;
|
||||
allelesUsedInGenotyping = null;
|
||||
}
|
||||
|
||||
public void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
|
||||
/**
|
||||
* Tell this result we used one more evaluation cycle
|
||||
*/
|
||||
protected void incNEvaluations() {
|
||||
nEvaluations++;
|
||||
}
|
||||
|
||||
protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
|
||||
if ( log10LofK > log10MLE ) {
|
||||
log10MLE = log10LofK;
|
||||
for ( int i = 0; i < alleleCountsForK.length; i++ )
|
||||
|
|
@ -110,7 +247,7 @@ public class AlleleFrequencyCalculationResult {
|
|||
}
|
||||
}
|
||||
|
||||
public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
|
||||
protected void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
|
||||
addToPosteriorsCache(log10LofK);
|
||||
|
||||
if ( log10LofK > log10MAP ) {
|
||||
|
|
@ -132,7 +269,7 @@ public class AlleleFrequencyCalculationResult {
|
|||
}
|
||||
}
|
||||
|
||||
public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
|
||||
protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
|
||||
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
|
||||
if ( log10LikelihoodOfAFzero > log10MLE ) {
|
||||
log10MLE = log10LikelihoodOfAFzero;
|
||||
|
|
@ -140,11 +277,22 @@ public class AlleleFrequencyCalculationResult {
|
|||
}
|
||||
}
|
||||
|
||||
public void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
|
||||
protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
|
||||
this.log10PosteriorOfAFzero = log10PosteriorOfAFzero;
|
||||
if ( log10PosteriorOfAFzero > log10MAP ) {
|
||||
log10MAP = log10PosteriorOfAFzero;
|
||||
Arrays.fill(alleleCountsOfMAP, 0);
|
||||
}
|
||||
}
|
||||
|
||||
protected void setAllelesUsedInGenotyping(List<Allele> allelesUsedInGenotyping) {
|
||||
if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() )
|
||||
throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty");
|
||||
|
||||
this.allelesUsedInGenotyping = allelesUsedInGenotyping;
|
||||
}
|
||||
|
||||
private static boolean goodLog10Value(final double result) {
|
||||
return result <= 0.0 || Double.isInfinite(result) || Double.isNaN(result);
|
||||
}
|
||||
}
|
||||
|
|
@ -32,41 +32,54 @@ import org.broadinstitute.sting.utils.variantcontext.*;
|
|||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||
|
||||
public class DiploidExactAFCalculation extends ExactAFCalculation {
|
||||
// private final static boolean DEBUG = false;
|
||||
|
||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
|
||||
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
public DiploidExactAFCalculation(final int nSamples, final int maxAltAlleles) {
|
||||
super(nSamples, maxAltAlleles, maxAltAlleles, null, null, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Dynamically found in UnifiedGenotyperEngine
|
||||
*
|
||||
* @param UAC
|
||||
* @param N
|
||||
* @param logger
|
||||
* @param verboseWriter
|
||||
*/
|
||||
public DiploidExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(UAC, N, logger, verboseWriter);
|
||||
}
|
||||
|
||||
public List<Allele> getLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
@Override
|
||||
public void computeLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
linearExactMultiAllelic(vc.getGenotypes(), vc.getNAlleles() - 1, log10AlleleFrequencyPriors, result);
|
||||
}
|
||||
|
||||
GenotypesContext GLs = vc.getGenotypes();
|
||||
List<Allele> alleles = vc.getAlleles();
|
||||
|
||||
final int myMaxAltAllelesToGenotype = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS && vc.getType().equals(VariantContext.Type.INDEL) ? 2 : MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
|
||||
@Override
|
||||
protected VariantContext reduceScope(final VariantContext vc) {
|
||||
final int myMaxAltAllelesToGenotype = vc.getType().equals(VariantContext.Type.INDEL) ? MAX_ALTERNATE_ALLELES_FOR_INDELS : MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
|
||||
|
||||
// don't try to genotype too many alternate alleles
|
||||
if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) {
|
||||
logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
|
||||
|
||||
alleles = new ArrayList<Allele>(myMaxAltAllelesToGenotype + 1);
|
||||
VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||
List<Allele> alleles = new ArrayList<Allele>(myMaxAltAllelesToGenotype + 1);
|
||||
alleles.add(vc.getReference());
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype));
|
||||
GLs = VariantContextUtils.subsetDiploidAlleles(vc, alleles, false);
|
||||
builder.alleles(alleles);
|
||||
builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false));
|
||||
return builder.make();
|
||||
} else {
|
||||
return vc;
|
||||
}
|
||||
|
||||
linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result);
|
||||
|
||||
return alleles;
|
||||
}
|
||||
|
||||
|
||||
private static final int PL_INDEX_OF_HOM_REF = 0;
|
||||
private static List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) {
|
||||
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
|
||||
|
|
@ -134,6 +147,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
// keep processing while we have AC conformations that need to be calculated
|
||||
MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen();
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
result.incNEvaluations(); // keep track of the number of evaluations
|
||||
|
||||
// compute log10Likelihoods
|
||||
final ExactACset set = ACqueue.remove();
|
||||
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLikelihoodSeen, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result);
|
||||
|
|
@ -30,40 +30,23 @@ import org.broadinstitute.sting.utils.MathUtils;
|
|||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* The model representing how we calculate a genotype given the priors and a pile
|
||||
* of bases and quality scores
|
||||
* Uses the Exact calculation of Heng Li
|
||||
*/
|
||||
public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
||||
|
||||
public enum Model {
|
||||
/** The default model with the best performance in all cases */
|
||||
EXACT
|
||||
abstract class ExactAFCalculation extends AlleleFrequencyCalculation {
|
||||
protected ExactAFCalculation(final UnifiedArgumentCollection UAC, final int nSamples, final Logger logger, final PrintStream verboseWriter) {
|
||||
super(UAC, nSamples, logger, verboseWriter);
|
||||
}
|
||||
|
||||
protected int N;
|
||||
protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
|
||||
protected boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
|
||||
protected Logger logger;
|
||||
protected PrintStream verboseWriter;
|
||||
|
||||
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
|
||||
|
||||
protected AlleleFrequencyCalculationModel(final UnifiedArgumentCollection UAC, final int N, final Logger logger, final PrintStream verboseWriter) {
|
||||
this.N = N;
|
||||
this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = UAC.MAX_ALTERNATE_ALLELES;
|
||||
this.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = UAC.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
this.logger = logger;
|
||||
this.verboseWriter = verboseWriter;
|
||||
protected ExactAFCalculation(final int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, File exactCallsLog, Logger logger, PrintStream verboseWriter) {
|
||||
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, exactCallsLog, logger, verboseWriter);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -102,31 +85,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
|||
return genotypeLikelihoods;
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
* @param vc variant context with alleles and genotype likelihoods
|
||||
* @param log10AlleleFrequencyPriors priors
|
||||
* @param result (pre-allocated) object to store likelihoods results
|
||||
* @return the alleles used for genotyping
|
||||
*/
|
||||
protected abstract List<Allele> getLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result);
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
* @param vc variant context with alleles and genotype likelihoods
|
||||
* @param allelesToUse alleles to subset
|
||||
* @param assignGenotypes
|
||||
* @param ploidy
|
||||
* @return GenotypesContext object
|
||||
*/
|
||||
protected abstract GenotypesContext subsetAlleles(final VariantContext vc,
|
||||
final List<Allele> allelesToUse,
|
||||
final boolean assignGenotypes,
|
||||
final int ploidy);
|
||||
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// protected classes used to store exact model matrix columns
|
||||
|
|
@ -0,0 +1,496 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class OptimizedDiploidExactAFCalculation extends ExactAFCalculation {
|
||||
// private final static boolean DEBUG = false;
|
||||
|
||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
|
||||
public OptimizedDiploidExactAFCalculation(final int nSamples, final int maxAltAlleles) {
|
||||
super(nSamples, maxAltAlleles, maxAltAlleles, null, null, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Dynamically found in UnifiedGenotyperEngine
|
||||
*
|
||||
* @param UAC
|
||||
* @param N
|
||||
* @param logger
|
||||
* @param verboseWriter
|
||||
*/
|
||||
public OptimizedDiploidExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(UAC, N, logger, verboseWriter);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void computeLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
linearExactMultiAllelic(vc.getGenotypes(), vc.getNAlleles() - 1, log10AlleleFrequencyPriors, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected VariantContext reduceScope(final VariantContext vc) {
|
||||
final int myMaxAltAllelesToGenotype = vc.getType().equals(VariantContext.Type.INDEL) ? MAX_ALTERNATE_ALLELES_FOR_INDELS : MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
|
||||
|
||||
// don't try to genotype too many alternate alleles
|
||||
if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) {
|
||||
logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
|
||||
|
||||
VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||
List<Allele> alleles = new ArrayList<Allele>(myMaxAltAllelesToGenotype + 1);
|
||||
alleles.add(vc.getReference());
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype));
|
||||
builder.alleles(alleles);
|
||||
builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false));
|
||||
return builder.make();
|
||||
} else {
|
||||
return vc;
|
||||
}
|
||||
}
|
||||
|
||||
private static final int PL_INDEX_OF_HOM_REF = 0;
|
||||
private static List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) {
|
||||
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
|
||||
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles];
|
||||
for ( int i = 0; i < numOriginalAltAlleles; i++ )
|
||||
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
|
||||
|
||||
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
|
||||
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes());
|
||||
for ( final double[] likelihoods : GLs ) {
|
||||
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
||||
if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) {
|
||||
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL);
|
||||
if ( alleles.alleleIndex1 != 0 )
|
||||
likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
|
||||
// don't double-count it
|
||||
if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 )
|
||||
likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
|
||||
}
|
||||
}
|
||||
|
||||
// sort them by probability mass and choose the best ones
|
||||
Collections.sort(Arrays.asList(likelihoodSums));
|
||||
final ArrayList<Allele> bestAlleles = new ArrayList<Allele>(numAllelesToChoose);
|
||||
for ( int i = 0; i < numAllelesToChoose; i++ )
|
||||
bestAlleles.add(likelihoodSums[i].allele);
|
||||
|
||||
final ArrayList<Allele> orderedBestAlleles = new ArrayList<Allele>(numAllelesToChoose);
|
||||
for ( Allele allele : vc.getAlternateAlleles() ) {
|
||||
if ( bestAlleles.contains(allele) )
|
||||
orderedBestAlleles.add(allele);
|
||||
}
|
||||
|
||||
return orderedBestAlleles;
|
||||
}
|
||||
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// Multi-allelic implementation.
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
public static void linearExactMultiAllelic(final GenotypesContext GLs,
|
||||
final int numAlternateAlleles,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
||||
final int numSamples = genotypeLikelihoods.size()-1;
|
||||
final int numChr = 2*numSamples;
|
||||
|
||||
// queue of AC conformations to process
|
||||
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
|
||||
|
||||
// mapping of ExactACset indexes to the objects
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>(numChr+1);
|
||||
|
||||
// add AC=0 to the queue
|
||||
int[] zeroCounts = new int[numAlternateAlleles];
|
||||
ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts));
|
||||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||
|
||||
// keep processing while we have AC conformations that need to be calculated
|
||||
MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen();
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
result.incNEvaluations(); // keep track of the number of evaluations
|
||||
|
||||
// compute log10Likelihoods
|
||||
final ExactACset set = ACqueue.remove();
|
||||
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLikelihoodSeen, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result);
|
||||
|
||||
// adjust max likelihood seen if needed
|
||||
if ( log10LofKs > maxLikelihoodSeen.maxLog10L )
|
||||
maxLikelihoodSeen.update(log10LofKs, set.ACcounts);
|
||||
|
||||
// clean up memory
|
||||
indexesToACset.remove(set.ACcounts);
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** removing used set=%s%n", set.ACcounts);
|
||||
}
|
||||
}
|
||||
|
||||
private static final class DependentSet {
|
||||
public final int[] ACcounts;
|
||||
public final int PLindex;
|
||||
|
||||
public DependentSet(final int[] ACcounts, final int PLindex) {
|
||||
this.ACcounts = ACcounts;
|
||||
this.PLindex = PLindex;
|
||||
}
|
||||
}
|
||||
|
||||
private static double calculateAlleleCountConformation(final ExactACset set,
|
||||
final ArrayList<double[]> genotypeLikelihoods,
|
||||
final MaxLikelihoodSeen maxLikelihoodSeen,
|
||||
final int numChr,
|
||||
final LinkedList<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
|
||||
|
||||
// compute the log10Likelihoods
|
||||
computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, result);
|
||||
|
||||
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
|
||||
|
||||
// can we abort early because the log10Likelihoods are so small?
|
||||
if ( log10LofK < maxLikelihoodSeen.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && maxLikelihoodSeen.isLowerAC(set.ACcounts) ) {
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// iterate over higher frequencies if possible
|
||||
final int ACwiggle = numChr - set.getACsum();
|
||||
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
|
||||
return log10LofK;
|
||||
|
||||
final int numAltAlleles = set.ACcounts.getCounts().length;
|
||||
|
||||
// add conformations for the k+1 case
|
||||
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
ACcountsClone[allele]++;
|
||||
// to get to this conformation, a sample would need to be AB (remember that ref=0)
|
||||
final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1);
|
||||
updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
}
|
||||
|
||||
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
|
||||
if ( ACwiggle > 1 ) {
|
||||
final ArrayList<DependentSet> differentAlleles = new ArrayList<DependentSet>(numAltAlleles * numAltAlleles);
|
||||
final ArrayList<DependentSet> sameAlleles = new ArrayList<DependentSet>(numAltAlleles);
|
||||
|
||||
for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) {
|
||||
for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
ACcountsClone[allele_i]++;
|
||||
ACcountsClone[allele_j]++;
|
||||
|
||||
// to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index)
|
||||
final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1);
|
||||
if ( allele_i == allele_j )
|
||||
sameAlleles.add(new DependentSet(ACcountsClone, PLindex));
|
||||
else
|
||||
differentAlleles.add(new DependentSet(ACcountsClone, PLindex));
|
||||
}
|
||||
}
|
||||
|
||||
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
|
||||
for ( DependentSet dependent : differentAlleles )
|
||||
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
for ( DependentSet dependent : sameAlleles )
|
||||
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
}
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
|
||||
// also pushes its value to the given callingSetIndex.
|
||||
private static void updateACset(final int[] newSetCounts,
|
||||
final int numChr,
|
||||
final ExactACset dependentSet,
|
||||
final int PLsetIndex,
|
||||
final Queue<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||
final ArrayList<double[]> genotypeLikelihoods) {
|
||||
final ExactACcounts index = new ExactACcounts(newSetCounts);
|
||||
if ( !indexesToACset.containsKey(index) ) {
|
||||
ExactACset set = new ExactACset(numChr/2 +1, index);
|
||||
indexesToACset.put(index, set);
|
||||
ACqueue.add(set);
|
||||
}
|
||||
|
||||
// push data from the dependency to the new set
|
||||
//if ( DEBUG )
|
||||
// System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts);
|
||||
pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods);
|
||||
}
|
||||
|
||||
private static void computeLofK(final ExactACset set,
|
||||
final ArrayList<double[]> genotypeLikelihoods,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
set.log10Likelihoods[0] = 0.0; // the zero case
|
||||
final int totalK = set.getACsum();
|
||||
|
||||
// special case for k = 0 over all k
|
||||
if ( totalK == 0 ) {
|
||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ )
|
||||
set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX];
|
||||
|
||||
final double log10Lof0 = set.log10Likelihoods[set.log10Likelihoods.length-1];
|
||||
result.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
return;
|
||||
}
|
||||
|
||||
// if we got here, then k > 0 for at least one k.
|
||||
// the non-AA possible conformations were already dealt with by pushes from dependent sets;
|
||||
// now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
|
||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
||||
|
||||
if ( totalK < 2*j-1 ) {
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
|
||||
set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue);
|
||||
}
|
||||
|
||||
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator;
|
||||
}
|
||||
|
||||
double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
|
||||
|
||||
// update the MLE if necessary
|
||||
result.updateMLEifNeeded(log10LofK, set.ACcounts.counts);
|
||||
|
||||
// apply the priors over each alternate allele
|
||||
for ( final int ACcount : set.ACcounts.getCounts() ) {
|
||||
if ( ACcount > 0 )
|
||||
log10LofK += log10AlleleFrequencyPriors[ACcount];
|
||||
}
|
||||
result.updateMAPifNeeded(log10LofK, set.ACcounts.counts);
|
||||
}
|
||||
|
||||
private static void pushData(final ExactACset targetSet,
|
||||
final ExactACset dependentSet,
|
||||
final int PLsetIndex,
|
||||
final ArrayList<double[]> genotypeLikelihoods) {
|
||||
final int totalK = targetSet.getACsum();
|
||||
|
||||
for ( int j = 1; j < targetSet.log10Likelihoods.length; j++ ) {
|
||||
|
||||
if ( totalK <= 2*j ) { // skip impossible conformations
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
final double conformationValue =
|
||||
determineCoefficient(PLsetIndex, j, targetSet.ACcounts.getCounts(), totalK) + dependentSet.log10Likelihoods[j-1] + gl[PLsetIndex];
|
||||
targetSet.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(targetSet.log10Likelihoods[j], conformationValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
|
||||
|
||||
// the closed form representation generalized for multiple alleles is as follows:
|
||||
// AA: (2j - totalK) * (2j - totalK - 1)
|
||||
// AB: 2k_b * (2j - totalK)
|
||||
// AC: 2k_c * (2j - totalK)
|
||||
// BB: k_b * (k_b - 1)
|
||||
// BC: 2 * k_b * k_c
|
||||
// CC: k_c * (k_c - 1)
|
||||
|
||||
// find the 2 alleles that are represented by this PL index
|
||||
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
|
||||
|
||||
// *** note that throughout this method we subtract one from the alleleIndex because ACcounts ***
|
||||
// *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. ***
|
||||
|
||||
// the AX het case
|
||||
if ( alleles.alleleIndex1 == 0 )
|
||||
return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK];
|
||||
|
||||
final int k_i = ACcounts[alleles.alleleIndex1-1];
|
||||
|
||||
// the hom var case (e.g. BB, CC, DD)
|
||||
final double coeff;
|
||||
if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) {
|
||||
coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1];
|
||||
}
|
||||
// the het non-ref case (e.g. BC, BD, CD)
|
||||
else {
|
||||
final int k_j = ACcounts[alleles.alleleIndex2-1];
|
||||
coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j];
|
||||
}
|
||||
|
||||
return coeff;
|
||||
}
|
||||
|
||||
public GenotypesContext subsetAlleles(final VariantContext vc,
|
||||
final List<Allele> allelesToUse,
|
||||
final boolean assignGenotypes,
|
||||
final int ploidy) {
|
||||
return VariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// Deprecated bi-allelic ~O(N) implementation. Kept here for posterity.
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* A simple data structure that holds the current, prev, and prev->prev likelihoods vectors
|
||||
* for the exact model calculation
|
||||
*/
|
||||
/*
|
||||
private final static class ExactACCache {
|
||||
double[] kMinus2, kMinus1, kMinus0;
|
||||
|
||||
private final static double[] create(int n) {
|
||||
return new double[n];
|
||||
}
|
||||
|
||||
public ExactACCache(int n) {
|
||||
kMinus2 = create(n);
|
||||
kMinus1 = create(n);
|
||||
kMinus0 = create(n);
|
||||
}
|
||||
|
||||
final public void rotate() {
|
||||
double[] tmp = kMinus2;
|
||||
kMinus2 = kMinus1;
|
||||
kMinus1 = kMinus0;
|
||||
kMinus0 = tmp;
|
||||
}
|
||||
|
||||
final public double[] getkMinus2() {
|
||||
return kMinus2;
|
||||
}
|
||||
|
||||
final public double[] getkMinus1() {
|
||||
return kMinus1;
|
||||
}
|
||||
|
||||
final public double[] getkMinus0() {
|
||||
return kMinus0;
|
||||
}
|
||||
}
|
||||
|
||||
public int linearExact(GenotypesContext GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[][] log10AlleleFrequencyLikelihoods,
|
||||
double[][] log10AlleleFrequencyPosteriors) {
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
||||
final int numSamples = genotypeLikelihoods.size()-1;
|
||||
final int numChr = 2*numSamples;
|
||||
|
||||
final ExactACCache logY = new ExactACCache(numSamples+1);
|
||||
logY.getkMinus0()[0] = 0.0; // the zero case
|
||||
|
||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
boolean done = false;
|
||||
int lastK = -1;
|
||||
|
||||
for (int k=0; k <= numChr && ! done; k++ ) {
|
||||
final double[] kMinus0 = logY.getkMinus0();
|
||||
|
||||
if ( k == 0 ) { // special case for k = 0
|
||||
for ( int j=1; j <= numSamples; j++ ) {
|
||||
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[0];
|
||||
}
|
||||
} else { // k > 0
|
||||
final double[] kMinus1 = logY.getkMinus1();
|
||||
final double[] kMinus2 = logY.getkMinus2();
|
||||
|
||||
for ( int j=1; j <= numSamples; j++ ) {
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
|
||||
double aa = Double.NEGATIVE_INFINITY;
|
||||
double ab = Double.NEGATIVE_INFINITY;
|
||||
if (k < 2*j-1)
|
||||
aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[0];
|
||||
|
||||
if (k < 2*j)
|
||||
ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[1];
|
||||
|
||||
double log10Max;
|
||||
if (k > 1) {
|
||||
final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[2];
|
||||
log10Max = approximateLog10SumLog10(aa, ab, bb);
|
||||
} else {
|
||||
// we know we aren't considering the BB case, so we can use an optimized log10 function
|
||||
log10Max = approximateLog10SumLog10(aa, ab);
|
||||
}
|
||||
|
||||
// finally, update the L(j,k) value
|
||||
kMinus0[j] = log10Max - logDenominator;
|
||||
}
|
||||
}
|
||||
|
||||
// update the posteriors vector
|
||||
final double log10LofK = kMinus0[numSamples];
|
||||
log10AlleleFrequencyLikelihoods[0][k] = log10LofK;
|
||||
log10AlleleFrequencyPosteriors[0][k] = log10LofK + log10AlleleFrequencyPriors[k];
|
||||
|
||||
// can we abort early?
|
||||
lastK = k;
|
||||
maxLog10L = Math.max(maxLog10L, log10LofK);
|
||||
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
//if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
|
||||
done = true;
|
||||
}
|
||||
|
||||
logY.rotate();
|
||||
}
|
||||
|
||||
return lastK;
|
||||
}
|
||||
|
||||
final static double approximateLog10SumLog10(double a, double b, double c) {
|
||||
return approximateLog10SumLog10(approximateLog10SumLog10(a, b), c);
|
||||
}
|
||||
*/
|
||||
|
||||
}
|
||||
|
|
@ -41,7 +41,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
|
||||
protected AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT;
|
||||
protected AlleleFrequencyCalculation.Model AFmodel = AlleleFrequencyCalculation.Model.EXACT;
|
||||
|
||||
/**
|
||||
* The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily
|
||||
|
|
@ -75,10 +75,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
|
||||
public Double MAX_DELETION_FRACTION = 0.05;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "cap_max_alternate_alleles_for_indels", shortName = "capMaxAltAllelesForIndels", doc = "Cap the maximum number of alternate alleles to genotype for indel calls at 2; overrides the --max_alternate_alleles argument; GSA production use only", required = false)
|
||||
public boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = false;
|
||||
|
||||
// indel-related arguments
|
||||
/**
|
||||
* A candidate indel is genotyped (and potentially called) if there are this number of reads with a consensus indel at a site.
|
||||
|
|
@ -186,7 +182,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
@Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false)
|
||||
boolean EXCLUDE_FILTERED_REFERENCE_SITES = false;
|
||||
|
||||
|
||||
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
|
||||
public UnifiedArgumentCollection clone() {
|
||||
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
|
||||
|
|
@ -212,7 +207,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE;
|
||||
uac.alleles = alleles;
|
||||
uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES;
|
||||
uac.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
uac.MAX_ALTERNATE_ALLELES_FOR_INDELS = MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
uac.GLmodel = GLmodel;
|
||||
uac.TREAT_ALL_READS_AS_SINGLE_POOL = TREAT_ALL_READS_AS_SINGLE_POOL;
|
||||
uac.referenceSampleRod = referenceSampleRod;
|
||||
|
|
@ -224,6 +219,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
uac.minReferenceDepth = minReferenceDepth;
|
||||
uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES;
|
||||
uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO;
|
||||
uac.exactCallsLog = exactCallsLog;
|
||||
|
||||
// todo- arguments to remove
|
||||
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
|
||||
|
|
@ -239,8 +235,10 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
this.GenotypingMode = SCAC.GenotypingMode;
|
||||
this.heterozygosity = SCAC.heterozygosity;
|
||||
this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
|
||||
this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
this.OutputMode = SCAC.OutputMode;
|
||||
this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
|
||||
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
|
||||
this.exactCallsLog = SCAC.exactCallsLog;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,10 +27,10 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
|
|
@ -249,7 +249,7 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
|
|||
throw new UserException("Incorrect genotype calculation model chosen. Only [POOLSNP|POOLINDEL|POOLBOTH] supported with this walker if sample ploidy != 2");
|
||||
}
|
||||
|
||||
if (UAC.AFmodel != AlleleFrequencyCalculationModel.Model.POOL)
|
||||
if (UAC.AFmodel != AlleleFrequencyCalculation.Model.POOL)
|
||||
throw new UserException("Incorrect AF Calculation model. Only POOL model supported if sample ploidy != 2");
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -78,11 +78,10 @@ public class UnifiedGenotyperEngine {
|
|||
private ThreadLocal<Map<String, GenotypeLikelihoodsCalculationModel>> glcm = new ThreadLocal<Map<String, GenotypeLikelihoodsCalculationModel>>();
|
||||
|
||||
// the model used for calculating p(non-ref)
|
||||
private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();
|
||||
private ThreadLocal<AlleleFrequencyCalculation> afcm = new ThreadLocal<AlleleFrequencyCalculation>();
|
||||
|
||||
// the allele frequency likelihoods and posteriors (allocated once as an optimization)
|
||||
private ThreadLocal<AlleleFrequencyCalculationResult> alleleFrequencyCalculationResult = new ThreadLocal<AlleleFrequencyCalculationResult>();
|
||||
private ThreadLocal<double[]> posteriorsArray = new ThreadLocal<double[]>();
|
||||
|
||||
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
|
||||
private final double[] log10AlleleFrequencyPriorsSNPs;
|
||||
|
|
@ -357,7 +356,6 @@ public class UnifiedGenotyperEngine {
|
|||
if ( afcm.get() == null ) {
|
||||
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
|
||||
alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES));
|
||||
posteriorsArray.set(new double[2]);
|
||||
}
|
||||
AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get();
|
||||
|
||||
|
|
@ -370,8 +368,7 @@ public class UnifiedGenotyperEngine {
|
|||
generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext));
|
||||
}
|
||||
|
||||
AFresult.reset();
|
||||
List<Allele> allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult);
|
||||
afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult);
|
||||
|
||||
// is the most likely frequency conformation AC=0 for all alternate alleles?
|
||||
boolean bestGuessIsRef = true;
|
||||
|
|
@ -382,7 +379,7 @@ public class UnifiedGenotyperEngine {
|
|||
myAlleles.add(vc.getReference());
|
||||
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
|
||||
final Allele alternateAllele = vc.getAlternateAllele(i);
|
||||
final int indexOfAllele = allelesUsedInGenotyping.indexOf(alternateAllele);
|
||||
final int indexOfAllele = AFresult.getAllelesUsedInGenotyping().indexOf(alternateAllele);
|
||||
// the genotyping model may have stripped it out
|
||||
if ( indexOfAllele == -1 )
|
||||
continue;
|
||||
|
|
@ -403,16 +400,16 @@ public class UnifiedGenotyperEngine {
|
|||
}
|
||||
|
||||
// calculate p(f>0):
|
||||
final double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get());
|
||||
final double PofF = 1.0 - normalizedPosteriors[0];
|
||||
final double PoFEq0 = AFresult.getNormalizedPosteriorOfAFzero();
|
||||
final double PoFGT0 = AFresult.getNormalizedPosteriorOfAFGTZero();
|
||||
|
||||
double phredScaledConfidence;
|
||||
if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PoFEq0);
|
||||
if ( Double.isInfinite(phredScaledConfidence) )
|
||||
phredScaledConfidence = -10.0 * AFresult.getLog10PosteriorOfAFzero();
|
||||
} else {
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PoFGT0);
|
||||
if ( Double.isInfinite(phredScaledConfidence) ) {
|
||||
final double sum = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
|
||||
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
|
||||
|
|
@ -423,7 +420,7 @@ public class UnifiedGenotyperEngine {
|
|||
if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) {
|
||||
// technically, at this point our confidence in a reference call isn't accurately estimated
|
||||
// because it didn't take into account samples with no data, so let's get a better estimate
|
||||
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, 1.0 - PofF);
|
||||
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0);
|
||||
}
|
||||
|
||||
// start constructing the resulting VC
|
||||
|
|
@ -439,7 +436,7 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
// print out stats if we have a writer
|
||||
if ( verboseWriter != null && !limitedContext )
|
||||
printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, model);
|
||||
printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model);
|
||||
|
||||
// *** note that calculating strand bias involves overwriting data structures, so we do that last
|
||||
final HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
|
|
@ -477,7 +474,6 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
// the forward lod
|
||||
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap);
|
||||
AFresult.reset();
|
||||
afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult);
|
||||
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
|
||||
double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
|
||||
|
|
@ -486,7 +482,6 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
// the reverse lod
|
||||
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap);
|
||||
AFresult.reset();
|
||||
afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult);
|
||||
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
|
||||
double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
|
||||
|
|
@ -513,10 +508,10 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
// if we are subsetting alleles (either because there were too many or because some were not polymorphic)
|
||||
// then we may need to trim the alleles (because the original VariantContext may have had to pad at the end).
|
||||
if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext )
|
||||
if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // limitedContext callers need to handle allele trimming on their own to keep their perReadAlleleLikelihoodMap alleles in sync
|
||||
vcCall = VariantContextUtils.reverseTrimAlleles(vcCall);
|
||||
|
||||
if ( annotationEngine != null && !limitedContext ) {
|
||||
if ( annotationEngine != null && !limitedContext ) { // limitedContext callers need to handle annotations on their own by calling their own annotationEngine
|
||||
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
|
||||
final ReadBackedPileup pileup = rawContext.getBasePileup();
|
||||
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);
|
||||
|
|
@ -524,13 +519,7 @@ public class UnifiedGenotyperEngine {
|
|||
vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap);
|
||||
}
|
||||
|
||||
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF));
|
||||
}
|
||||
|
||||
public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) {
|
||||
normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero();
|
||||
normalizedPosteriors[1] = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
|
||||
return MathUtils.normalizeFromLog10(normalizedPosteriors);
|
||||
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0));
|
||||
}
|
||||
|
||||
private Map<String, AlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) {
|
||||
|
|
@ -754,32 +743,34 @@ public class UnifiedGenotyperEngine {
|
|||
return glcm;
|
||||
}
|
||||
|
||||
private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) {
|
||||
private static AlleleFrequencyCalculation getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) {
|
||||
|
||||
List<Class<? extends AlleleFrequencyCalculationModel>> afClasses = new PluginManager<AlleleFrequencyCalculationModel>(AlleleFrequencyCalculationModel.class).getPlugins();
|
||||
List<Class<? extends AlleleFrequencyCalculation>> afClasses = new PluginManager<AlleleFrequencyCalculation>(AlleleFrequencyCalculation.class).getPlugins();
|
||||
|
||||
// user-specified name
|
||||
String afModelName = UAC.AFmodel.name();
|
||||
String afModelName = UAC.AFmodel.implementationName;
|
||||
|
||||
if (!afModelName.contains(GPSTRING) && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY)
|
||||
afModelName = GPSTRING + afModelName;
|
||||
else
|
||||
afModelName = "Diploid" + afModelName;
|
||||
|
||||
for (int i = 0; i < afClasses.size(); i++) {
|
||||
Class<? extends AlleleFrequencyCalculationModel> afClass = afClasses.get(i);
|
||||
Class<? extends AlleleFrequencyCalculation> afClass = afClasses.get(i);
|
||||
String key = afClass.getSimpleName().replace("AFCalculationModel","").toUpperCase();
|
||||
if (afModelName.equalsIgnoreCase(key)) {
|
||||
try {
|
||||
Object args[] = new Object[]{UAC,N,logger,verboseWriter};
|
||||
Constructor c = afClass.getDeclaredConstructor(UnifiedArgumentCollection.class, int.class, Logger.class, PrintStream.class);
|
||||
|
||||
return (AlleleFrequencyCalculationModel)c.newInstance(args);
|
||||
return (AlleleFrequencyCalculation)c.newInstance(args);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel);
|
||||
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculation " + UAC.AFmodel);
|
||||
}
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel);
|
||||
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculation " + UAC.AFmodel);
|
||||
}
|
||||
|
||||
public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding<VariantContext> allelesBinding) {
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidExactAFCalculation;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.TreeSet;
|
||||
|
|
@ -51,7 +51,7 @@ public class GLBasedSampleSelector extends SampleSelector {
|
|||
flatPriors = new double[1+2*samples.size()];
|
||||
}
|
||||
AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size());
|
||||
ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPriors,result);
|
||||
DiploidExactAFCalculation.linearExactMultiAllelic(subContext.getGenotypes(), vc.getAlternateAlleles().size(), flatPriors, result);
|
||||
// do we want to let this qual go up or down?
|
||||
if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) {
|
||||
return true;
|
||||
|
|
|
|||
|
|
@ -75,7 +75,7 @@ public class FilterLiftedVariants extends RodWalker<Integer, Integer> {
|
|||
boolean failed = false;
|
||||
byte[] recordRef = vc.getReference().getBases();
|
||||
for (int i = 0; i < recordRef.length && i < MAX_VARIANT_SIZE; i++) {
|
||||
if ( recordRef[i] != ref[i + (vc.isPointEvent() ? 0 : 1)] ) {
|
||||
if ( recordRef[i] != ref[i] ) {
|
||||
failed = true;
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -50,7 +50,6 @@ import org.broadinstitute.sting.utils.variantcontext.*;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -278,13 +277,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
@Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
|
||||
protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
|
||||
|
||||
/**
|
||||
* Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so make sure you supply the program with enough memory
|
||||
* given your input set. This option will NOT work well for large callsets; use --select_random_fraction for sets with a large numbers of variants.
|
||||
*/
|
||||
@Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false)
|
||||
protected int numRandom = 0;
|
||||
|
||||
/**
|
||||
* This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions.
|
||||
*/
|
||||
|
|
@ -330,20 +322,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false;
|
||||
|
||||
|
||||
/* Private class used to store the intermediate variants in the integer random selection process */
|
||||
private static class RandomVariantStructure {
|
||||
private VariantContext vc;
|
||||
|
||||
RandomVariantStructure(VariantContext vcP) {
|
||||
vc = vcP;
|
||||
}
|
||||
|
||||
public void set (VariantContext vcP) {
|
||||
vc = vcP;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public enum NumberAlleleRestriction {
|
||||
ALL,
|
||||
BIALLELIC,
|
||||
|
|
@ -364,12 +342,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
|
||||
|
||||
/* variables used by the SELECT RANDOM modules */
|
||||
private boolean SELECT_RANDOM_NUMBER = false;
|
||||
private boolean SELECT_RANDOM_FRACTION = false;
|
||||
private int variantNumber = 0;
|
||||
private int nVariantsAdded = 0;
|
||||
private int positionToAdd = 0;
|
||||
private RandomVariantStructure [] variantArray;
|
||||
|
||||
//Random number generator for the genotypes to remove
|
||||
private Random randomGenotypes = new Random();
|
||||
|
|
@ -478,12 +451,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
mv = new MendelianViolation(MENDELIAN_VIOLATION_QUAL_THRESHOLD,false,true);
|
||||
}
|
||||
|
||||
SELECT_RANDOM_NUMBER = numRandom > 0;
|
||||
if (SELECT_RANDOM_NUMBER) {
|
||||
logger.info("Selecting " + numRandom + " variants at random from the variant track");
|
||||
variantArray = new RandomVariantStructure[numRandom];
|
||||
}
|
||||
|
||||
SELECT_RANDOM_FRACTION = fractionRandom > 0;
|
||||
if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track");
|
||||
|
||||
|
|
@ -588,14 +555,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
break;
|
||||
}
|
||||
}
|
||||
if ( !failedJexlMatch ) {
|
||||
if (SELECT_RANDOM_NUMBER) {
|
||||
randomlyAddVariant(++variantNumber, sub);
|
||||
}
|
||||
else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) {
|
||||
if ( ! justRead )
|
||||
vcfWriter.add(sub);
|
||||
}
|
||||
if ( !failedJexlMatch &&
|
||||
!justRead &&
|
||||
( !SELECT_RANDOM_FRACTION || GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom ) ) {
|
||||
vcfWriter.add(sub);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -718,14 +681,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
|
||||
public void onTraversalDone(Integer result) {
|
||||
logger.info(result + " records processed.");
|
||||
|
||||
if (SELECT_RANDOM_NUMBER) {
|
||||
int positionToPrint = positionToAdd;
|
||||
for (int i=0; i<numRandom; i++) {
|
||||
vcfWriter.add(variantArray[positionToPrint].vc);
|
||||
positionToPrint = nextCircularPosition(positionToPrint);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -746,9 +701,9 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
|
||||
GenotypesContext newGC = sub.getGenotypes();
|
||||
|
||||
// if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs (because they are no longer accurate)
|
||||
// if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs and AD (because they are no longer accurate)
|
||||
if ( vc.getAlleles().size() != sub.getAlleles().size() )
|
||||
newGC = VariantContextUtils.stripPLs(sub.getGenotypes());
|
||||
newGC = VariantContextUtils.stripPLsAndAD(sub.getGenotypes());
|
||||
|
||||
// if we have fewer samples in the selected VC than in the original VC, we need to strip out the MLE tags
|
||||
if ( vc.getNSamples() != sub.getNSamples() ) {
|
||||
|
|
@ -809,25 +764,4 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
if ( sawDP )
|
||||
builder.attribute("DP", depth);
|
||||
}
|
||||
|
||||
private void randomlyAddVariant(int rank, VariantContext vc) {
|
||||
if (nVariantsAdded < numRandom)
|
||||
variantArray[nVariantsAdded++] = new RandomVariantStructure(vc);
|
||||
|
||||
else {
|
||||
double v = GenomeAnalysisEngine.getRandomGenerator().nextDouble();
|
||||
double t = (1.0/(rank-numRandom+1));
|
||||
if ( v < t) {
|
||||
variantArray[positionToAdd].set(vc);
|
||||
nVariantsAdded++;
|
||||
positionToAdd = nextCircularPosition(positionToAdd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int nextCircularPosition(int cur) {
|
||||
if ((cur + 1) == variantArray.length)
|
||||
return 0;
|
||||
return cur + 1;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
|
|||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.lang.reflect.Array;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -334,12 +335,12 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
return records;
|
||||
}
|
||||
|
||||
private static void addFieldValue(Object val, List<List<String>> result) {
|
||||
private static void addFieldValue(final Object val, final List<List<String>> result) {
|
||||
final int numResultRecords = result.size();
|
||||
|
||||
// if we're trying to create a single output record, add it
|
||||
if ( numResultRecords == 1 ) {
|
||||
result.get(0).add(val.toString());
|
||||
result.get(0).add(prettyPrintObject(val));
|
||||
}
|
||||
// if this field is a list of the proper size, add the appropriate entry to each record
|
||||
else if ( (val instanceof List) && ((List)val).size() == numResultRecords ) {
|
||||
|
|
@ -355,6 +356,26 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
private static String prettyPrintObject(final Object val) {
|
||||
if ( val instanceof List )
|
||||
return prettyPrintObject(((List)val).toArray());
|
||||
|
||||
if ( !val.getClass().isArray() )
|
||||
return val.toString();
|
||||
|
||||
final int length = Array.getLength(val);
|
||||
if ( length == 0 )
|
||||
return "";
|
||||
|
||||
final StringBuilder sb = new StringBuilder(prettyPrintObject(Array.get(val, 0)));
|
||||
for ( int i = 1; i < length; i++ ) {
|
||||
sb.append(",");
|
||||
sb.append(prettyPrintObject(Array.get(val, i)));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
public static List<List<String>> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
|
||||
return extractFields(vc, fields, null, null, allowMissingData, false);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -236,6 +236,33 @@ public class Utils {
|
|||
}
|
||||
}
|
||||
|
||||
public static <T> List<T> append(final List<T> left, T ... elts) {
|
||||
final List<T> l = new LinkedList<T>(left);
|
||||
l.addAll(Arrays.asList(elts));
|
||||
return l;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a string of the values in joined by separator, such as A,B,C
|
||||
*
|
||||
* @param separator
|
||||
* @param doubles
|
||||
* @return
|
||||
*/
|
||||
public static String join(String separator, double[] doubles) {
|
||||
if ( doubles == null || doubles.length == 0)
|
||||
return "";
|
||||
else {
|
||||
StringBuilder ret = new StringBuilder();
|
||||
ret.append(doubles[0]);
|
||||
for (int i = 1; i < doubles.length; ++i) {
|
||||
ret.append(separator);
|
||||
ret.append(doubles[i]);
|
||||
}
|
||||
return ret.toString();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a string of the form elt1.toString() [sep elt2.toString() ... sep elt.toString()] for a collection of
|
||||
* elti objects (note there's no actual space between sep and the elti elements). Returns
|
||||
|
|
|
|||
|
|
@ -157,11 +157,8 @@ public class VariantContextUtils {
|
|||
builder.attributes(calculateChromosomeCounts(vc, new HashMap<String, Object>(vc.getAttributes()), removeStaleValues, founderIds));
|
||||
}
|
||||
|
||||
public static Genotype removePLs(Genotype g) {
|
||||
if ( g.hasLikelihoods() )
|
||||
return new GenotypeBuilder(g).noPL().make();
|
||||
else
|
||||
return g;
|
||||
public static Genotype removePLsAndAD(final Genotype g) {
|
||||
return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g;
|
||||
}
|
||||
|
||||
public final static VCFCompoundHeaderLine getMetaDataForField(final VCFHeader header, final String field) {
|
||||
|
|
@ -573,7 +570,7 @@ public class VariantContextUtils {
|
|||
}
|
||||
|
||||
// if we have more alternate alleles in the merged VC than in one or more of the
|
||||
// original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF
|
||||
// original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD
|
||||
for ( final VariantContext vc : VCs ) {
|
||||
if (vc.alleles.size() == 1)
|
||||
continue;
|
||||
|
|
@ -581,7 +578,7 @@ public class VariantContextUtils {
|
|||
if ( ! genotypes.isEmpty() )
|
||||
logger.debug(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s",
|
||||
genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles));
|
||||
genotypes = stripPLs(genotypes);
|
||||
genotypes = stripPLsAndAD(genotypes);
|
||||
// this will remove stale AC,AF attributed from vc
|
||||
calculateChromosomeCounts(vc, attributes, true);
|
||||
break;
|
||||
|
|
@ -672,11 +669,11 @@ public class VariantContextUtils {
|
|||
return true;
|
||||
}
|
||||
|
||||
public static GenotypesContext stripPLs(GenotypesContext genotypes) {
|
||||
public static GenotypesContext stripPLsAndAD(GenotypesContext genotypes) {
|
||||
GenotypesContext newGs = GenotypesContext.create(genotypes.size());
|
||||
|
||||
for ( final Genotype g : genotypes ) {
|
||||
newGs.add(g.hasLikelihoods() ? removePLs(g) : g);
|
||||
newGs.add(removePLsAndAD(g));
|
||||
}
|
||||
|
||||
return newGs;
|
||||
|
|
@ -1343,10 +1340,7 @@ public class VariantContextUtils {
|
|||
|
||||
public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) {
|
||||
|
||||
// TODO - this function doesn't work with mixed records or records that started as mixed and then became non-mixed
|
||||
|
||||
// see whether we need to trim common reference base from all alleles
|
||||
|
||||
final int trimExtent = computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, false);
|
||||
if ( trimExtent <= 0 || inputVC.getAlleles().size() <= 1 )
|
||||
return inputVC;
|
||||
|
|
|
|||
|
|
@ -477,10 +477,10 @@ class VCFWriter extends IndexingVariantContextWriter {
|
|||
else if ( val instanceof List ) {
|
||||
result = formatVCFField(((List)val).toArray());
|
||||
} else if ( val.getClass().isArray() ) {
|
||||
int length = Array.getLength(val);
|
||||
final int length = Array.getLength(val);
|
||||
if ( length == 0 )
|
||||
return formatVCFField(null);
|
||||
StringBuffer sb = new StringBuffer(formatVCFField(Array.get(val, 0)));
|
||||
final StringBuilder sb = new StringBuilder(formatVCFField(Array.get(val, 0)));
|
||||
for ( int i = 1; i < length; i++) {
|
||||
sb.append(",");
|
||||
sb.append(formatVCFField(Array.get(val, i)));
|
||||
|
|
|
|||
|
|
@ -1,127 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
|
||||
public class ExactAFCalculationModelUnitTest extends BaseTest {
|
||||
|
||||
static double[] AA1, AB1, BB1;
|
||||
static double[] AA2, AB2, AC2, BB2, BC2, CC2;
|
||||
static final int numSamples = 3;
|
||||
static double[] priors = new double[2*numSamples+1]; // flat priors
|
||||
|
||||
@BeforeSuite
|
||||
public void before() {
|
||||
AA1 = new double[]{0.0, -20.0, -20.0};
|
||||
AB1 = new double[]{-20.0, 0.0, -20.0};
|
||||
BB1 = new double[]{-20.0, -20.0, 0.0};
|
||||
AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0};
|
||||
AC2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0};
|
||||
BB2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0};
|
||||
BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0};
|
||||
CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0};
|
||||
}
|
||||
|
||||
private class GetGLsTest extends TestDataProvider {
|
||||
GenotypesContext GLs;
|
||||
int numAltAlleles;
|
||||
String name;
|
||||
|
||||
private GetGLsTest(String name, int numAltAlleles, Genotype... arg) {
|
||||
super(GetGLsTest.class, name);
|
||||
GLs = GenotypesContext.create(arg);
|
||||
this.name = name;
|
||||
this.numAltAlleles = numAltAlleles;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s input=%s", super.toString(), GLs);
|
||||
}
|
||||
}
|
||||
|
||||
private static Genotype createGenotype(String name, double[] gls) {
|
||||
return new GenotypeBuilder(name, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).PL(gls).make();
|
||||
}
|
||||
|
||||
@DataProvider(name = "getGLs")
|
||||
public Object[][] createGLsData() {
|
||||
|
||||
// bi-allelic case
|
||||
new GetGLsTest("B0", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AA3", AA1));
|
||||
new GetGLsTest("B1", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AB", AB1));
|
||||
new GetGLsTest("B2", 1, createGenotype("AA1", AA1), createGenotype("BB", BB1), createGenotype("AA2", AA1));
|
||||
new GetGLsTest("B3a", 1, createGenotype("AB", AB1), createGenotype("AA", AA1), createGenotype("BB", BB1));
|
||||
new GetGLsTest("B3b", 1, createGenotype("AB1", AB1), createGenotype("AB2", AB1), createGenotype("AB3", AB1));
|
||||
new GetGLsTest("B4", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("AA", AA1));
|
||||
new GetGLsTest("B5", 1, createGenotype("BB1", BB1), createGenotype("AB", AB1), createGenotype("BB2", BB1));
|
||||
new GetGLsTest("B6", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("BB3", BB1));
|
||||
|
||||
// tri-allelic case
|
||||
new GetGLsTest("B1C0", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AB", AB2));
|
||||
new GetGLsTest("B0C1", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AC", AC2));
|
||||
new GetGLsTest("B1C1a", 2, createGenotype("AA", AA2), createGenotype("AB", AB2), createGenotype("AC", AC2));
|
||||
new GetGLsTest("B1C1b", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("BC", BC2));
|
||||
new GetGLsTest("B2C1", 2, createGenotype("AB1", AB2), createGenotype("AB2", AB2), createGenotype("AC", AC2));
|
||||
new GetGLsTest("B3C2a", 2, createGenotype("AB", AB2), createGenotype("BC1", BC2), createGenotype("BC2", BC2));
|
||||
new GetGLsTest("B3C2b", 2, createGenotype("AB", AB2), createGenotype("BB", BB2), createGenotype("CC", CC2));
|
||||
|
||||
return GetGLsTest.getTests(GetGLsTest.class);
|
||||
}
|
||||
|
||||
|
||||
@Test(dataProvider = "getGLs")
|
||||
public void testGLs(GetGLsTest cfg) {
|
||||
|
||||
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
|
||||
|
||||
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
|
||||
|
||||
int nameIndex = 1;
|
||||
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
|
||||
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
|
||||
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele];
|
||||
|
||||
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLargeGLs() {
|
||||
|
||||
final double[] BB = new double[]{-20000000.0, -20000000.0, 0.0};
|
||||
GetGLsTest cfg = new GetGLsTest("B6", 1, createGenotype("1", BB), createGenotype("2", BB), createGenotype("3", BB));
|
||||
|
||||
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
|
||||
|
||||
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
|
||||
|
||||
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[0];
|
||||
Assert.assertEquals(calculatedAlleleCount, 6);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMismatchedGLs() {
|
||||
|
||||
final double[] AB = new double[]{-2000.0, 0.0, -2000.0, -2000.0, -2000.0, -2000.0};
|
||||
final double[] AC = new double[]{-100.0, -100.0, -100.0, 0.0, -100.0, -100.0};
|
||||
GetGLsTest cfg = new GetGLsTest("B1C1", 2, createGenotype("1", AC), createGenotype("2", AB));
|
||||
|
||||
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
|
||||
|
||||
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
|
||||
|
||||
Assert.assertEquals(result.getAlleleCountsOfMAP()[0], 1);
|
||||
Assert.assertEquals(result.getAlleleCountsOfMAP()[1], 1);
|
||||
}
|
||||
}
|
||||
|
|
@ -182,12 +182,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testOutputParameterAllConfident() {
|
||||
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "da318257d25a02abd26a3348421c3c69");
|
||||
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "7bb6375fddc461c72d44f261f6d4b3c7");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOutputParameterAllSites() {
|
||||
testOutputParameters("--output_mode EMIT_ALL_SITES", "13c4f01cffbbfac600318be95b3ca02f");
|
||||
testOutputParameters("--output_mode EMIT_ALL_SITES", "2104dac76fa2a58a92c72b331c7f2095");
|
||||
}
|
||||
|
||||
private void testOutputParameters(final String args, final String md5) {
|
||||
|
|
@ -438,4 +438,19 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
Arrays.asList("22c9fd65ce3298bd7fbf400c9c209f29"));
|
||||
executeTest("test calling on reads with Ns in CIGAR", spec);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing reduced reads
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Test
|
||||
public void testReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("84486c88a0fd1ae996a6402490db8492"));
|
||||
executeTest("test calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -61,4 +61,13 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest {
|
|||
Arrays.asList("7e7bad0e1890753a01303c09a38ceb8d"));
|
||||
executeTest("test hg18 to hg19, unsorted", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLiftoverFilteringOfIndels() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T FilterLiftedVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "liftover_indel_test.vcf --no_cmdline_in_header",
|
||||
1,
|
||||
Arrays.asList("0909a953291a5e701194668c9b8833ab"));
|
||||
executeTest("test liftover filtering of indels", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -255,7 +255,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
|||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile,
|
||||
1,
|
||||
Arrays.asList("3ab35d5e81a29fb5db3e2add11c7e823")
|
||||
Arrays.asList("f14d75892b99547d8e9ba3a03bfb04ea")
|
||||
);
|
||||
executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -63,7 +63,7 @@ public class VariantsToTableIntegrationTest extends WalkerTest {
|
|||
@Test(enabled = true)
|
||||
public void testMultiAllelicOneRecord() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(variantsToTableMultiAllelicCmd(""),
|
||||
Arrays.asList("13dd36c08be6c800f23988e6000d963e"));
|
||||
Arrays.asList("0ff49c08690f61a38614606a090f23ea"));
|
||||
executeTest("testMultiAllelicOneRecord", spec);
|
||||
}
|
||||
|
||||
|
|
@ -100,6 +100,19 @@ public class VariantsToTableIntegrationTest extends WalkerTest {
|
|||
executeTest("testGenotypeFieldsWithInline", spec);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testListFields() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" --variant " + privateTestDir + "vcfexample.withMLE.vcf" +
|
||||
" -T VariantsToTable" +
|
||||
" -GF PL" +
|
||||
" -o %s",
|
||||
1,
|
||||
Arrays.asList("1cb2737ab0eaee0a9ae25ab2e7ac3e7e"));
|
||||
executeTest("testGenotypeFields", spec);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMoltenOutput() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
|
|
|
|||
|
|
@ -10,13 +10,17 @@ class ExampleRetryMemoryLimit extends QScript {
|
|||
var bamFile: File = _
|
||||
|
||||
def script() {
|
||||
val ug = new UnifiedGenotyper with RetryMemoryLimit
|
||||
// First run with 1m
|
||||
ug.memoryLimit = .001
|
||||
// On retry run with 1g
|
||||
ug.retryMemoryFunction = (d => d * 1000)
|
||||
ug.reference_sequence = referenceFile
|
||||
ug.input_file = Seq(bamFile)
|
||||
add(ug)
|
||||
for (scatterCount <- 1 to 2) {
|
||||
val ug = new UnifiedGenotyper with RetryMemoryLimit
|
||||
// First run with 1m
|
||||
ug.memoryLimit = .001
|
||||
// On retry run with 1g
|
||||
ug.retryMemoryFunction = (d => d * 1000)
|
||||
ug.reference_sequence = referenceFile
|
||||
ug.input_file = Seq(bamFile)
|
||||
ug.out = swapExt(bamFile, ".bam", ".scattered_%d.vcf".format(scatterCount))
|
||||
ug.scatterCount = scatterCount
|
||||
add(ug)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -189,7 +189,7 @@ class QCommandLine extends CommandLineProgram with Logging {
|
|||
private def createQueueHeader() : Seq[String] = {
|
||||
Seq(String.format("Queue v%s, Compiled %s", getQueueVersion, getBuildTimestamp),
|
||||
"Copyright (c) 2012 The Broad Institute",
|
||||
"Fro support and documentation go to http://www.broadinstitute.org/gatk")
|
||||
"For support and documentation go to http://www.broadinstitute.org/gatk")
|
||||
}
|
||||
|
||||
private def getQueueVersion : String = {
|
||||
|
|
|
|||
|
|
@ -26,19 +26,19 @@ package org.broadinstitute.sting.queue.extensions.gatk
|
|||
|
||||
import org.broadinstitute.sting.queue.function.scattergather.GatherFunction
|
||||
import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction
|
||||
import org.broadinstitute.sting.queue.function.QFunction
|
||||
import org.broadinstitute.sting.queue.function.{RetryMemoryLimit, QFunction}
|
||||
import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor
|
||||
|
||||
/**
|
||||
* Merges BAM files using net.sf.picard.sam.MergeSamFiles.
|
||||
*/
|
||||
class BamGatherFunction extends GatherFunction with PicardBamFunction {
|
||||
class BamGatherFunction extends GatherFunction with PicardBamFunction with RetryMemoryLimit {
|
||||
this.javaMainClass = "net.sf.picard.sam.MergeSamFiles"
|
||||
this.assumeSorted = Some(true)
|
||||
protected def inputBams = gatherParts
|
||||
protected def outputBam = originalOutput
|
||||
|
||||
override def freezeFieldValues {
|
||||
override def freezeFieldValues() {
|
||||
val originalGATK = originalFunction.asInstanceOf[CommandLineGATK]
|
||||
|
||||
// Whatever the original function can handle, merging *should* do less.
|
||||
|
|
|
|||
|
|
@ -25,13 +25,13 @@
|
|||
package org.broadinstitute.sting.queue.extensions.gatk
|
||||
|
||||
import org.broadinstitute.sting.queue.function.scattergather.GatherFunction
|
||||
import org.broadinstitute.sting.queue.function.QFunction
|
||||
import org.broadinstitute.sting.queue.function.{RetryMemoryLimit, QFunction}
|
||||
import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor
|
||||
|
||||
/**
|
||||
* Merges a vcf text file.
|
||||
*/
|
||||
class VcfGatherFunction extends CombineVariants with GatherFunction {
|
||||
class VcfGatherFunction extends CombineVariants with GatherFunction with RetryMemoryLimit {
|
||||
this.assumeIdenticalSamples = true
|
||||
this.suppressCommandLineHeader = true
|
||||
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ class SortSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFun
|
|||
override def freezeFieldValues() {
|
||||
super.freezeFieldValues()
|
||||
if (outputIndex == null && output != null)
|
||||
outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai")
|
||||
outputIndex = new File(output.getAbsolutePath.stripSuffix(".bam") + ".bai")
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@
|
|||
package org.broadinstitute.sting.queue.function
|
||||
|
||||
import org.broadinstitute.sting.queue.util._
|
||||
import org.broadinstitute.sting.commandline.Argument
|
||||
|
||||
/**
|
||||
* A command line that will be run in a pipeline.
|
||||
|
|
@ -33,12 +34,15 @@ trait CommandLineFunction extends QFunction with Logging {
|
|||
def commandLine: String
|
||||
|
||||
/** Upper memory limit */
|
||||
@Argument(doc="Memory limit", required=false)
|
||||
var memoryLimit: Option[Double] = None
|
||||
|
||||
/** Resident memory limit */
|
||||
@Argument(doc="Resident memory limit", required=false)
|
||||
var residentLimit: Option[Double] = None
|
||||
|
||||
/** Resident memory request */
|
||||
@Argument(doc="Resident memory request", required=false)
|
||||
var residentRequest: Option[Double] = None
|
||||
|
||||
/** the number of SMP cores this job wants */
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@ trait JavaCommandLineFunction extends CommandLineFunction {
|
|||
/**
|
||||
* Memory limit for the java executable, or if None will use the default memoryLimit.
|
||||
*/
|
||||
@Argument(doc="Java memory limit", required=false)
|
||||
var javaMemoryLimit: Option[Double] = None
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -113,11 +113,13 @@ trait QFunction extends Logging with QJobReport {
|
|||
var jobErrorFile: File = _
|
||||
|
||||
/** Errors (if any) from the last failed run of jobErrorFiles. */
|
||||
@Argument(doc="Job error lines", required=false)
|
||||
var jobErrorLines: Seq[String] = Nil
|
||||
|
||||
/**
|
||||
* The number of times this function has previously been run.
|
||||
*/
|
||||
@Argument(doc="Job retries", required=false)
|
||||
var retries = 0
|
||||
|
||||
/** Change settings for the next run. Retries will be set to the number of times the function was run and jobErrorLines may contain the error text. */
|
||||
|
|
@ -541,4 +543,11 @@ object QFunction {
|
|||
classFields
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the Seq of fields for a QFunction class.
|
||||
* @param clazz Class to retrieve fields for.
|
||||
* @return the fields of the class.
|
||||
*/
|
||||
def classFunctionFields(clazz: Class[_]) = classFields(clazz).functionFields
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,17 +24,26 @@
|
|||
|
||||
package org.broadinstitute.sting.queue.function
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument
|
||||
|
||||
object RetryMemoryLimit {
|
||||
private val defaultRetryMemoryFunction: (Double => Double) = ( 2 * _ )
|
||||
private val defaultMemoryLimitErrorText = Seq("OutOfMemory", "you did not provide enough memory", "TERM_MEMLIMIT")
|
||||
}
|
||||
|
||||
/** A mixin that on retry increases the memory limit when certain text is found. */
|
||||
trait RetryMemoryLimit extends CommandLineFunction {
|
||||
|
||||
/** How to increase the memory. By default doubles the memory. */
|
||||
var retryMemoryFunction: (Double => Double) = (2 * _)
|
||||
var retryMemoryFunction: (Double => Double) = RetryMemoryLimit.defaultRetryMemoryFunction
|
||||
|
||||
/** Once the threshold is passed, no more memory will be added to memory limit. */
|
||||
@Argument(doc="threshold to stop doubling the memory", required=false)
|
||||
var memoryLimitThreshold: Option[Double] = None
|
||||
|
||||
/** Various strings to look for to determine we ran out of memory. */
|
||||
var memoryLimitErrorText = Seq("OutOfMemory", "you did not provide enough memory", "TERM_MEMLIMIT")
|
||||
@Argument(doc="text to look for in the errors", required = false)
|
||||
var memoryLimitErrorText = RetryMemoryLimit.defaultMemoryLimitErrorText
|
||||
|
||||
override def freezeFieldValues() {
|
||||
super.freezeFieldValues()
|
||||
|
|
@ -42,6 +51,21 @@ trait RetryMemoryLimit extends CommandLineFunction {
|
|||
this.memoryLimitThreshold = this.qSettings.memoryLimitThreshold
|
||||
}
|
||||
|
||||
|
||||
override def copySettingsTo(function: QFunction) {
|
||||
super.copySettingsTo(function)
|
||||
function match {
|
||||
case retryMemoryLimit: RetryMemoryLimit =>
|
||||
if (retryMemoryLimit.memoryLimitThreshold.isEmpty)
|
||||
retryMemoryLimit.memoryLimitThreshold = this.memoryLimitThreshold
|
||||
if (retryMemoryLimit.retryMemoryFunction == RetryMemoryLimit.defaultRetryMemoryFunction)
|
||||
retryMemoryLimit.retryMemoryFunction = this.retryMemoryFunction
|
||||
if (retryMemoryLimit.memoryLimitErrorText == RetryMemoryLimit.defaultMemoryLimitErrorText)
|
||||
retryMemoryLimit.memoryLimitErrorText = this.memoryLimitErrorText
|
||||
case _ => /* ignore */
|
||||
}
|
||||
}
|
||||
|
||||
override def setupRetry() {
|
||||
super.setupRetry()
|
||||
if (this.memoryLimitThreshold.isDefined && this.memoryLimit.isDefined) {
|
||||
|
|
|
|||
|
|
@ -30,6 +30,10 @@ import org.broadinstitute.sting.queue.function.{QFunction, CommandLineFunction}
|
|||
/**
|
||||
* Shadow clones another command line function.
|
||||
*/
|
||||
object CloneFunction {
|
||||
private lazy val cloneFunctionFields = QFunction.classFunctionFields(classOf[CloneFunction])
|
||||
}
|
||||
|
||||
class CloneFunction extends CommandLineFunction {
|
||||
var originalFunction: ScatterGatherableFunction = _
|
||||
var cloneIndex: Int = _
|
||||
|
|
@ -41,10 +45,10 @@ class CloneFunction extends CommandLineFunction {
|
|||
var originalValues = Map.empty[ArgumentSource, Any]
|
||||
withScatterPartCount += 1
|
||||
if (withScatterPartCount == 1) {
|
||||
overriddenFields.foreach{
|
||||
case (field, overrideValue) => {
|
||||
originalFunction.functionFields.foreach {
|
||||
case (field) => {
|
||||
originalValues += field -> originalFunction.getFieldValue(field)
|
||||
originalFunction.setFieldValue(field, overrideValue)
|
||||
originalFunction.setFieldValue(field, getFieldValue(field))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -52,9 +56,11 @@ class CloneFunction extends CommandLineFunction {
|
|||
f()
|
||||
} finally {
|
||||
if (withScatterPartCount == 1) {
|
||||
originalValues.foreach{
|
||||
case (name, value) =>
|
||||
originalFunction.setFieldValue(name, value)
|
||||
originalFunction.functionFields.foreach {
|
||||
case (field) => {
|
||||
setFieldValue(field, originalFunction.getFieldValue(field))
|
||||
originalFunction.setFieldValue(field, originalValues(field))
|
||||
}
|
||||
}
|
||||
}
|
||||
withScatterPartCount -= 1
|
||||
|
|
@ -63,6 +69,8 @@ class CloneFunction extends CommandLineFunction {
|
|||
|
||||
override def description = withScatterPart(() => originalFunction.description)
|
||||
override def shortDescription = withScatterPart(() => originalFunction.shortDescription)
|
||||
override def setupRetry() { withScatterPart(() => originalFunction.setupRetry()) }
|
||||
|
||||
override protected def functionFieldClass = originalFunction.getClass
|
||||
|
||||
def commandLine = withScatterPart(() => originalFunction.commandLine)
|
||||
|
|
@ -73,13 +81,19 @@ class CloneFunction extends CommandLineFunction {
|
|||
}
|
||||
|
||||
override def getFieldValue(source: ArgumentSource): AnyRef = {
|
||||
overriddenFields.get(source) match {
|
||||
case Some(value) => value.asInstanceOf[AnyRef]
|
||||
case None => {
|
||||
val value = originalFunction.getFieldValue(source)
|
||||
overriddenFields += source -> value
|
||||
value
|
||||
}
|
||||
CloneFunction.cloneFunctionFields.find(_.field.getName == source.field.getName) match {
|
||||
case Some(cloneSource) =>
|
||||
super.getFieldValue(cloneSource)
|
||||
case None =>
|
||||
overriddenFields.get(source) match {
|
||||
case Some(value) =>
|
||||
value.asInstanceOf[AnyRef]
|
||||
case None => {
|
||||
val value = originalFunction.getFieldValue(source)
|
||||
overriddenFields += source -> value
|
||||
value
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue