Merge branch 'master' of gsa2:/humgen/gsa-scr1/chartl/dev/unstable

This commit is contained in:
Christopher Hartl 2012-10-12 00:48:42 -04:00
commit 6b9987cf1b
57 changed files with 2234 additions and 528 deletions

View File

@ -23,7 +23,7 @@ public class BaseAndQualsCounts extends BaseCounts {
}
}
public void incr(byte base, byte baseQual, byte insQual, byte delQual) {
public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
super.incr(base, baseQual);
BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) { // do not allow Ns
@ -32,7 +32,7 @@ public class BaseAndQualsCounts extends BaseCounts {
}
}
public void decr(byte base, byte baseQual, byte insQual, byte delQual) {
public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
super.decr(base, baseQual);
BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) { // do not allow Ns
@ -41,16 +41,15 @@ public class BaseAndQualsCounts extends BaseCounts {
}
}
public byte averageInsertionQualsOfMostCommonBase() {
return getGenericAverageQualOfMostCommonBase(sumInsertionQuals);
public byte averageInsertionQualsOfBase(final BaseIndex base) {
return getGenericAverageQualOfBase(base, sumInsertionQuals);
}
public byte averageDeletionQualsOfMostCommonBase() {
return getGenericAverageQualOfMostCommonBase(sumDeletionQuals);
public byte averageDeletionQualsOfBase(final BaseIndex base) {
return getGenericAverageQualOfBase(base, sumDeletionQuals);
}
private byte getGenericAverageQualOfMostCommonBase(Map<BaseIndex, Long> sumQuals) {
BaseIndex base = BaseIndex.byteToBase(baseWithMostCounts());
private byte getGenericAverageQualOfBase(final BaseIndex base, final Map<BaseIndex, Long> sumQuals) {
return (byte) (sumQuals.get(base) / getCount(base));
}
}

View File

@ -41,26 +41,26 @@ import java.util.Map;
@Requires("other != null")
public void add(BaseCounts other) {
for (BaseIndex i : BaseIndex.values())
for (final BaseIndex i : BaseIndex.values())
counts.put(i, counts.get(i) + other.counts.get(i));
}
@Requires("other != null")
public void sub(BaseCounts other) {
for (BaseIndex i : BaseIndex.values())
for (final BaseIndex i : BaseIndex.values())
counts.put(i, counts.get(i) - other.counts.get(i));
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
public void incr(byte base) {
BaseIndex i = BaseIndex.byteToBase(base);
final BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) // no Ns
counts.put(i, counts.get(i) + 1);
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
public void incr(byte base, byte qual) {
BaseIndex i = BaseIndex.byteToBase(base);
final BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) { // no Ns
counts.put(i, counts.get(i) + 1);
sumQuals.put(i, sumQuals.get(i) + qual);
@ -69,14 +69,14 @@ import java.util.Map;
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
public void decr(byte base) {
BaseIndex i = BaseIndex.byteToBase(base);
final BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) // no Ns
counts.put(i, counts.get(i) - 1);
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
public void decr(byte base, byte qual) {
BaseIndex i = BaseIndex.byteToBase(base);
final BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) { // no Ns
counts.put(i, counts.get(i) - 1);
sumQuals.put(i, sumQuals.get(i) - qual);
@ -84,52 +84,48 @@ import java.util.Map;
}
@Ensures("result >= 0")
public int getCount(byte base) {
public int getCount(final byte base) {
return getCount(BaseIndex.byteToBase(base));
}
@Ensures("result >= 0")
public int getCount(BaseIndex base) {
public int getCount(final BaseIndex base) {
return counts.get(base);
}
@Ensures("result >= 0")
public long getSumQuals(byte base) {
public long getSumQuals(final byte base) {
return getSumQuals(BaseIndex.byteToBase(base));
}
@Ensures("result >= 0")
public long getSumQuals(BaseIndex base) {
public long getSumQuals(final BaseIndex base) {
return sumQuals.get(base);
}
@Ensures("result >= 0")
public byte averageQuals(byte base) {
public byte averageQuals(final byte base) {
return (byte) (getSumQuals(base) / getCount(base));
}
@Ensures("result >= 0")
public byte averageQuals(BaseIndex base) {
public byte averageQuals(final BaseIndex base) {
return (byte) (getSumQuals(base) / getCount(base));
}
public byte baseWithMostCounts() {
return baseIndexWithMostCounts().getByte();
@Ensures("result >= 0")
public int countOfBase(final BaseIndex base) {
return counts.get(base);
}
@Ensures("result >= 0")
public int countOfMostCommonBase() {
return counts.get(baseIndexWithMostCounts());
public long sumQualsOfBase(final BaseIndex base) {
return sumQuals.get(base);
}
@Ensures("result >= 0")
public long sumQualsOfMostCommonBase() {
return sumQuals.get(baseIndexWithMostCounts());
}
@Ensures("result >= 0")
public byte averageQualsOfMostCommonBase() {
return (byte) (sumQualsOfMostCommonBase() / countOfMostCommonBase());
public byte averageQualsOfBase(final BaseIndex base) {
return (byte) (sumQualsOfBase(base) / countOfBase(base));
}
@ -149,7 +145,7 @@ import java.util.Map;
* @return the proportion of this base over all other bases
*/
@Ensures({"result >=0.0", "result<= 1.0"})
public double baseCountProportion(byte base) {
public double baseCountProportion(final byte base) {
return (double) counts.get(BaseIndex.byteToBase(base)) / totalCount();
}
@ -160,7 +156,7 @@ import java.util.Map;
* @return the proportion of this base over all other bases
*/
@Ensures({"result >=0.0", "result<= 1.0"})
public double baseCountProportion(BaseIndex baseIndex) {
public double baseCountProportion(final BaseIndex baseIndex) {
int total = totalCount();
if (total == 0)
return 0.0;
@ -177,22 +173,28 @@ import java.util.Map;
return b.toString();
}
public byte baseWithMostCounts() {
return baseIndexWithMostCounts().getByte();
}
@Ensures("result != null")
public BaseIndex baseIndexWithMostCounts() {
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
for (BaseIndex i : counts.keySet())
if (hasHigherCount(i, maxI))
maxI = i;
for (Map.Entry<BaseIndex, Integer> entry : counts.entrySet()) {
if (entry.getValue() > counts.get(maxI))
maxI = entry.getKey();
}
return maxI;
}
@Ensures("result != null")
public BaseIndex baseIndexWithMostCountsWithoutIndels() {
BaseIndex mostCounts = MAX_BASE_INDEX_WITH_NO_COUNTS;
for (BaseIndex index : counts.keySet())
if (index.isNucleotide() && hasHigherCount(index, mostCounts))
mostCounts = index;
return mostCounts;
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
for (Map.Entry<BaseIndex, Integer> entry : counts.entrySet()) {
if (entry.getKey().isNucleotide() && entry.getValue() > counts.get(maxI))
maxI = entry.getKey();
}
return maxI;
}
private boolean hasHigherCount(final BaseIndex targetIndex, final BaseIndex testIndex) {
@ -201,6 +203,30 @@ import java.util.Map;
return ( targetCount > testCount || (targetCount == testCount && sumQuals.get(targetIndex) > sumQuals.get(testIndex)) );
}
public byte baseWithMostProbability() {
return baseIndexWithMostProbability().getByte();
}
@Ensures("result != null")
public BaseIndex baseIndexWithMostProbability() {
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
for (Map.Entry<BaseIndex, Long> entry : sumQuals.entrySet()) {
if (entry.getValue() > sumQuals.get(maxI))
maxI = entry.getKey();
}
return (sumQuals.get(maxI) > 0L ? maxI : baseIndexWithMostCounts());
}
@Ensures("result != null")
public BaseIndex baseIndexWithMostProbabilityWithoutIndels() {
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
for (Map.Entry<BaseIndex, Long> entry : sumQuals.entrySet()) {
if (entry.getKey().isNucleotide() && entry.getValue() > sumQuals.get(maxI))
maxI = entry.getKey();
}
return (sumQuals.get(maxI) > 0L ? maxI : baseIndexWithMostCountsWithoutIndels());
}
@Ensures("result >=0")
public int totalCountWithoutIndels() {
int sum = 0;
@ -218,8 +244,8 @@ import java.util.Map;
*/
@Requires("index.isNucleotide()")
@Ensures({"result >=0.0", "result<= 1.0"})
public double baseCountProportionWithoutIndels(BaseIndex index) {
int total = totalCountWithoutIndels();
public double baseCountProportionWithoutIndels(final BaseIndex index) {
final int total = totalCountWithoutIndels();
if (total == 0)
return 0.0;
return (double) counts.get(index) / totalCountWithoutIndels();

View File

@ -182,7 +182,7 @@ public class HeaderElement {
* @return whether or not the HeaderElement is variant due to excess insertions
*/
private boolean isVariantFromMismatches(double minVariantProportion) {
BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostCountsWithoutIndels();
BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels();
double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon);
return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion);
}

View File

@ -55,7 +55,7 @@ public class SlidingWindow {
private final int nContigs;
private boolean allowPolyploidReduction;
private boolean allowPolyploidReductionInGeneral;
/**
* The types of synthetic reads to use in the finalizeAndAdd method
@ -117,7 +117,7 @@ public class SlidingWindow {
this.hasIndelQualities = hasIndelQualities;
this.nContigs = nContigs;
this.allowPolyploidReduction = allowPolyploidReduction;
this.allowPolyploidReductionInGeneral = allowPolyploidReduction;
}
/**
@ -207,8 +207,9 @@ public class SlidingWindow {
finalizedReads = closeVariantRegions(regions, false);
List<GATKSAMRecord> readsToRemove = new LinkedList<GATKSAMRecord>();
for (GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!)
if (read.getSoftEnd() < getStartLocation(windowHeader)) {
final int windowHeaderStartLoc = getStartLocation(windowHeader);
for (final GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!)
if (read.getSoftEnd() < windowHeaderStartLoc) {
readsToRemove.add(read);
}
}
@ -291,7 +292,7 @@ public class SlidingWindow {
reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS));
int endOfFilteredData = findNextNonFilteredDataElement(header, start, end);
addToFilteredData(header, start, endOfFilteredData, isNegativeStrand);
reads.addAll(addToFilteredData(header, start, endOfFilteredData, isNegativeStrand));
if (endOfFilteredData <= start)
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start));
@ -418,7 +419,9 @@ public class SlidingWindow {
* @param start the first header index to add to consensus
* @param end the first header index NOT TO add to consensus
*/
private void addToFilteredData(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
private List<GATKSAMRecord> addToFilteredData(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
List<GATKSAMRecord> result = new ArrayList<GATKSAMRecord>(0);
if (filteredDataConsensus == null)
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
@ -434,8 +437,15 @@ public class SlidingWindow {
if (!headerElement.hasFilteredData())
throw new ReviewedStingException("No filtered data in " + index);
if ( filteredDataConsensus.getRefStart() + filteredDataConsensus.size() != headerElement.getLocation() ) {
result.add(finalizeFilteredDataConsensus());
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
}
genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS());
}
return result;
}
/**
@ -472,15 +482,15 @@ public class SlidingWindow {
* @param rms the rms mapping quality in the header element
*/
private void genericAddBaseToConsensus(SyntheticRead syntheticRead, BaseAndQualsCounts baseCounts, double rms) {
BaseIndex base = baseCounts.baseIndexWithMostCounts();
byte count = (byte) Math.min(baseCounts.countOfMostCommonBase(), Byte.MAX_VALUE);
byte qual = baseCounts.averageQualsOfMostCommonBase();
byte insQual = baseCounts.averageInsertionQualsOfMostCommonBase();
byte delQual = baseCounts.averageDeletionQualsOfMostCommonBase();
final BaseIndex base = baseCounts.baseIndexWithMostProbability();
byte count = (byte) Math.min(baseCounts.countOfBase(base), Byte.MAX_VALUE);
byte qual = baseCounts.averageQualsOfBase(base);
byte insQual = baseCounts.averageInsertionQualsOfBase(base);
byte delQual = baseCounts.averageDeletionQualsOfBase(base);
syntheticRead.add(base, count, qual, insQual, delQual, rms);
}
private List<GATKSAMRecord> compressVariantRegion(int start, int stop) {
private List<GATKSAMRecord> compressVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>();
// Try to compress into a polyploid consensus
@ -490,7 +500,8 @@ public class SlidingWindow {
boolean foundEvent = false;
Object[] header = windowHeader.toArray();
if ( allowPolyploidReduction ) { // foundEvent will remain false if we don't allow polyploid reduction
// foundEvent will remain false if we don't allow polyploid reduction
if ( allowPolyploidReductionInGeneral && !disallowPolyploidReductionAtThisPosition ) {
for (int i = start; i<=stop; i++) {
nHaplotypes = ((HeaderElement) header[i]).getNumberOfHaplotypes(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT);
if (nHaplotypes > nContigs) {
@ -512,9 +523,6 @@ public class SlidingWindow {
}
}
int refStart = windowHeader.get(start).getLocation();
int refStop = windowHeader.get(stop).getLocation();
// Try to compress the variant region
// the "foundEvent" protects us from trying to compress variant regions that are created by insertions
if (canCompress && foundEvent) {
@ -524,6 +532,9 @@ public class SlidingWindow {
// Return all reads that overlap the variant region and remove them from the window header entirely
// also remove all reads preceding the variant region (since they will be output as consensus right after compression
else {
final int refStart = windowHeader.get(start).getLocation();
final int refStop = windowHeader.get(stop).getLocation();
LinkedList<GATKSAMRecord> toRemove = new LinkedList<GATKSAMRecord>();
for (GATKSAMRecord read : readsInWindow) {
if (read.getSoftStart() <= refStop) {
@ -549,8 +560,8 @@ public class SlidingWindow {
* @return all reads contained in the variant region plus any adjacent synthetic reads
*/
@Requires("start <= stop")
protected List<GATKSAMRecord> closeVariantRegion(int start, int stop) {
List<GATKSAMRecord> allReads = compressVariantRegion(start, stop);
protected List<GATKSAMRecord> closeVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
List<GATKSAMRecord> allReads = compressVariantRegion(start, stop, disallowPolyploidReductionAtThisPosition);
List<GATKSAMRecord> result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads;
result.addAll(addToSyntheticReads(windowHeader, 0, stop, false));
@ -570,7 +581,7 @@ public class SlidingWindow {
if (stop < 0 && forceClose)
stop = windowHeader.size() - 1;
if (stop >= 0) {
allReads.addAll(closeVariantRegion(start, stop));
allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1));
lastStop = stop;
}
}

View File

@ -44,7 +44,7 @@ public class SyntheticRead {
private String contig;
private int contigIndex;
private String readName;
private Integer refStart;
private int refStart;
private boolean hasIndelQualities = false;
private boolean isNegativeStrand = false;
@ -60,7 +60,7 @@ public class SyntheticRead {
* @param refStart the alignment start (reference based)
* @param readTag the reduce reads tag for the synthetic read
*/
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) {
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) {
final int initialCapacity = 10000;
bases = new ArrayList<BaseIndex>(initialCapacity);
counts = new ArrayList<Byte>(initialCapacity);
@ -80,7 +80,7 @@ public class SyntheticRead {
this.isNegativeStrand = isNegativeRead;
}
public SyntheticRead(List<BaseIndex> bases, List<Byte> counts, List<Byte> quals, List<Byte> insertionQuals, List<Byte> deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, boolean hasIndelQualities, boolean isNegativeRead) {
public SyntheticRead(List<BaseIndex> bases, List<Byte> counts, List<Byte> quals, List<Byte> insertionQuals, List<Byte> deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) {
this.bases = bases;
this.counts = counts;
this.quals = quals;
@ -115,11 +115,15 @@ public class SyntheticRead {
this.mappingQuality += mappingQuality;
}
public BaseIndex getBase(int readCoordinate) {
public BaseIndex getBase(final int readCoordinate) {
return bases.get(readCoordinate);
}
/**
public int getRefStart() {
return refStart;
}
/**
* Creates a GATKSAMRecord of the synthetic read. Will return null if the read is invalid.
*
* Invalid reads are :

View File

@ -0,0 +1,228 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.ConsoleAppender;
import org.apache.log4j.Logger;
import org.apache.log4j.SimpleLayout;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.*;
/**
* Created with IntelliJ IDEA.
* User: depristo
* Date: 10/2/12
* Time: 10:25 AM
* To change this template use File | Settings | File Templates.
*/
public class ExactAFCalculationPerformanceTest {
final static Logger logger = Logger.getLogger(ExactAFCalculationPerformanceTest.class);
private static abstract class Analysis {
final GATKReport report;
public Analysis(final String name, final List<String> columns) {
report = GATKReport.newSimpleReport(name, columns);
}
public abstract void run(final ExactAFCalculationTestBuilder testBuilder,
final List<Object> coreColumns);
public String getName() {
return getTable().getTableName();
}
public GATKReportTable getTable() {
return report.getTables().iterator().next();
}
}
private static class AnalyzeByACAndPL extends Analysis {
public AnalyzeByACAndPL(final List<String> columns) {
super("AnalyzeByACAndPL", Utils.append(columns, "non.type.pls", "ac", "n.alt.seg", "other.ac"));
}
public void run(final ExactAFCalculationTestBuilder testBuilder, final List<Object> coreValues) {
final SimpleTimer timer = new SimpleTimer();
for ( final int nonTypePL : Arrays.asList(10, 100, 1000) ) {
final ExactAFCalculation calc = testBuilder.makeModel();
final double[] priors = testBuilder.makePriors();
for ( int[] ACs : makeACs(testBuilder.numAltAlleles, testBuilder.nSamples*2) ) {
final VariantContext vc = testBuilder.makeACTest(ACs, nonTypePL);
timer.start();
final AlleleFrequencyCalculationResult result = calc.getLog10PNonRef(vc, priors);
final long runtime = timer.getElapsedTimeNano();
int otherAC = 0;
int nAltSeg = 0;
for ( int i = 0; i < ACs.length; i++ ) {
nAltSeg += ACs[i] > 0 ? 1 : 0;
if ( i > 0 ) otherAC += ACs[i];
}
final List<Object> columns = new LinkedList<Object>(coreValues);
columns.addAll(Arrays.asList(runtime, result.getnEvaluations(), nonTypePL, ACs[0], nAltSeg, otherAC));
report.addRowList(columns);
}
}
}
private List<int[]> makeACs(final int nAltAlleles, final int nChrom) {
if ( nAltAlleles > 2 ) throw new IllegalArgumentException("nAltAlleles must be < 3");
final List<int[]> ACs = new LinkedList<int[]>();
if ( nAltAlleles == 1 )
for ( int i = 0; i < nChrom; i++ ) {
ACs.add(new int[]{i});
} else if ( nAltAlleles == 2 ) {
for ( int i = 0; i < nChrom; i++ ) {
for ( int j : Arrays.asList(0, 1, 5, 10, 50, 100, 1000, 10000, 100000) ) {
if ( j < nChrom - i )
ACs.add(new int[]{i, j});
}
}
} else {
throw new IllegalStateException("cannot get here");
}
return ACs;
}
}
private static class AnalyzeBySingletonPosition extends Analysis {
public AnalyzeBySingletonPosition(final List<String> columns) {
super("AnalyzeBySingletonPosition", Utils.append(columns, "non.type.pls", "position.of.singleton"));
}
public void run(final ExactAFCalculationTestBuilder testBuilder, final List<Object> coreValues) {
final SimpleTimer timer = new SimpleTimer();
for ( final int nonTypePL : Arrays.asList(100) ) {
final ExactAFCalculation calc = testBuilder.makeModel();
final double[] priors = testBuilder.makePriors();
final int[] ac = new int[testBuilder.numAltAlleles];
ac[0] = 1;
final VariantContext vc = testBuilder.makeACTest(ac, nonTypePL);
for ( int position = 0; position < vc.getNSamples(); position++ ) {
final VariantContextBuilder vcb = new VariantContextBuilder(vc);
final List<Genotype> genotypes = new ArrayList<Genotype>(vc.getGenotypes());
Collections.rotate(genotypes, position);
vcb.genotypes(genotypes);
timer.start();
final AlleleFrequencyCalculationResult result = calc.getLog10PNonRef(vcb.make(), priors);
final long runtime = timer.getElapsedTimeNano();
final List<Object> columns = new LinkedList<Object>(coreValues);
columns.addAll(Arrays.asList(runtime, result.getnEvaluations(), nonTypePL, position));
report.addRowList(columns);
}
}
}
}
private static class AnalyzeByNonInformative extends Analysis {
public AnalyzeByNonInformative(final List<String> columns) {
super("AnalyzeByNonInformative", Utils.append(columns, "non.type.pls", "n.non.informative"));
}
public void run(final ExactAFCalculationTestBuilder testBuilder, final List<Object> coreValues) {
final SimpleTimer timer = new SimpleTimer();
for ( final int nonTypePL : Arrays.asList(100) ) {
final ExactAFCalculation calc = testBuilder.makeModel();
final double[] priors = testBuilder.makePriors();
final int[] ac = new int[testBuilder.numAltAlleles];
ac[0] = 1;
final VariantContext vc = testBuilder.makeACTest(ac, nonTypePL);
final Genotype nonInformative = testBuilder.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0);
for ( int nNonInformative = 0; nNonInformative < vc.getNSamples(); nNonInformative++ ) {
final VariantContextBuilder vcb = new VariantContextBuilder(vc);
final List<Genotype> genotypes = new ArrayList<Genotype>();
genotypes.addAll(vc.getGenotypes().subList(0, nNonInformative + 1));
genotypes.addAll(Collections.nCopies(vc.getNSamples() - nNonInformative, nonInformative));
vcb.genotypes(genotypes);
timer.start();
final AlleleFrequencyCalculationResult result = calc.getLog10PNonRef(vcb.make(), priors);
final long runtime = timer.getElapsedTimeNano();
final List<Object> columns = new LinkedList<Object>(coreValues);
columns.addAll(Arrays.asList(runtime, result.getnEvaluations(), nonTypePL, nNonInformative));
report.addRowList(columns);
}
}
}
}
public static void main(final String[] args) throws Exception {
logger.addAppender(new ConsoleAppender(new SimpleLayout()));
final List<String> coreColumns = Arrays.asList("iteration", "n.alt.alleles", "n.samples",
"exact.model", "prior.type", "runtime", "n.evaluations");
final PrintStream out = new PrintStream(new FileOutputStream(args[0]));
final boolean USE_GENERAL = false;
final List<ExactAFCalculationTestBuilder.ModelType> modelTypes = USE_GENERAL
? Arrays.asList(ExactAFCalculationTestBuilder.ModelType.values())
: Arrays.asList(ExactAFCalculationTestBuilder.ModelType.DiploidExact);
final boolean ONLY_HUMAN_PRIORS = false;
final List<ExactAFCalculationTestBuilder.PriorType> priorTypes = ONLY_HUMAN_PRIORS
? Arrays.asList(ExactAFCalculationTestBuilder.PriorType.values())
: Arrays.asList(ExactAFCalculationTestBuilder.PriorType.human);
final int MAX_N_SAMPLES_FOR_MULTI_ALLELIC = 100;
final List<Analysis> analyzes = new ArrayList<Analysis>();
analyzes.add(new AnalyzeByACAndPL(coreColumns));
analyzes.add(new AnalyzeBySingletonPosition(coreColumns));
analyzes.add(new AnalyzeByNonInformative(coreColumns));
for ( int iteration = 0; iteration < 1; iteration++ ) {
for ( final int nAltAlleles : Arrays.asList(1, 2) ) {
for ( final int nSamples : Arrays.asList(1, 10, 100, 1000, 10000) ) {
if ( nSamples > MAX_N_SAMPLES_FOR_MULTI_ALLELIC && nAltAlleles > 1 )
continue; // skip things that will take forever!
for ( final ExactAFCalculationTestBuilder.ModelType modelType : modelTypes ) {
for ( final ExactAFCalculationTestBuilder.PriorType priorType : priorTypes ) {
final ExactAFCalculationTestBuilder testBuilder
= new ExactAFCalculationTestBuilder(nSamples, nAltAlleles, modelType, priorType);
for ( final Analysis analysis : analyzes ) {
logger.info(Utils.join("\t", Arrays.asList(iteration, nAltAlleles, nSamples, modelType, priorType, analysis.getName())));
final List<?> values = Arrays.asList(iteration, nAltAlleles, nSamples, modelType, priorType);
analysis.run(testBuilder, (List<Object>)values);
}
}
}
}
}
}
final GATKReport report = new GATKReport();
for ( final Analysis analysis : analyzes )
report.addTable(analysis.getTable());
report.print(out);
out.close();
}
}

View File

@ -0,0 +1,158 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class ExactAFCalculationTestBuilder {
final static Allele A = Allele.create("A", true);
final static Allele C = Allele.create("C");
final static Allele G = Allele.create("G");
final static Allele T = Allele.create("T");
static int sampleNameCounter = 0;
final int nSamples;
final int numAltAlleles;
final ModelType modelType;
final PriorType priorType;
public ExactAFCalculationTestBuilder(final int nSamples, final int numAltAlleles,
final ModelType modelType, final PriorType priorType) {
this.nSamples = nSamples;
this.numAltAlleles = numAltAlleles;
this.modelType = modelType;
this.priorType = priorType;
}
public enum ModelType {
DiploidExact,
OptimizedDiploidExact,
GeneralExact
}
public enum PriorType {
flat,
human
}
public int getnSamples() {
return nSamples;
}
public ExactAFCalculation makeModel() {
switch (modelType) {
case DiploidExact: return new DiploidExactAFCalculation(nSamples, 4);
case OptimizedDiploidExact: return new OptimizedDiploidExactAFCalculation(nSamples, 4);
case GeneralExact: return new GeneralPloidyExactAFCalculation(nSamples, 4, 2);
default: throw new RuntimeException("Unexpected type " + modelType);
}
}
public double[] makePriors() {
final int nPriorValues = 2*nSamples+1;
switch ( priorType ) {
case flat:
return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
case human:
final double[] humanPriors = new double[nPriorValues];
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues-1, humanPriors, 0.001);
return humanPriors;
default:
throw new RuntimeException("Unexpected type " + priorType);
}
}
public VariantContext makeACTest(final int[] ACs, final int nonTypePL) {
final int nChrom = nSamples * 2;
final int[] nhet = new int[numAltAlleles];
final int[] nhomvar = new int[numAltAlleles];
for ( int i = 0; i < ACs.length; i++ ) {
final double p = ACs[i] / (1.0 * nChrom);
nhomvar[i] = (int)Math.floor(nSamples * p * p);
nhet[i] = ACs[i] - 2 * nhomvar[i];
if ( nhet[i] < 0 )
throw new IllegalStateException("Bug!");
}
final long calcAC = MathUtils.sum(nhet) + 2 * MathUtils.sum(nhomvar);
if ( calcAC != MathUtils.sum(ACs) )
throw new IllegalStateException("calculated AC " + calcAC + " not equal to desired AC " + Utils.join(",", ACs));
return makeACTest(nhet, nhomvar, nonTypePL);
}
public VariantContext makeACTest(final int[] nhet, final int[] nhomvar, final int nonTypePL) {
List<Genotype> samples = new ArrayList<Genotype>(nSamples);
for ( int altI = 0; altI < nhet.length; altI++ ) {
for ( int i = 0; i < nhet[altI]; i++ )
samples.add(makePL(GenotypeType.HET, nonTypePL, altI+1));
for ( int i = 0; i < nhomvar[altI]; i++ )
samples.add(makePL(GenotypeType.HOM_VAR, nonTypePL, altI+1));
}
final int nRef = (int)(nSamples - MathUtils.sum(nhet) - MathUtils.sum(nhomvar));
for ( int i = 0; i < nRef; i++ ) samples.add(makePL(GenotypeType.HOM_REF, nonTypePL, 0));
samples = samples.subList(0, nSamples);
if ( samples.size() > nSamples )
throw new IllegalStateException("too many samples");
VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, getAlleles());
vcb.genotypes(samples);
return vcb.make();
}
public List<Allele> getAlleles() {
return Arrays.asList(A, C, G, T).subList(0, numAltAlleles+1);
}
public List<Allele> getAlleles(final GenotypeType type, final int altI) {
switch (type) {
case HOM_REF: return Arrays.asList(getAlleles().get(0), getAlleles().get(0));
case HET: return Arrays.asList(getAlleles().get(0), getAlleles().get(altI));
case HOM_VAR: return Arrays.asList(getAlleles().get(altI), getAlleles().get(altI));
default: throw new IllegalArgumentException("Unexpected type " + type);
}
}
public Genotype makePL(final List<Allele> expectedGT, int ... pls) {
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
gb.alleles(expectedGT);
gb.PL(pls);
return gb.make();
}
private int numPLs() {
return GenotypeLikelihoods.numLikelihoods(numAltAlleles+1, 2);
}
public Genotype makePL(final GenotypeType type, final int nonTypePL, final int altI) {
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
gb.alleles(getAlleles(type, altI));
final int[] pls = new int[numPLs()];
Arrays.fill(pls, nonTypePL);
int index = 0;
switch ( type ) {
case HOM_REF: index = GenotypeLikelihoods.calculatePLindex(0, 0); break;
case HET: index = GenotypeLikelihoods.calculatePLindex(0, altI); break;
case HOM_VAR: index = GenotypeLikelihoods.calculatePLindex(altI, altI); break;
}
pls[index] = 0;
gb.PL(pls);
return gb.make();
}
}

View File

@ -34,43 +34,47 @@ import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.PrintStream;
import java.util.*;
public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalculationModel {
public class GeneralPloidyExactAFCalculation extends ExactAFCalculation {
static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them
final protected UnifiedArgumentCollection UAC;
private final int ploidy;
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
private final static boolean VERBOSE = false;
protected GeneralPloidyExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
protected GeneralPloidyExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
super(UAC, N, logger, verboseWriter);
ploidy = UAC.samplePloidy;
this.UAC = UAC;
}
public List<Allele> getLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
GenotypesContext GLs = vc.getGenotypes();
List<Allele> alleles = vc.getAlleles();
public GeneralPloidyExactAFCalculation(final int nSamples, final int maxAltAlleles, final int ploidy) {
super(nSamples, maxAltAlleles, maxAltAlleles, null, null, null);
this.ploidy = ploidy;
}
@Override
protected VariantContext reduceScope(VariantContext vc) {
// don't try to genotype too many alternate alleles
if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) {
logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
final List<Allele> alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
alleles.add(vc.getReference());
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE, ploidy));
GLs = subsetAlleles(vc, alleles, false, ploidy);
VariantContextBuilder builder = new VariantContextBuilder(vc);
builder.alleles(alleles);
builder.genotypes(subsetAlleles(vc, alleles, false, ploidy));
return builder.make();
} else {
return vc;
}
}
combineSinglePools(GLs, alleles.size(), ploidy, log10AlleleFrequencyPriors, result);
return alleles;
@Override
public void computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors, result);
}
@ -194,7 +198,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
combinedPoolLikelihoods.add(set);
for (int p=1; p<genotypeLikelihoods.size(); p++) {
result.reset();
result.reset(); // TODO -- why is this here? It makes it hard to track the n evaluation
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p), combinedPloidy, ploidyPerPool,
numAlleles, log10AlleleFrequencyPriors, result);
combinedPloidy = ploidyPerPool + combinedPloidy; // total number of chromosomes in combinedLikelihoods
@ -226,6 +230,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
// keep processing while we have AC conformations that need to be calculated
MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen();
while ( !ACqueue.isEmpty() ) {
result.incNEvaluations();
// compute log10Likelihoods
final ExactACset ACset = ACqueue.remove();
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, result, maxLikelihoodSeen, ACqueue, indexesToACset);

View File

@ -491,15 +491,15 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
// If neighbors fall below maximum - threshold, we don't queue up THEIR own neighbors
// and we repeat until queue is empty
// queue of AC conformations to process
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue = new LinkedList<AlleleFrequencyCalculationModel.ExactACset>();
final LinkedList<ExactAFCalculation.ExactACset> ACqueue = new LinkedList<ExactAFCalculation.ExactACset>();
// mapping of ExactACset indexes to the objects
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset = new HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset>(likelihoodDim);
final HashMap<ExactAFCalculation.ExactACcounts, ExactAFCalculation.ExactACset> indexesToACset = new HashMap<ExactAFCalculation.ExactACcounts, ExactAFCalculation.ExactACset>(likelihoodDim);
// add AC=0 to the queue
final int[] zeroCounts = new int[nAlleles];
zeroCounts[0] = numChromosomes;
AlleleFrequencyCalculationModel.ExactACset zeroSet =
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(zeroCounts));
ExactAFCalculation.ExactACset zeroSet =
new ExactAFCalculation.ExactACset(1, new ExactAFCalculation.ExactACcounts(zeroCounts));
ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.ACcounts, zeroSet);
@ -508,7 +508,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
double maxLog10L = Double.NEGATIVE_INFINITY;
while ( !ACqueue.isEmpty() ) {
// compute log10Likelihoods
final AlleleFrequencyCalculationModel.ExactACset ACset = ACqueue.remove();
final ExactAFCalculation.ExactACset ACset = ACqueue.remove();
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup);
// adjust max likelihood seen if needed
@ -525,8 +525,8 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
int plIdx = 0;
SumIterator iterator = new SumIterator(nAlleles, numChromosomes);
while (iterator.hasNext()) {
AlleleFrequencyCalculationModel.ExactACset ACset =
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(iterator.getCurrentVector()));
ExactAFCalculation.ExactACset ACset =
new ExactAFCalculation.ExactACset(1, new ExactAFCalculation.ExactACcounts(iterator.getCurrentVector()));
// for observed base X, add Q(jX,k) to likelihood vector for all k in error model
//likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup);
@ -540,14 +540,14 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
}
private double calculateACConformationAndUpdateQueue(final ExactAFCalculationModel.ExactACset set,
private double calculateACConformationAndUpdateQueue(final DiploidExactAFCalculation.ExactACset set,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,
final double maxLog10L,
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts,
AlleleFrequencyCalculationModel.ExactACset> indexesToACset,
final LinkedList<ExactAFCalculation.ExactACset> ACqueue,
final HashMap<ExactAFCalculation.ExactACcounts,
ExactAFCalculation.ExactACset> indexesToACset,
final ReadBackedPileup pileup) {
// compute likelihood of set
getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup);
@ -597,7 +597,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
* @param numObservations Number of observations for each allele
* @param pileup Read backed pileup in case it's necessary
*/
public abstract void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
public abstract void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,
@ -608,12 +608,12 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
// Static methods
public static void updateACset(final int[] newSetCounts,
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset) {
final LinkedList<ExactAFCalculation.ExactACset> ACqueue,
final HashMap<ExactAFCalculation.ExactACcounts, ExactAFCalculation.ExactACset> indexesToACset) {
final AlleleFrequencyCalculationModel.ExactACcounts index = new AlleleFrequencyCalculationModel.ExactACcounts(newSetCounts);
final ExactAFCalculation.ExactACcounts index = new ExactAFCalculation.ExactACcounts(newSetCounts);
if ( !indexesToACset.containsKey(index) ) {
AlleleFrequencyCalculationModel.ExactACset newSet = new AlleleFrequencyCalculationModel.ExactACset(1, index);
ExactAFCalculation.ExactACset newSet = new ExactAFCalculation.ExactACset(1, index);
indexesToACset.put(index, newSet);
ACqueue.add(newSet);
if (VERBOSE)

View File

@ -188,7 +188,7 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
* @param alleleList List of alleles
* @param numObservations Number of observations for each allele in alleleList
*/
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
public void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,

View File

@ -12,7 +12,10 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import static java.lang.Math.log10;
import static java.lang.Math.pow;
@ -218,7 +221,7 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
* @param alleleList List of alleles
* @param numObservations Number of observations for each allele in alleleList
*/
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
public void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,

View File

@ -283,7 +283,7 @@ public class GenotypingEngine {
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
if( mergedVC == null ) { continue; }
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
int aCount = 0;
for( final Allele a : mergedVC.getAlleles() ) {
alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
@ -308,9 +308,20 @@ public class GenotypingEngine {
}
genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() );
}
final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
if( call != null ) {
if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call);
// also, need to update the allele -> haplotype mapping
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMapTrim = new HashMap<Allele, ArrayList<Haplotype>>();
for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC
alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleHashMap.get(call.getAlleles().get(iii)));
}
call = vcCallTrim;
alleleHashMap = alleleHashMapTrim;
}
returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(call, alleleHashMap) );
}
}

View File

@ -237,9 +237,13 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING);
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING);
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
// create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
UnifiedArgumentCollection simpleUAC = UAC.clone();
simpleUAC.exactCallsLog = null;
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
// initialize the output VCF header
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());

View File

@ -40,7 +40,6 @@ import java.util.*;
public class LikelihoodCalculationEngine {
private static final double LOG_ONE_HALF = -Math.log10(2.0);
private static final double BEST_LIKELIHOOD_THRESHOLD = 0.1;
private final byte constantGCP;
private final boolean DEBUG;
private final PairHMM pairHMM;
@ -184,7 +183,7 @@ public class LikelihoodCalculationEngine {
haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF );
}
}
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum?
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood);
}
}
}
@ -323,11 +322,13 @@ public class LikelihoodCalculationEngine {
return bestHaplotypes;
}
public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList, final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call) {
public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser,
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList,
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call) {
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst());
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
//final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<Allele, List<GATKSAMRecord>>();
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
final ArrayList<GATKSAMRecord> readsForThisSample = sample.getValue();
@ -352,7 +353,7 @@ public class LikelihoodCalculationEngine {
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
for( final Allele a : call.getFirst().getAlleles() )
likelihoodMap.add(read,a,0.0);
likelihoodMap.add(read, a, 0.0);
}
}

View File

@ -63,7 +63,7 @@ public class BaseCountsUnitTest extends BaseTest {
String name = String.format("Test-%s", params.bases);
Assert.assertEquals(counts.totalCount(), params.bases.length(), name);
Assert.assertEquals(counts.countOfMostCommonBase(), params.mostCommonCount, name);
Assert.assertEquals(counts.countOfBase(counts.baseIndexWithMostCounts()), params.mostCommonCount, name);
Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name);
}
}

View File

@ -0,0 +1,348 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.*;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
public class ExactAFCalculationModelUnitTest extends BaseTest {
static Allele A = Allele.create("A", true);
static Allele C = Allele.create("C");
static Allele G = Allele.create("G");
static int sampleNameCounter = 0;
static Genotype AA1, AB1, BB1, NON_INFORMATIVE1;
static Genotype AA2, AB2, AC2, BB2, BC2, CC2, NON_INFORMATIVE2;
final double[] FLAT_3SAMPLE_PRIORS = new double[2*3+1]; // flat priors
final private static boolean INCLUDE_BIALLELIC = true;
final private static boolean INCLUDE_TRIALLELIC = true;
final private static boolean Guillermo_FIXME = false; // TODO -- can only be enabled when GdA fixes bug
@BeforeSuite
public void before() {
AA1 = makePL(Arrays.asList(A, A), 0, 20, 20);
AB1 = makePL(Arrays.asList(A, C), 20, 0, 20);
BB1 = makePL(Arrays.asList(C, C), 20, 20, 0);
NON_INFORMATIVE1 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0);
AA2 = makePL(Arrays.asList(A, A), 0, 20, 20, 20, 20, 20);
AB2 = makePL(Arrays.asList(A, C), 20, 0, 20, 20, 20, 20);
BB2 = makePL(Arrays.asList(C, C), 20, 20, 0, 20, 20, 20);
AC2 = makePL(Arrays.asList(A, G), 20, 20, 20, 0, 20, 20);
BC2 = makePL(Arrays.asList(C, G), 20, 20, 20, 20, 0, 20);
CC2 = makePL(Arrays.asList(G, G), 20, 20, 20, 20, 20, 0);
NON_INFORMATIVE2 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0, 0, 0, 0);
}
private Genotype makePL(final List<Allele> expectedGT, int ... pls) {
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
gb.alleles(expectedGT);
gb.PL(pls);
return gb.make();
}
private class GetGLsTest extends TestDataProvider {
GenotypesContext GLs;
int numAltAlleles;
final ExactAFCalculation calc;
final int[] expectedACs;
final double[] priors;
final String priorName;
private GetGLsTest(final ExactAFCalculation calculation, int numAltAlleles, List<Genotype> arg, final double[] priors, final String priorName) {
super(GetGLsTest.class);
GLs = GenotypesContext.create(new ArrayList<Genotype>(arg));
this.numAltAlleles = numAltAlleles;
this.calc = calculation;
this.priors = priors;
this.priorName = priorName;
expectedACs = new int[numAltAlleles+1];
for ( int alleleI = 0; alleleI < expectedACs.length; alleleI++ ) {
expectedACs[alleleI] = 0;
final Allele allele = getAlleles().get(alleleI);
for ( Genotype g : arg ) {
expectedACs[alleleI] += Collections.frequency(g.getAlleles(), allele);
}
}
}
public AlleleFrequencyCalculationResult execute() {
return getCalc().getLog10PNonRef(getVC(), getPriors());
}
public double[] getPriors() {
return priors;
}
public ExactAFCalculation getCalc() {
return calc;
}
public VariantContext getVC() {
VariantContextBuilder builder = new VariantContextBuilder("test", "1", 1, 1, getAlleles());
builder.genotypes(GLs);
return builder.make();
}
public List<Allele> getAlleles() {
return Arrays.asList(Allele.create("A", true),
Allele.create("C"),
Allele.create("G"),
Allele.create("T")).subList(0, numAltAlleles+1);
}
public int getExpectedAltAC(final int alleleI) {
return expectedACs[alleleI+1];
}
public String toString() {
return String.format("%s model=%s prior=%s input=%s", super.toString(), calc.getClass().getSimpleName(),
priorName, GLs.size() > 5 ? String.format("%d samples", GLs.size()) : GLs);
}
}
@DataProvider(name = "wellFormedGLs")
public Object[][] createSimpleGLsData() {
final List<Genotype> biAllelicSamples = Arrays.asList(AA1, AB1, BB1);
final List<Genotype> triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2);
for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) {
final ExactAFCalculation diploidCalc = new DiploidExactAFCalculation(nSamples, 4);
final ExactAFCalculation optDiploidCalc = new OptimizedDiploidExactAFCalculation(nSamples, 4);
final ExactAFCalculation generalCalc = new GeneralPloidyExactAFCalculation(nSamples, 4, 2);
final int nPriorValues = 2*nSamples+1;
final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
final double[] humanPriors = new double[nPriorValues];
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues-1, humanPriors, 0.001);
for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
for ( ExactAFCalculation model : Arrays.asList(diploidCalc, optDiploidCalc, generalCalc) ) {
final String priorName = priors == humanPriors ? "human" : "flat";
// bi-allelic
if ( INCLUDE_BIALLELIC && nSamples <= biAllelicSamples.size() )
for ( List<Genotype> genotypes : Utils.makePermutations(biAllelicSamples, nSamples, true) )
new GetGLsTest(model, 1, genotypes, priors, priorName);
// tri-allelic
if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || model != generalCalc || Guillermo_FIXME ) )
for ( List<Genotype> genotypes : Utils.makePermutations(triAllelicSamples, nSamples, true) )
new GetGLsTest(model, 2, genotypes, priors, priorName);
}
}
}
return GetGLsTest.getTests(GetGLsTest.class);
}
private static class NonInformativeData {
final Genotype nonInformative;
final List<Genotype> called;
final int nAltAlleles;
private NonInformativeData(List<Genotype> called, Genotype nonInformative, int nAltAlleles) {
this.called = called;
this.nonInformative = nonInformative;
this.nAltAlleles = nAltAlleles;
}
}
@DataProvider(name = "GLsWithNonInformative")
public Object[][] makeGLsWithNonInformative() {
List<Object[]> tests = new ArrayList<Object[]>();
final List<NonInformativeData> nonInformativeTests = new LinkedList<NonInformativeData>();
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB1), NON_INFORMATIVE1, 1));
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2), NON_INFORMATIVE2, 2));
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2, BC2), NON_INFORMATIVE2, 2));
for ( final int nNonInformative : Arrays.asList(1, 10, 100) ) {
for ( final NonInformativeData testData : nonInformativeTests ) {
final List<Genotype> samples = new ArrayList<Genotype>();
samples.addAll(testData.called);
samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative));
final int nSamples = samples.size();
final ExactAFCalculation diploidCalc = new DiploidExactAFCalculation(nSamples, 4);
final ExactAFCalculation optDiploidCalc = new OptimizedDiploidExactAFCalculation(nSamples, 4);
final ExactAFCalculation generalCalc = new GeneralPloidyExactAFCalculation(nSamples, 4, 2);
final double[] priors = new double[2*nSamples+1]; // flat priors
for ( ExactAFCalculation model : Arrays.asList(diploidCalc, optDiploidCalc, generalCalc) ) {
final GetGLsTest onlyInformative = new GetGLsTest(model, testData.nAltAlleles, testData.called, priors, "flat");
for ( int rotation = 0; rotation < nSamples; rotation++ ) {
Collections.rotate(samples, 1);
final GetGLsTest withNonInformative = new GetGLsTest(model, testData.nAltAlleles, samples, priors, "flat");
tests.add(new Object[]{onlyInformative, withNonInformative});
}
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(enabled = true, dataProvider = "wellFormedGLs")
public void testGLs(GetGLsTest cfg) {
testResultSimple(cfg);
}
@Test(enabled = true, dataProvider = "GLsWithNonInformative", dependsOnMethods = "testGLs")
public void testGLsWithNonInformative(GetGLsTest onlyInformative, GetGLsTest withNonInformative) {
final AlleleFrequencyCalculationResult expected = onlyInformative.execute();
final AlleleFrequencyCalculationResult actual = withNonInformative.execute();
testResultSimple(withNonInformative);
Assert.assertEquals(actual.getLog10PosteriorOfAFzero(), expected.getLog10LikelihoodOfAFzero());
Assert.assertEquals(actual.getLog10LikelihoodOfAFzero(), expected.getLog10LikelihoodOfAFzero());
Assert.assertEquals(actual.getLog10PosteriorsMatrixSumWithoutAFzero(), expected.getLog10PosteriorsMatrixSumWithoutAFzero());
Assert.assertEquals(actual.getAlleleCountsOfMAP(), expected.getAlleleCountsOfMAP());
Assert.assertEquals(actual.getAlleleCountsOfMLE(), expected.getAlleleCountsOfMLE());
Assert.assertEquals(actual.getLog10MAP(), expected.getLog10MAP());
Assert.assertEquals(actual.getLog10MLE(), expected.getLog10MLE());
Assert.assertEquals(actual.getAllelesUsedInGenotyping(), expected.getAllelesUsedInGenotyping());
}
private void testResultSimple(final GetGLsTest cfg) {
final AlleleFrequencyCalculationResult result = cfg.execute();
Assert.assertEquals(result.getNormalizedPosteriorOfAFzero() + result.getNormalizedPosteriorOfAFGTZero(), 1.0, 1e-4);
final int minNumberOfEvaluations = cfg.getVC().getCalledChrCount();
Assert.assertTrue(result.getnEvaluations() >= minNumberOfEvaluations,
"Number of evaluations " + result.getnEvaluations() + " must be at least " + minNumberOfEvaluations);
Assert.assertNotNull(result.getAllelesUsedInGenotyping());
Assert.assertTrue(cfg.getAlleles().containsAll(result.getAllelesUsedInGenotyping()), "Result object has alleles not in our initial allele list");
for ( int altAlleleI = 0; altAlleleI < cfg.numAltAlleles; altAlleleI++ ) {
int expectedAlleleCount = cfg.getExpectedAltAC(altAlleleI);
int calcAC_MLE = result.getAlleleCountsOfMLE()[altAlleleI];
final Allele allele = cfg.getAlleles().get(altAlleleI+1);
Assert.assertEquals(calcAC_MLE, expectedAlleleCount, "MLE AC not equal to expected AC for allele " + allele);
}
// TODO
// TODO -- enable when we understand the contract between AC_MAP and pNonRef
// TODO
// final int AC_MAP = (int)MathUtils.sum(result.getAlleleCountsOfMAP());
// if ( AC_MAP > 0 ) {
// Assert.assertTrue(result.getNormalizedPosteriorOfAFzero() < 0.50, "MAP AC " + AC_MAP + " > 0 but we had posterior AF = 0 > 0.5 of " + result.getNormalizedPosteriorOfAFzero());
// } else {
// Assert.assertTrue(result.getNormalizedPosteriorOfAFzero() > 0.50, "MAP AC " + AC_MAP + " == 0 but we had posterior AF = 0 < 0.5 of " + result.getNormalizedPosteriorOfAFzero());
// }
}
@Test(enabled = true, dataProvider = "Models")
public void testLargeGLs(final ExactAFCalculation calc) {
final Genotype BB = makePL(Arrays.asList(C, C), 20000000, 20000000, 0);
GetGLsTest cfg = new GetGLsTest(calc, 1, Arrays.asList(BB, BB, BB), FLAT_3SAMPLE_PRIORS, "flat");
final AlleleFrequencyCalculationResult result = cfg.execute();
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[0];
Assert.assertEquals(calculatedAlleleCount, 6);
}
@Test(enabled = true, dataProvider = "Models")
public void testMismatchedGLs(final ExactAFCalculation calc) {
final Genotype AB = makePL(Arrays.asList(A,C), 2000, 0, 2000, 2000, 2000, 2000);
final Genotype AC = makePL(Arrays.asList(A,G), 100, 100, 100, 0, 100, 100);
GetGLsTest cfg = new GetGLsTest(calc, 2, Arrays.asList(AB, AC), FLAT_3SAMPLE_PRIORS, "flat");
final AlleleFrequencyCalculationResult result = cfg.execute();
Assert.assertEquals(result.getAlleleCountsOfMAP()[0], 1);
Assert.assertEquals(result.getAlleleCountsOfMAP()[1], 1);
}
@DataProvider(name = "Models")
public Object[][] makeModels() {
List<Object[]> tests = new ArrayList<Object[]>();
tests.add(new Object[]{new DiploidExactAFCalculation(2, 4)});
tests.add(new Object[]{new OptimizedDiploidExactAFCalculation(2, 4)});
tests.add(new Object[]{new GeneralPloidyExactAFCalculation(2, 4, 2)});
return tests.toArray(new Object[][]{});
}
@Test(enabled = true, dataProvider = "Models")
public void testBiallelicPriors(final ExactAFCalculation model) {
final int REF_PL = 10;
final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000);
for ( int log10NonRefPrior = 1; log10NonRefPrior < 100*REF_PL; log10NonRefPrior += 1 ) {
final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior);
final double[] priors = MathUtils.toLog10(new double[]{refPrior, (1-refPrior) / 2, (1-refPrior) / 2});
GetGLsTest cfg = new GetGLsTest(model, 1, Arrays.asList(AB), priors, "pNonRef" + log10NonRefPrior);
final AlleleFrequencyCalculationResult result = cfg.execute();
final int actualAC = result.getAlleleCountsOfMAP()[0];
final double pRefWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0];
final double pHetWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1];
final boolean expectNonRef = pRefWithPrior <= pHetWithPrior;
if ( expectNonRef )
Assert.assertTrue(result.getNormalizedPosteriorOfAFGTZero() > 0.5);
else
Assert.assertTrue(result.getNormalizedPosteriorOfAFGTZero() < 0.5);
final int expectedAC = expectNonRef ? 1 : 0;
Assert.assertEquals(actualAC, expectedAC,
"actual AC with priors " + log10NonRefPrior + " not expected "
+ expectedAC + " priors " + Utils.join(",", priors));
}
}
@Test(enabled = false, dataProvider = "Models")
public void testTriallelicPriors(final ExactAFCalculation model) {
// TODO
// TODO
// TODO THIS SEEMS TO ID A BUG IN THE EXACT MODEL FOR MULTI-ALLELICS, AS THE
// TODO SECOND ALLELE ISN'T HAVING A SQUARED PRIOR. TALK TO ERIC AND CONFIRM
// TODO
// TODO
final int REF_PL_AB = 10, REF_PL_AC = 20; // first AC goes, then AB
final Genotype AB = makePL(Arrays.asList(A,C), REF_PL_AB, 0, 10000, 10000, 10000);
final Genotype AC = makePL(Arrays.asList(A, G), REF_PL_AC, 10000, 10000, 0, 10000, 10000);
for ( int log10NonRefPrior = 1; log10NonRefPrior < 100*REF_PL_AC; log10NonRefPrior += 1 ) {
final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior);
final double nonRefPrior = (1-refPrior) / 2;
final double[] priors = MathUtils.toLog10(new double[]{refPrior, nonRefPrior, nonRefPrior, nonRefPrior, nonRefPrior, nonRefPrior});
GetGLsTest cfg = new GetGLsTest(model, 2, Arrays.asList(AB, AC), priors, "pNonRef" + log10NonRefPrior);
final AlleleFrequencyCalculationResult result = cfg.execute();
final int actualAC_AB = result.getAlleleCountsOfMAP()[0];
final double pRefABWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0];
final double pHetABWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1];
final int expectedAC_AB = pRefABWithPrior <= pHetABWithPrior ? 1 : 0;
Assert.assertEquals(actualAC_AB, expectedAC_AB,
"actual AC with priors " + log10NonRefPrior + " not expected "
+ expectedAC_AB + " priors " + Utils.join(",", priors));
final double nonRefPriorSecondAllele = Math.pow(nonRefPrior, 2);
final double refPriorSecondAllele = 1 - nonRefPriorSecondAllele;
final int actualAC_AC = result.getAlleleCountsOfMAP()[1];
final double pRefACWithPrior = AB.getLikelihoods().getAsVector()[0] + Math.log10(refPriorSecondAllele);
final double pHetACWithPrior = AC.getLikelihoods().getAsVector()[3] + Math.log10(nonRefPriorSecondAllele);
final int expectedAC_AC = pRefACWithPrior <= pHetACWithPrior ? 1 : 0;
Assert.assertEquals(actualAC_AC, expectedAC_AC,
"actual AC with priors " + log10NonRefPrior + " not expected "
+ expectedAC_AC + " priors " + Utils.join(",", priors));
}
}
}

View File

@ -141,7 +141,7 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest {
final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size());
double[] priors = new double[len]; // flat priors
GeneralPloidyExactAFCalculationModel.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result);
GeneralPloidyExactAFCalculation.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result);
int nameIndex = 1;
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));

View File

@ -31,7 +31,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "-gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "71bec55320a2f07af0d54be9d7735322");
HCTest(CEUTRIO_BAM, "--max_alternate_alleles_for_indels 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "71bec55320a2f07af0d54be9d7735322");
}
private void HCTestComplexVariants(String bam, String args, String md5) {
@ -42,7 +42,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleComplex() {
HCTestComplexVariants(CEUTRIO_BAM, "", "f5a809e3fbd9998f79b75bb2973209e1");
HCTestComplexVariants(CEUTRIO_BAM, "", "966da0de8466d21d79f1523488dff6bd");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
@ -81,4 +81,19 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
executeTest("HCTestStructuralIndels: ", spec);
}
// --------------------------------------------------------------------------------------------------------------
//
// testing reduced reads
//
// --------------------------------------------------------------------------------------------------------------
@Test
public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("864abe729828248333aee14818c1d2e1"));
executeTest("HC calling on a ReducedRead BAM", spec);
}
}

View File

@ -1,13 +1,12 @@
package org.broadinstitute.sting.gatk.arguments;
import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
/**
* Created with IntelliJ IDEA.
* User: rpoplin
@ -59,4 +58,18 @@ public class StandardCallerArgumentCollection {
@Advanced
@Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
public int MAX_ALTERNATE_ALLELES = 3;
/**
* If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
* then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
* scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
* that you not play around with this parameter.
*/
@Advanced
@Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "Maximum number of alternate alleles to genotype for indels only", required = false)
public int MAX_ALTERNATE_ALLELES_FOR_INDELS = 2;
@Hidden
@Argument(shortName = "logExactCalls", doc="x", required=false)
public File exactCallsLog = null;
}

View File

@ -124,6 +124,12 @@ public class BAMScheduler implements Iterator<FilePointer> {
*/
private FilePointer generatePointerOverEntireFileset() {
FilePointer filePointer = new FilePointer();
// This is a "monolithic" FilePointer representing all regions in all files we will ever visit, and is
// the only FilePointer we will create. This allows us to have this FilePointer represent regions from
// multiple contigs
filePointer.setIsMonolithic(true);
Map<SAMReaderID,GATKBAMFileSpan> currentPosition;
// Only use the deprecated SAMDataSource.getCurrentPosition() if we're not using experimental downsampling

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;
@ -88,6 +89,17 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
*/
private PeekableIterator<SAMRecord> currentContigReadsIterator = null;
/**
* How many FilePointers have we pulled from the filePointers iterator?
*/
private int totalFilePointersConsumed = 0;
/**
* Have we encountered a monolithic FilePointer?
*/
private boolean encounteredMonolithicFilePointer = false;
{
createNextContigFilePointer();
advance();
@ -167,6 +179,20 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
logger.info("Loading BAM index data for next contig");
while ( filePointers.hasNext() ) {
// Make sure that if we see a monolithic FilePointer (representing all regions in all files) that
// it is the ONLY FilePointer we ever encounter
if ( encounteredMonolithicFilePointer ) {
throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer");
}
if ( filePointers.peek().isMonolithic() ) {
if ( totalFilePointersConsumed > 0 ) {
throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer");
}
encounteredMonolithicFilePointer = true;
logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek()));
}
// If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the
// same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge
if ( nextContigFilePointers.isEmpty() ||
@ -175,6 +201,7 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
(nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) {
nextContigFilePointers.add(filePointers.next());
totalFilePointersConsumed++;
}
else {
break; // next FilePointer is on a different contig or has different mapped/unmapped status,

View File

@ -50,10 +50,28 @@ public class FilePointer {
*/
protected final boolean isRegionUnmapped;
/**
* Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
* ever visit during this GATK run? If this is set to true, the engine will expect to see only this
* one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
* from more than one contig.
*/
private boolean isMonolithic = false;
/**
* Index of the contig covered by this FilePointer. Only meaningful for non-monolithic, mapped FilePointers
*/
private Integer contigIndex = null;
public FilePointer( List<GenomeLoc> locations ) {
this.locations.addAll(locations);
this.isRegionUnmapped = checkUnmappedStatus();
validateLocations();
validateAllLocations();
if ( locations.size() > 0 ) {
contigIndex = locations.get(0).getContigIndex();
}
}
public FilePointer( final GenomeLoc... locations ) {
@ -80,8 +98,9 @@ public class FilePointer {
return foundUnmapped;
}
private void validateLocations() {
if ( isRegionUnmapped ) {
private void validateAllLocations() {
// Unmapped and monolithic FilePointers are exempted from the one-contig-only restriction
if ( isRegionUnmapped || isMonolithic ) {
return;
}
@ -89,13 +108,22 @@ public class FilePointer {
for ( GenomeLoc location : locations ) {
if ( previousContigIndex != null && previousContigIndex != location.getContigIndex() ) {
throw new ReviewedStingException("File pointers must contain intervals from at most one contig");
throw new ReviewedStingException("Non-monolithic file pointers must contain intervals from at most one contig");
}
previousContigIndex = location.getContigIndex();
}
}
private void validateLocation( GenomeLoc location ) {
if ( isRegionUnmapped != GenomeLoc.isUnmapped(location) ) {
throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped.");
}
if ( ! isRegionUnmapped && ! isMonolithic && contigIndex != null && contigIndex != location.getContigIndex() ) {
throw new ReviewedStingException("Non-monolithic file pointers must contain intervals from at most one contig");
}
}
/**
* Returns an immutable view of this FilePointer's file spans
*
@ -123,6 +151,29 @@ public class FilePointer {
return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
}
/**
* Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
* ever visit during this GATK run? If this is set to true, the engine will expect to see only this
* one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
* from more than one contig.
*
* @return true if this FP is a monolithic FP representing all regions in all files, otherwise false
*/
public boolean isMonolithic() {
return isMonolithic;
}
/**
* Set this FP's "monolithic" status to true or false. An FP is monolithic if it represents all
* regions in all files that we will ever visit, and is the only FP we will ever create. A monolithic
* FP may contain intervals from more than one contig.
*
* @param isMonolithic set this FP's monolithic status to this value
*/
public void setIsMonolithic( boolean isMonolithic ) {
this.isMonolithic = isMonolithic;
}
@Override
public boolean equals(final Object other) {
if(!(other instanceof FilePointer))
@ -151,15 +202,12 @@ public class FilePointer {
}
public void addLocation(final GenomeLoc location) {
this.locations.add(location);
checkUnmappedStatus();
validateLocations();
}
validateLocation(location);
public void addLocations( final List<GenomeLoc> locations ) {
this.locations.addAll(locations);
checkUnmappedStatus();
validateLocations();
this.locations.add(location);
if ( contigIndex == null ) {
contigIndex = location.getContigIndex();
}
}
public void addFileSpans(final SAMReaderID id, final SAMFileSpan fileSpan) {

View File

@ -215,19 +215,29 @@ public class ReadShard extends Shard {
int start = Integer.MAX_VALUE;
int stop = Integer.MIN_VALUE;
String contig = null;
boolean foundMapped = false;
for ( final SAMRecord read : reads ) {
if ( contig != null && ! read.getReferenceName().equals(contig) )
throw new ReviewedStingException("ReadShard contains reads spanning contig boundaries, which is no longer allowed. "
+ "First contig is " + contig + " next read was " + read.getReferenceName() );
contig = read.getReferenceName();
if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart();
if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd();
// Even if this shard as a *whole* is not "unmapped", we can still encounter *individual* unmapped mates
// of mapped reads within this shard's buffer. In fact, if we're very unlucky with shard boundaries,
// this shard might consist *only* of unmapped mates! We need to refrain from using the alignment
// starts/stops of these unmapped mates, and detect the case where the shard has been filled *only*
// with unmapped mates.
if ( ! read.getReadUnmappedFlag() ) {
foundMapped = true;
if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart();
if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd();
}
}
assert contig != null;
if ( contig.equals("*") ) // all reads are unmapped
if ( ! foundMapped || contig.equals("*") ) // all reads are unmapped
return GenomeLoc.UNMAPPED;
else
return parser.createGenomeLoc(contig, start, stop);

View File

@ -12,6 +12,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -49,8 +50,12 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno
if ( perReadAlleleLikelihoodMap.size() == 0 )
return null;
for ( Map.Entry<String, PerReadAlleleLikelihoodMap> sample : perReadAlleleLikelihoodMap.entrySet() )
depth += sample.getValue().getNumberOfStoredElements();
for (PerReadAlleleLikelihoodMap maps : perReadAlleleLikelihoodMap.values() ) {
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
final GATKSAMRecord read = el.getKey();
depth += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1);
}
}
}
else
return null;

View File

@ -13,6 +13,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
@ -72,7 +73,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
ReadBackedPileup pileup = stratifiedContext.getBasePileup();
for ( PileupElement p : pileup ) {
if ( alleleCounts.containsKey(p.getBase()) )
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1);
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+p.getRepresentativeCount());
}
// we need to add counts in the correct order
@ -91,12 +92,13 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
alleleCounts.put(allele, 0);
}
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
final GATKSAMRecord read = el.getKey();
final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
if (a.isNoCall())
continue; // read is non-informative
if (!vc.getAlleles().contains(a))
continue; // sanity check - shouldn't be needed
alleleCounts.put(a,alleleCounts.get(a)+1);
alleleCounts.put(a, alleleCounts.get(a) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1));
}
final int[] counts = new int[alleleCounts.size()];
counts[0] = alleleCounts.get(vc.getReference());

View File

@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -71,7 +72,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
}
else if (stratifiedPerReadAlleleLikelihoodMap != null) {
// either SNP with no alignment context, or indels: per-read likelihood map needed
final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount());
final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc);
return pValueForBestTable(table, null);
}
else
@ -235,14 +236,13 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
* allele2 # #
* @return a 2x2 contingency table
*/
private static int[][] getContingencyTable( final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap,
final Allele ref, final Allele alt) {
private static int[][] getContingencyTable( final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) {
final Allele ref = vc.getReference();
final Allele alt = vc.getAltAlleleWithHighestAlleleCount();
int[][] table = new int[2][2];
for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) {
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
if ( el.getKey().isReducedRead() ) // ignore reduced reads
continue;
final boolean matchesRef = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(ref,true);
final boolean matchesAlt = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(alt,true);
@ -254,7 +254,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
int row = matchesRef ? 0 : 1;
int column = isFW ? 0 : 1;
table[row][column]++;
final GATKSAMRecord read = el.getKey();
table[row][column] += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1);
}
}
@ -275,7 +276,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
for (PileupElement p : sample.getValue().getBasePileup()) {
if ( ! RankSumTest.isUsableBase(p, false) || p.getRead().isReducedRead() ) // ignore deletions and reduced reads
if ( ! RankSumTest.isUsableBase(p, false) ) // ignore deletions
continue;
if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider )
@ -290,7 +291,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
int row = matchesRef ? 0 : 1;
int column = isFW ? 0 : 1;
table[row][column]++;
table[row][column] += p.getRepresentativeCount();
}
}
}

View File

@ -200,15 +200,15 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
}
}
private boolean readHasBeenSkipped(GATKSAMRecord read) {
private boolean readHasBeenSkipped( final GATKSAMRecord read ) {
return read.containsTemporaryAttribute(SKIP_RECORD_ATTRIBUTE);
}
private boolean isLowQualityBase(GATKSAMRecord read, int offset) {
return read.getBaseQualities()[offset] < minimumQToUse;
private boolean isLowQualityBase( final PileupElement p ) {
return p.getQual() < minimumQToUse;
}
private boolean readNotSeen(GATKSAMRecord read) {
private boolean readNotSeen( final GATKSAMRecord read ) {
return !read.containsTemporaryAttribute(SEEN_ATTRIBUTE);
}
@ -230,7 +230,7 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
final int offset = p.getOffset();
// This read has been marked to be skipped or base is low quality (we don't recalibrate low quality bases)
if (readHasBeenSkipped(read) || isLowQualityBase(read, offset))
if (readHasBeenSkipped(read) || p.isInsertionAtBeginningOfRead() || isLowQualityBase(p) )
continue;
if (readNotSeen(read)) {

View File

@ -0,0 +1,233 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.List;
/**
* Generic interface for calculating the probability of alleles segregating given priors and genotype likelihoods
*/
public abstract class AlleleFrequencyCalculation implements Cloneable {
private final static Logger defaultLogger = Logger.getLogger(AlleleFrequencyCalculation.class);
public enum Model {
/** The default model with the best performance in all cases */
EXACT("ExactAFCalculation");
final String implementationName;
private Model(String implementationName) {
this.implementationName = implementationName;
}
}
protected int nSamples;
protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
protected int MAX_ALTERNATE_ALLELES_FOR_INDELS;
protected Logger logger;
protected PrintStream verboseWriter;
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
private SimpleTimer callTimer = new SimpleTimer();
private PrintStream callReport = null;
protected AlleleFrequencyCalculation(final UnifiedArgumentCollection UAC, final int nSamples, final Logger logger, final PrintStream verboseWriter) {
this(nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.MAX_ALTERNATE_ALLELES_FOR_INDELS, UAC.exactCallsLog, logger, verboseWriter);
}
protected AlleleFrequencyCalculation(final int nSamples,
final int maxAltAlleles,
final int maxAltAllelesForIndels,
final File exactCallsLog,
final Logger logger,
final PrintStream verboseWriter) {
if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples);
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles);
this.nSamples = nSamples;
this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = maxAltAlleles;
this.MAX_ALTERNATE_ALLELES_FOR_INDELS = maxAltAllelesForIndels;
this.logger = logger == null ? defaultLogger : logger;
this.verboseWriter = verboseWriter;
if ( exactCallsLog != null )
initializeOutputFile(exactCallsLog);
}
/**
* @see #getLog10PNonRef(org.broadinstitute.sting.utils.variantcontext.VariantContext, double[], AlleleFrequencyCalculationResult)
*
* Allocates a new results object. Useful for testing but slow in practice.
*/
public final AlleleFrequencyCalculationResult getLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors) {
return getLog10PNonRef(vc, log10AlleleFrequencyPriors, new AlleleFrequencyCalculationResult(MAX_ALTERNATE_ALLELES_TO_GENOTYPE));
}
/**
* Compute the probability of the alleles segregating given the genotype likelihoods of the samples in vc
*
* @param vc the VariantContext holding the alleles and sample information
* @param log10AlleleFrequencyPriors a prior vector nSamples x 2 in length indicating the Pr(AF = i)
* @param result a pre-allocated (for efficiency) object to hold the result of the calculation
* @return result (for programming convenience)
*/
public final AlleleFrequencyCalculationResult getLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null");
if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null");
if ( result == null ) throw new IllegalArgumentException("Results object cannot be null");
// reset the result, so we can store our new result there
result.reset();
final VariantContext vcWorking = reduceScope(vc);
callTimer.start();
computeLog10PNonRef(vcWorking, log10AlleleFrequencyPriors, result);
final long nanoTime = callTimer.getElapsedTimeNano();
if ( callReport != null )
printCallInfo(vcWorking, log10AlleleFrequencyPriors, nanoTime, result.getLog10PosteriorOfAFzero());
result.setAllelesUsedInGenotyping(vcWorking.getAlleles());
return result;
}
// ---------------------------------------------------------------------------
//
// Abstract methods that should be implemented by concrete implementations
// to actually calculate the AF
//
// ---------------------------------------------------------------------------
/**
* Look at VC and perhaps return a new one of reduced complexity, if that's necessary
*
* Used before the call to computeLog10PNonRef to simply the calculation job at hand,
* if vc exceeds bounds. For example, if VC has 100 alt alleles this function
* may decide to only genotype the best 2 of them.
*
* @param vc the initial VC provided by the caller to this AFcalculation
* @return a potentially simpler VC that's more tractable to genotype
*/
@Requires("vc != null")
@Ensures("result != null")
protected abstract VariantContext reduceScope(final VariantContext vc);
/**
* Actually carry out the log10PNonRef calculation on vc, storing results in results
*
* @param vc variant context with alleles and genotype likelihoods
* @param log10AlleleFrequencyPriors priors
* @param result (pre-allocated) object to store results
*/
// TODO -- add consistent requires among args
protected abstract void computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result);
/**
* Must be overridden by concrete subclasses
*
* @param vc variant context with alleles and genotype likelihoods
* @param allelesToUse alleles to subset
* @param assignGenotypes
* @param ploidy
* @return GenotypesContext object
*/
protected abstract GenotypesContext subsetAlleles(final VariantContext vc,
final List<Allele> allelesToUse,
final boolean assignGenotypes,
final int ploidy);
// ---------------------------------------------------------------------------
//
// Print information about the call to the calls log
//
// ---------------------------------------------------------------------------
private void initializeOutputFile(final File outputFile) {
try {
if (outputFile != null) {
callReport = new PrintStream( new FileOutputStream(outputFile) );
callReport.println(Utils.join("\t", Arrays.asList("loc", "variable", "key", "value")));
}
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotCreateOutputFile(outputFile, e);
}
}
private void printCallInfo(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final long runtimeNano,
final double log10PosteriorOfAFzero) {
printCallElement(vc, "type", "ignore", vc.getType());
int allelei = 0;
for ( final Allele a : vc.getAlleles() )
printCallElement(vc, "allele", allelei++, a.getDisplayString());
for ( final Genotype g : vc.getGenotypes() )
printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString());
for ( int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++ )
printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]);
printCallElement(vc, "runtime.nano", "ignore", runtimeNano);
printCallElement(vc, "log10PosteriorOfAFzero", "ignore", log10PosteriorOfAFzero);
callReport.flush();
}
private void printCallElement(final VariantContext vc,
final Object variable,
final Object key,
final Object value) {
final String loc = String.format("%s:%d", vc.getChr(), vc.getStart());
callReport.println(Utils.join("\t", Arrays.asList(loc, variable, key, value)));
}
}

View File

@ -25,9 +25,12 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import com.google.java.contract.Ensures;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.Arrays;
import java.util.List;
/**
* Created by IntelliJ IDEA.
@ -35,9 +38,10 @@ import java.util.Arrays;
* Date: Dec 14, 2011
*
* Useful helper class to communicate the results of the allele frequency calculation
*
* TODO -- WHAT IS THE CONTRACT ON MAP AC AND P NON REF?
*/
public class AlleleFrequencyCalculationResult {
// These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles
private double log10MLE;
private double log10MAP;
@ -54,21 +58,88 @@ public class AlleleFrequencyCalculationResult {
private double log10LikelihoodOfAFzero;
private double log10PosteriorOfAFzero;
int nEvaluations = 0;
/**
* The list of alleles actually used in computing the AF
*/
private List<Allele> allelesUsedInGenotyping = null;
/**
* Create a results object capability of storing results for calls with up to maxAltAlleles
*
* @param maxAltAlleles an integer >= 1
*/
public AlleleFrequencyCalculationResult(final int maxAltAlleles) {
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles);
alleleCountsOfMLE = new int[maxAltAlleles];
alleleCountsOfMAP = new int[maxAltAlleles];
reset();
}
/**
* Get the log10 value of the probability mass at the MLE
*
* @return a log10 prob
*/
@Ensures("goodLog10Value(result)")
public double getLog10MLE() {
return log10MLE;
}
/**
* Get the log10 value of the probability mass at the max. a posterior (MAP)
*
* @return a log10 prob
*/
@Ensures("goodLog10Value(result)")
public double getLog10MAP() {
return log10MAP;
}
/**
* Returns a vector with maxAltAlleles values containing AC values at the MLE
*
* The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order,
* starting from index 0 (i.e., the first alt allele is at 0). The vector is always
* maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values
* are meaningful.
*
* @return a vector with allele counts, not all of which may be meaningful
*/
@Ensures("result != null")
public int[] getAlleleCountsOfMLE() {
return alleleCountsOfMLE;
}
/**
* Returns a vector with maxAltAlleles values containing AC values at the MAP
*
* @see #getAlleleCountsOfMLE() for the encoding of results in this vector
*
* @return a non-null vector of ints
*/
@Ensures("result != null")
public int[] getAlleleCountsOfMAP() {
return alleleCountsOfMAP;
}
/**
* Returns the number of cycles used to evaluate the pNonRef for this AF calculation
*
* @return the number of evaluations required to produce the answer for this AF calculation
*/
public int getnEvaluations() {
return nEvaluations;
}
/**
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
*
* @return
*/
public double getLog10PosteriorsMatrixSumWithoutAFzero() {
if ( log10PosteriorMatrixSum == null ) {
log10PosteriorMatrixSum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex);
@ -76,33 +147,99 @@ public class AlleleFrequencyCalculationResult {
return log10PosteriorMatrixSum;
}
public int[] getAlleleCountsOfMLE() {
return alleleCountsOfMLE;
}
public int[] getAlleleCountsOfMAP() {
return alleleCountsOfMAP;
}
/**
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
*
* @return
*/
public double getLog10LikelihoodOfAFzero() {
return log10LikelihoodOfAFzero;
}
/**
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
*
* @return
*/
public double getLog10PosteriorOfAFzero() {
return log10PosteriorOfAFzero;
}
public void reset() {
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
/**
* Get the list of alleles actually used in genotyping.
*
* Due to computational / implementation constraints this may be smaller than
* the actual list of alleles requested
*
* @return a non-empty list of alleles used during genotyping
*/
@Ensures({"result != null", "! result.isEmpty()"})
public List<Allele> getAllelesUsedInGenotyping() {
if ( allelesUsedInGenotyping == null )
throw new IllegalStateException("allelesUsedInGenotyping requested but not yet set");
return allelesUsedInGenotyping;
}
/**
* Get the normalized -- across all AFs -- of AC == 0, NOT LOG10
* @return
*/
// TODO -- this ensure cannot be enabled right now because the log10 inputs can be infinity, etc.
// TODO -- we should own these values in a more meaningful way and return good values in the case
// TODO -- where this happens, or instead thrown an error and have a function to say "was this calculation successful
// @Ensures({"result >= 0.0", "result <= 1.0"})
public double getNormalizedPosteriorOfAFzero() {
return getNormalizedPosteriors()[0];
}
/**
* Get the normalized -- across all AFs -- of AC > 0, NOT LOG10
* @return
*/
// TODO -- this ensure cannot be enabled right now because the log10 inputs can be infinity, etc.
// TODO -- we should own these values in a more meaningful way and return good values in the case
// TODO -- where this happens, or instead thrown an error and have a function to say "was this calculation successful
//@Ensures({"result >= 0.0", "result <= 1.0"})
public double getNormalizedPosteriorOfAFGTZero() {
return getNormalizedPosteriors()[1];
}
private double[] getNormalizedPosteriors() {
final double[] posteriors = new double[]{ getLog10PosteriorOfAFzero(), getLog10PosteriorsMatrixSumWithoutAFzero() };
return MathUtils.normalizeFromLog10(posteriors);
}
// --------------------------------------------------------------------------------
//
// Protected mutational methods only for use within the calculation models themselves
//
// --------------------------------------------------------------------------------
/**
* Reset the data in this results object, so that it can be used in a subsequent AF calculation
*
* Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer
*/
protected void reset() {
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculation.VALUE_NOT_CALCULATED;
for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) {
alleleCountsOfMLE[i] = 0;
alleleCountsOfMAP[i] = 0;
}
currentPosteriorsCacheIndex = 0;
log10PosteriorMatrixSum = null;
allelesUsedInGenotyping = null;
}
public void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
/**
* Tell this result we used one more evaluation cycle
*/
protected void incNEvaluations() {
nEvaluations++;
}
protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
if ( log10LofK > log10MLE ) {
log10MLE = log10LofK;
for ( int i = 0; i < alleleCountsForK.length; i++ )
@ -110,7 +247,7 @@ public class AlleleFrequencyCalculationResult {
}
}
public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
protected void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
addToPosteriorsCache(log10LofK);
if ( log10LofK > log10MAP ) {
@ -132,7 +269,7 @@ public class AlleleFrequencyCalculationResult {
}
}
public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
if ( log10LikelihoodOfAFzero > log10MLE ) {
log10MLE = log10LikelihoodOfAFzero;
@ -140,11 +277,22 @@ public class AlleleFrequencyCalculationResult {
}
}
public void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
this.log10PosteriorOfAFzero = log10PosteriorOfAFzero;
if ( log10PosteriorOfAFzero > log10MAP ) {
log10MAP = log10PosteriorOfAFzero;
Arrays.fill(alleleCountsOfMAP, 0);
}
}
protected void setAllelesUsedInGenotyping(List<Allele> allelesUsedInGenotyping) {
if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() )
throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty");
this.allelesUsedInGenotyping = allelesUsedInGenotyping;
}
private static boolean goodLog10Value(final double result) {
return result <= 0.0 || Double.isInfinite(result) || Double.isNaN(result);
}
}

View File

@ -32,41 +32,54 @@ import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.PrintStream;
import java.util.*;
public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
public class DiploidExactAFCalculation extends ExactAFCalculation {
// private final static boolean DEBUG = false;
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
public DiploidExactAFCalculation(final int nSamples, final int maxAltAlleles) {
super(nSamples, maxAltAlleles, maxAltAlleles, null, null, null);
}
/**
* Dynamically found in UnifiedGenotyperEngine
*
* @param UAC
* @param N
* @param logger
* @param verboseWriter
*/
public DiploidExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
super(UAC, N, logger, verboseWriter);
}
public List<Allele> getLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
@Override
public void computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
linearExactMultiAllelic(vc.getGenotypes(), vc.getNAlleles() - 1, log10AlleleFrequencyPriors, result);
}
GenotypesContext GLs = vc.getGenotypes();
List<Allele> alleles = vc.getAlleles();
final int myMaxAltAllelesToGenotype = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS && vc.getType().equals(VariantContext.Type.INDEL) ? 2 : MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
@Override
protected VariantContext reduceScope(final VariantContext vc) {
final int myMaxAltAllelesToGenotype = vc.getType().equals(VariantContext.Type.INDEL) ? MAX_ALTERNATE_ALLELES_FOR_INDELS : MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
// don't try to genotype too many alternate alleles
if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) {
logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
alleles = new ArrayList<Allele>(myMaxAltAllelesToGenotype + 1);
VariantContextBuilder builder = new VariantContextBuilder(vc);
List<Allele> alleles = new ArrayList<Allele>(myMaxAltAllelesToGenotype + 1);
alleles.add(vc.getReference());
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype));
GLs = VariantContextUtils.subsetDiploidAlleles(vc, alleles, false);
builder.alleles(alleles);
builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false));
return builder.make();
} else {
return vc;
}
linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result);
return alleles;
}
private static final int PL_INDEX_OF_HOM_REF = 0;
private static List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) {
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
@ -134,6 +147,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// keep processing while we have AC conformations that need to be calculated
MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen();
while ( !ACqueue.isEmpty() ) {
result.incNEvaluations(); // keep track of the number of evaluations
// compute log10Likelihoods
final ExactACset set = ACqueue.remove();
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLikelihoodSeen, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result);

View File

@ -30,40 +30,23 @@ import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* The model representing how we calculate a genotype given the priors and a pile
* of bases and quality scores
* Uses the Exact calculation of Heng Li
*/
public abstract class AlleleFrequencyCalculationModel implements Cloneable {
public enum Model {
/** The default model with the best performance in all cases */
EXACT
abstract class ExactAFCalculation extends AlleleFrequencyCalculation {
protected ExactAFCalculation(final UnifiedArgumentCollection UAC, final int nSamples, final Logger logger, final PrintStream verboseWriter) {
super(UAC, nSamples, logger, verboseWriter);
}
protected int N;
protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
protected boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
protected Logger logger;
protected PrintStream verboseWriter;
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
protected AlleleFrequencyCalculationModel(final UnifiedArgumentCollection UAC, final int N, final Logger logger, final PrintStream verboseWriter) {
this.N = N;
this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = UAC.MAX_ALTERNATE_ALLELES;
this.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = UAC.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
this.logger = logger;
this.verboseWriter = verboseWriter;
protected ExactAFCalculation(final int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, File exactCallsLog, Logger logger, PrintStream verboseWriter) {
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, exactCallsLog, logger, verboseWriter);
}
/**
@ -102,31 +85,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
return genotypeLikelihoods;
}
/**
* Must be overridden by concrete subclasses
* @param vc variant context with alleles and genotype likelihoods
* @param log10AlleleFrequencyPriors priors
* @param result (pre-allocated) object to store likelihoods results
* @return the alleles used for genotyping
*/
protected abstract List<Allele> getLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result);
/**
* Must be overridden by concrete subclasses
* @param vc variant context with alleles and genotype likelihoods
* @param allelesToUse alleles to subset
* @param assignGenotypes
* @param ploidy
* @return GenotypesContext object
*/
protected abstract GenotypesContext subsetAlleles(final VariantContext vc,
final List<Allele> allelesToUse,
final boolean assignGenotypes,
final int ploidy);
// -------------------------------------------------------------------------------------
//
// protected classes used to store exact model matrix columns

View File

@ -0,0 +1,496 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.PrintStream;
import java.util.*;
public class OptimizedDiploidExactAFCalculation extends ExactAFCalculation {
// private final static boolean DEBUG = false;
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
public OptimizedDiploidExactAFCalculation(final int nSamples, final int maxAltAlleles) {
super(nSamples, maxAltAlleles, maxAltAlleles, null, null, null);
}
/**
* Dynamically found in UnifiedGenotyperEngine
*
* @param UAC
* @param N
* @param logger
* @param verboseWriter
*/
public OptimizedDiploidExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
super(UAC, N, logger, verboseWriter);
}
@Override
public void computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
linearExactMultiAllelic(vc.getGenotypes(), vc.getNAlleles() - 1, log10AlleleFrequencyPriors, result);
}
@Override
protected VariantContext reduceScope(final VariantContext vc) {
final int myMaxAltAllelesToGenotype = vc.getType().equals(VariantContext.Type.INDEL) ? MAX_ALTERNATE_ALLELES_FOR_INDELS : MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
// don't try to genotype too many alternate alleles
if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) {
logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
VariantContextBuilder builder = new VariantContextBuilder(vc);
List<Allele> alleles = new ArrayList<Allele>(myMaxAltAllelesToGenotype + 1);
alleles.add(vc.getReference());
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype));
builder.alleles(alleles);
builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false));
return builder.make();
} else {
return vc;
}
}
private static final int PL_INDEX_OF_HOM_REF = 0;
private static List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) {
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles];
for ( int i = 0; i < numOriginalAltAlleles; i++ )
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes());
for ( final double[] likelihoods : GLs ) {
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) {
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL);
if ( alleles.alleleIndex1 != 0 )
likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
// don't double-count it
if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 )
likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
}
}
// sort them by probability mass and choose the best ones
Collections.sort(Arrays.asList(likelihoodSums));
final ArrayList<Allele> bestAlleles = new ArrayList<Allele>(numAllelesToChoose);
for ( int i = 0; i < numAllelesToChoose; i++ )
bestAlleles.add(likelihoodSums[i].allele);
final ArrayList<Allele> orderedBestAlleles = new ArrayList<Allele>(numAllelesToChoose);
for ( Allele allele : vc.getAlternateAlleles() ) {
if ( bestAlleles.contains(allele) )
orderedBestAlleles.add(allele);
}
return orderedBestAlleles;
}
// -------------------------------------------------------------------------------------
//
// Multi-allelic implementation.
//
// -------------------------------------------------------------------------------------
public static void linearExactMultiAllelic(final GenotypesContext GLs,
final int numAlternateAlleles,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;
// queue of AC conformations to process
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
// mapping of ExactACset indexes to the objects
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>(numChr+1);
// add AC=0 to the queue
int[] zeroCounts = new int[numAlternateAlleles];
ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts));
ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.ACcounts, zeroSet);
// keep processing while we have AC conformations that need to be calculated
MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen();
while ( !ACqueue.isEmpty() ) {
result.incNEvaluations(); // keep track of the number of evaluations
// compute log10Likelihoods
final ExactACset set = ACqueue.remove();
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLikelihoodSeen, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result);
// adjust max likelihood seen if needed
if ( log10LofKs > maxLikelihoodSeen.maxLog10L )
maxLikelihoodSeen.update(log10LofKs, set.ACcounts);
// clean up memory
indexesToACset.remove(set.ACcounts);
//if ( DEBUG )
// System.out.printf(" *** removing used set=%s%n", set.ACcounts);
}
}
private static final class DependentSet {
public final int[] ACcounts;
public final int PLindex;
public DependentSet(final int[] ACcounts, final int PLindex) {
this.ACcounts = ACcounts;
this.PLindex = PLindex;
}
}
private static double calculateAlleleCountConformation(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final MaxLikelihoodSeen maxLikelihoodSeen,
final int numChr,
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
//if ( DEBUG )
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
// compute the log10Likelihoods
computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, result);
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
// can we abort early because the log10Likelihoods are so small?
if ( log10LofK < maxLikelihoodSeen.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && maxLikelihoodSeen.isLowerAC(set.ACcounts) ) {
//if ( DEBUG )
// System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
return log10LofK;
}
// iterate over higher frequencies if possible
final int ACwiggle = numChr - set.getACsum();
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
return log10LofK;
final int numAltAlleles = set.ACcounts.getCounts().length;
// add conformations for the k+1 case
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
ACcountsClone[allele]++;
// to get to this conformation, a sample would need to be AB (remember that ref=0)
final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1);
updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
}
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
if ( ACwiggle > 1 ) {
final ArrayList<DependentSet> differentAlleles = new ArrayList<DependentSet>(numAltAlleles * numAltAlleles);
final ArrayList<DependentSet> sameAlleles = new ArrayList<DependentSet>(numAltAlleles);
for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) {
for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) {
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
ACcountsClone[allele_i]++;
ACcountsClone[allele_j]++;
// to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index)
final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1);
if ( allele_i == allele_j )
sameAlleles.add(new DependentSet(ACcountsClone, PLindex));
else
differentAlleles.add(new DependentSet(ACcountsClone, PLindex));
}
}
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
for ( DependentSet dependent : differentAlleles )
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
for ( DependentSet dependent : sameAlleles )
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
}
return log10LofK;
}
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
// also pushes its value to the given callingSetIndex.
private static void updateACset(final int[] newSetCounts,
final int numChr,
final ExactACset dependentSet,
final int PLsetIndex,
final Queue<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final ArrayList<double[]> genotypeLikelihoods) {
final ExactACcounts index = new ExactACcounts(newSetCounts);
if ( !indexesToACset.containsKey(index) ) {
ExactACset set = new ExactACset(numChr/2 +1, index);
indexesToACset.put(index, set);
ACqueue.add(set);
}
// push data from the dependency to the new set
//if ( DEBUG )
// System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts);
pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods);
}
private static void computeLofK(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
set.log10Likelihoods[0] = 0.0; // the zero case
final int totalK = set.getACsum();
// special case for k = 0 over all k
if ( totalK == 0 ) {
for ( int j = 1; j < set.log10Likelihoods.length; j++ )
set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX];
final double log10Lof0 = set.log10Likelihoods[set.log10Likelihoods.length-1];
result.setLog10LikelihoodOfAFzero(log10Lof0);
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
return;
}
// if we got here, then k > 0 for at least one k.
// the non-AA possible conformations were already dealt with by pushes from dependent sets;
// now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
if ( totalK < 2*j-1 ) {
final double[] gl = genotypeLikelihoods.get(j);
final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue);
}
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator;
}
double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
// update the MLE if necessary
result.updateMLEifNeeded(log10LofK, set.ACcounts.counts);
// apply the priors over each alternate allele
for ( final int ACcount : set.ACcounts.getCounts() ) {
if ( ACcount > 0 )
log10LofK += log10AlleleFrequencyPriors[ACcount];
}
result.updateMAPifNeeded(log10LofK, set.ACcounts.counts);
}
private static void pushData(final ExactACset targetSet,
final ExactACset dependentSet,
final int PLsetIndex,
final ArrayList<double[]> genotypeLikelihoods) {
final int totalK = targetSet.getACsum();
for ( int j = 1; j < targetSet.log10Likelihoods.length; j++ ) {
if ( totalK <= 2*j ) { // skip impossible conformations
final double[] gl = genotypeLikelihoods.get(j);
final double conformationValue =
determineCoefficient(PLsetIndex, j, targetSet.ACcounts.getCounts(), totalK) + dependentSet.log10Likelihoods[j-1] + gl[PLsetIndex];
targetSet.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(targetSet.log10Likelihoods[j], conformationValue);
}
}
}
private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
// the closed form representation generalized for multiple alleles is as follows:
// AA: (2j - totalK) * (2j - totalK - 1)
// AB: 2k_b * (2j - totalK)
// AC: 2k_c * (2j - totalK)
// BB: k_b * (k_b - 1)
// BC: 2 * k_b * k_c
// CC: k_c * (k_c - 1)
// find the 2 alleles that are represented by this PL index
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
// *** note that throughout this method we subtract one from the alleleIndex because ACcounts ***
// *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. ***
// the AX het case
if ( alleles.alleleIndex1 == 0 )
return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK];
final int k_i = ACcounts[alleles.alleleIndex1-1];
// the hom var case (e.g. BB, CC, DD)
final double coeff;
if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) {
coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1];
}
// the het non-ref case (e.g. BC, BD, CD)
else {
final int k_j = ACcounts[alleles.alleleIndex2-1];
coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j];
}
return coeff;
}
public GenotypesContext subsetAlleles(final VariantContext vc,
final List<Allele> allelesToUse,
final boolean assignGenotypes,
final int ploidy) {
return VariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes);
}
// -------------------------------------------------------------------------------------
//
// Deprecated bi-allelic ~O(N) implementation. Kept here for posterity.
//
// -------------------------------------------------------------------------------------
/**
* A simple data structure that holds the current, prev, and prev->prev likelihoods vectors
* for the exact model calculation
*/
/*
private final static class ExactACCache {
double[] kMinus2, kMinus1, kMinus0;
private final static double[] create(int n) {
return new double[n];
}
public ExactACCache(int n) {
kMinus2 = create(n);
kMinus1 = create(n);
kMinus0 = create(n);
}
final public void rotate() {
double[] tmp = kMinus2;
kMinus2 = kMinus1;
kMinus1 = kMinus0;
kMinus0 = tmp;
}
final public double[] getkMinus2() {
return kMinus2;
}
final public double[] getkMinus1() {
return kMinus1;
}
final public double[] getkMinus0() {
return kMinus0;
}
}
public int linearExact(GenotypesContext GLs,
double[] log10AlleleFrequencyPriors,
double[][] log10AlleleFrequencyLikelihoods,
double[][] log10AlleleFrequencyPosteriors) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;
final ExactACCache logY = new ExactACCache(numSamples+1);
logY.getkMinus0()[0] = 0.0; // the zero case
double maxLog10L = Double.NEGATIVE_INFINITY;
boolean done = false;
int lastK = -1;
for (int k=0; k <= numChr && ! done; k++ ) {
final double[] kMinus0 = logY.getkMinus0();
if ( k == 0 ) { // special case for k = 0
for ( int j=1; j <= numSamples; j++ ) {
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[0];
}
} else { // k > 0
final double[] kMinus1 = logY.getkMinus1();
final double[] kMinus2 = logY.getkMinus2();
for ( int j=1; j <= numSamples; j++ ) {
final double[] gl = genotypeLikelihoods.get(j);
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
double aa = Double.NEGATIVE_INFINITY;
double ab = Double.NEGATIVE_INFINITY;
if (k < 2*j-1)
aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[0];
if (k < 2*j)
ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[1];
double log10Max;
if (k > 1) {
final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[2];
log10Max = approximateLog10SumLog10(aa, ab, bb);
} else {
// we know we aren't considering the BB case, so we can use an optimized log10 function
log10Max = approximateLog10SumLog10(aa, ab);
}
// finally, update the L(j,k) value
kMinus0[j] = log10Max - logDenominator;
}
}
// update the posteriors vector
final double log10LofK = kMinus0[numSamples];
log10AlleleFrequencyLikelihoods[0][k] = log10LofK;
log10AlleleFrequencyPosteriors[0][k] = log10LofK + log10AlleleFrequencyPriors[k];
// can we abort early?
lastK = k;
maxLog10L = Math.max(maxLog10L, log10LofK);
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
//if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
done = true;
}
logY.rotate();
}
return lastK;
}
final static double approximateLog10SumLog10(double a, double b, double c) {
return approximateLog10SumLog10(approximateLog10SumLog10(a, b), c);
}
*/
}

View File

@ -41,7 +41,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
*/
@Advanced
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
protected AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT;
protected AlleleFrequencyCalculation.Model AFmodel = AlleleFrequencyCalculation.Model.EXACT;
/**
* The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily
@ -75,10 +75,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
public Double MAX_DELETION_FRACTION = 0.05;
@Hidden
@Argument(fullName = "cap_max_alternate_alleles_for_indels", shortName = "capMaxAltAllelesForIndels", doc = "Cap the maximum number of alternate alleles to genotype for indel calls at 2; overrides the --max_alternate_alleles argument; GSA production use only", required = false)
public boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = false;
// indel-related arguments
/**
* A candidate indel is genotyped (and potentially called) if there are this number of reads with a consensus indel at a site.
@ -186,7 +182,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
@Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false)
boolean EXCLUDE_FILTERED_REFERENCE_SITES = false;
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
public UnifiedArgumentCollection clone() {
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
@ -212,7 +207,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE;
uac.alleles = alleles;
uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES;
uac.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
uac.MAX_ALTERNATE_ALLELES_FOR_INDELS = MAX_ALTERNATE_ALLELES_FOR_INDELS;
uac.GLmodel = GLmodel;
uac.TREAT_ALL_READS_AS_SINGLE_POOL = TREAT_ALL_READS_AS_SINGLE_POOL;
uac.referenceSampleRod = referenceSampleRod;
@ -224,6 +219,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
uac.minReferenceDepth = minReferenceDepth;
uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES;
uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO;
uac.exactCallsLog = exactCallsLog;
// todo- arguments to remove
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
@ -239,8 +235,10 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
this.GenotypingMode = SCAC.GenotypingMode;
this.heterozygosity = SCAC.heterozygosity;
this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS;
this.OutputMode = SCAC.OutputMode;
this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
this.exactCallsLog = SCAC.exactCallsLog;
}
}

View File

@ -27,10 +27,10 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
@ -249,7 +249,7 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
throw new UserException("Incorrect genotype calculation model chosen. Only [POOLSNP|POOLINDEL|POOLBOTH] supported with this walker if sample ploidy != 2");
}
if (UAC.AFmodel != AlleleFrequencyCalculationModel.Model.POOL)
if (UAC.AFmodel != AlleleFrequencyCalculation.Model.POOL)
throw new UserException("Incorrect AF Calculation model. Only POOL model supported if sample ploidy != 2");
}

View File

@ -78,11 +78,10 @@ public class UnifiedGenotyperEngine {
private ThreadLocal<Map<String, GenotypeLikelihoodsCalculationModel>> glcm = new ThreadLocal<Map<String, GenotypeLikelihoodsCalculationModel>>();
// the model used for calculating p(non-ref)
private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();
private ThreadLocal<AlleleFrequencyCalculation> afcm = new ThreadLocal<AlleleFrequencyCalculation>();
// the allele frequency likelihoods and posteriors (allocated once as an optimization)
private ThreadLocal<AlleleFrequencyCalculationResult> alleleFrequencyCalculationResult = new ThreadLocal<AlleleFrequencyCalculationResult>();
private ThreadLocal<double[]> posteriorsArray = new ThreadLocal<double[]>();
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
private final double[] log10AlleleFrequencyPriorsSNPs;
@ -357,7 +356,6 @@ public class UnifiedGenotyperEngine {
if ( afcm.get() == null ) {
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES));
posteriorsArray.set(new double[2]);
}
AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get();
@ -370,8 +368,7 @@ public class UnifiedGenotyperEngine {
generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext));
}
AFresult.reset();
List<Allele> allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult);
afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult);
// is the most likely frequency conformation AC=0 for all alternate alleles?
boolean bestGuessIsRef = true;
@ -382,7 +379,7 @@ public class UnifiedGenotyperEngine {
myAlleles.add(vc.getReference());
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
final Allele alternateAllele = vc.getAlternateAllele(i);
final int indexOfAllele = allelesUsedInGenotyping.indexOf(alternateAllele);
final int indexOfAllele = AFresult.getAllelesUsedInGenotyping().indexOf(alternateAllele);
// the genotyping model may have stripped it out
if ( indexOfAllele == -1 )
continue;
@ -403,16 +400,16 @@ public class UnifiedGenotyperEngine {
}
// calculate p(f>0):
final double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get());
final double PofF = 1.0 - normalizedPosteriors[0];
final double PoFEq0 = AFresult.getNormalizedPosteriorOfAFzero();
final double PoFGT0 = AFresult.getNormalizedPosteriorOfAFGTZero();
double phredScaledConfidence;
if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PoFEq0);
if ( Double.isInfinite(phredScaledConfidence) )
phredScaledConfidence = -10.0 * AFresult.getLog10PosteriorOfAFzero();
} else {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PoFGT0);
if ( Double.isInfinite(phredScaledConfidence) ) {
final double sum = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
@ -423,7 +420,7 @@ public class UnifiedGenotyperEngine {
if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) {
// technically, at this point our confidence in a reference call isn't accurately estimated
// because it didn't take into account samples with no data, so let's get a better estimate
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, 1.0 - PofF);
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0);
}
// start constructing the resulting VC
@ -439,7 +436,7 @@ public class UnifiedGenotyperEngine {
// print out stats if we have a writer
if ( verboseWriter != null && !limitedContext )
printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, model);
printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model);
// *** note that calculating strand bias involves overwriting data structures, so we do that last
final HashMap<String, Object> attributes = new HashMap<String, Object>();
@ -477,7 +474,6 @@ public class UnifiedGenotyperEngine {
// the forward lod
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap);
AFresult.reset();
afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult);
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
@ -486,7 +482,6 @@ public class UnifiedGenotyperEngine {
// the reverse lod
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap);
AFresult.reset();
afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult);
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
@ -513,10 +508,10 @@ public class UnifiedGenotyperEngine {
// if we are subsetting alleles (either because there were too many or because some were not polymorphic)
// then we may need to trim the alleles (because the original VariantContext may have had to pad at the end).
if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext )
if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // limitedContext callers need to handle allele trimming on their own to keep their perReadAlleleLikelihoodMap alleles in sync
vcCall = VariantContextUtils.reverseTrimAlleles(vcCall);
if ( annotationEngine != null && !limitedContext ) {
if ( annotationEngine != null && !limitedContext ) { // limitedContext callers need to handle annotations on their own by calling their own annotationEngine
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
final ReadBackedPileup pileup = rawContext.getBasePileup();
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);
@ -524,13 +519,7 @@ public class UnifiedGenotyperEngine {
vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap);
}
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF));
}
public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) {
normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero();
normalizedPosteriors[1] = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
return MathUtils.normalizeFromLog10(normalizedPosteriors);
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0));
}
private Map<String, AlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) {
@ -754,32 +743,34 @@ public class UnifiedGenotyperEngine {
return glcm;
}
private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) {
private static AlleleFrequencyCalculation getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) {
List<Class<? extends AlleleFrequencyCalculationModel>> afClasses = new PluginManager<AlleleFrequencyCalculationModel>(AlleleFrequencyCalculationModel.class).getPlugins();
List<Class<? extends AlleleFrequencyCalculation>> afClasses = new PluginManager<AlleleFrequencyCalculation>(AlleleFrequencyCalculation.class).getPlugins();
// user-specified name
String afModelName = UAC.AFmodel.name();
String afModelName = UAC.AFmodel.implementationName;
if (!afModelName.contains(GPSTRING) && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY)
afModelName = GPSTRING + afModelName;
else
afModelName = "Diploid" + afModelName;
for (int i = 0; i < afClasses.size(); i++) {
Class<? extends AlleleFrequencyCalculationModel> afClass = afClasses.get(i);
Class<? extends AlleleFrequencyCalculation> afClass = afClasses.get(i);
String key = afClass.getSimpleName().replace("AFCalculationModel","").toUpperCase();
if (afModelName.equalsIgnoreCase(key)) {
try {
Object args[] = new Object[]{UAC,N,logger,verboseWriter};
Constructor c = afClass.getDeclaredConstructor(UnifiedArgumentCollection.class, int.class, Logger.class, PrintStream.class);
return (AlleleFrequencyCalculationModel)c.newInstance(args);
return (AlleleFrequencyCalculation)c.newInstance(args);
}
catch (Exception e) {
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel);
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculation " + UAC.AFmodel);
}
}
}
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel);
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculation " + UAC.AFmodel);
}
public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding<VariantContext> allelesBinding) {

View File

@ -24,7 +24,7 @@
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult;
import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidExactAFCalculation;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.TreeSet;
@ -51,7 +51,7 @@ public class GLBasedSampleSelector extends SampleSelector {
flatPriors = new double[1+2*samples.size()];
}
AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size());
ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPriors,result);
DiploidExactAFCalculation.linearExactMultiAllelic(subContext.getGenotypes(), vc.getAlternateAlleles().size(), flatPriors, result);
// do we want to let this qual go up or down?
if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) {
return true;

View File

@ -75,7 +75,7 @@ public class FilterLiftedVariants extends RodWalker<Integer, Integer> {
boolean failed = false;
byte[] recordRef = vc.getReference().getBases();
for (int i = 0; i < recordRef.length && i < MAX_VARIANT_SIZE; i++) {
if ( recordRef[i] != ref[i + (vc.isPointEvent() ? 0 : 1)] ) {
if ( recordRef[i] != ref[i] ) {
failed = true;
break;
}

View File

@ -50,7 +50,6 @@ import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.*;
/**
@ -278,13 +277,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
@Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
/**
* Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so make sure you supply the program with enough memory
* given your input set. This option will NOT work well for large callsets; use --select_random_fraction for sets with a large numbers of variants.
*/
@Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false)
protected int numRandom = 0;
/**
* This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions.
*/
@ -330,20 +322,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false;
/* Private class used to store the intermediate variants in the integer random selection process */
private static class RandomVariantStructure {
private VariantContext vc;
RandomVariantStructure(VariantContext vcP) {
vc = vcP;
}
public void set (VariantContext vcP) {
vc = vcP;
}
}
public enum NumberAlleleRestriction {
ALL,
BIALLELIC,
@ -364,12 +342,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
/* variables used by the SELECT RANDOM modules */
private boolean SELECT_RANDOM_NUMBER = false;
private boolean SELECT_RANDOM_FRACTION = false;
private int variantNumber = 0;
private int nVariantsAdded = 0;
private int positionToAdd = 0;
private RandomVariantStructure [] variantArray;
//Random number generator for the genotypes to remove
private Random randomGenotypes = new Random();
@ -478,12 +451,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
mv = new MendelianViolation(MENDELIAN_VIOLATION_QUAL_THRESHOLD,false,true);
}
SELECT_RANDOM_NUMBER = numRandom > 0;
if (SELECT_RANDOM_NUMBER) {
logger.info("Selecting " + numRandom + " variants at random from the variant track");
variantArray = new RandomVariantStructure[numRandom];
}
SELECT_RANDOM_FRACTION = fractionRandom > 0;
if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track");
@ -588,14 +555,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
break;
}
}
if ( !failedJexlMatch ) {
if (SELECT_RANDOM_NUMBER) {
randomlyAddVariant(++variantNumber, sub);
}
else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) {
if ( ! justRead )
vcfWriter.add(sub);
}
if ( !failedJexlMatch &&
!justRead &&
( !SELECT_RANDOM_FRACTION || GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom ) ) {
vcfWriter.add(sub);
}
}
}
@ -718,14 +681,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
public void onTraversalDone(Integer result) {
logger.info(result + " records processed.");
if (SELECT_RANDOM_NUMBER) {
int positionToPrint = positionToAdd;
for (int i=0; i<numRandom; i++) {
vcfWriter.add(variantArray[positionToPrint].vc);
positionToPrint = nextCircularPosition(positionToPrint);
}
}
}
@ -746,9 +701,9 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
GenotypesContext newGC = sub.getGenotypes();
// if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs (because they are no longer accurate)
// if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs and AD (because they are no longer accurate)
if ( vc.getAlleles().size() != sub.getAlleles().size() )
newGC = VariantContextUtils.stripPLs(sub.getGenotypes());
newGC = VariantContextUtils.stripPLsAndAD(sub.getGenotypes());
// if we have fewer samples in the selected VC than in the original VC, we need to strip out the MLE tags
if ( vc.getNSamples() != sub.getNSamples() ) {
@ -809,25 +764,4 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
if ( sawDP )
builder.attribute("DP", depth);
}
private void randomlyAddVariant(int rank, VariantContext vc) {
if (nVariantsAdded < numRandom)
variantArray[nVariantsAdded++] = new RandomVariantStructure(vc);
else {
double v = GenomeAnalysisEngine.getRandomGenerator().nextDouble();
double t = (1.0/(rank-numRandom+1));
if ( v < t) {
variantArray[positionToAdd].set(vc);
nVariantsAdded++;
positionToAdd = nextCircularPosition(positionToAdd);
}
}
}
private int nextCircularPosition(int cur) {
if ((cur + 1) == variantArray.length)
return 0;
return cur + 1;
}
}

View File

@ -42,6 +42,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.io.PrintStream;
import java.lang.reflect.Array;
import java.util.*;
/**
@ -334,12 +335,12 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
return records;
}
private static void addFieldValue(Object val, List<List<String>> result) {
private static void addFieldValue(final Object val, final List<List<String>> result) {
final int numResultRecords = result.size();
// if we're trying to create a single output record, add it
if ( numResultRecords == 1 ) {
result.get(0).add(val.toString());
result.get(0).add(prettyPrintObject(val));
}
// if this field is a list of the proper size, add the appropriate entry to each record
else if ( (val instanceof List) && ((List)val).size() == numResultRecords ) {
@ -355,6 +356,26 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
}
}
private static String prettyPrintObject(final Object val) {
if ( val instanceof List )
return prettyPrintObject(((List)val).toArray());
if ( !val.getClass().isArray() )
return val.toString();
final int length = Array.getLength(val);
if ( length == 0 )
return "";
final StringBuilder sb = new StringBuilder(prettyPrintObject(Array.get(val, 0)));
for ( int i = 1; i < length; i++ ) {
sb.append(",");
sb.append(prettyPrintObject(Array.get(val, i)));
}
return sb.toString();
}
public static List<List<String>> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
return extractFields(vc, fields, null, null, allowMissingData, false);
}

View File

@ -236,6 +236,33 @@ public class Utils {
}
}
public static <T> List<T> append(final List<T> left, T ... elts) {
final List<T> l = new LinkedList<T>(left);
l.addAll(Arrays.asList(elts));
return l;
}
/**
* Returns a string of the values in joined by separator, such as A,B,C
*
* @param separator
* @param doubles
* @return
*/
public static String join(String separator, double[] doubles) {
if ( doubles == null || doubles.length == 0)
return "";
else {
StringBuilder ret = new StringBuilder();
ret.append(doubles[0]);
for (int i = 1; i < doubles.length; ++i) {
ret.append(separator);
ret.append(doubles[i]);
}
return ret.toString();
}
}
/**
* Returns a string of the form elt1.toString() [sep elt2.toString() ... sep elt.toString()] for a collection of
* elti objects (note there's no actual space between sep and the elti elements). Returns

View File

@ -157,11 +157,8 @@ public class VariantContextUtils {
builder.attributes(calculateChromosomeCounts(vc, new HashMap<String, Object>(vc.getAttributes()), removeStaleValues, founderIds));
}
public static Genotype removePLs(Genotype g) {
if ( g.hasLikelihoods() )
return new GenotypeBuilder(g).noPL().make();
else
return g;
public static Genotype removePLsAndAD(final Genotype g) {
return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g;
}
public final static VCFCompoundHeaderLine getMetaDataForField(final VCFHeader header, final String field) {
@ -573,7 +570,7 @@ public class VariantContextUtils {
}
// if we have more alternate alleles in the merged VC than in one or more of the
// original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF
// original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD
for ( final VariantContext vc : VCs ) {
if (vc.alleles.size() == 1)
continue;
@ -581,7 +578,7 @@ public class VariantContextUtils {
if ( ! genotypes.isEmpty() )
logger.debug(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s",
genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles));
genotypes = stripPLs(genotypes);
genotypes = stripPLsAndAD(genotypes);
// this will remove stale AC,AF attributed from vc
calculateChromosomeCounts(vc, attributes, true);
break;
@ -672,11 +669,11 @@ public class VariantContextUtils {
return true;
}
public static GenotypesContext stripPLs(GenotypesContext genotypes) {
public static GenotypesContext stripPLsAndAD(GenotypesContext genotypes) {
GenotypesContext newGs = GenotypesContext.create(genotypes.size());
for ( final Genotype g : genotypes ) {
newGs.add(g.hasLikelihoods() ? removePLs(g) : g);
newGs.add(removePLsAndAD(g));
}
return newGs;
@ -1343,10 +1340,7 @@ public class VariantContextUtils {
public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) {
// TODO - this function doesn't work with mixed records or records that started as mixed and then became non-mixed
// see whether we need to trim common reference base from all alleles
final int trimExtent = computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, false);
if ( trimExtent <= 0 || inputVC.getAlleles().size() <= 1 )
return inputVC;

View File

@ -477,10 +477,10 @@ class VCFWriter extends IndexingVariantContextWriter {
else if ( val instanceof List ) {
result = formatVCFField(((List)val).toArray());
} else if ( val.getClass().isArray() ) {
int length = Array.getLength(val);
final int length = Array.getLength(val);
if ( length == 0 )
return formatVCFField(null);
StringBuffer sb = new StringBuffer(formatVCFField(Array.get(val, 0)));
final StringBuilder sb = new StringBuilder(formatVCFField(Array.get(val, 0)));
for ( int i = 1; i < length; i++) {
sb.append(",");
sb.append(formatVCFField(Array.get(val, i)));

View File

@ -1,127 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.Arrays;
public class ExactAFCalculationModelUnitTest extends BaseTest {
static double[] AA1, AB1, BB1;
static double[] AA2, AB2, AC2, BB2, BC2, CC2;
static final int numSamples = 3;
static double[] priors = new double[2*numSamples+1]; // flat priors
@BeforeSuite
public void before() {
AA1 = new double[]{0.0, -20.0, -20.0};
AB1 = new double[]{-20.0, 0.0, -20.0};
BB1 = new double[]{-20.0, -20.0, 0.0};
AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0};
AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0};
AC2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0};
BB2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0};
BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0};
CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0};
}
private class GetGLsTest extends TestDataProvider {
GenotypesContext GLs;
int numAltAlleles;
String name;
private GetGLsTest(String name, int numAltAlleles, Genotype... arg) {
super(GetGLsTest.class, name);
GLs = GenotypesContext.create(arg);
this.name = name;
this.numAltAlleles = numAltAlleles;
}
public String toString() {
return String.format("%s input=%s", super.toString(), GLs);
}
}
private static Genotype createGenotype(String name, double[] gls) {
return new GenotypeBuilder(name, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).PL(gls).make();
}
@DataProvider(name = "getGLs")
public Object[][] createGLsData() {
// bi-allelic case
new GetGLsTest("B0", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AA3", AA1));
new GetGLsTest("B1", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AB", AB1));
new GetGLsTest("B2", 1, createGenotype("AA1", AA1), createGenotype("BB", BB1), createGenotype("AA2", AA1));
new GetGLsTest("B3a", 1, createGenotype("AB", AB1), createGenotype("AA", AA1), createGenotype("BB", BB1));
new GetGLsTest("B3b", 1, createGenotype("AB1", AB1), createGenotype("AB2", AB1), createGenotype("AB3", AB1));
new GetGLsTest("B4", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("AA", AA1));
new GetGLsTest("B5", 1, createGenotype("BB1", BB1), createGenotype("AB", AB1), createGenotype("BB2", BB1));
new GetGLsTest("B6", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("BB3", BB1));
// tri-allelic case
new GetGLsTest("B1C0", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AB", AB2));
new GetGLsTest("B0C1", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AC", AC2));
new GetGLsTest("B1C1a", 2, createGenotype("AA", AA2), createGenotype("AB", AB2), createGenotype("AC", AC2));
new GetGLsTest("B1C1b", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("BC", BC2));
new GetGLsTest("B2C1", 2, createGenotype("AB1", AB2), createGenotype("AB2", AB2), createGenotype("AC", AC2));
new GetGLsTest("B3C2a", 2, createGenotype("AB", AB2), createGenotype("BC1", BC2), createGenotype("BC2", BC2));
new GetGLsTest("B3C2b", 2, createGenotype("AB", AB2), createGenotype("BB", BB2), createGenotype("CC", CC2));
return GetGLsTest.getTests(GetGLsTest.class);
}
@Test(dataProvider = "getGLs")
public void testGLs(GetGLsTest cfg) {
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
int nameIndex = 1;
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele];
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
}
}
@Test
public void testLargeGLs() {
final double[] BB = new double[]{-20000000.0, -20000000.0, 0.0};
GetGLsTest cfg = new GetGLsTest("B6", 1, createGenotype("1", BB), createGenotype("2", BB), createGenotype("3", BB));
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[0];
Assert.assertEquals(calculatedAlleleCount, 6);
}
@Test
public void testMismatchedGLs() {
final double[] AB = new double[]{-2000.0, 0.0, -2000.0, -2000.0, -2000.0, -2000.0};
final double[] AC = new double[]{-100.0, -100.0, -100.0, 0.0, -100.0, -100.0};
GetGLsTest cfg = new GetGLsTest("B1C1", 2, createGenotype("1", AC), createGenotype("2", AB));
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
Assert.assertEquals(result.getAlleleCountsOfMAP()[0], 1);
Assert.assertEquals(result.getAlleleCountsOfMAP()[1], 1);
}
}

View File

@ -182,12 +182,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testOutputParameterAllConfident() {
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "da318257d25a02abd26a3348421c3c69");
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "7bb6375fddc461c72d44f261f6d4b3c7");
}
@Test
public void testOutputParameterAllSites() {
testOutputParameters("--output_mode EMIT_ALL_SITES", "13c4f01cffbbfac600318be95b3ca02f");
testOutputParameters("--output_mode EMIT_ALL_SITES", "2104dac76fa2a58a92c72b331c7f2095");
}
private void testOutputParameters(final String args, final String md5) {
@ -438,4 +438,19 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
Arrays.asList("22c9fd65ce3298bd7fbf400c9c209f29"));
executeTest("test calling on reads with Ns in CIGAR", spec);
}
// --------------------------------------------------------------------------------------------------------------
//
// testing reduced reads
//
// --------------------------------------------------------------------------------------------------------------
@Test
public void testReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("84486c88a0fd1ae996a6402490db8492"));
executeTest("test calling on a ReducedRead BAM", spec);
}
}

View File

@ -61,4 +61,13 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest {
Arrays.asList("7e7bad0e1890753a01303c09a38ceb8d"));
executeTest("test hg18 to hg19, unsorted", spec);
}
@Test
public void testLiftoverFilteringOfIndels() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T FilterLiftedVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "liftover_indel_test.vcf --no_cmdline_in_header",
1,
Arrays.asList("0909a953291a5e701194668c9b8833ab"));
executeTest("test liftover filtering of indels", spec);
}
}

View File

@ -255,7 +255,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile,
1,
Arrays.asList("3ab35d5e81a29fb5db3e2add11c7e823")
Arrays.asList("f14d75892b99547d8e9ba3a03bfb04ea")
);
executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec);
}

View File

@ -63,7 +63,7 @@ public class VariantsToTableIntegrationTest extends WalkerTest {
@Test(enabled = true)
public void testMultiAllelicOneRecord() {
WalkerTestSpec spec = new WalkerTestSpec(variantsToTableMultiAllelicCmd(""),
Arrays.asList("13dd36c08be6c800f23988e6000d963e"));
Arrays.asList("0ff49c08690f61a38614606a090f23ea"));
executeTest("testMultiAllelicOneRecord", spec);
}
@ -100,6 +100,19 @@ public class VariantsToTableIntegrationTest extends WalkerTest {
executeTest("testGenotypeFieldsWithInline", spec);
}
@Test(enabled = true)
public void testListFields() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" --variant " + privateTestDir + "vcfexample.withMLE.vcf" +
" -T VariantsToTable" +
" -GF PL" +
" -o %s",
1,
Arrays.asList("1cb2737ab0eaee0a9ae25ab2e7ac3e7e"));
executeTest("testGenotypeFields", spec);
}
@Test(enabled = true)
public void testMoltenOutput() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(

View File

@ -10,13 +10,17 @@ class ExampleRetryMemoryLimit extends QScript {
var bamFile: File = _
def script() {
val ug = new UnifiedGenotyper with RetryMemoryLimit
// First run with 1m
ug.memoryLimit = .001
// On retry run with 1g
ug.retryMemoryFunction = (d => d * 1000)
ug.reference_sequence = referenceFile
ug.input_file = Seq(bamFile)
add(ug)
for (scatterCount <- 1 to 2) {
val ug = new UnifiedGenotyper with RetryMemoryLimit
// First run with 1m
ug.memoryLimit = .001
// On retry run with 1g
ug.retryMemoryFunction = (d => d * 1000)
ug.reference_sequence = referenceFile
ug.input_file = Seq(bamFile)
ug.out = swapExt(bamFile, ".bam", ".scattered_%d.vcf".format(scatterCount))
ug.scatterCount = scatterCount
add(ug)
}
}
}

View File

@ -189,7 +189,7 @@ class QCommandLine extends CommandLineProgram with Logging {
private def createQueueHeader() : Seq[String] = {
Seq(String.format("Queue v%s, Compiled %s", getQueueVersion, getBuildTimestamp),
"Copyright (c) 2012 The Broad Institute",
"Fro support and documentation go to http://www.broadinstitute.org/gatk")
"For support and documentation go to http://www.broadinstitute.org/gatk")
}
private def getQueueVersion : String = {

View File

@ -26,19 +26,19 @@ package org.broadinstitute.sting.queue.extensions.gatk
import org.broadinstitute.sting.queue.function.scattergather.GatherFunction
import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction
import org.broadinstitute.sting.queue.function.QFunction
import org.broadinstitute.sting.queue.function.{RetryMemoryLimit, QFunction}
import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor
/**
* Merges BAM files using net.sf.picard.sam.MergeSamFiles.
*/
class BamGatherFunction extends GatherFunction with PicardBamFunction {
class BamGatherFunction extends GatherFunction with PicardBamFunction with RetryMemoryLimit {
this.javaMainClass = "net.sf.picard.sam.MergeSamFiles"
this.assumeSorted = Some(true)
protected def inputBams = gatherParts
protected def outputBam = originalOutput
override def freezeFieldValues {
override def freezeFieldValues() {
val originalGATK = originalFunction.asInstanceOf[CommandLineGATK]
// Whatever the original function can handle, merging *should* do less.

View File

@ -25,13 +25,13 @@
package org.broadinstitute.sting.queue.extensions.gatk
import org.broadinstitute.sting.queue.function.scattergather.GatherFunction
import org.broadinstitute.sting.queue.function.QFunction
import org.broadinstitute.sting.queue.function.{RetryMemoryLimit, QFunction}
import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor
/**
* Merges a vcf text file.
*/
class VcfGatherFunction extends CombineVariants with GatherFunction {
class VcfGatherFunction extends CombineVariants with GatherFunction with RetryMemoryLimit {
this.assumeIdenticalSamples = true
this.suppressCommandLineHeader = true

View File

@ -50,7 +50,7 @@ class SortSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFun
override def freezeFieldValues() {
super.freezeFieldValues()
if (outputIndex == null && output != null)
outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai")
outputIndex = new File(output.getAbsolutePath.stripSuffix(".bam") + ".bai")
}

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.queue.function
import org.broadinstitute.sting.queue.util._
import org.broadinstitute.sting.commandline.Argument
/**
* A command line that will be run in a pipeline.
@ -33,12 +34,15 @@ trait CommandLineFunction extends QFunction with Logging {
def commandLine: String
/** Upper memory limit */
@Argument(doc="Memory limit", required=false)
var memoryLimit: Option[Double] = None
/** Resident memory limit */
@Argument(doc="Resident memory limit", required=false)
var residentLimit: Option[Double] = None
/** Resident memory request */
@Argument(doc="Resident memory request", required=false)
var residentRequest: Option[Double] = None
/** the number of SMP cores this job wants */

View File

@ -47,6 +47,7 @@ trait JavaCommandLineFunction extends CommandLineFunction {
/**
* Memory limit for the java executable, or if None will use the default memoryLimit.
*/
@Argument(doc="Java memory limit", required=false)
var javaMemoryLimit: Option[Double] = None
/**

View File

@ -113,11 +113,13 @@ trait QFunction extends Logging with QJobReport {
var jobErrorFile: File = _
/** Errors (if any) from the last failed run of jobErrorFiles. */
@Argument(doc="Job error lines", required=false)
var jobErrorLines: Seq[String] = Nil
/**
* The number of times this function has previously been run.
*/
@Argument(doc="Job retries", required=false)
var retries = 0
/** Change settings for the next run. Retries will be set to the number of times the function was run and jobErrorLines may contain the error text. */
@ -541,4 +543,11 @@ object QFunction {
classFields
}
}
/**
* Returns the Seq of fields for a QFunction class.
* @param clazz Class to retrieve fields for.
* @return the fields of the class.
*/
def classFunctionFields(clazz: Class[_]) = classFields(clazz).functionFields
}

View File

@ -24,17 +24,26 @@
package org.broadinstitute.sting.queue.function
import org.broadinstitute.sting.commandline.Argument
object RetryMemoryLimit {
private val defaultRetryMemoryFunction: (Double => Double) = ( 2 * _ )
private val defaultMemoryLimitErrorText = Seq("OutOfMemory", "you did not provide enough memory", "TERM_MEMLIMIT")
}
/** A mixin that on retry increases the memory limit when certain text is found. */
trait RetryMemoryLimit extends CommandLineFunction {
/** How to increase the memory. By default doubles the memory. */
var retryMemoryFunction: (Double => Double) = (2 * _)
var retryMemoryFunction: (Double => Double) = RetryMemoryLimit.defaultRetryMemoryFunction
/** Once the threshold is passed, no more memory will be added to memory limit. */
@Argument(doc="threshold to stop doubling the memory", required=false)
var memoryLimitThreshold: Option[Double] = None
/** Various strings to look for to determine we ran out of memory. */
var memoryLimitErrorText = Seq("OutOfMemory", "you did not provide enough memory", "TERM_MEMLIMIT")
@Argument(doc="text to look for in the errors", required = false)
var memoryLimitErrorText = RetryMemoryLimit.defaultMemoryLimitErrorText
override def freezeFieldValues() {
super.freezeFieldValues()
@ -42,6 +51,21 @@ trait RetryMemoryLimit extends CommandLineFunction {
this.memoryLimitThreshold = this.qSettings.memoryLimitThreshold
}
override def copySettingsTo(function: QFunction) {
super.copySettingsTo(function)
function match {
case retryMemoryLimit: RetryMemoryLimit =>
if (retryMemoryLimit.memoryLimitThreshold.isEmpty)
retryMemoryLimit.memoryLimitThreshold = this.memoryLimitThreshold
if (retryMemoryLimit.retryMemoryFunction == RetryMemoryLimit.defaultRetryMemoryFunction)
retryMemoryLimit.retryMemoryFunction = this.retryMemoryFunction
if (retryMemoryLimit.memoryLimitErrorText == RetryMemoryLimit.defaultMemoryLimitErrorText)
retryMemoryLimit.memoryLimitErrorText = this.memoryLimitErrorText
case _ => /* ignore */
}
}
override def setupRetry() {
super.setupRetry()
if (this.memoryLimitThreshold.isDefined && this.memoryLimit.isDefined) {

View File

@ -30,6 +30,10 @@ import org.broadinstitute.sting.queue.function.{QFunction, CommandLineFunction}
/**
* Shadow clones another command line function.
*/
object CloneFunction {
private lazy val cloneFunctionFields = QFunction.classFunctionFields(classOf[CloneFunction])
}
class CloneFunction extends CommandLineFunction {
var originalFunction: ScatterGatherableFunction = _
var cloneIndex: Int = _
@ -41,10 +45,10 @@ class CloneFunction extends CommandLineFunction {
var originalValues = Map.empty[ArgumentSource, Any]
withScatterPartCount += 1
if (withScatterPartCount == 1) {
overriddenFields.foreach{
case (field, overrideValue) => {
originalFunction.functionFields.foreach {
case (field) => {
originalValues += field -> originalFunction.getFieldValue(field)
originalFunction.setFieldValue(field, overrideValue)
originalFunction.setFieldValue(field, getFieldValue(field))
}
}
}
@ -52,9 +56,11 @@ class CloneFunction extends CommandLineFunction {
f()
} finally {
if (withScatterPartCount == 1) {
originalValues.foreach{
case (name, value) =>
originalFunction.setFieldValue(name, value)
originalFunction.functionFields.foreach {
case (field) => {
setFieldValue(field, originalFunction.getFieldValue(field))
originalFunction.setFieldValue(field, originalValues(field))
}
}
}
withScatterPartCount -= 1
@ -63,6 +69,8 @@ class CloneFunction extends CommandLineFunction {
override def description = withScatterPart(() => originalFunction.description)
override def shortDescription = withScatterPart(() => originalFunction.shortDescription)
override def setupRetry() { withScatterPart(() => originalFunction.setupRetry()) }
override protected def functionFieldClass = originalFunction.getClass
def commandLine = withScatterPart(() => originalFunction.commandLine)
@ -73,13 +81,19 @@ class CloneFunction extends CommandLineFunction {
}
override def getFieldValue(source: ArgumentSource): AnyRef = {
overriddenFields.get(source) match {
case Some(value) => value.asInstanceOf[AnyRef]
case None => {
val value = originalFunction.getFieldValue(source)
overriddenFields += source -> value
value
}
CloneFunction.cloneFunctionFields.find(_.field.getName == source.field.getName) match {
case Some(cloneSource) =>
super.getFieldValue(cloneSource)
case None =>
overriddenFields.get(source) match {
case Some(value) =>
value.asInstanceOf[AnyRef]
case None => {
val value = originalFunction.getFieldValue(source)
overriddenFields += source -> value
value
}
}
}
}