diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java index 98a96fbfb..d5afc5722 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java @@ -23,7 +23,7 @@ public class BaseAndQualsCounts extends BaseCounts { } } - public void incr(byte base, byte baseQual, byte insQual, byte delQual) { + public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual) { super.incr(base, baseQual); BaseIndex i = BaseIndex.byteToBase(base); if (i != null) { // do not allow Ns @@ -32,7 +32,7 @@ public class BaseAndQualsCounts extends BaseCounts { } } - public void decr(byte base, byte baseQual, byte insQual, byte delQual) { + public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual) { super.decr(base, baseQual); BaseIndex i = BaseIndex.byteToBase(base); if (i != null) { // do not allow Ns @@ -41,16 +41,15 @@ public class BaseAndQualsCounts extends BaseCounts { } } - public byte averageInsertionQualsOfMostCommonBase() { - return getGenericAverageQualOfMostCommonBase(sumInsertionQuals); + public byte averageInsertionQualsOfBase(final BaseIndex base) { + return getGenericAverageQualOfBase(base, sumInsertionQuals); } - public byte averageDeletionQualsOfMostCommonBase() { - return getGenericAverageQualOfMostCommonBase(sumDeletionQuals); + public byte averageDeletionQualsOfBase(final BaseIndex base) { + return getGenericAverageQualOfBase(base, sumDeletionQuals); } - private byte getGenericAverageQualOfMostCommonBase(Map sumQuals) { - BaseIndex base = BaseIndex.byteToBase(baseWithMostCounts()); + private byte getGenericAverageQualOfBase(final BaseIndex base, final Map sumQuals) { return (byte) (sumQuals.get(base) / getCount(base)); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java index 53c36c3f9..94f3c2b6b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java @@ -41,26 +41,26 @@ import java.util.Map; @Requires("other != null") public void add(BaseCounts other) { - for (BaseIndex i : BaseIndex.values()) + for (final BaseIndex i : BaseIndex.values()) counts.put(i, counts.get(i) + other.counts.get(i)); } @Requires("other != null") public void sub(BaseCounts other) { - for (BaseIndex i : BaseIndex.values()) + for (final BaseIndex i : BaseIndex.values()) counts.put(i, counts.get(i) - other.counts.get(i)); } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") public void incr(byte base) { - BaseIndex i = BaseIndex.byteToBase(base); + final BaseIndex i = BaseIndex.byteToBase(base); if (i != null) // no Ns counts.put(i, counts.get(i) + 1); } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") public void incr(byte base, byte qual) { - BaseIndex i = BaseIndex.byteToBase(base); + final BaseIndex i = BaseIndex.byteToBase(base); if (i != null) { // no Ns counts.put(i, counts.get(i) + 1); sumQuals.put(i, sumQuals.get(i) + qual); @@ -69,14 +69,14 @@ import java.util.Map; @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") public void decr(byte base) { - BaseIndex i = BaseIndex.byteToBase(base); + final BaseIndex i = BaseIndex.byteToBase(base); if (i != null) // no Ns counts.put(i, counts.get(i) - 1); } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") public void decr(byte base, byte qual) { - BaseIndex i = BaseIndex.byteToBase(base); + final BaseIndex i = BaseIndex.byteToBase(base); if (i != null) { // no Ns counts.put(i, counts.get(i) - 1); sumQuals.put(i, sumQuals.get(i) - qual); @@ -84,52 +84,48 @@ import java.util.Map; } @Ensures("result >= 0") - public int getCount(byte base) { + public int getCount(final byte base) { return getCount(BaseIndex.byteToBase(base)); } @Ensures("result >= 0") - public int getCount(BaseIndex base) { + public int getCount(final BaseIndex base) { return counts.get(base); } @Ensures("result >= 0") - public long getSumQuals(byte base) { + public long getSumQuals(final byte base) { return getSumQuals(BaseIndex.byteToBase(base)); } @Ensures("result >= 0") - public long getSumQuals(BaseIndex base) { + public long getSumQuals(final BaseIndex base) { return sumQuals.get(base); } @Ensures("result >= 0") - public byte averageQuals(byte base) { + public byte averageQuals(final byte base) { return (byte) (getSumQuals(base) / getCount(base)); } @Ensures("result >= 0") - public byte averageQuals(BaseIndex base) { + public byte averageQuals(final BaseIndex base) { return (byte) (getSumQuals(base) / getCount(base)); } - public byte baseWithMostCounts() { - return baseIndexWithMostCounts().getByte(); + @Ensures("result >= 0") + public int countOfBase(final BaseIndex base) { + return counts.get(base); } @Ensures("result >= 0") - public int countOfMostCommonBase() { - return counts.get(baseIndexWithMostCounts()); + public long sumQualsOfBase(final BaseIndex base) { + return sumQuals.get(base); } @Ensures("result >= 0") - public long sumQualsOfMostCommonBase() { - return sumQuals.get(baseIndexWithMostCounts()); - } - - @Ensures("result >= 0") - public byte averageQualsOfMostCommonBase() { - return (byte) (sumQualsOfMostCommonBase() / countOfMostCommonBase()); + public byte averageQualsOfBase(final BaseIndex base) { + return (byte) (sumQualsOfBase(base) / countOfBase(base)); } @@ -149,7 +145,7 @@ import java.util.Map; * @return the proportion of this base over all other bases */ @Ensures({"result >=0.0", "result<= 1.0"}) - public double baseCountProportion(byte base) { + public double baseCountProportion(final byte base) { return (double) counts.get(BaseIndex.byteToBase(base)) / totalCount(); } @@ -160,7 +156,7 @@ import java.util.Map; * @return the proportion of this base over all other bases */ @Ensures({"result >=0.0", "result<= 1.0"}) - public double baseCountProportion(BaseIndex baseIndex) { + public double baseCountProportion(final BaseIndex baseIndex) { int total = totalCount(); if (total == 0) return 0.0; @@ -177,22 +173,28 @@ import java.util.Map; return b.toString(); } + public byte baseWithMostCounts() { + return baseIndexWithMostCounts().getByte(); + } + @Ensures("result != null") public BaseIndex baseIndexWithMostCounts() { BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; - for (BaseIndex i : counts.keySet()) - if (hasHigherCount(i, maxI)) - maxI = i; + for (Map.Entry entry : counts.entrySet()) { + if (entry.getValue() > counts.get(maxI)) + maxI = entry.getKey(); + } return maxI; } @Ensures("result != null") public BaseIndex baseIndexWithMostCountsWithoutIndels() { - BaseIndex mostCounts = MAX_BASE_INDEX_WITH_NO_COUNTS; - for (BaseIndex index : counts.keySet()) - if (index.isNucleotide() && hasHigherCount(index, mostCounts)) - mostCounts = index; - return mostCounts; + BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; + for (Map.Entry entry : counts.entrySet()) { + if (entry.getKey().isNucleotide() && entry.getValue() > counts.get(maxI)) + maxI = entry.getKey(); + } + return maxI; } private boolean hasHigherCount(final BaseIndex targetIndex, final BaseIndex testIndex) { @@ -201,6 +203,30 @@ import java.util.Map; return ( targetCount > testCount || (targetCount == testCount && sumQuals.get(targetIndex) > sumQuals.get(testIndex)) ); } + public byte baseWithMostProbability() { + return baseIndexWithMostProbability().getByte(); + } + + @Ensures("result != null") + public BaseIndex baseIndexWithMostProbability() { + BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; + for (Map.Entry entry : sumQuals.entrySet()) { + if (entry.getValue() > sumQuals.get(maxI)) + maxI = entry.getKey(); + } + return (sumQuals.get(maxI) > 0L ? maxI : baseIndexWithMostCounts()); + } + + @Ensures("result != null") + public BaseIndex baseIndexWithMostProbabilityWithoutIndels() { + BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; + for (Map.Entry entry : sumQuals.entrySet()) { + if (entry.getKey().isNucleotide() && entry.getValue() > sumQuals.get(maxI)) + maxI = entry.getKey(); + } + return (sumQuals.get(maxI) > 0L ? maxI : baseIndexWithMostCountsWithoutIndels()); + } + @Ensures("result >=0") public int totalCountWithoutIndels() { int sum = 0; @@ -218,8 +244,8 @@ import java.util.Map; */ @Requires("index.isNucleotide()") @Ensures({"result >=0.0", "result<= 1.0"}) - public double baseCountProportionWithoutIndels(BaseIndex index) { - int total = totalCountWithoutIndels(); + public double baseCountProportionWithoutIndels(final BaseIndex index) { + final int total = totalCountWithoutIndels(); if (total == 0) return 0.0; return (double) counts.get(index) / totalCountWithoutIndels(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java index 3fc438b19..0c1854ad1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java @@ -182,7 +182,7 @@ public class HeaderElement { * @return whether or not the HeaderElement is variant due to excess insertions */ private boolean isVariantFromMismatches(double minVariantProportion) { - BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostCountsWithoutIndels(); + BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels(); double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon); return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 6c588898c..6fdf85317 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -55,7 +55,7 @@ public class SlidingWindow { private final int nContigs; - private boolean allowPolyploidReduction; + private boolean allowPolyploidReductionInGeneral; /** * The types of synthetic reads to use in the finalizeAndAdd method @@ -117,7 +117,7 @@ public class SlidingWindow { this.hasIndelQualities = hasIndelQualities; this.nContigs = nContigs; - this.allowPolyploidReduction = allowPolyploidReduction; + this.allowPolyploidReductionInGeneral = allowPolyploidReduction; } /** @@ -207,8 +207,9 @@ public class SlidingWindow { finalizedReads = closeVariantRegions(regions, false); List readsToRemove = new LinkedList(); - for (GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!) - if (read.getSoftEnd() < getStartLocation(windowHeader)) { + final int windowHeaderStartLoc = getStartLocation(windowHeader); + for (final GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!) + if (read.getSoftEnd() < windowHeaderStartLoc) { readsToRemove.add(read); } } @@ -291,7 +292,7 @@ public class SlidingWindow { reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS)); int endOfFilteredData = findNextNonFilteredDataElement(header, start, end); - addToFilteredData(header, start, endOfFilteredData, isNegativeStrand); + reads.addAll(addToFilteredData(header, start, endOfFilteredData, isNegativeStrand)); if (endOfFilteredData <= start) throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start)); @@ -418,7 +419,9 @@ public class SlidingWindow { * @param start the first header index to add to consensus * @param end the first header index NOT TO add to consensus */ - private void addToFilteredData(LinkedList header, int start, int end, boolean isNegativeStrand) { + private List addToFilteredData(LinkedList header, int start, int end, boolean isNegativeStrand) { + List result = new ArrayList(0); + if (filteredDataConsensus == null) filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand); @@ -434,8 +437,15 @@ public class SlidingWindow { if (!headerElement.hasFilteredData()) throw new ReviewedStingException("No filtered data in " + index); + if ( filteredDataConsensus.getRefStart() + filteredDataConsensus.size() != headerElement.getLocation() ) { + result.add(finalizeFilteredDataConsensus()); + filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand); + } + genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS()); } + + return result; } /** @@ -472,15 +482,15 @@ public class SlidingWindow { * @param rms the rms mapping quality in the header element */ private void genericAddBaseToConsensus(SyntheticRead syntheticRead, BaseAndQualsCounts baseCounts, double rms) { - BaseIndex base = baseCounts.baseIndexWithMostCounts(); - byte count = (byte) Math.min(baseCounts.countOfMostCommonBase(), Byte.MAX_VALUE); - byte qual = baseCounts.averageQualsOfMostCommonBase(); - byte insQual = baseCounts.averageInsertionQualsOfMostCommonBase(); - byte delQual = baseCounts.averageDeletionQualsOfMostCommonBase(); + final BaseIndex base = baseCounts.baseIndexWithMostProbability(); + byte count = (byte) Math.min(baseCounts.countOfBase(base), Byte.MAX_VALUE); + byte qual = baseCounts.averageQualsOfBase(base); + byte insQual = baseCounts.averageInsertionQualsOfBase(base); + byte delQual = baseCounts.averageDeletionQualsOfBase(base); syntheticRead.add(base, count, qual, insQual, delQual, rms); } - private List compressVariantRegion(int start, int stop) { + private List compressVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) { List allReads = new LinkedList(); // Try to compress into a polyploid consensus @@ -490,7 +500,8 @@ public class SlidingWindow { boolean foundEvent = false; Object[] header = windowHeader.toArray(); - if ( allowPolyploidReduction ) { // foundEvent will remain false if we don't allow polyploid reduction + // foundEvent will remain false if we don't allow polyploid reduction + if ( allowPolyploidReductionInGeneral && !disallowPolyploidReductionAtThisPosition ) { for (int i = start; i<=stop; i++) { nHaplotypes = ((HeaderElement) header[i]).getNumberOfHaplotypes(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT); if (nHaplotypes > nContigs) { @@ -512,9 +523,6 @@ public class SlidingWindow { } } - int refStart = windowHeader.get(start).getLocation(); - int refStop = windowHeader.get(stop).getLocation(); - // Try to compress the variant region // the "foundEvent" protects us from trying to compress variant regions that are created by insertions if (canCompress && foundEvent) { @@ -524,6 +532,9 @@ public class SlidingWindow { // Return all reads that overlap the variant region and remove them from the window header entirely // also remove all reads preceding the variant region (since they will be output as consensus right after compression else { + final int refStart = windowHeader.get(start).getLocation(); + final int refStop = windowHeader.get(stop).getLocation(); + LinkedList toRemove = new LinkedList(); for (GATKSAMRecord read : readsInWindow) { if (read.getSoftStart() <= refStop) { @@ -549,8 +560,8 @@ public class SlidingWindow { * @return all reads contained in the variant region plus any adjacent synthetic reads */ @Requires("start <= stop") - protected List closeVariantRegion(int start, int stop) { - List allReads = compressVariantRegion(start, stop); + protected List closeVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) { + List allReads = compressVariantRegion(start, stop, disallowPolyploidReductionAtThisPosition); List result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads; result.addAll(addToSyntheticReads(windowHeader, 0, stop, false)); @@ -570,7 +581,7 @@ public class SlidingWindow { if (stop < 0 && forceClose) stop = windowHeader.size() - 1; if (stop >= 0) { - allReads.addAll(closeVariantRegion(start, stop)); + allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); lastStop = stop; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java index ab65020c3..ccf81dd67 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java @@ -44,7 +44,7 @@ public class SyntheticRead { private String contig; private int contigIndex; private String readName; - private Integer refStart; + private int refStart; private boolean hasIndelQualities = false; private boolean isNegativeStrand = false; @@ -60,7 +60,7 @@ public class SyntheticRead { * @param refStart the alignment start (reference based) * @param readTag the reduce reads tag for the synthetic read */ - public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) { + public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) { final int initialCapacity = 10000; bases = new ArrayList(initialCapacity); counts = new ArrayList(initialCapacity); @@ -80,7 +80,7 @@ public class SyntheticRead { this.isNegativeStrand = isNegativeRead; } - public SyntheticRead(List bases, List counts, List quals, List insertionQuals, List deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, boolean hasIndelQualities, boolean isNegativeRead) { + public SyntheticRead(List bases, List counts, List quals, List insertionQuals, List deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) { this.bases = bases; this.counts = counts; this.quals = quals; @@ -115,11 +115,15 @@ public class SyntheticRead { this.mappingQuality += mappingQuality; } - public BaseIndex getBase(int readCoordinate) { + public BaseIndex getBase(final int readCoordinate) { return bases.get(readCoordinate); } - /** + public int getRefStart() { + return refStart; + } + + /** * Creates a GATKSAMRecord of the synthetic read. Will return null if the read is invalid. * * Invalid reads are : diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationPerformanceTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationPerformanceTest.java new file mode 100644 index 000000000..b4d041061 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationPerformanceTest.java @@ -0,0 +1,228 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.apache.log4j.ConsoleAppender; +import org.apache.log4j.Logger; +import org.apache.log4j.SimpleLayout; +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; + +import java.io.FileOutputStream; +import java.io.PrintStream; +import java.util.*; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 10/2/12 + * Time: 10:25 AM + * To change this template use File | Settings | File Templates. + */ +public class ExactAFCalculationPerformanceTest { + final static Logger logger = Logger.getLogger(ExactAFCalculationPerformanceTest.class); + + private static abstract class Analysis { + final GATKReport report; + + public Analysis(final String name, final List columns) { + report = GATKReport.newSimpleReport(name, columns); + } + + public abstract void run(final ExactAFCalculationTestBuilder testBuilder, + final List coreColumns); + + public String getName() { + return getTable().getTableName(); + } + + public GATKReportTable getTable() { + return report.getTables().iterator().next(); + } + } + + private static class AnalyzeByACAndPL extends Analysis { + public AnalyzeByACAndPL(final List columns) { + super("AnalyzeByACAndPL", Utils.append(columns, "non.type.pls", "ac", "n.alt.seg", "other.ac")); + } + + public void run(final ExactAFCalculationTestBuilder testBuilder, final List coreValues) { + final SimpleTimer timer = new SimpleTimer(); + + for ( final int nonTypePL : Arrays.asList(10, 100, 1000) ) { + final ExactAFCalculation calc = testBuilder.makeModel(); + final double[] priors = testBuilder.makePriors(); + + for ( int[] ACs : makeACs(testBuilder.numAltAlleles, testBuilder.nSamples*2) ) { + final VariantContext vc = testBuilder.makeACTest(ACs, nonTypePL); + + timer.start(); + final AlleleFrequencyCalculationResult result = calc.getLog10PNonRef(vc, priors); + final long runtime = timer.getElapsedTimeNano(); + + int otherAC = 0; + int nAltSeg = 0; + for ( int i = 0; i < ACs.length; i++ ) { + nAltSeg += ACs[i] > 0 ? 1 : 0; + if ( i > 0 ) otherAC += ACs[i]; + } + + final List columns = new LinkedList(coreValues); + columns.addAll(Arrays.asList(runtime, result.getnEvaluations(), nonTypePL, ACs[0], nAltSeg, otherAC)); + report.addRowList(columns); + } + } + } + + private List makeACs(final int nAltAlleles, final int nChrom) { + if ( nAltAlleles > 2 ) throw new IllegalArgumentException("nAltAlleles must be < 3"); + + final List ACs = new LinkedList(); + + if ( nAltAlleles == 1 ) + for ( int i = 0; i < nChrom; i++ ) { + ACs.add(new int[]{i}); + } else if ( nAltAlleles == 2 ) { + for ( int i = 0; i < nChrom; i++ ) { + for ( int j : Arrays.asList(0, 1, 5, 10, 50, 100, 1000, 10000, 100000) ) { + if ( j < nChrom - i ) + ACs.add(new int[]{i, j}); + } + } + } else { + throw new IllegalStateException("cannot get here"); + } + + return ACs; + } + } + + private static class AnalyzeBySingletonPosition extends Analysis { + public AnalyzeBySingletonPosition(final List columns) { + super("AnalyzeBySingletonPosition", Utils.append(columns, "non.type.pls", "position.of.singleton")); + } + + public void run(final ExactAFCalculationTestBuilder testBuilder, final List coreValues) { + final SimpleTimer timer = new SimpleTimer(); + + for ( final int nonTypePL : Arrays.asList(100) ) { + final ExactAFCalculation calc = testBuilder.makeModel(); + final double[] priors = testBuilder.makePriors(); + + final int[] ac = new int[testBuilder.numAltAlleles]; + ac[0] = 1; + final VariantContext vc = testBuilder.makeACTest(ac, nonTypePL); + + for ( int position = 0; position < vc.getNSamples(); position++ ) { + final VariantContextBuilder vcb = new VariantContextBuilder(vc); + final List genotypes = new ArrayList(vc.getGenotypes()); + Collections.rotate(genotypes, position); + vcb.genotypes(genotypes); + + timer.start(); + final AlleleFrequencyCalculationResult result = calc.getLog10PNonRef(vcb.make(), priors); + final long runtime = timer.getElapsedTimeNano(); + + final List columns = new LinkedList(coreValues); + columns.addAll(Arrays.asList(runtime, result.getnEvaluations(), nonTypePL, position)); + report.addRowList(columns); + } + } + } + } + + private static class AnalyzeByNonInformative extends Analysis { + public AnalyzeByNonInformative(final List columns) { + super("AnalyzeByNonInformative", Utils.append(columns, "non.type.pls", "n.non.informative")); + } + + public void run(final ExactAFCalculationTestBuilder testBuilder, final List coreValues) { + final SimpleTimer timer = new SimpleTimer(); + + for ( final int nonTypePL : Arrays.asList(100) ) { + final ExactAFCalculation calc = testBuilder.makeModel(); + final double[] priors = testBuilder.makePriors(); + + final int[] ac = new int[testBuilder.numAltAlleles]; + ac[0] = 1; + final VariantContext vc = testBuilder.makeACTest(ac, nonTypePL); + final Genotype nonInformative = testBuilder.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0); + + for ( int nNonInformative = 0; nNonInformative < vc.getNSamples(); nNonInformative++ ) { + final VariantContextBuilder vcb = new VariantContextBuilder(vc); + + final List genotypes = new ArrayList(); + genotypes.addAll(vc.getGenotypes().subList(0, nNonInformative + 1)); + genotypes.addAll(Collections.nCopies(vc.getNSamples() - nNonInformative, nonInformative)); + vcb.genotypes(genotypes); + + timer.start(); + final AlleleFrequencyCalculationResult result = calc.getLog10PNonRef(vcb.make(), priors); + final long runtime = timer.getElapsedTimeNano(); + + final List columns = new LinkedList(coreValues); + columns.addAll(Arrays.asList(runtime, result.getnEvaluations(), nonTypePL, nNonInformative)); + report.addRowList(columns); + } + } + } + } + + public static void main(final String[] args) throws Exception { + logger.addAppender(new ConsoleAppender(new SimpleLayout())); + + final List coreColumns = Arrays.asList("iteration", "n.alt.alleles", "n.samples", + "exact.model", "prior.type", "runtime", "n.evaluations"); + + final PrintStream out = new PrintStream(new FileOutputStream(args[0])); + + final boolean USE_GENERAL = false; + final List modelTypes = USE_GENERAL + ? Arrays.asList(ExactAFCalculationTestBuilder.ModelType.values()) + : Arrays.asList(ExactAFCalculationTestBuilder.ModelType.DiploidExact); + + final boolean ONLY_HUMAN_PRIORS = false; + final List priorTypes = ONLY_HUMAN_PRIORS + ? Arrays.asList(ExactAFCalculationTestBuilder.PriorType.values()) + : Arrays.asList(ExactAFCalculationTestBuilder.PriorType.human); + + final int MAX_N_SAMPLES_FOR_MULTI_ALLELIC = 100; + + final List analyzes = new ArrayList(); + analyzes.add(new AnalyzeByACAndPL(coreColumns)); + analyzes.add(new AnalyzeBySingletonPosition(coreColumns)); + analyzes.add(new AnalyzeByNonInformative(coreColumns)); + + for ( int iteration = 0; iteration < 1; iteration++ ) { + for ( final int nAltAlleles : Arrays.asList(1, 2) ) { + for ( final int nSamples : Arrays.asList(1, 10, 100, 1000, 10000) ) { + if ( nSamples > MAX_N_SAMPLES_FOR_MULTI_ALLELIC && nAltAlleles > 1 ) + continue; // skip things that will take forever! + + for ( final ExactAFCalculationTestBuilder.ModelType modelType : modelTypes ) { + for ( final ExactAFCalculationTestBuilder.PriorType priorType : priorTypes ) { + final ExactAFCalculationTestBuilder testBuilder + = new ExactAFCalculationTestBuilder(nSamples, nAltAlleles, modelType, priorType); + + for ( final Analysis analysis : analyzes ) { + logger.info(Utils.join("\t", Arrays.asList(iteration, nAltAlleles, nSamples, modelType, priorType, analysis.getName()))); + final List values = Arrays.asList(iteration, nAltAlleles, nSamples, modelType, priorType); + analysis.run(testBuilder, (List)values); + } + } + } + } + } + } + + final GATKReport report = new GATKReport(); + for ( final Analysis analysis : analyzes ) + report.addTable(analysis.getTable()); + report.print(out); + out.close(); + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationTestBuilder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationTestBuilder.java new file mode 100644 index 000000000..f472a1140 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationTestBuilder.java @@ -0,0 +1,158 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class ExactAFCalculationTestBuilder { + final static Allele A = Allele.create("A", true); + final static Allele C = Allele.create("C"); + final static Allele G = Allele.create("G"); + final static Allele T = Allele.create("T"); + + static int sampleNameCounter = 0; + + final int nSamples; + final int numAltAlleles; + final ModelType modelType; + final PriorType priorType; + + public ExactAFCalculationTestBuilder(final int nSamples, final int numAltAlleles, + final ModelType modelType, final PriorType priorType) { + this.nSamples = nSamples; + this.numAltAlleles = numAltAlleles; + this.modelType = modelType; + this.priorType = priorType; + } + + public enum ModelType { + DiploidExact, + OptimizedDiploidExact, + GeneralExact + } + + public enum PriorType { + flat, + human + } + + public int getnSamples() { + return nSamples; + } + + public ExactAFCalculation makeModel() { + switch (modelType) { + case DiploidExact: return new DiploidExactAFCalculation(nSamples, 4); + case OptimizedDiploidExact: return new OptimizedDiploidExactAFCalculation(nSamples, 4); + case GeneralExact: return new GeneralPloidyExactAFCalculation(nSamples, 4, 2); + default: throw new RuntimeException("Unexpected type " + modelType); + } + } + + public double[] makePriors() { + final int nPriorValues = 2*nSamples+1; + + switch ( priorType ) { + case flat: + return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors + case human: + final double[] humanPriors = new double[nPriorValues]; + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues-1, humanPriors, 0.001); + return humanPriors; + default: + throw new RuntimeException("Unexpected type " + priorType); + } + } + + public VariantContext makeACTest(final int[] ACs, final int nonTypePL) { + final int nChrom = nSamples * 2; + + final int[] nhet = new int[numAltAlleles]; + final int[] nhomvar = new int[numAltAlleles]; + + for ( int i = 0; i < ACs.length; i++ ) { + final double p = ACs[i] / (1.0 * nChrom); + nhomvar[i] = (int)Math.floor(nSamples * p * p); + nhet[i] = ACs[i] - 2 * nhomvar[i]; + + if ( nhet[i] < 0 ) + throw new IllegalStateException("Bug!"); + } + + final long calcAC = MathUtils.sum(nhet) + 2 * MathUtils.sum(nhomvar); + if ( calcAC != MathUtils.sum(ACs) ) + throw new IllegalStateException("calculated AC " + calcAC + " not equal to desired AC " + Utils.join(",", ACs)); + + return makeACTest(nhet, nhomvar, nonTypePL); + } + + public VariantContext makeACTest(final int[] nhet, final int[] nhomvar, final int nonTypePL) { + List samples = new ArrayList(nSamples); + + for ( int altI = 0; altI < nhet.length; altI++ ) { + for ( int i = 0; i < nhet[altI]; i++ ) + samples.add(makePL(GenotypeType.HET, nonTypePL, altI+1)); + for ( int i = 0; i < nhomvar[altI]; i++ ) + samples.add(makePL(GenotypeType.HOM_VAR, nonTypePL, altI+1)); + } + + final int nRef = (int)(nSamples - MathUtils.sum(nhet) - MathUtils.sum(nhomvar)); + for ( int i = 0; i < nRef; i++ ) samples.add(makePL(GenotypeType.HOM_REF, nonTypePL, 0)); + + samples = samples.subList(0, nSamples); + + if ( samples.size() > nSamples ) + throw new IllegalStateException("too many samples"); + + VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, getAlleles()); + vcb.genotypes(samples); + return vcb.make(); + } + + public List getAlleles() { + return Arrays.asList(A, C, G, T).subList(0, numAltAlleles+1); + } + + public List getAlleles(final GenotypeType type, final int altI) { + switch (type) { + case HOM_REF: return Arrays.asList(getAlleles().get(0), getAlleles().get(0)); + case HET: return Arrays.asList(getAlleles().get(0), getAlleles().get(altI)); + case HOM_VAR: return Arrays.asList(getAlleles().get(altI), getAlleles().get(altI)); + default: throw new IllegalArgumentException("Unexpected type " + type); + } + } + + public Genotype makePL(final List expectedGT, int ... pls) { + GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++); + gb.alleles(expectedGT); + gb.PL(pls); + return gb.make(); + } + + private int numPLs() { + return GenotypeLikelihoods.numLikelihoods(numAltAlleles+1, 2); + } + + public Genotype makePL(final GenotypeType type, final int nonTypePL, final int altI) { + GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++); + gb.alleles(getAlleles(type, altI)); + + final int[] pls = new int[numPLs()]; + Arrays.fill(pls, nonTypePL); + + int index = 0; + switch ( type ) { + case HOM_REF: index = GenotypeLikelihoods.calculatePLindex(0, 0); break; + case HET: index = GenotypeLikelihoods.calculatePLindex(0, altI); break; + case HOM_VAR: index = GenotypeLikelihoods.calculatePLindex(altI, altI); break; + } + pls[index] = 0; + gb.PL(pls); + + return gb.make(); + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculation.java similarity index 96% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculation.java index 5662d82d6..da3ed2a02 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculation.java @@ -34,43 +34,47 @@ import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; import java.util.*; -public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalculationModel { +public class GeneralPloidyExactAFCalculation extends ExactAFCalculation { static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them - final protected UnifiedArgumentCollection UAC; private final int ploidy; private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 private final static boolean VERBOSE = false; - protected GeneralPloidyExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { + protected GeneralPloidyExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { super(UAC, N, logger, verboseWriter); ploidy = UAC.samplePloidy; - this.UAC = UAC; - } - public List getLog10PNonRef(final VariantContext vc, - final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result) { - - GenotypesContext GLs = vc.getGenotypes(); - List alleles = vc.getAlleles(); + public GeneralPloidyExactAFCalculation(final int nSamples, final int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, maxAltAlleles, null, null, null); + this.ploidy = ploidy; + } + @Override + protected VariantContext reduceScope(VariantContext vc) { // don't try to genotype too many alternate alleles if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) { logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); - alleles = new ArrayList(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1); + final List alleles = new ArrayList(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1); alleles.add(vc.getReference()); alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE, ploidy)); - - GLs = subsetAlleles(vc, alleles, false, ploidy); + VariantContextBuilder builder = new VariantContextBuilder(vc); + builder.alleles(alleles); + builder.genotypes(subsetAlleles(vc, alleles, false, ploidy)); + return builder.make(); + } else { + return vc; } + } - combineSinglePools(GLs, alleles.size(), ploidy, log10AlleleFrequencyPriors, result); - - return alleles; + @Override + public void computeLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors, result); } @@ -194,7 +198,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula combinedPoolLikelihoods.add(set); for (int p=1; p ACqueue = new LinkedList(); + final LinkedList ACqueue = new LinkedList(); // mapping of ExactACset indexes to the objects - final HashMap indexesToACset = new HashMap(likelihoodDim); + final HashMap indexesToACset = new HashMap(likelihoodDim); // add AC=0 to the queue final int[] zeroCounts = new int[nAlleles]; zeroCounts[0] = numChromosomes; - AlleleFrequencyCalculationModel.ExactACset zeroSet = - new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(zeroCounts)); + ExactAFCalculation.ExactACset zeroSet = + new ExactAFCalculation.ExactACset(1, new ExactAFCalculation.ExactACcounts(zeroCounts)); ACqueue.add(zeroSet); indexesToACset.put(zeroSet.ACcounts, zeroSet); @@ -508,7 +508,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods { double maxLog10L = Double.NEGATIVE_INFINITY; while ( !ACqueue.isEmpty() ) { // compute log10Likelihoods - final AlleleFrequencyCalculationModel.ExactACset ACset = ACqueue.remove(); + final ExactAFCalculation.ExactACset ACset = ACqueue.remove(); final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup); // adjust max likelihood seen if needed @@ -525,8 +525,8 @@ public abstract class GeneralPloidyGenotypeLikelihoods { int plIdx = 0; SumIterator iterator = new SumIterator(nAlleles, numChromosomes); while (iterator.hasNext()) { - AlleleFrequencyCalculationModel.ExactACset ACset = - new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(iterator.getCurrentVector())); + ExactAFCalculation.ExactACset ACset = + new ExactAFCalculation.ExactACset(1, new ExactAFCalculation.ExactACcounts(iterator.getCurrentVector())); // for observed base X, add Q(jX,k) to likelihood vector for all k in error model //likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k)) getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup); @@ -540,14 +540,14 @@ public abstract class GeneralPloidyGenotypeLikelihoods { } - private double calculateACConformationAndUpdateQueue(final ExactAFCalculationModel.ExactACset set, + private double calculateACConformationAndUpdateQueue(final DiploidExactAFCalculation.ExactACset set, final ErrorModel errorModel, final List alleleList, final List numObservations, final double maxLog10L, - final LinkedList ACqueue, - final HashMap indexesToACset, + final LinkedList ACqueue, + final HashMap indexesToACset, final ReadBackedPileup pileup) { // compute likelihood of set getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup); @@ -597,7 +597,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods { * @param numObservations Number of observations for each allele * @param pileup Read backed pileup in case it's necessary */ - public abstract void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset, + public abstract void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset, final ErrorModel errorModel, final List alleleList, final List numObservations, @@ -608,12 +608,12 @@ public abstract class GeneralPloidyGenotypeLikelihoods { // Static methods public static void updateACset(final int[] newSetCounts, - final LinkedList ACqueue, - final HashMap indexesToACset) { + final LinkedList ACqueue, + final HashMap indexesToACset) { - final AlleleFrequencyCalculationModel.ExactACcounts index = new AlleleFrequencyCalculationModel.ExactACcounts(newSetCounts); + final ExactAFCalculation.ExactACcounts index = new ExactAFCalculation.ExactACcounts(newSetCounts); if ( !indexesToACset.containsKey(index) ) { - AlleleFrequencyCalculationModel.ExactACset newSet = new AlleleFrequencyCalculationModel.ExactACset(1, index); + ExactAFCalculation.ExactACset newSet = new ExactAFCalculation.ExactACset(1, index); indexesToACset.put(index, newSet); ACqueue.add(newSet); if (VERBOSE) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java index ac212cfb5..d038934ba 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java @@ -188,7 +188,7 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype * @param alleleList List of alleles * @param numObservations Number of observations for each allele in alleleList */ - public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset, + public void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset, final ErrorModel errorModel, final List alleleList, final List numObservations, diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java index 944372907..fc9910cc0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java @@ -12,7 +12,10 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; import static java.lang.Math.log10; import static java.lang.Math.pow; @@ -218,7 +221,7 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi * @param alleleList List of alleles * @param numObservations Number of observations for each allele in alleleList */ - public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset, + public void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset, final ErrorModel errorModel, final List alleleList, final List numObservations, diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 192befe67..8738def50 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -283,7 +283,7 @@ public class GenotypingEngine { final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); if( mergedVC == null ) { continue; } - final HashMap> alleleHashMap = new HashMap>(); + HashMap> alleleHashMap = new HashMap>(); int aCount = 0; for( final Allele a : mergedVC.getAlleles() ) { alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper @@ -308,9 +308,20 @@ public class GenotypingEngine { } genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() ); } - final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel); - + VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel); if( call != null ) { + if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! + final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call); + // also, need to update the allele -> haplotype mapping + final HashMap> alleleHashMapTrim = new HashMap>(); + for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC + alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleHashMap.get(call.getAlleles().get(iii))); + } + + call = vcCallTrim; + alleleHashMap = alleleHashMapTrim; + } + returnCalls.add( new Pair>>(call, alleleHashMap) ); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index f4d8a88e0..71e4f5f8a 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -237,9 +237,13 @@ public class HaplotypeCaller extends ActiveRegionWalker implem UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling - UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING); - UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); - UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); + UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); + UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); + + // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested + UnifiedArgumentCollection simpleUAC = UAC.clone(); + simpleUAC.exactCallsLog = null; + UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); // initialize the output VCF header annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index db289ecab..072f81db9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -40,7 +40,6 @@ import java.util.*; public class LikelihoodCalculationEngine { private static final double LOG_ONE_HALF = -Math.log10(2.0); - private static final double BEST_LIKELIHOOD_THRESHOLD = 0.1; private final byte constantGCP; private final boolean DEBUG; private final PairHMM pairHMM; @@ -184,7 +183,7 @@ public class LikelihoodCalculationEngine { haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF ); } } - haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum? + haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); } } } @@ -323,11 +322,13 @@ public class LikelihoodCalculationEngine { return bestHaplotypes; } - public static Map partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap> perSampleReadList, final HashMap> perSampleFilteredReadList, final Pair>> call) { + public static Map partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, + final HashMap> perSampleReadList, + final HashMap> perSampleFilteredReadList, + final Pair>> call) { final Map returnMap = new HashMap(); final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst()); for( final Map.Entry> sample : perSampleReadList.entrySet() ) { - //final Map> alleleReadMap = new HashMap>(); final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); final ArrayList readsForThisSample = sample.getValue(); @@ -352,7 +353,7 @@ public class LikelihoodCalculationEngine { // only count the read if it overlaps the event, otherwise it is not added to the output read list at all if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { for( final Allele a : call.getFirst().getAlleles() ) - likelihoodMap.add(read,a,0.0); + likelihoodMap.add(read, a, 0.0); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java index a8707641a..3e5cbf0e8 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java @@ -63,7 +63,7 @@ public class BaseCountsUnitTest extends BaseTest { String name = String.format("Test-%s", params.bases); Assert.assertEquals(counts.totalCount(), params.bases.length(), name); - Assert.assertEquals(counts.countOfMostCommonBase(), params.mostCommonCount, name); + Assert.assertEquals(counts.countOfBase(counts.baseIndexWithMostCounts()), params.mostCommonCount, name); Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name); } } \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java new file mode 100644 index 000000000..602009654 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java @@ -0,0 +1,348 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variantcontext.*; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + + +public class ExactAFCalculationModelUnitTest extends BaseTest { + static Allele A = Allele.create("A", true); + static Allele C = Allele.create("C"); + static Allele G = Allele.create("G"); + + static int sampleNameCounter = 0; + static Genotype AA1, AB1, BB1, NON_INFORMATIVE1; + static Genotype AA2, AB2, AC2, BB2, BC2, CC2, NON_INFORMATIVE2; + final double[] FLAT_3SAMPLE_PRIORS = new double[2*3+1]; // flat priors + final private static boolean INCLUDE_BIALLELIC = true; + final private static boolean INCLUDE_TRIALLELIC = true; + final private static boolean Guillermo_FIXME = false; // TODO -- can only be enabled when GdA fixes bug + + @BeforeSuite + public void before() { + AA1 = makePL(Arrays.asList(A, A), 0, 20, 20); + AB1 = makePL(Arrays.asList(A, C), 20, 0, 20); + BB1 = makePL(Arrays.asList(C, C), 20, 20, 0); + NON_INFORMATIVE1 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0); + + AA2 = makePL(Arrays.asList(A, A), 0, 20, 20, 20, 20, 20); + AB2 = makePL(Arrays.asList(A, C), 20, 0, 20, 20, 20, 20); + BB2 = makePL(Arrays.asList(C, C), 20, 20, 0, 20, 20, 20); + AC2 = makePL(Arrays.asList(A, G), 20, 20, 20, 0, 20, 20); + BC2 = makePL(Arrays.asList(C, G), 20, 20, 20, 20, 0, 20); + CC2 = makePL(Arrays.asList(G, G), 20, 20, 20, 20, 20, 0); + NON_INFORMATIVE2 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0, 0, 0, 0); + } + + private Genotype makePL(final List expectedGT, int ... pls) { + GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++); + gb.alleles(expectedGT); + gb.PL(pls); + return gb.make(); + } + + private class GetGLsTest extends TestDataProvider { + GenotypesContext GLs; + int numAltAlleles; + final ExactAFCalculation calc; + final int[] expectedACs; + final double[] priors; + final String priorName; + + private GetGLsTest(final ExactAFCalculation calculation, int numAltAlleles, List arg, final double[] priors, final String priorName) { + super(GetGLsTest.class); + GLs = GenotypesContext.create(new ArrayList(arg)); + this.numAltAlleles = numAltAlleles; + this.calc = calculation; + this.priors = priors; + this.priorName = priorName; + + expectedACs = new int[numAltAlleles+1]; + for ( int alleleI = 0; alleleI < expectedACs.length; alleleI++ ) { + expectedACs[alleleI] = 0; + final Allele allele = getAlleles().get(alleleI); + for ( Genotype g : arg ) { + expectedACs[alleleI] += Collections.frequency(g.getAlleles(), allele); + } + } + } + + public AlleleFrequencyCalculationResult execute() { + return getCalc().getLog10PNonRef(getVC(), getPriors()); + } + + public double[] getPriors() { + return priors; + } + + public ExactAFCalculation getCalc() { + return calc; + } + + public VariantContext getVC() { + VariantContextBuilder builder = new VariantContextBuilder("test", "1", 1, 1, getAlleles()); + builder.genotypes(GLs); + return builder.make(); + } + + public List getAlleles() { + return Arrays.asList(Allele.create("A", true), + Allele.create("C"), + Allele.create("G"), + Allele.create("T")).subList(0, numAltAlleles+1); + } + + public int getExpectedAltAC(final int alleleI) { + return expectedACs[alleleI+1]; + } + + public String toString() { + return String.format("%s model=%s prior=%s input=%s", super.toString(), calc.getClass().getSimpleName(), + priorName, GLs.size() > 5 ? String.format("%d samples", GLs.size()) : GLs); + } + } + + @DataProvider(name = "wellFormedGLs") + public Object[][] createSimpleGLsData() { + final List biAllelicSamples = Arrays.asList(AA1, AB1, BB1); + final List triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2); + + for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) { + final ExactAFCalculation diploidCalc = new DiploidExactAFCalculation(nSamples, 4); + final ExactAFCalculation optDiploidCalc = new OptimizedDiploidExactAFCalculation(nSamples, 4); + final ExactAFCalculation generalCalc = new GeneralPloidyExactAFCalculation(nSamples, 4, 2); + + final int nPriorValues = 2*nSamples+1; + final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors + final double[] humanPriors = new double[nPriorValues]; + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues-1, humanPriors, 0.001); + + for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) { + for ( ExactAFCalculation model : Arrays.asList(diploidCalc, optDiploidCalc, generalCalc) ) { + final String priorName = priors == humanPriors ? "human" : "flat"; + + // bi-allelic + if ( INCLUDE_BIALLELIC && nSamples <= biAllelicSamples.size() ) + for ( List genotypes : Utils.makePermutations(biAllelicSamples, nSamples, true) ) + new GetGLsTest(model, 1, genotypes, priors, priorName); + + // tri-allelic + if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || model != generalCalc || Guillermo_FIXME ) ) + for ( List genotypes : Utils.makePermutations(triAllelicSamples, nSamples, true) ) + new GetGLsTest(model, 2, genotypes, priors, priorName); + } + } + } + + return GetGLsTest.getTests(GetGLsTest.class); + } + + private static class NonInformativeData { + final Genotype nonInformative; + final List called; + final int nAltAlleles; + + private NonInformativeData(List called, Genotype nonInformative, int nAltAlleles) { + this.called = called; + this.nonInformative = nonInformative; + this.nAltAlleles = nAltAlleles; + } + } + + @DataProvider(name = "GLsWithNonInformative") + public Object[][] makeGLsWithNonInformative() { + List tests = new ArrayList(); + + final List nonInformativeTests = new LinkedList(); + nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB1), NON_INFORMATIVE1, 1)); + nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2), NON_INFORMATIVE2, 2)); + nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2, BC2), NON_INFORMATIVE2, 2)); + + for ( final int nNonInformative : Arrays.asList(1, 10, 100) ) { + for ( final NonInformativeData testData : nonInformativeTests ) { + final List samples = new ArrayList(); + samples.addAll(testData.called); + samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative)); + + final int nSamples = samples.size(); + final ExactAFCalculation diploidCalc = new DiploidExactAFCalculation(nSamples, 4); + final ExactAFCalculation optDiploidCalc = new OptimizedDiploidExactAFCalculation(nSamples, 4); + final ExactAFCalculation generalCalc = new GeneralPloidyExactAFCalculation(nSamples, 4, 2); + final double[] priors = new double[2*nSamples+1]; // flat priors + + for ( ExactAFCalculation model : Arrays.asList(diploidCalc, optDiploidCalc, generalCalc) ) { + final GetGLsTest onlyInformative = new GetGLsTest(model, testData.nAltAlleles, testData.called, priors, "flat"); + + for ( int rotation = 0; rotation < nSamples; rotation++ ) { + Collections.rotate(samples, 1); + final GetGLsTest withNonInformative = new GetGLsTest(model, testData.nAltAlleles, samples, priors, "flat"); + tests.add(new Object[]{onlyInformative, withNonInformative}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "wellFormedGLs") + public void testGLs(GetGLsTest cfg) { + testResultSimple(cfg); + } + + @Test(enabled = true, dataProvider = "GLsWithNonInformative", dependsOnMethods = "testGLs") + public void testGLsWithNonInformative(GetGLsTest onlyInformative, GetGLsTest withNonInformative) { + final AlleleFrequencyCalculationResult expected = onlyInformative.execute(); + final AlleleFrequencyCalculationResult actual = withNonInformative.execute(); + + testResultSimple(withNonInformative); + + Assert.assertEquals(actual.getLog10PosteriorOfAFzero(), expected.getLog10LikelihoodOfAFzero()); + Assert.assertEquals(actual.getLog10LikelihoodOfAFzero(), expected.getLog10LikelihoodOfAFzero()); + Assert.assertEquals(actual.getLog10PosteriorsMatrixSumWithoutAFzero(), expected.getLog10PosteriorsMatrixSumWithoutAFzero()); + Assert.assertEquals(actual.getAlleleCountsOfMAP(), expected.getAlleleCountsOfMAP()); + Assert.assertEquals(actual.getAlleleCountsOfMLE(), expected.getAlleleCountsOfMLE()); + Assert.assertEquals(actual.getLog10MAP(), expected.getLog10MAP()); + Assert.assertEquals(actual.getLog10MLE(), expected.getLog10MLE()); + Assert.assertEquals(actual.getAllelesUsedInGenotyping(), expected.getAllelesUsedInGenotyping()); + } + + private void testResultSimple(final GetGLsTest cfg) { + final AlleleFrequencyCalculationResult result = cfg.execute(); + + Assert.assertEquals(result.getNormalizedPosteriorOfAFzero() + result.getNormalizedPosteriorOfAFGTZero(), 1.0, 1e-4); + + final int minNumberOfEvaluations = cfg.getVC().getCalledChrCount(); + Assert.assertTrue(result.getnEvaluations() >= minNumberOfEvaluations, + "Number of evaluations " + result.getnEvaluations() + " must be at least " + minNumberOfEvaluations); + Assert.assertNotNull(result.getAllelesUsedInGenotyping()); + Assert.assertTrue(cfg.getAlleles().containsAll(result.getAllelesUsedInGenotyping()), "Result object has alleles not in our initial allele list"); + + for ( int altAlleleI = 0; altAlleleI < cfg.numAltAlleles; altAlleleI++ ) { + int expectedAlleleCount = cfg.getExpectedAltAC(altAlleleI); + int calcAC_MLE = result.getAlleleCountsOfMLE()[altAlleleI]; + + final Allele allele = cfg.getAlleles().get(altAlleleI+1); + Assert.assertEquals(calcAC_MLE, expectedAlleleCount, "MLE AC not equal to expected AC for allele " + allele); + } + + // TODO + // TODO -- enable when we understand the contract between AC_MAP and pNonRef + // TODO +// final int AC_MAP = (int)MathUtils.sum(result.getAlleleCountsOfMAP()); +// if ( AC_MAP > 0 ) { +// Assert.assertTrue(result.getNormalizedPosteriorOfAFzero() < 0.50, "MAP AC " + AC_MAP + " > 0 but we had posterior AF = 0 > 0.5 of " + result.getNormalizedPosteriorOfAFzero()); +// } else { +// Assert.assertTrue(result.getNormalizedPosteriorOfAFzero() > 0.50, "MAP AC " + AC_MAP + " == 0 but we had posterior AF = 0 < 0.5 of " + result.getNormalizedPosteriorOfAFzero()); +// } + } + + @Test(enabled = true, dataProvider = "Models") + public void testLargeGLs(final ExactAFCalculation calc) { + final Genotype BB = makePL(Arrays.asList(C, C), 20000000, 20000000, 0); + GetGLsTest cfg = new GetGLsTest(calc, 1, Arrays.asList(BB, BB, BB), FLAT_3SAMPLE_PRIORS, "flat"); + + final AlleleFrequencyCalculationResult result = cfg.execute(); + + int calculatedAlleleCount = result.getAlleleCountsOfMAP()[0]; + Assert.assertEquals(calculatedAlleleCount, 6); + } + + @Test(enabled = true, dataProvider = "Models") + public void testMismatchedGLs(final ExactAFCalculation calc) { + final Genotype AB = makePL(Arrays.asList(A,C), 2000, 0, 2000, 2000, 2000, 2000); + final Genotype AC = makePL(Arrays.asList(A,G), 100, 100, 100, 0, 100, 100); + GetGLsTest cfg = new GetGLsTest(calc, 2, Arrays.asList(AB, AC), FLAT_3SAMPLE_PRIORS, "flat"); + + final AlleleFrequencyCalculationResult result = cfg.execute(); + + Assert.assertEquals(result.getAlleleCountsOfMAP()[0], 1); + Assert.assertEquals(result.getAlleleCountsOfMAP()[1], 1); + } + + @DataProvider(name = "Models") + public Object[][] makeModels() { + List tests = new ArrayList(); + + tests.add(new Object[]{new DiploidExactAFCalculation(2, 4)}); + tests.add(new Object[]{new OptimizedDiploidExactAFCalculation(2, 4)}); + tests.add(new Object[]{new GeneralPloidyExactAFCalculation(2, 4, 2)}); + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "Models") + public void testBiallelicPriors(final ExactAFCalculation model) { + final int REF_PL = 10; + final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000); + + for ( int log10NonRefPrior = 1; log10NonRefPrior < 100*REF_PL; log10NonRefPrior += 1 ) { + final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior); + final double[] priors = MathUtils.toLog10(new double[]{refPrior, (1-refPrior) / 2, (1-refPrior) / 2}); + GetGLsTest cfg = new GetGLsTest(model, 1, Arrays.asList(AB), priors, "pNonRef" + log10NonRefPrior); + final AlleleFrequencyCalculationResult result = cfg.execute(); + final int actualAC = result.getAlleleCountsOfMAP()[0]; + + final double pRefWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0]; + final double pHetWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1]; + final boolean expectNonRef = pRefWithPrior <= pHetWithPrior; + + if ( expectNonRef ) + Assert.assertTrue(result.getNormalizedPosteriorOfAFGTZero() > 0.5); + else + Assert.assertTrue(result.getNormalizedPosteriorOfAFGTZero() < 0.5); + + final int expectedAC = expectNonRef ? 1 : 0; + Assert.assertEquals(actualAC, expectedAC, + "actual AC with priors " + log10NonRefPrior + " not expected " + + expectedAC + " priors " + Utils.join(",", priors)); + } + } + + @Test(enabled = false, dataProvider = "Models") + public void testTriallelicPriors(final ExactAFCalculation model) { + // TODO + // TODO + // TODO THIS SEEMS TO ID A BUG IN THE EXACT MODEL FOR MULTI-ALLELICS, AS THE + // TODO SECOND ALLELE ISN'T HAVING A SQUARED PRIOR. TALK TO ERIC AND CONFIRM + // TODO + // TODO + final int REF_PL_AB = 10, REF_PL_AC = 20; // first AC goes, then AB + final Genotype AB = makePL(Arrays.asList(A,C), REF_PL_AB, 0, 10000, 10000, 10000); + final Genotype AC = makePL(Arrays.asList(A, G), REF_PL_AC, 10000, 10000, 0, 10000, 10000); + + for ( int log10NonRefPrior = 1; log10NonRefPrior < 100*REF_PL_AC; log10NonRefPrior += 1 ) { + final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior); + final double nonRefPrior = (1-refPrior) / 2; + final double[] priors = MathUtils.toLog10(new double[]{refPrior, nonRefPrior, nonRefPrior, nonRefPrior, nonRefPrior, nonRefPrior}); + GetGLsTest cfg = new GetGLsTest(model, 2, Arrays.asList(AB, AC), priors, "pNonRef" + log10NonRefPrior); + final AlleleFrequencyCalculationResult result = cfg.execute(); + final int actualAC_AB = result.getAlleleCountsOfMAP()[0]; + + final double pRefABWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0]; + final double pHetABWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1]; + final int expectedAC_AB = pRefABWithPrior <= pHetABWithPrior ? 1 : 0; + Assert.assertEquals(actualAC_AB, expectedAC_AB, + "actual AC with priors " + log10NonRefPrior + " not expected " + + expectedAC_AB + " priors " + Utils.join(",", priors)); + + final double nonRefPriorSecondAllele = Math.pow(nonRefPrior, 2); + final double refPriorSecondAllele = 1 - nonRefPriorSecondAllele; + final int actualAC_AC = result.getAlleleCountsOfMAP()[1]; + final double pRefACWithPrior = AB.getLikelihoods().getAsVector()[0] + Math.log10(refPriorSecondAllele); + final double pHetACWithPrior = AC.getLikelihoods().getAsVector()[3] + Math.log10(nonRefPriorSecondAllele); + final int expectedAC_AC = pRefACWithPrior <= pHetACWithPrior ? 1 : 0; + Assert.assertEquals(actualAC_AC, expectedAC_AC, + "actual AC with priors " + log10NonRefPrior + " not expected " + + expectedAC_AC + " priors " + Utils.join(",", priors)); + } + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java index 983f562d2..a646e6f09 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java @@ -141,7 +141,7 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest { final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size()); double[] priors = new double[len]; // flat priors - GeneralPloidyExactAFCalculationModel.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result); + GeneralPloidyExactAFCalculation.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result); int nameIndex = 1; for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index b4ac2b86d..e94c9705c 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -31,7 +31,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "-gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "71bec55320a2f07af0d54be9d7735322"); + HCTest(CEUTRIO_BAM, "--max_alternate_alleles_for_indels 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "71bec55320a2f07af0d54be9d7735322"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -42,7 +42,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(CEUTRIO_BAM, "", "f5a809e3fbd9998f79b75bb2973209e1"); + HCTestComplexVariants(CEUTRIO_BAM, "", "966da0de8466d21d79f1523488dff6bd"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -81,4 +81,19 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { executeTest("HCTestStructuralIndels: ", spec); } + // -------------------------------------------------------------------------------------------------------------- + // + // testing reduced reads + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void HCTestReducedBam() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, + Arrays.asList("864abe729828248333aee14818c1d2e1")); + executeTest("HC calling on a ReducedRead BAM", spec); + } + + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index f30fc0316..085a60191 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -1,13 +1,12 @@ package org.broadinstitute.sting.gatk.arguments; -import org.broadinstitute.sting.commandline.Advanced; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.io.File; + /** * Created with IntelliJ IDEA. * User: rpoplin @@ -59,4 +58,18 @@ public class StandardCallerArgumentCollection { @Advanced @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) public int MAX_ALTERNATE_ALLELES = 3; + + /** + * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES), + * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it + * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend + * that you not play around with this parameter. + */ + @Advanced + @Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "Maximum number of alternate alleles to genotype for indels only", required = false) + public int MAX_ALTERNATE_ALLELES_FOR_INDELS = 2; + + @Hidden + @Argument(shortName = "logExactCalls", doc="x", required=false) + public File exactCallsLog = null; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index d0e310d3f..8ee7e0439 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -124,6 +124,12 @@ public class BAMScheduler implements Iterator { */ private FilePointer generatePointerOverEntireFileset() { FilePointer filePointer = new FilePointer(); + + // This is a "monolithic" FilePointer representing all regions in all files we will ever visit, and is + // the only FilePointer we will create. This allows us to have this FilePointer represent regions from + // multiple contigs + filePointer.setIsMonolithic(true); + Map currentPosition; // Only use the deprecated SAMDataSource.getCurrentPosition() if we're not using experimental downsampling diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java index 6c064cf86..0440c7eae 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.*; @@ -88,6 +89,17 @@ public class ExperimentalReadShardBalancer extends ShardBalancer { */ private PeekableIterator currentContigReadsIterator = null; + /** + * How many FilePointers have we pulled from the filePointers iterator? + */ + private int totalFilePointersConsumed = 0; + + /** + * Have we encountered a monolithic FilePointer? + */ + private boolean encounteredMonolithicFilePointer = false; + + { createNextContigFilePointer(); advance(); @@ -167,6 +179,20 @@ public class ExperimentalReadShardBalancer extends ShardBalancer { logger.info("Loading BAM index data for next contig"); while ( filePointers.hasNext() ) { + + // Make sure that if we see a monolithic FilePointer (representing all regions in all files) that + // it is the ONLY FilePointer we ever encounter + if ( encounteredMonolithicFilePointer ) { + throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer"); + } + if ( filePointers.peek().isMonolithic() ) { + if ( totalFilePointersConsumed > 0 ) { + throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer"); + } + encounteredMonolithicFilePointer = true; + logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek())); + } + // If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the // same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge if ( nextContigFilePointers.isEmpty() || @@ -175,6 +201,7 @@ public class ExperimentalReadShardBalancer extends ShardBalancer { (nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) { nextContigFilePointers.add(filePointers.next()); + totalFilePointersConsumed++; } else { break; // next FilePointer is on a different contig or has different mapped/unmapped status, diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java index 50f4e0273..197015641 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java @@ -50,10 +50,28 @@ public class FilePointer { */ protected final boolean isRegionUnmapped; + /** + * Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will + * ever visit during this GATK run? If this is set to true, the engine will expect to see only this + * one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals + * from more than one contig. + */ + private boolean isMonolithic = false; + + /** + * Index of the contig covered by this FilePointer. Only meaningful for non-monolithic, mapped FilePointers + */ + private Integer contigIndex = null; + + public FilePointer( List locations ) { this.locations.addAll(locations); this.isRegionUnmapped = checkUnmappedStatus(); - validateLocations(); + + validateAllLocations(); + if ( locations.size() > 0 ) { + contigIndex = locations.get(0).getContigIndex(); + } } public FilePointer( final GenomeLoc... locations ) { @@ -80,8 +98,9 @@ public class FilePointer { return foundUnmapped; } - private void validateLocations() { - if ( isRegionUnmapped ) { + private void validateAllLocations() { + // Unmapped and monolithic FilePointers are exempted from the one-contig-only restriction + if ( isRegionUnmapped || isMonolithic ) { return; } @@ -89,13 +108,22 @@ public class FilePointer { for ( GenomeLoc location : locations ) { if ( previousContigIndex != null && previousContigIndex != location.getContigIndex() ) { - throw new ReviewedStingException("File pointers must contain intervals from at most one contig"); + throw new ReviewedStingException("Non-monolithic file pointers must contain intervals from at most one contig"); } previousContigIndex = location.getContigIndex(); } } + private void validateLocation( GenomeLoc location ) { + if ( isRegionUnmapped != GenomeLoc.isUnmapped(location) ) { + throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped."); + } + if ( ! isRegionUnmapped && ! isMonolithic && contigIndex != null && contigIndex != location.getContigIndex() ) { + throw new ReviewedStingException("Non-monolithic file pointers must contain intervals from at most one contig"); + } + } + /** * Returns an immutable view of this FilePointer's file spans * @@ -123,6 +151,29 @@ public class FilePointer { return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; } + /** + * Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will + * ever visit during this GATK run? If this is set to true, the engine will expect to see only this + * one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals + * from more than one contig. + * + * @return true if this FP is a monolithic FP representing all regions in all files, otherwise false + */ + public boolean isMonolithic() { + return isMonolithic; + } + + /** + * Set this FP's "monolithic" status to true or false. An FP is monolithic if it represents all + * regions in all files that we will ever visit, and is the only FP we will ever create. A monolithic + * FP may contain intervals from more than one contig. + * + * @param isMonolithic set this FP's monolithic status to this value + */ + public void setIsMonolithic( boolean isMonolithic ) { + this.isMonolithic = isMonolithic; + } + @Override public boolean equals(final Object other) { if(!(other instanceof FilePointer)) @@ -151,15 +202,12 @@ public class FilePointer { } public void addLocation(final GenomeLoc location) { - this.locations.add(location); - checkUnmappedStatus(); - validateLocations(); - } + validateLocation(location); - public void addLocations( final List locations ) { - this.locations.addAll(locations); - checkUnmappedStatus(); - validateLocations(); + this.locations.add(location); + if ( contigIndex == null ) { + contigIndex = location.getContigIndex(); + } } public void addFileSpans(final SAMReaderID id, final SAMFileSpan fileSpan) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java index 662c7526b..27e666f6f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java @@ -215,19 +215,29 @@ public class ReadShard extends Shard { int start = Integer.MAX_VALUE; int stop = Integer.MIN_VALUE; String contig = null; + boolean foundMapped = false; for ( final SAMRecord read : reads ) { if ( contig != null && ! read.getReferenceName().equals(contig) ) throw new ReviewedStingException("ReadShard contains reads spanning contig boundaries, which is no longer allowed. " + "First contig is " + contig + " next read was " + read.getReferenceName() ); contig = read.getReferenceName(); - if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart(); - if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd(); + + // Even if this shard as a *whole* is not "unmapped", we can still encounter *individual* unmapped mates + // of mapped reads within this shard's buffer. In fact, if we're very unlucky with shard boundaries, + // this shard might consist *only* of unmapped mates! We need to refrain from using the alignment + // starts/stops of these unmapped mates, and detect the case where the shard has been filled *only* + // with unmapped mates. + if ( ! read.getReadUnmappedFlag() ) { + foundMapped = true; + if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart(); + if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd(); + } } assert contig != null; - if ( contig.equals("*") ) // all reads are unmapped + if ( ! foundMapped || contig.equals("*") ) // all reads are unmapped return GenomeLoc.UNMAPPED; else return parser.createGenomeLoc(contig, start, stop); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index ec3f1e5c7..1cc88fc24 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -12,6 +12,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -49,8 +50,12 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno if ( perReadAlleleLikelihoodMap.size() == 0 ) return null; - for ( Map.Entry sample : perReadAlleleLikelihoodMap.entrySet() ) - depth += sample.getValue().getNumberOfStoredElements(); + for (PerReadAlleleLikelihoodMap maps : perReadAlleleLikelihoodMap.values() ) { + for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { + final GATKSAMRecord read = el.getKey(); + depth += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1); + } + } } else return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index ee9b51b56..d7790969e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -13,6 +13,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; @@ -72,7 +73,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa ReadBackedPileup pileup = stratifiedContext.getBasePileup(); for ( PileupElement p : pileup ) { if ( alleleCounts.containsKey(p.getBase()) ) - alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1); + alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+p.getRepresentativeCount()); } // we need to add counts in the correct order @@ -91,12 +92,13 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa alleleCounts.put(allele, 0); } for (Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { + final GATKSAMRecord read = el.getKey(); final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); if (a.isNoCall()) continue; // read is non-informative if (!vc.getAlleles().contains(a)) continue; // sanity check - shouldn't be needed - alleleCounts.put(a,alleleCounts.get(a)+1); + alleleCounts.put(a, alleleCounts.get(a) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1)); } final int[] counts = new int[alleleCounts.size()]; counts[0] = alleleCounts.get(vc.getReference()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index e95af71c2..ec0393cdc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -71,7 +72,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat } else if (stratifiedPerReadAlleleLikelihoodMap != null) { // either SNP with no alignment context, or indels: per-read likelihood map needed - final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); + final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc); return pValueForBestTable(table, null); } else @@ -235,14 +236,13 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat * allele2 # # * @return a 2x2 contingency table */ - private static int[][] getContingencyTable( final Map stratifiedPerReadAlleleLikelihoodMap, - final Allele ref, final Allele alt) { + private static int[][] getContingencyTable( final Map stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) { + final Allele ref = vc.getReference(); + final Allele alt = vc.getAltAlleleWithHighestAlleleCount(); int[][] table = new int[2][2]; for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { - if ( el.getKey().isReducedRead() ) // ignore reduced reads - continue; final boolean matchesRef = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(ref,true); final boolean matchesAlt = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(alt,true); @@ -254,7 +254,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat int row = matchesRef ? 0 : 1; int column = isFW ? 0 : 1; - table[row][column]++; + final GATKSAMRecord read = el.getKey(); + table[row][column] += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1); } } @@ -275,7 +276,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat for ( Map.Entry sample : stratifiedContexts.entrySet() ) { for (PileupElement p : sample.getValue().getBasePileup()) { - if ( ! RankSumTest.isUsableBase(p, false) || p.getRead().isReducedRead() ) // ignore deletions and reduced reads + if ( ! RankSumTest.isUsableBase(p, false) ) // ignore deletions continue; if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider ) @@ -290,7 +291,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat int row = matchesRef ? 0 : 1; int column = isFW ? 0 : 1; - table[row][column]++; + table[row][column] += p.getRepresentativeCount(); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index ee6a619fd..2c7a12a36 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -200,15 +200,15 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed } } - private boolean readHasBeenSkipped(GATKSAMRecord read) { + private boolean readHasBeenSkipped( final GATKSAMRecord read ) { return read.containsTemporaryAttribute(SKIP_RECORD_ATTRIBUTE); } - private boolean isLowQualityBase(GATKSAMRecord read, int offset) { - return read.getBaseQualities()[offset] < minimumQToUse; + private boolean isLowQualityBase( final PileupElement p ) { + return p.getQual() < minimumQToUse; } - private boolean readNotSeen(GATKSAMRecord read) { + private boolean readNotSeen( final GATKSAMRecord read ) { return !read.containsTemporaryAttribute(SEEN_ATTRIBUTE); } @@ -230,7 +230,7 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed final int offset = p.getOffset(); // This read has been marked to be skipped or base is low quality (we don't recalibrate low quality bases) - if (readHasBeenSkipped(read) || isLowQualityBase(read, offset)) + if (readHasBeenSkipped(read) || p.isInsertionAtBeginningOfRead() || isLowQualityBase(p) ) continue; if (readNotSeen(read)) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculation.java new file mode 100755 index 000000000..fc578a5bd --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculation.java @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.PrintStream; +import java.util.Arrays; +import java.util.List; + + +/** + * Generic interface for calculating the probability of alleles segregating given priors and genotype likelihoods + */ +public abstract class AlleleFrequencyCalculation implements Cloneable { + private final static Logger defaultLogger = Logger.getLogger(AlleleFrequencyCalculation.class); + + public enum Model { + /** The default model with the best performance in all cases */ + EXACT("ExactAFCalculation"); + + final String implementationName; + + private Model(String implementationName) { + this.implementationName = implementationName; + } + } + + protected int nSamples; + protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE; + protected int MAX_ALTERNATE_ALLELES_FOR_INDELS; + + protected Logger logger; + protected PrintStream verboseWriter; + + protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; + + private SimpleTimer callTimer = new SimpleTimer(); + private PrintStream callReport = null; + + protected AlleleFrequencyCalculation(final UnifiedArgumentCollection UAC, final int nSamples, final Logger logger, final PrintStream verboseWriter) { + this(nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.MAX_ALTERNATE_ALLELES_FOR_INDELS, UAC.exactCallsLog, logger, verboseWriter); + } + + protected AlleleFrequencyCalculation(final int nSamples, + final int maxAltAlleles, + final int maxAltAllelesForIndels, + final File exactCallsLog, + final Logger logger, + final PrintStream verboseWriter) { + if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples); + if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles); + + this.nSamples = nSamples; + this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = maxAltAlleles; + this.MAX_ALTERNATE_ALLELES_FOR_INDELS = maxAltAllelesForIndels; + this.logger = logger == null ? defaultLogger : logger; + this.verboseWriter = verboseWriter; + if ( exactCallsLog != null ) + initializeOutputFile(exactCallsLog); + } + + /** + * @see #getLog10PNonRef(org.broadinstitute.sting.utils.variantcontext.VariantContext, double[], AlleleFrequencyCalculationResult) + * + * Allocates a new results object. Useful for testing but slow in practice. + */ + public final AlleleFrequencyCalculationResult getLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors) { + return getLog10PNonRef(vc, log10AlleleFrequencyPriors, new AlleleFrequencyCalculationResult(MAX_ALTERNATE_ALLELES_TO_GENOTYPE)); + } + + /** + * Compute the probability of the alleles segregating given the genotype likelihoods of the samples in vc + * + * @param vc the VariantContext holding the alleles and sample information + * @param log10AlleleFrequencyPriors a prior vector nSamples x 2 in length indicating the Pr(AF = i) + * @param result a pre-allocated (for efficiency) object to hold the result of the calculation + * @return result (for programming convenience) + */ + public final AlleleFrequencyCalculationResult getLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null"); + if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null"); + if ( result == null ) throw new IllegalArgumentException("Results object cannot be null"); + + // reset the result, so we can store our new result there + result.reset(); + + final VariantContext vcWorking = reduceScope(vc); + + callTimer.start(); + computeLog10PNonRef(vcWorking, log10AlleleFrequencyPriors, result); + final long nanoTime = callTimer.getElapsedTimeNano(); + + if ( callReport != null ) + printCallInfo(vcWorking, log10AlleleFrequencyPriors, nanoTime, result.getLog10PosteriorOfAFzero()); + + result.setAllelesUsedInGenotyping(vcWorking.getAlleles()); + return result; + } + + // --------------------------------------------------------------------------- + // + // Abstract methods that should be implemented by concrete implementations + // to actually calculate the AF + // + // --------------------------------------------------------------------------- + + /** + * Look at VC and perhaps return a new one of reduced complexity, if that's necessary + * + * Used before the call to computeLog10PNonRef to simply the calculation job at hand, + * if vc exceeds bounds. For example, if VC has 100 alt alleles this function + * may decide to only genotype the best 2 of them. + * + * @param vc the initial VC provided by the caller to this AFcalculation + * @return a potentially simpler VC that's more tractable to genotype + */ + @Requires("vc != null") + @Ensures("result != null") + protected abstract VariantContext reduceScope(final VariantContext vc); + + /** + * Actually carry out the log10PNonRef calculation on vc, storing results in results + * + * @param vc variant context with alleles and genotype likelihoods + * @param log10AlleleFrequencyPriors priors + * @param result (pre-allocated) object to store results + */ + // TODO -- add consistent requires among args + protected abstract void computeLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result); + + /** + * Must be overridden by concrete subclasses + * + * @param vc variant context with alleles and genotype likelihoods + * @param allelesToUse alleles to subset + * @param assignGenotypes + * @param ploidy + * @return GenotypesContext object + */ + protected abstract GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes, + final int ploidy); + + // --------------------------------------------------------------------------- + // + // Print information about the call to the calls log + // + // --------------------------------------------------------------------------- + + private void initializeOutputFile(final File outputFile) { + try { + if (outputFile != null) { + callReport = new PrintStream( new FileOutputStream(outputFile) ); + callReport.println(Utils.join("\t", Arrays.asList("loc", "variable", "key", "value"))); + } + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotCreateOutputFile(outputFile, e); + } + } + + private void printCallInfo(final VariantContext vc, + final double[] log10AlleleFrequencyPriors, + final long runtimeNano, + final double log10PosteriorOfAFzero) { + printCallElement(vc, "type", "ignore", vc.getType()); + + int allelei = 0; + for ( final Allele a : vc.getAlleles() ) + printCallElement(vc, "allele", allelei++, a.getDisplayString()); + + for ( final Genotype g : vc.getGenotypes() ) + printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString()); + + for ( int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++ ) + printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]); + + printCallElement(vc, "runtime.nano", "ignore", runtimeNano); + printCallElement(vc, "log10PosteriorOfAFzero", "ignore", log10PosteriorOfAFzero); + + callReport.flush(); + } + + private void printCallElement(final VariantContext vc, + final Object variable, + final Object key, + final Object value) { + final String loc = String.format("%s:%d", vc.getChr(), vc.getStart()); + callReport.println(Utils.join("\t", Arrays.asList(loc, variable, key, value))); + } + +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java index c93e780bf..aabca9bcb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java @@ -25,9 +25,12 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; +import com.google.java.contract.Ensures; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; import java.util.Arrays; +import java.util.List; /** * Created by IntelliJ IDEA. @@ -35,9 +38,10 @@ import java.util.Arrays; * Date: Dec 14, 2011 * * Useful helper class to communicate the results of the allele frequency calculation + * + * TODO -- WHAT IS THE CONTRACT ON MAP AC AND P NON REF? */ public class AlleleFrequencyCalculationResult { - // These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles private double log10MLE; private double log10MAP; @@ -54,21 +58,88 @@ public class AlleleFrequencyCalculationResult { private double log10LikelihoodOfAFzero; private double log10PosteriorOfAFzero; + int nEvaluations = 0; + /** + * The list of alleles actually used in computing the AF + */ + private List allelesUsedInGenotyping = null; + + /** + * Create a results object capability of storing results for calls with up to maxAltAlleles + * + * @param maxAltAlleles an integer >= 1 + */ public AlleleFrequencyCalculationResult(final int maxAltAlleles) { + if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles); + alleleCountsOfMLE = new int[maxAltAlleles]; alleleCountsOfMAP = new int[maxAltAlleles]; + reset(); } + /** + * Get the log10 value of the probability mass at the MLE + * + * @return a log10 prob + */ + @Ensures("goodLog10Value(result)") public double getLog10MLE() { return log10MLE; } + /** + * Get the log10 value of the probability mass at the max. a posterior (MAP) + * + * @return a log10 prob + */ + @Ensures("goodLog10Value(result)") public double getLog10MAP() { return log10MAP; } + /** + * Returns a vector with maxAltAlleles values containing AC values at the MLE + * + * The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order, + * starting from index 0 (i.e., the first alt allele is at 0). The vector is always + * maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values + * are meaningful. + * + * @return a vector with allele counts, not all of which may be meaningful + */ + @Ensures("result != null") + public int[] getAlleleCountsOfMLE() { + return alleleCountsOfMLE; + } + + /** + * Returns a vector with maxAltAlleles values containing AC values at the MAP + * + * @see #getAlleleCountsOfMLE() for the encoding of results in this vector + * + * @return a non-null vector of ints + */ + @Ensures("result != null") + public int[] getAlleleCountsOfMAP() { + return alleleCountsOfMAP; + } + + /** + * Returns the number of cycles used to evaluate the pNonRef for this AF calculation + * + * @return the number of evaluations required to produce the answer for this AF calculation + */ + public int getnEvaluations() { + return nEvaluations; + } + + /** + * TODO -- eric what is this supposed to return? my unit tests don't do what I think they should + * + * @return + */ public double getLog10PosteriorsMatrixSumWithoutAFzero() { if ( log10PosteriorMatrixSum == null ) { log10PosteriorMatrixSum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex); @@ -76,33 +147,99 @@ public class AlleleFrequencyCalculationResult { return log10PosteriorMatrixSum; } - public int[] getAlleleCountsOfMLE() { - return alleleCountsOfMLE; - } - - public int[] getAlleleCountsOfMAP() { - return alleleCountsOfMAP; - } - + /** + * TODO -- eric what is this supposed to return? my unit tests don't do what I think they should + * + * @return + */ public double getLog10LikelihoodOfAFzero() { return log10LikelihoodOfAFzero; } + /** + * TODO -- eric what is this supposed to return? my unit tests don't do what I think they should + * + * @return + */ public double getLog10PosteriorOfAFzero() { return log10PosteriorOfAFzero; } - public void reset() { - log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + /** + * Get the list of alleles actually used in genotyping. + * + * Due to computational / implementation constraints this may be smaller than + * the actual list of alleles requested + * + * @return a non-empty list of alleles used during genotyping + */ + @Ensures({"result != null", "! result.isEmpty()"}) + public List getAllelesUsedInGenotyping() { + if ( allelesUsedInGenotyping == null ) + throw new IllegalStateException("allelesUsedInGenotyping requested but not yet set"); + + return allelesUsedInGenotyping; + } + + /** + * Get the normalized -- across all AFs -- of AC == 0, NOT LOG10 + * @return + */ + // TODO -- this ensure cannot be enabled right now because the log10 inputs can be infinity, etc. + // TODO -- we should own these values in a more meaningful way and return good values in the case + // TODO -- where this happens, or instead thrown an error and have a function to say "was this calculation successful +// @Ensures({"result >= 0.0", "result <= 1.0"}) + public double getNormalizedPosteriorOfAFzero() { + return getNormalizedPosteriors()[0]; + } + + /** + * Get the normalized -- across all AFs -- of AC > 0, NOT LOG10 + * @return + */ + // TODO -- this ensure cannot be enabled right now because the log10 inputs can be infinity, etc. + // TODO -- we should own these values in a more meaningful way and return good values in the case + // TODO -- where this happens, or instead thrown an error and have a function to say "was this calculation successful + //@Ensures({"result >= 0.0", "result <= 1.0"}) + public double getNormalizedPosteriorOfAFGTZero() { + return getNormalizedPosteriors()[1]; + } + + private double[] getNormalizedPosteriors() { + final double[] posteriors = new double[]{ getLog10PosteriorOfAFzero(), getLog10PosteriorsMatrixSumWithoutAFzero() }; + return MathUtils.normalizeFromLog10(posteriors); + } + + // -------------------------------------------------------------------------------- + // + // Protected mutational methods only for use within the calculation models themselves + // + // -------------------------------------------------------------------------------- + + /** + * Reset the data in this results object, so that it can be used in a subsequent AF calculation + * + * Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer + */ + protected void reset() { + log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculation.VALUE_NOT_CALCULATED; for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) { alleleCountsOfMLE[i] = 0; alleleCountsOfMAP[i] = 0; } currentPosteriorsCacheIndex = 0; log10PosteriorMatrixSum = null; + allelesUsedInGenotyping = null; } - public void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { + /** + * Tell this result we used one more evaluation cycle + */ + protected void incNEvaluations() { + nEvaluations++; + } + + protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { if ( log10LofK > log10MLE ) { log10MLE = log10LofK; for ( int i = 0; i < alleleCountsForK.length; i++ ) @@ -110,7 +247,7 @@ public class AlleleFrequencyCalculationResult { } } - public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) { + protected void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) { addToPosteriorsCache(log10LofK); if ( log10LofK > log10MAP ) { @@ -132,7 +269,7 @@ public class AlleleFrequencyCalculationResult { } } - public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { + protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; if ( log10LikelihoodOfAFzero > log10MLE ) { log10MLE = log10LikelihoodOfAFzero; @@ -140,11 +277,22 @@ public class AlleleFrequencyCalculationResult { } } - public void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { + protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { this.log10PosteriorOfAFzero = log10PosteriorOfAFzero; if ( log10PosteriorOfAFzero > log10MAP ) { log10MAP = log10PosteriorOfAFzero; Arrays.fill(alleleCountsOfMAP, 0); } } + + protected void setAllelesUsedInGenotyping(List allelesUsedInGenotyping) { + if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() ) + throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty"); + + this.allelesUsedInGenotyping = allelesUsedInGenotyping; + } + + private static boolean goodLog10Value(final double result) { + return result <= 0.0 || Double.isInfinite(result) || Double.isNaN(result); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidExactAFCalculation.java similarity index 93% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidExactAFCalculation.java index ba7f0f622..40a30b710 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidExactAFCalculation.java @@ -32,41 +32,54 @@ import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; import java.util.*; -public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { - +public class DiploidExactAFCalculation extends ExactAFCalculation { // private final static boolean DEBUG = false; private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 - protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { + public DiploidExactAFCalculation(final int nSamples, final int maxAltAlleles) { + super(nSamples, maxAltAlleles, maxAltAlleles, null, null, null); + } + + /** + * Dynamically found in UnifiedGenotyperEngine + * + * @param UAC + * @param N + * @param logger + * @param verboseWriter + */ + public DiploidExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { super(UAC, N, logger, verboseWriter); } - public List getLog10PNonRef(final VariantContext vc, - final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result) { + @Override + public void computeLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + linearExactMultiAllelic(vc.getGenotypes(), vc.getNAlleles() - 1, log10AlleleFrequencyPriors, result); + } - GenotypesContext GLs = vc.getGenotypes(); - List alleles = vc.getAlleles(); - - final int myMaxAltAllelesToGenotype = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS && vc.getType().equals(VariantContext.Type.INDEL) ? 2 : MAX_ALTERNATE_ALLELES_TO_GENOTYPE; + @Override + protected VariantContext reduceScope(final VariantContext vc) { + final int myMaxAltAllelesToGenotype = vc.getType().equals(VariantContext.Type.INDEL) ? MAX_ALTERNATE_ALLELES_FOR_INDELS : MAX_ALTERNATE_ALLELES_TO_GENOTYPE; // don't try to genotype too many alternate alleles if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) { logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); - alleles = new ArrayList(myMaxAltAllelesToGenotype + 1); + VariantContextBuilder builder = new VariantContextBuilder(vc); + List alleles = new ArrayList(myMaxAltAllelesToGenotype + 1); alleles.add(vc.getReference()); alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype)); - GLs = VariantContextUtils.subsetDiploidAlleles(vc, alleles, false); + builder.alleles(alleles); + builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false)); + return builder.make(); + } else { + return vc; } - - linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result); - - return alleles; } - private static final int PL_INDEX_OF_HOM_REF = 0; private static List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) { final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); @@ -134,6 +147,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // keep processing while we have AC conformations that need to be calculated MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen(); while ( !ACqueue.isEmpty() ) { + result.incNEvaluations(); // keep track of the number of evaluations + // compute log10Likelihoods final ExactACset set = ACqueue.remove(); final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLikelihoodSeen, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculation.java similarity index 71% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculation.java index 569cd7072..b70309ed5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculation.java @@ -30,40 +30,23 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.io.File; import java.io.PrintStream; import java.util.ArrayList; import java.util.Arrays; -import java.util.List; /** - * The model representing how we calculate a genotype given the priors and a pile - * of bases and quality scores + * Uses the Exact calculation of Heng Li */ -public abstract class AlleleFrequencyCalculationModel implements Cloneable { - - public enum Model { - /** The default model with the best performance in all cases */ - EXACT +abstract class ExactAFCalculation extends AlleleFrequencyCalculation { + protected ExactAFCalculation(final UnifiedArgumentCollection UAC, final int nSamples, final Logger logger, final PrintStream verboseWriter) { + super(UAC, nSamples, logger, verboseWriter); } - protected int N; - protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE; - protected boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS; - - protected Logger logger; - protected PrintStream verboseWriter; - - protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; - - protected AlleleFrequencyCalculationModel(final UnifiedArgumentCollection UAC, final int N, final Logger logger, final PrintStream verboseWriter) { - this.N = N; - this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = UAC.MAX_ALTERNATE_ALLELES; - this.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = UAC.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS; - this.logger = logger; - this.verboseWriter = verboseWriter; + protected ExactAFCalculation(final int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, File exactCallsLog, Logger logger, PrintStream verboseWriter) { + super(nSamples, maxAltAlleles, maxAltAllelesForIndels, exactCallsLog, logger, verboseWriter); } /** @@ -102,31 +85,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { return genotypeLikelihoods; } - /** - * Must be overridden by concrete subclasses - * @param vc variant context with alleles and genotype likelihoods - * @param log10AlleleFrequencyPriors priors - * @param result (pre-allocated) object to store likelihoods results - * @return the alleles used for genotyping - */ - protected abstract List getLog10PNonRef(final VariantContext vc, - final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result); - - /** - * Must be overridden by concrete subclasses - * @param vc variant context with alleles and genotype likelihoods - * @param allelesToUse alleles to subset - * @param assignGenotypes - * @param ploidy - * @return GenotypesContext object - */ - protected abstract GenotypesContext subsetAlleles(final VariantContext vc, - final List allelesToUse, - final boolean assignGenotypes, - final int ploidy); - - // ------------------------------------------------------------------------------------- // // protected classes used to store exact model matrix columns diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OptimizedDiploidExactAFCalculation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OptimizedDiploidExactAFCalculation.java new file mode 100755 index 000000000..71f0a675d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OptimizedDiploidExactAFCalculation.java @@ -0,0 +1,496 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.io.PrintStream; +import java.util.*; + +public class OptimizedDiploidExactAFCalculation extends ExactAFCalculation { + // private final static boolean DEBUG = false; + + private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 + + public OptimizedDiploidExactAFCalculation(final int nSamples, final int maxAltAlleles) { + super(nSamples, maxAltAlleles, maxAltAlleles, null, null, null); + } + + /** + * Dynamically found in UnifiedGenotyperEngine + * + * @param UAC + * @param N + * @param logger + * @param verboseWriter + */ + public OptimizedDiploidExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { + super(UAC, N, logger, verboseWriter); + } + + @Override + public void computeLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + linearExactMultiAllelic(vc.getGenotypes(), vc.getNAlleles() - 1, log10AlleleFrequencyPriors, result); + } + + @Override + protected VariantContext reduceScope(final VariantContext vc) { + final int myMaxAltAllelesToGenotype = vc.getType().equals(VariantContext.Type.INDEL) ? MAX_ALTERNATE_ALLELES_FOR_INDELS : MAX_ALTERNATE_ALLELES_TO_GENOTYPE; + + // don't try to genotype too many alternate alleles + if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) { + logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); + + VariantContextBuilder builder = new VariantContextBuilder(vc); + List alleles = new ArrayList(myMaxAltAllelesToGenotype + 1); + alleles.add(vc.getReference()); + alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype)); + builder.alleles(alleles); + builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false)); + return builder.make(); + } else { + return vc; + } + } + + private static final int PL_INDEX_OF_HOM_REF = 0; + private static List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) { + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles]; + for ( int i = 0; i < numOriginalAltAlleles; i++ ) + likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i)); + + // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype + final ArrayList GLs = getGLs(vc.getGenotypes()); + for ( final double[] likelihoods : GLs ) { + final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); + if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) { + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL); + if ( alleles.alleleIndex1 != 0 ) + likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; + // don't double-count it + if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 ) + likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; + } + } + + // sort them by probability mass and choose the best ones + Collections.sort(Arrays.asList(likelihoodSums)); + final ArrayList bestAlleles = new ArrayList(numAllelesToChoose); + for ( int i = 0; i < numAllelesToChoose; i++ ) + bestAlleles.add(likelihoodSums[i].allele); + + final ArrayList orderedBestAlleles = new ArrayList(numAllelesToChoose); + for ( Allele allele : vc.getAlternateAlleles() ) { + if ( bestAlleles.contains(allele) ) + orderedBestAlleles.add(allele); + } + + return orderedBestAlleles; + } + + + // ------------------------------------------------------------------------------------- + // + // Multi-allelic implementation. + // + // ------------------------------------------------------------------------------------- + + public static void linearExactMultiAllelic(final GenotypesContext GLs, + final int numAlternateAlleles, + final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + + final ArrayList genotypeLikelihoods = getGLs(GLs); + final int numSamples = genotypeLikelihoods.size()-1; + final int numChr = 2*numSamples; + + // queue of AC conformations to process + final LinkedList ACqueue = new LinkedList(); + + // mapping of ExactACset indexes to the objects + final HashMap indexesToACset = new HashMap(numChr+1); + + // add AC=0 to the queue + int[] zeroCounts = new int[numAlternateAlleles]; + ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts)); + ACqueue.add(zeroSet); + indexesToACset.put(zeroSet.ACcounts, zeroSet); + + // keep processing while we have AC conformations that need to be calculated + MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen(); + while ( !ACqueue.isEmpty() ) { + result.incNEvaluations(); // keep track of the number of evaluations + + // compute log10Likelihoods + final ExactACset set = ACqueue.remove(); + final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLikelihoodSeen, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result); + + // adjust max likelihood seen if needed + if ( log10LofKs > maxLikelihoodSeen.maxLog10L ) + maxLikelihoodSeen.update(log10LofKs, set.ACcounts); + + // clean up memory + indexesToACset.remove(set.ACcounts); + //if ( DEBUG ) + // System.out.printf(" *** removing used set=%s%n", set.ACcounts); + } + } + + private static final class DependentSet { + public final int[] ACcounts; + public final int PLindex; + + public DependentSet(final int[] ACcounts, final int PLindex) { + this.ACcounts = ACcounts; + this.PLindex = PLindex; + } + } + + private static double calculateAlleleCountConformation(final ExactACset set, + final ArrayList genotypeLikelihoods, + final MaxLikelihoodSeen maxLikelihoodSeen, + final int numChr, + final LinkedList ACqueue, + final HashMap indexesToACset, + final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + + //if ( DEBUG ) + // System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); + + // compute the log10Likelihoods + computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, result); + + final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; + + // can we abort early because the log10Likelihoods are so small? + if ( log10LofK < maxLikelihoodSeen.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && maxLikelihoodSeen.isLowerAC(set.ACcounts) ) { + //if ( DEBUG ) + // System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); + return log10LofK; + } + + // iterate over higher frequencies if possible + final int ACwiggle = numChr - set.getACsum(); + if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies + return log10LofK; + + final int numAltAlleles = set.ACcounts.getCounts().length; + + // add conformations for the k+1 case + for ( int allele = 0; allele < numAltAlleles; allele++ ) { + final int[] ACcountsClone = set.ACcounts.getCounts().clone(); + ACcountsClone[allele]++; + // to get to this conformation, a sample would need to be AB (remember that ref=0) + final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1); + updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + } + + // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different + if ( ACwiggle > 1 ) { + final ArrayList differentAlleles = new ArrayList(numAltAlleles * numAltAlleles); + final ArrayList sameAlleles = new ArrayList(numAltAlleles); + + for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { + for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) { + final int[] ACcountsClone = set.ACcounts.getCounts().clone(); + ACcountsClone[allele_i]++; + ACcountsClone[allele_j]++; + + // to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index) + final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1); + if ( allele_i == allele_j ) + sameAlleles.add(new DependentSet(ACcountsClone, PLindex)); + else + differentAlleles.add(new DependentSet(ACcountsClone, PLindex)); + } + } + + // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering + for ( DependentSet dependent : differentAlleles ) + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + for ( DependentSet dependent : sameAlleles ) + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + } + + return log10LofK; + } + + // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and + // also pushes its value to the given callingSetIndex. + private static void updateACset(final int[] newSetCounts, + final int numChr, + final ExactACset dependentSet, + final int PLsetIndex, + final Queue ACqueue, + final HashMap indexesToACset, + final ArrayList genotypeLikelihoods) { + final ExactACcounts index = new ExactACcounts(newSetCounts); + if ( !indexesToACset.containsKey(index) ) { + ExactACset set = new ExactACset(numChr/2 +1, index); + indexesToACset.put(index, set); + ACqueue.add(set); + } + + // push data from the dependency to the new set + //if ( DEBUG ) + // System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts); + pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods); + } + + private static void computeLofK(final ExactACset set, + final ArrayList genotypeLikelihoods, + final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + + set.log10Likelihoods[0] = 0.0; // the zero case + final int totalK = set.getACsum(); + + // special case for k = 0 over all k + if ( totalK == 0 ) { + for ( int j = 1; j < set.log10Likelihoods.length; j++ ) + set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; + + final double log10Lof0 = set.log10Likelihoods[set.log10Likelihoods.length-1]; + result.setLog10LikelihoodOfAFzero(log10Lof0); + result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + return; + } + + // if we got here, then k > 0 for at least one k. + // the non-AA possible conformations were already dealt with by pushes from dependent sets; + // now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value + for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { + + if ( totalK < 2*j-1 ) { + final double[] gl = genotypeLikelihoods.get(j); + final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; + set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue); + } + + final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; + set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator; + } + + double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; + + // update the MLE if necessary + result.updateMLEifNeeded(log10LofK, set.ACcounts.counts); + + // apply the priors over each alternate allele + for ( final int ACcount : set.ACcounts.getCounts() ) { + if ( ACcount > 0 ) + log10LofK += log10AlleleFrequencyPriors[ACcount]; + } + result.updateMAPifNeeded(log10LofK, set.ACcounts.counts); + } + + private static void pushData(final ExactACset targetSet, + final ExactACset dependentSet, + final int PLsetIndex, + final ArrayList genotypeLikelihoods) { + final int totalK = targetSet.getACsum(); + + for ( int j = 1; j < targetSet.log10Likelihoods.length; j++ ) { + + if ( totalK <= 2*j ) { // skip impossible conformations + final double[] gl = genotypeLikelihoods.get(j); + final double conformationValue = + determineCoefficient(PLsetIndex, j, targetSet.ACcounts.getCounts(), totalK) + dependentSet.log10Likelihoods[j-1] + gl[PLsetIndex]; + targetSet.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(targetSet.log10Likelihoods[j], conformationValue); + } + } + } + + private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) { + + // the closed form representation generalized for multiple alleles is as follows: + // AA: (2j - totalK) * (2j - totalK - 1) + // AB: 2k_b * (2j - totalK) + // AC: 2k_c * (2j - totalK) + // BB: k_b * (k_b - 1) + // BC: 2 * k_b * k_c + // CC: k_c * (k_c - 1) + + // find the 2 alleles that are represented by this PL index + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + + // *** note that throughout this method we subtract one from the alleleIndex because ACcounts *** + // *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. *** + + // the AX het case + if ( alleles.alleleIndex1 == 0 ) + return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK]; + + final int k_i = ACcounts[alleles.alleleIndex1-1]; + + // the hom var case (e.g. BB, CC, DD) + final double coeff; + if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) { + coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1]; + } + // the het non-ref case (e.g. BC, BD, CD) + else { + final int k_j = ACcounts[alleles.alleleIndex2-1]; + coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j]; + } + + return coeff; + } + + public GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes, + final int ploidy) { + return VariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes); + } + + // ------------------------------------------------------------------------------------- + // + // Deprecated bi-allelic ~O(N) implementation. Kept here for posterity. + // + // ------------------------------------------------------------------------------------- + + /** + * A simple data structure that holds the current, prev, and prev->prev likelihoods vectors + * for the exact model calculation + */ +/* + private final static class ExactACCache { + double[] kMinus2, kMinus1, kMinus0; + + private final static double[] create(int n) { + return new double[n]; + } + + public ExactACCache(int n) { + kMinus2 = create(n); + kMinus1 = create(n); + kMinus0 = create(n); + } + + final public void rotate() { + double[] tmp = kMinus2; + kMinus2 = kMinus1; + kMinus1 = kMinus0; + kMinus0 = tmp; + } + + final public double[] getkMinus2() { + return kMinus2; + } + + final public double[] getkMinus1() { + return kMinus1; + } + + final public double[] getkMinus0() { + return kMinus0; + } + } + + public int linearExact(GenotypesContext GLs, + double[] log10AlleleFrequencyPriors, + double[][] log10AlleleFrequencyLikelihoods, + double[][] log10AlleleFrequencyPosteriors) { + final ArrayList genotypeLikelihoods = getGLs(GLs); + final int numSamples = genotypeLikelihoods.size()-1; + final int numChr = 2*numSamples; + + final ExactACCache logY = new ExactACCache(numSamples+1); + logY.getkMinus0()[0] = 0.0; // the zero case + + double maxLog10L = Double.NEGATIVE_INFINITY; + boolean done = false; + int lastK = -1; + + for (int k=0; k <= numChr && ! done; k++ ) { + final double[] kMinus0 = logY.getkMinus0(); + + if ( k == 0 ) { // special case for k = 0 + for ( int j=1; j <= numSamples; j++ ) { + kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[0]; + } + } else { // k > 0 + final double[] kMinus1 = logY.getkMinus1(); + final double[] kMinus2 = logY.getkMinus2(); + + for ( int j=1; j <= numSamples; j++ ) { + final double[] gl = genotypeLikelihoods.get(j); + final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; + + double aa = Double.NEGATIVE_INFINITY; + double ab = Double.NEGATIVE_INFINITY; + if (k < 2*j-1) + aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[0]; + + if (k < 2*j) + ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[1]; + + double log10Max; + if (k > 1) { + final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[2]; + log10Max = approximateLog10SumLog10(aa, ab, bb); + } else { + // we know we aren't considering the BB case, so we can use an optimized log10 function + log10Max = approximateLog10SumLog10(aa, ab); + } + + // finally, update the L(j,k) value + kMinus0[j] = log10Max - logDenominator; + } + } + + // update the posteriors vector + final double log10LofK = kMinus0[numSamples]; + log10AlleleFrequencyLikelihoods[0][k] = log10LofK; + log10AlleleFrequencyPosteriors[0][k] = log10LofK + log10AlleleFrequencyPriors[k]; + + // can we abort early? + lastK = k; + maxLog10L = Math.max(maxLog10L, log10LofK); + if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { + //if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L); + done = true; + } + + logY.rotate(); + } + + return lastK; + } + + final static double approximateLog10SumLog10(double a, double b, double c) { + return approximateLog10SumLog10(approximateLog10SumLog10(a, b), c); + } +*/ + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 30c0f3e18..842ec876a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -41,7 +41,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection */ @Advanced @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) - protected AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT; + protected AlleleFrequencyCalculation.Model AFmodel = AlleleFrequencyCalculation.Model.EXACT; /** * The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily @@ -75,10 +75,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) public Double MAX_DELETION_FRACTION = 0.05; - @Hidden - @Argument(fullName = "cap_max_alternate_alleles_for_indels", shortName = "capMaxAltAllelesForIndels", doc = "Cap the maximum number of alternate alleles to genotype for indel calls at 2; overrides the --max_alternate_alleles argument; GSA production use only", required = false) - public boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = false; - // indel-related arguments /** * A candidate indel is genotyped (and potentially called) if there are this number of reads with a consensus indel at a site. @@ -186,7 +182,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false) boolean EXCLUDE_FILTERED_REFERENCE_SITES = false; - // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! public UnifiedArgumentCollection clone() { UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); @@ -212,7 +207,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE; uac.alleles = alleles; uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES; - uac.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS; + uac.MAX_ALTERNATE_ALLELES_FOR_INDELS = MAX_ALTERNATE_ALLELES_FOR_INDELS; uac.GLmodel = GLmodel; uac.TREAT_ALL_READS_AS_SINGLE_POOL = TREAT_ALL_READS_AS_SINGLE_POOL; uac.referenceSampleRod = referenceSampleRod; @@ -224,6 +219,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection uac.minReferenceDepth = minReferenceDepth; uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES; uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO; + uac.exactCallsLog = exactCallsLog; // todo- arguments to remove uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; @@ -239,8 +235,10 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection this.GenotypingMode = SCAC.GenotypingMode; this.heterozygosity = SCAC.heterozygosity; this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES; + this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS; this.OutputMode = SCAC.OutputMode; this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; + this.exactCallsLog = SCAC.exactCallsLog; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 0d1997252..30a1439e4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -27,10 +27,10 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.filters.BadMateFilter; import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; @@ -249,7 +249,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif throw new UserException("Incorrect genotype calculation model chosen. Only [POOLSNP|POOLINDEL|POOLBOTH] supported with this walker if sample ploidy != 2"); } - if (UAC.AFmodel != AlleleFrequencyCalculationModel.Model.POOL) + if (UAC.AFmodel != AlleleFrequencyCalculation.Model.POOL) throw new UserException("Incorrect AF Calculation model. Only POOL model supported if sample ploidy != 2"); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 469d63b8a..aeb8b9dd5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -78,11 +78,10 @@ public class UnifiedGenotyperEngine { private ThreadLocal> glcm = new ThreadLocal>(); // the model used for calculating p(non-ref) - private ThreadLocal afcm = new ThreadLocal(); + private ThreadLocal afcm = new ThreadLocal(); // the allele frequency likelihoods and posteriors (allocated once as an optimization) private ThreadLocal alleleFrequencyCalculationResult = new ThreadLocal(); - private ThreadLocal posteriorsArray = new ThreadLocal(); // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything private final double[] log10AlleleFrequencyPriorsSNPs; @@ -357,7 +356,6 @@ public class UnifiedGenotyperEngine { if ( afcm.get() == null ) { afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES)); - posteriorsArray.set(new double[2]); } AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get(); @@ -370,8 +368,7 @@ public class UnifiedGenotyperEngine { generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); } - AFresult.reset(); - List allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult); + afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult); // is the most likely frequency conformation AC=0 for all alternate alleles? boolean bestGuessIsRef = true; @@ -382,7 +379,7 @@ public class UnifiedGenotyperEngine { myAlleles.add(vc.getReference()); for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { final Allele alternateAllele = vc.getAlternateAllele(i); - final int indexOfAllele = allelesUsedInGenotyping.indexOf(alternateAllele); + final int indexOfAllele = AFresult.getAllelesUsedInGenotyping().indexOf(alternateAllele); // the genotyping model may have stripped it out if ( indexOfAllele == -1 ) continue; @@ -403,16 +400,16 @@ public class UnifiedGenotyperEngine { } // calculate p(f>0): - final double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get()); - final double PofF = 1.0 - normalizedPosteriors[0]; + final double PoFEq0 = AFresult.getNormalizedPosteriorOfAFzero(); + final double PoFGT0 = AFresult.getNormalizedPosteriorOfAFGTZero(); double phredScaledConfidence; if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); + phredScaledConfidence = QualityUtils.phredScaleErrorRate(PoFEq0); if ( Double.isInfinite(phredScaledConfidence) ) phredScaledConfidence = -10.0 * AFresult.getLog10PosteriorOfAFzero(); } else { - phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); + phredScaledConfidence = QualityUtils.phredScaleErrorRate(PoFGT0); if ( Double.isInfinite(phredScaledConfidence) ) { final double sum = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); @@ -423,7 +420,7 @@ public class UnifiedGenotyperEngine { if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) { // technically, at this point our confidence in a reference call isn't accurately estimated // because it didn't take into account samples with no data, so let's get a better estimate - return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, 1.0 - PofF); + return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0); } // start constructing the resulting VC @@ -439,7 +436,7 @@ public class UnifiedGenotyperEngine { // print out stats if we have a writer if ( verboseWriter != null && !limitedContext ) - printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, model); + printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model); // *** note that calculating strand bias involves overwriting data structures, so we do that last final HashMap attributes = new HashMap(); @@ -477,7 +474,6 @@ public class UnifiedGenotyperEngine { // the forward lod VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); - AFresult.reset(); afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero(); @@ -486,7 +482,6 @@ public class UnifiedGenotyperEngine { // the reverse lod VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); - AFresult.reset(); afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero(); @@ -513,10 +508,10 @@ public class UnifiedGenotyperEngine { // if we are subsetting alleles (either because there were too many or because some were not polymorphic) // then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). - if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) + if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // limitedContext callers need to handle allele trimming on their own to keep their perReadAlleleLikelihoodMap alleles in sync vcCall = VariantContextUtils.reverseTrimAlleles(vcCall); - if ( annotationEngine != null && !limitedContext ) { + if ( annotationEngine != null && !limitedContext ) { // limitedContext callers need to handle annotations on their own by calling their own annotationEngine // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations final ReadBackedPileup pileup = rawContext.getBasePileup(); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); @@ -524,13 +519,7 @@ public class UnifiedGenotyperEngine { vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap); } - return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); - } - - public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) { - normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero(); - normalizedPosteriors[1] = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); - return MathUtils.normalizeFromLog10(normalizedPosteriors); + return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0)); } private Map getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) { @@ -754,32 +743,34 @@ public class UnifiedGenotyperEngine { return glcm; } - private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) { + private static AlleleFrequencyCalculation getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) { - List> afClasses = new PluginManager(AlleleFrequencyCalculationModel.class).getPlugins(); + List> afClasses = new PluginManager(AlleleFrequencyCalculation.class).getPlugins(); // user-specified name - String afModelName = UAC.AFmodel.name(); + String afModelName = UAC.AFmodel.implementationName; if (!afModelName.contains(GPSTRING) && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) afModelName = GPSTRING + afModelName; + else + afModelName = "Diploid" + afModelName; for (int i = 0; i < afClasses.size(); i++) { - Class afClass = afClasses.get(i); + Class afClass = afClasses.get(i); String key = afClass.getSimpleName().replace("AFCalculationModel","").toUpperCase(); if (afModelName.equalsIgnoreCase(key)) { try { Object args[] = new Object[]{UAC,N,logger,verboseWriter}; Constructor c = afClass.getDeclaredConstructor(UnifiedArgumentCollection.class, int.class, Logger.class, PrintStream.class); - return (AlleleFrequencyCalculationModel)c.newInstance(args); + return (AlleleFrequencyCalculation)c.newInstance(args); } catch (Exception e) { - throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel); + throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculation " + UAC.AFmodel); } } } - throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel); + throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculation " + UAC.AFmodel); } public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding allelesBinding) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java index 3e48520a7..cbc4c4401 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java @@ -24,7 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult; -import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidExactAFCalculation; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.TreeSet; @@ -51,7 +51,7 @@ public class GLBasedSampleSelector extends SampleSelector { flatPriors = new double[1+2*samples.size()]; } AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size()); - ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPriors,result); + DiploidExactAFCalculation.linearExactMultiAllelic(subContext.getGenotypes(), vc.getAlternateAlleles().size(), flatPriors, result); // do we want to let this qual go up or down? if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) { return true; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java index f89bcb2a7..92d6e686b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java @@ -75,7 +75,7 @@ public class FilterLiftedVariants extends RodWalker { boolean failed = false; byte[] recordRef = vc.getReference().getBases(); for (int i = 0; i < recordRef.length && i < MAX_VARIANT_SIZE; i++) { - if ( recordRef[i] != ref[i + (vc.isPointEvent() ? 0 : 1)] ) { + if ( recordRef[i] != ref[i] ) { failed = true; break; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 9664a5bde..15c17988c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -50,7 +50,6 @@ import org.broadinstitute.sting.utils.variantcontext.*; import java.io.File; import java.io.FileNotFoundException; -import java.io.PrintStream; import java.util.*; /** @@ -278,13 +277,6 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0; - /** - * Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so make sure you supply the program with enough memory - * given your input set. This option will NOT work well for large callsets; use --select_random_fraction for sets with a large numbers of variants. - */ - @Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false) - protected int numRandom = 0; - /** * This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions. */ @@ -330,20 +322,6 @@ public class SelectVariants extends RodWalker implements TreeR private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false; - /* Private class used to store the intermediate variants in the integer random selection process */ - private static class RandomVariantStructure { - private VariantContext vc; - - RandomVariantStructure(VariantContext vcP) { - vc = vcP; - } - - public void set (VariantContext vcP) { - vc = vcP; - } - - } - public enum NumberAlleleRestriction { ALL, BIALLELIC, @@ -364,12 +342,7 @@ public class SelectVariants extends RodWalker implements TreeR /* variables used by the SELECT RANDOM modules */ - private boolean SELECT_RANDOM_NUMBER = false; private boolean SELECT_RANDOM_FRACTION = false; - private int variantNumber = 0; - private int nVariantsAdded = 0; - private int positionToAdd = 0; - private RandomVariantStructure [] variantArray; //Random number generator for the genotypes to remove private Random randomGenotypes = new Random(); @@ -478,12 +451,6 @@ public class SelectVariants extends RodWalker implements TreeR mv = new MendelianViolation(MENDELIAN_VIOLATION_QUAL_THRESHOLD,false,true); } - SELECT_RANDOM_NUMBER = numRandom > 0; - if (SELECT_RANDOM_NUMBER) { - logger.info("Selecting " + numRandom + " variants at random from the variant track"); - variantArray = new RandomVariantStructure[numRandom]; - } - SELECT_RANDOM_FRACTION = fractionRandom > 0; if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track"); @@ -588,14 +555,10 @@ public class SelectVariants extends RodWalker implements TreeR break; } } - if ( !failedJexlMatch ) { - if (SELECT_RANDOM_NUMBER) { - randomlyAddVariant(++variantNumber, sub); - } - else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) { - if ( ! justRead ) - vcfWriter.add(sub); - } + if ( !failedJexlMatch && + !justRead && + ( !SELECT_RANDOM_FRACTION || GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom ) ) { + vcfWriter.add(sub); } } } @@ -718,14 +681,6 @@ public class SelectVariants extends RodWalker implements TreeR public void onTraversalDone(Integer result) { logger.info(result + " records processed."); - - if (SELECT_RANDOM_NUMBER) { - int positionToPrint = positionToAdd; - for (int i=0; i implements TreeR GenotypesContext newGC = sub.getGenotypes(); - // if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs (because they are no longer accurate) + // if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs and AD (because they are no longer accurate) if ( vc.getAlleles().size() != sub.getAlleles().size() ) - newGC = VariantContextUtils.stripPLs(sub.getGenotypes()); + newGC = VariantContextUtils.stripPLsAndAD(sub.getGenotypes()); // if we have fewer samples in the selected VC than in the original VC, we need to strip out the MLE tags if ( vc.getNSamples() != sub.getNSamples() ) { @@ -809,25 +764,4 @@ public class SelectVariants extends RodWalker implements TreeR if ( sawDP ) builder.attribute("DP", depth); } - - private void randomlyAddVariant(int rank, VariantContext vc) { - if (nVariantsAdded < numRandom) - variantArray[nVariantsAdded++] = new RandomVariantStructure(vc); - - else { - double v = GenomeAnalysisEngine.getRandomGenerator().nextDouble(); - double t = (1.0/(rank-numRandom+1)); - if ( v < t) { - variantArray[positionToAdd].set(vc); - nVariantsAdded++; - positionToAdd = nextCircularPosition(positionToAdd); - } - } - } - - private int nextCircularPosition(int cur) { - if ((cur + 1) == variantArray.length) - return 0; - return cur + 1; - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index b9577ca9b..dd5264a1b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -42,6 +42,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.io.PrintStream; +import java.lang.reflect.Array; import java.util.*; /** @@ -334,12 +335,12 @@ public class VariantsToTable extends RodWalker { return records; } - private static void addFieldValue(Object val, List> result) { + private static void addFieldValue(final Object val, final List> result) { final int numResultRecords = result.size(); // if we're trying to create a single output record, add it if ( numResultRecords == 1 ) { - result.get(0).add(val.toString()); + result.get(0).add(prettyPrintObject(val)); } // if this field is a list of the proper size, add the appropriate entry to each record else if ( (val instanceof List) && ((List)val).size() == numResultRecords ) { @@ -355,6 +356,26 @@ public class VariantsToTable extends RodWalker { } } + private static String prettyPrintObject(final Object val) { + if ( val instanceof List ) + return prettyPrintObject(((List)val).toArray()); + + if ( !val.getClass().isArray() ) + return val.toString(); + + final int length = Array.getLength(val); + if ( length == 0 ) + return ""; + + final StringBuilder sb = new StringBuilder(prettyPrintObject(Array.get(val, 0))); + for ( int i = 1; i < length; i++ ) { + sb.append(","); + sb.append(prettyPrintObject(Array.get(val, i))); + } + return sb.toString(); + } + + public static List> extractFields(VariantContext vc, List fields, boolean allowMissingData) { return extractFields(vc, fields, null, null, allowMissingData, false); } diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index 74b038032..f4a200af0 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -236,6 +236,33 @@ public class Utils { } } + public static List append(final List left, T ... elts) { + final List l = new LinkedList(left); + l.addAll(Arrays.asList(elts)); + return l; + } + + /** + * Returns a string of the values in joined by separator, such as A,B,C + * + * @param separator + * @param doubles + * @return + */ + public static String join(String separator, double[] doubles) { + if ( doubles == null || doubles.length == 0) + return ""; + else { + StringBuilder ret = new StringBuilder(); + ret.append(doubles[0]); + for (int i = 1; i < doubles.length; ++i) { + ret.append(separator); + ret.append(doubles[i]); + } + return ret.toString(); + } + } + /** * Returns a string of the form elt1.toString() [sep elt2.toString() ... sep elt.toString()] for a collection of * elti objects (note there's no actual space between sep and the elti elements). Returns diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 8abcf115a..81959c998 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -157,11 +157,8 @@ public class VariantContextUtils { builder.attributes(calculateChromosomeCounts(vc, new HashMap(vc.getAttributes()), removeStaleValues, founderIds)); } - public static Genotype removePLs(Genotype g) { - if ( g.hasLikelihoods() ) - return new GenotypeBuilder(g).noPL().make(); - else - return g; + public static Genotype removePLsAndAD(final Genotype g) { + return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g; } public final static VCFCompoundHeaderLine getMetaDataForField(final VCFHeader header, final String field) { @@ -573,7 +570,7 @@ public class VariantContextUtils { } // if we have more alternate alleles in the merged VC than in one or more of the - // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF + // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD for ( final VariantContext vc : VCs ) { if (vc.alleles.size() == 1) continue; @@ -581,7 +578,7 @@ public class VariantContextUtils { if ( ! genotypes.isEmpty() ) logger.debug(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); - genotypes = stripPLs(genotypes); + genotypes = stripPLsAndAD(genotypes); // this will remove stale AC,AF attributed from vc calculateChromosomeCounts(vc, attributes, true); break; @@ -672,11 +669,11 @@ public class VariantContextUtils { return true; } - public static GenotypesContext stripPLs(GenotypesContext genotypes) { + public static GenotypesContext stripPLsAndAD(GenotypesContext genotypes) { GenotypesContext newGs = GenotypesContext.create(genotypes.size()); for ( final Genotype g : genotypes ) { - newGs.add(g.hasLikelihoods() ? removePLs(g) : g); + newGs.add(removePLsAndAD(g)); } return newGs; @@ -1343,10 +1340,7 @@ public class VariantContextUtils { public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) { - // TODO - this function doesn't work with mixed records or records that started as mixed and then became non-mixed - // see whether we need to trim common reference base from all alleles - final int trimExtent = computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, false); if ( trimExtent <= 0 || inputVC.getAlleles().size() <= 1 ) return inputVC; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index f2d34fe85..9a987f161 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -477,10 +477,10 @@ class VCFWriter extends IndexingVariantContextWriter { else if ( val instanceof List ) { result = formatVCFField(((List)val).toArray()); } else if ( val.getClass().isArray() ) { - int length = Array.getLength(val); + final int length = Array.getLength(val); if ( length == 0 ) return formatVCFField(null); - StringBuffer sb = new StringBuffer(formatVCFField(Array.get(val, 0))); + final StringBuilder sb = new StringBuilder(formatVCFField(Array.get(val, 0))); for ( int i = 1; i < length; i++) { sb.append(","); sb.append(formatVCFField(Array.get(val, i))); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java deleted file mode 100644 index 0731d3fd8..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java +++ /dev/null @@ -1,127 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; -import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; -import org.testng.Assert; -import org.testng.annotations.BeforeSuite; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.Arrays; - - -public class ExactAFCalculationModelUnitTest extends BaseTest { - - static double[] AA1, AB1, BB1; - static double[] AA2, AB2, AC2, BB2, BC2, CC2; - static final int numSamples = 3; - static double[] priors = new double[2*numSamples+1]; // flat priors - - @BeforeSuite - public void before() { - AA1 = new double[]{0.0, -20.0, -20.0}; - AB1 = new double[]{-20.0, 0.0, -20.0}; - BB1 = new double[]{-20.0, -20.0, 0.0}; - AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0}; - AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0}; - AC2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0}; - BB2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0}; - BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0}; - CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0}; - } - - private class GetGLsTest extends TestDataProvider { - GenotypesContext GLs; - int numAltAlleles; - String name; - - private GetGLsTest(String name, int numAltAlleles, Genotype... arg) { - super(GetGLsTest.class, name); - GLs = GenotypesContext.create(arg); - this.name = name; - this.numAltAlleles = numAltAlleles; - } - - public String toString() { - return String.format("%s input=%s", super.toString(), GLs); - } - } - - private static Genotype createGenotype(String name, double[] gls) { - return new GenotypeBuilder(name, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).PL(gls).make(); - } - - @DataProvider(name = "getGLs") - public Object[][] createGLsData() { - - // bi-allelic case - new GetGLsTest("B0", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AA3", AA1)); - new GetGLsTest("B1", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AB", AB1)); - new GetGLsTest("B2", 1, createGenotype("AA1", AA1), createGenotype("BB", BB1), createGenotype("AA2", AA1)); - new GetGLsTest("B3a", 1, createGenotype("AB", AB1), createGenotype("AA", AA1), createGenotype("BB", BB1)); - new GetGLsTest("B3b", 1, createGenotype("AB1", AB1), createGenotype("AB2", AB1), createGenotype("AB3", AB1)); - new GetGLsTest("B4", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("AA", AA1)); - new GetGLsTest("B5", 1, createGenotype("BB1", BB1), createGenotype("AB", AB1), createGenotype("BB2", BB1)); - new GetGLsTest("B6", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("BB3", BB1)); - - // tri-allelic case - new GetGLsTest("B1C0", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AB", AB2)); - new GetGLsTest("B0C1", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AC", AC2)); - new GetGLsTest("B1C1a", 2, createGenotype("AA", AA2), createGenotype("AB", AB2), createGenotype("AC", AC2)); - new GetGLsTest("B1C1b", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("BC", BC2)); - new GetGLsTest("B2C1", 2, createGenotype("AB1", AB2), createGenotype("AB2", AB2), createGenotype("AC", AC2)); - new GetGLsTest("B3C2a", 2, createGenotype("AB", AB2), createGenotype("BC1", BC2), createGenotype("BC2", BC2)); - new GetGLsTest("B3C2b", 2, createGenotype("AB", AB2), createGenotype("BB", BB2), createGenotype("CC", CC2)); - - return GetGLsTest.getTests(GetGLsTest.class); - } - - - @Test(dataProvider = "getGLs") - public void testGLs(GetGLsTest cfg) { - - final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2); - - ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result); - - int nameIndex = 1; - for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { - int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); - int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele]; - - Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); - } - } - - @Test - public void testLargeGLs() { - - final double[] BB = new double[]{-20000000.0, -20000000.0, 0.0}; - GetGLsTest cfg = new GetGLsTest("B6", 1, createGenotype("1", BB), createGenotype("2", BB), createGenotype("3", BB)); - - final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2); - - ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result); - - int calculatedAlleleCount = result.getAlleleCountsOfMAP()[0]; - Assert.assertEquals(calculatedAlleleCount, 6); - } - - @Test - public void testMismatchedGLs() { - - final double[] AB = new double[]{-2000.0, 0.0, -2000.0, -2000.0, -2000.0, -2000.0}; - final double[] AC = new double[]{-100.0, -100.0, -100.0, 0.0, -100.0, -100.0}; - GetGLsTest cfg = new GetGLsTest("B1C1", 2, createGenotype("1", AC), createGenotype("2", AB)); - - final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2); - - ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result); - - Assert.assertEquals(result.getAlleleCountsOfMAP()[0], 1); - Assert.assertEquals(result.getAlleleCountsOfMAP()[1], 1); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 1f418f736..0388a3291 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -182,12 +182,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "da318257d25a02abd26a3348421c3c69"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "7bb6375fddc461c72d44f261f6d4b3c7"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "13c4f01cffbbfac600318be95b3ca02f"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "2104dac76fa2a58a92c72b331c7f2095"); } private void testOutputParameters(final String args, final String md5) { @@ -438,4 +438,19 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { Arrays.asList("22c9fd65ce3298bd7fbf400c9c209f29")); executeTest("test calling on reads with Ns in CIGAR", spec); } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing reduced reads + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void testReducedBam() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, + Arrays.asList("84486c88a0fd1ae996a6402490db8492")); + executeTest("test calling on a ReducedRead BAM", spec); + } + } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java index e14580ead..a8309c14e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java @@ -61,4 +61,13 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest { Arrays.asList("7e7bad0e1890753a01303c09a38ceb8d")); executeTest("test hg18 to hg19, unsorted", spec); } + + @Test + public void testLiftoverFilteringOfIndels() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T FilterLiftedVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "liftover_indel_test.vcf --no_cmdline_in_header", + 1, + Arrays.asList("0909a953291a5e701194668c9b8833ab")); + executeTest("test liftover filtering of indels", spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index ffd9c9b4a..34395e920 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -255,7 +255,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile, 1, - Arrays.asList("3ab35d5e81a29fb5db3e2add11c7e823") + Arrays.asList("f14d75892b99547d8e9ba3a03bfb04ea") ); executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java index 2ffcd02e2..8186ffc7d 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java @@ -63,7 +63,7 @@ public class VariantsToTableIntegrationTest extends WalkerTest { @Test(enabled = true) public void testMultiAllelicOneRecord() { WalkerTestSpec spec = new WalkerTestSpec(variantsToTableMultiAllelicCmd(""), - Arrays.asList("13dd36c08be6c800f23988e6000d963e")); + Arrays.asList("0ff49c08690f61a38614606a090f23ea")); executeTest("testMultiAllelicOneRecord", spec); } @@ -100,6 +100,19 @@ public class VariantsToTableIntegrationTest extends WalkerTest { executeTest("testGenotypeFieldsWithInline", spec); } + @Test(enabled = true) + public void testListFields() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b36KGReference + + " --variant " + privateTestDir + "vcfexample.withMLE.vcf" + + " -T VariantsToTable" + + " -GF PL" + + " -o %s", + 1, + Arrays.asList("1cb2737ab0eaee0a9ae25ab2e7ac3e7e")); + executeTest("testGenotypeFields", spec); + } + @Test(enabled = true) public void testMoltenOutput() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleRetryMemoryLimit.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleRetryMemoryLimit.scala index 09a24e782..1cd5a7512 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleRetryMemoryLimit.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleRetryMemoryLimit.scala @@ -10,13 +10,17 @@ class ExampleRetryMemoryLimit extends QScript { var bamFile: File = _ def script() { - val ug = new UnifiedGenotyper with RetryMemoryLimit - // First run with 1m - ug.memoryLimit = .001 - // On retry run with 1g - ug.retryMemoryFunction = (d => d * 1000) - ug.reference_sequence = referenceFile - ug.input_file = Seq(bamFile) - add(ug) + for (scatterCount <- 1 to 2) { + val ug = new UnifiedGenotyper with RetryMemoryLimit + // First run with 1m + ug.memoryLimit = .001 + // On retry run with 1g + ug.retryMemoryFunction = (d => d * 1000) + ug.reference_sequence = referenceFile + ug.input_file = Seq(bamFile) + ug.out = swapExt(bamFile, ".bam", ".scattered_%d.vcf".format(scatterCount)) + ug.scatterCount = scatterCount + add(ug) + } } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index 0d0fab9d1..d0379d022 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -189,7 +189,7 @@ class QCommandLine extends CommandLineProgram with Logging { private def createQueueHeader() : Seq[String] = { Seq(String.format("Queue v%s, Compiled %s", getQueueVersion, getBuildTimestamp), "Copyright (c) 2012 The Broad Institute", - "Fro support and documentation go to http://www.broadinstitute.org/gatk") + "For support and documentation go to http://www.broadinstitute.org/gatk") } private def getQueueVersion : String = { diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala index 6cd4b06bc..9522ec86c 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala @@ -26,19 +26,19 @@ package org.broadinstitute.sting.queue.extensions.gatk import org.broadinstitute.sting.queue.function.scattergather.GatherFunction import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction -import org.broadinstitute.sting.queue.function.QFunction +import org.broadinstitute.sting.queue.function.{RetryMemoryLimit, QFunction} import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor /** * Merges BAM files using net.sf.picard.sam.MergeSamFiles. */ -class BamGatherFunction extends GatherFunction with PicardBamFunction { +class BamGatherFunction extends GatherFunction with PicardBamFunction with RetryMemoryLimit { this.javaMainClass = "net.sf.picard.sam.MergeSamFiles" this.assumeSorted = Some(true) protected def inputBams = gatherParts protected def outputBam = originalOutput - override def freezeFieldValues { + override def freezeFieldValues() { val originalGATK = originalFunction.asInstanceOf[CommandLineGATK] // Whatever the original function can handle, merging *should* do less. diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala index 739e6cc91..75be4d773 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala @@ -25,13 +25,13 @@ package org.broadinstitute.sting.queue.extensions.gatk import org.broadinstitute.sting.queue.function.scattergather.GatherFunction -import org.broadinstitute.sting.queue.function.QFunction +import org.broadinstitute.sting.queue.function.{RetryMemoryLimit, QFunction} import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor /** * Merges a vcf text file. */ -class VcfGatherFunction extends CombineVariants with GatherFunction { +class VcfGatherFunction extends CombineVariants with GatherFunction with RetryMemoryLimit { this.assumeIdenticalSamples = true this.suppressCommandLineHeader = true diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala index 9257cc7c2..b22bb2b59 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala @@ -50,7 +50,7 @@ class SortSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFun override def freezeFieldValues() { super.freezeFieldValues() if (outputIndex == null && output != null) - outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai") + outputIndex = new File(output.getAbsolutePath.stripSuffix(".bam") + ".bai") } diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala index 84b625760..eb426d301 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala @@ -25,6 +25,7 @@ package org.broadinstitute.sting.queue.function import org.broadinstitute.sting.queue.util._ +import org.broadinstitute.sting.commandline.Argument /** * A command line that will be run in a pipeline. @@ -33,12 +34,15 @@ trait CommandLineFunction extends QFunction with Logging { def commandLine: String /** Upper memory limit */ + @Argument(doc="Memory limit", required=false) var memoryLimit: Option[Double] = None /** Resident memory limit */ + @Argument(doc="Resident memory limit", required=false) var residentLimit: Option[Double] = None /** Resident memory request */ + @Argument(doc="Resident memory request", required=false) var residentRequest: Option[Double] = None /** the number of SMP cores this job wants */ diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala index b9cb8540f..6500360c0 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala @@ -47,6 +47,7 @@ trait JavaCommandLineFunction extends CommandLineFunction { /** * Memory limit for the java executable, or if None will use the default memoryLimit. */ + @Argument(doc="Java memory limit", required=false) var javaMemoryLimit: Option[Double] = None /** diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala index 9f7932d39..aae846534 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala @@ -113,11 +113,13 @@ trait QFunction extends Logging with QJobReport { var jobErrorFile: File = _ /** Errors (if any) from the last failed run of jobErrorFiles. */ + @Argument(doc="Job error lines", required=false) var jobErrorLines: Seq[String] = Nil /** * The number of times this function has previously been run. */ + @Argument(doc="Job retries", required=false) var retries = 0 /** Change settings for the next run. Retries will be set to the number of times the function was run and jobErrorLines may contain the error text. */ @@ -541,4 +543,11 @@ object QFunction { classFields } } + + /** + * Returns the Seq of fields for a QFunction class. + * @param clazz Class to retrieve fields for. + * @return the fields of the class. + */ + def classFunctionFields(clazz: Class[_]) = classFields(clazz).functionFields } diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/RetryMemoryLimit.scala b/public/scala/src/org/broadinstitute/sting/queue/function/RetryMemoryLimit.scala index 8bba5551f..acc9a7203 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/RetryMemoryLimit.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/RetryMemoryLimit.scala @@ -24,17 +24,26 @@ package org.broadinstitute.sting.queue.function +import org.broadinstitute.sting.commandline.Argument + +object RetryMemoryLimit { + private val defaultRetryMemoryFunction: (Double => Double) = ( 2 * _ ) + private val defaultMemoryLimitErrorText = Seq("OutOfMemory", "you did not provide enough memory", "TERM_MEMLIMIT") +} + /** A mixin that on retry increases the memory limit when certain text is found. */ trait RetryMemoryLimit extends CommandLineFunction { /** How to increase the memory. By default doubles the memory. */ - var retryMemoryFunction: (Double => Double) = (2 * _) + var retryMemoryFunction: (Double => Double) = RetryMemoryLimit.defaultRetryMemoryFunction /** Once the threshold is passed, no more memory will be added to memory limit. */ + @Argument(doc="threshold to stop doubling the memory", required=false) var memoryLimitThreshold: Option[Double] = None /** Various strings to look for to determine we ran out of memory. */ - var memoryLimitErrorText = Seq("OutOfMemory", "you did not provide enough memory", "TERM_MEMLIMIT") + @Argument(doc="text to look for in the errors", required = false) + var memoryLimitErrorText = RetryMemoryLimit.defaultMemoryLimitErrorText override def freezeFieldValues() { super.freezeFieldValues() @@ -42,6 +51,21 @@ trait RetryMemoryLimit extends CommandLineFunction { this.memoryLimitThreshold = this.qSettings.memoryLimitThreshold } + + override def copySettingsTo(function: QFunction) { + super.copySettingsTo(function) + function match { + case retryMemoryLimit: RetryMemoryLimit => + if (retryMemoryLimit.memoryLimitThreshold.isEmpty) + retryMemoryLimit.memoryLimitThreshold = this.memoryLimitThreshold + if (retryMemoryLimit.retryMemoryFunction == RetryMemoryLimit.defaultRetryMemoryFunction) + retryMemoryLimit.retryMemoryFunction = this.retryMemoryFunction + if (retryMemoryLimit.memoryLimitErrorText == RetryMemoryLimit.defaultMemoryLimitErrorText) + retryMemoryLimit.memoryLimitErrorText = this.memoryLimitErrorText + case _ => /* ignore */ + } + } + override def setupRetry() { super.setupRetry() if (this.memoryLimitThreshold.isDefined && this.memoryLimit.isDefined) { diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala index 5b4f2b7e6..686188e72 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala @@ -30,6 +30,10 @@ import org.broadinstitute.sting.queue.function.{QFunction, CommandLineFunction} /** * Shadow clones another command line function. */ +object CloneFunction { + private lazy val cloneFunctionFields = QFunction.classFunctionFields(classOf[CloneFunction]) +} + class CloneFunction extends CommandLineFunction { var originalFunction: ScatterGatherableFunction = _ var cloneIndex: Int = _ @@ -41,10 +45,10 @@ class CloneFunction extends CommandLineFunction { var originalValues = Map.empty[ArgumentSource, Any] withScatterPartCount += 1 if (withScatterPartCount == 1) { - overriddenFields.foreach{ - case (field, overrideValue) => { + originalFunction.functionFields.foreach { + case (field) => { originalValues += field -> originalFunction.getFieldValue(field) - originalFunction.setFieldValue(field, overrideValue) + originalFunction.setFieldValue(field, getFieldValue(field)) } } } @@ -52,9 +56,11 @@ class CloneFunction extends CommandLineFunction { f() } finally { if (withScatterPartCount == 1) { - originalValues.foreach{ - case (name, value) => - originalFunction.setFieldValue(name, value) + originalFunction.functionFields.foreach { + case (field) => { + setFieldValue(field, originalFunction.getFieldValue(field)) + originalFunction.setFieldValue(field, originalValues(field)) + } } } withScatterPartCount -= 1 @@ -63,6 +69,8 @@ class CloneFunction extends CommandLineFunction { override def description = withScatterPart(() => originalFunction.description) override def shortDescription = withScatterPart(() => originalFunction.shortDescription) + override def setupRetry() { withScatterPart(() => originalFunction.setupRetry()) } + override protected def functionFieldClass = originalFunction.getClass def commandLine = withScatterPart(() => originalFunction.commandLine) @@ -73,13 +81,19 @@ class CloneFunction extends CommandLineFunction { } override def getFieldValue(source: ArgumentSource): AnyRef = { - overriddenFields.get(source) match { - case Some(value) => value.asInstanceOf[AnyRef] - case None => { - val value = originalFunction.getFieldValue(source) - overriddenFields += source -> value - value - } + CloneFunction.cloneFunctionFields.find(_.field.getName == source.field.getName) match { + case Some(cloneSource) => + super.getFieldValue(cloneSource) + case None => + overriddenFields.get(source) match { + case Some(value) => + value.asInstanceOf[AnyRef] + case None => { + val value = originalFunction.getFieldValue(source) + overriddenFields += source -> value + value + } + } } }