diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java new file mode 100644 index 000000000..714a4df18 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java @@ -0,0 +1,21 @@ +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import org.broadinstitute.sting.utils.GenomeLocComparator; + +import java.util.TreeSet; + +/** + * A stash of regions that must be kept uncompressed in all samples + * + * In general, these are regions that were kept uncompressed by a tumor sample and we want to force + * all other samples (normals and/or tumors) to also keep these regions uncompressed + * + * User: carneiro + * Date: 10/15/12 + * Time: 4:08 PM + */ +public class CompressionStash extends TreeSet { + public CompressionStash() { + super(new GenomeLocComparator()); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java index 7c9fc101b..2c3439010 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java @@ -55,11 +55,12 @@ public class MultiSampleCompressor implements Compressor { final int minBaseQual, final ReduceReads.DownsampleStrategy downsampleStrategy, final int nContigs, - final boolean allowPolyploidReduction) { + final boolean allowPolyploidReduction, + final CompressionStash compressionStash) { for ( String name : SampleUtils.getSAMFileSamples(header) ) { compressorsPerSample.put(name, new SingleSampleCompressor(contextSize, downsampleCoverage, - minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction)); + minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction, compressionStash)); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 5810bc94f..b6761f4a6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -222,6 +222,8 @@ public class ReduceReads extends ReadWalker, ReduceRea HashMap readNameHash; // This hash will keep the name of the original read the new compressed name (a number). Long nextReadNumber = 1L; // The next number to use for the compressed read name. + CompressionStash compressionStash = new CompressionStash(); + SortedSet intervalList; private static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag @@ -328,7 +330,7 @@ public class ReduceReads extends ReadWalker, ReduceRea */ @Override public ReduceReadsStash reduceInit() { - return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION)); + return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION, compressionStash)); } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java index 6a086c53b..82a433300 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java @@ -20,6 +20,7 @@ public class SingleSampleCompressor implements Compressor { final private ReduceReads.DownsampleStrategy downsampleStrategy; final private int nContigs; final private boolean allowPolyploidReduction; + final CompressionStash compressionStash; private SlidingWindow slidingWindow; private int slidingWindowCounter; @@ -33,7 +34,8 @@ public class SingleSampleCompressor implements Compressor { final int minBaseQual, final ReduceReads.DownsampleStrategy downsampleStrategy, final int nContigs, - final boolean allowPolyploidReduction) { + final boolean allowPolyploidReduction, + final CompressionStash compressionStash) { this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; this.minMappingQuality = minMappingQuality; @@ -44,6 +46,7 @@ public class SingleSampleCompressor implements Compressor { this.downsampleStrategy = downsampleStrategy; this.nContigs = nContigs; this.allowPolyploidReduction = allowPolyploidReduction; + this.compressionStash = compressionStash; } /** @@ -65,7 +68,7 @@ public class SingleSampleCompressor implements Compressor { } if ( slidingWindow == null) { // this is the first read - slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction); + slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction, compressionStash); slidingWindowCounter++; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 6fdf85317..22c40c542 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -6,7 +6,6 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; @@ -56,6 +55,7 @@ public class SlidingWindow { private final int nContigs; private boolean allowPolyploidReductionInGeneral; + private CompressionStash compressionStash; /** * The types of synthetic reads to use in the finalizeAndAdd method @@ -87,7 +87,7 @@ public class SlidingWindow { } - public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction) { + public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction, CompressionStash compressionStash) { this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; @@ -118,6 +118,7 @@ public class SlidingWindow { this.nContigs = nContigs; this.allowPolyploidReductionInGeneral = allowPolyploidReduction; + this.compressionStash = compressionStash; } /** @@ -145,7 +146,7 @@ public class SlidingWindow { * @param variantSite boolean array with true marking variant regions * @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region. */ - private Pair getNextVariantRegion(int from, int to, boolean[] variantSite) { + private SimpleGenomeLoc getNextVariantRegion(int from, int to, boolean[] variantSite) { boolean foundStart = false; int variantRegionStartIndex = 0; for (int i=from; i(variantRegionStartIndex, i-1)); + return(new SimpleGenomeLoc(contig, contigIndex, variantRegionStartIndex, i-1, true)); } } - return (foundStart) ? new Pair(variantRegionStartIndex, -1) : null; + return (foundStart) ? new SimpleGenomeLoc(contig, contigIndex, variantRegionStartIndex, to-1, false) : null; } /** @@ -168,23 +169,22 @@ public class SlidingWindow { * @param variantSite boolean array with true marking variant regions * @return a list with start/stops of variant regions following getNextVariantRegion description */ - private List> getAllVariantRegions(int from, int to, boolean[] variantSite) { - List> regions = new LinkedList>(); + private CompressionStash getVariantRegionsFromThisSample(int from, int to, boolean[] variantSite) { + CompressionStash regions = new CompressionStash(); int index = from; while(index < to) { - Pair result = getNextVariantRegion(index, to, variantSite); + SimpleGenomeLoc result = getNextVariantRegion(index, to, variantSite); if (result == null) break; regions.add(result); - if (result.getSecond() < 0) + if (result.getStop() < 0) break; - index = result.getSecond() + 1; + index = result.getStop() + 1; } return regions; } - /** * Determines if the window can be slid given the new incoming read. * @@ -203,7 +203,7 @@ public class SlidingWindow { boolean[] variantSite = markSites(getStartLocation(windowHeader) + readStartHeaderIndex); int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive) - List> regions = getAllVariantRegions(0, breakpoint, variantSite); + CompressionStash regions = getVariantRegionsFromThisSample(0, breakpoint, variantSite); finalizedReads = closeVariantRegions(regions, false); List readsToRemove = new LinkedList(); @@ -567,26 +567,31 @@ public class SlidingWindow { result.addAll(addToSyntheticReads(windowHeader, 0, stop, false)); result.addAll(finalizeAndAdd(ConsensusType.BOTH)); - return result; // finalized reads will be downsampled if necessary + return result; // finalized reads will be downsampled if necessary } - private List closeVariantRegions(List> regions, boolean forceClose) { + private List closeVariantRegions(CompressionStash regions, boolean forceClose) { List allReads = new LinkedList(); if (!regions.isEmpty()) { int lastStop = -1; - for (Pair region : regions) { - int start = region.getFirst(); - int stop = region.getSecond(); - if (stop < 0 && forceClose) - stop = windowHeader.size() - 1; - if (stop >= 0) { - allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); - lastStop = stop; + for (SimpleGenomeLoc region : regions) { + int start = region.getStart(); + int stop = region.getStop(); + + if (!region.isFinished()) { + if(forceClose) // region is unfinished but we're forcing the close of this window + stop = windowHeader.size() - 1; + else + continue; // region is unfinished and we're not forcing the close of this window } + + allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); + lastStop = stop; } - for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion) - windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here! + + for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion) + windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here! } return allReads; } @@ -626,7 +631,7 @@ public class SlidingWindow { if (!windowHeader.isEmpty()) { boolean[] variantSite = markSites(getStopLocation(windowHeader) + 1); - List> regions = getAllVariantRegions(0, windowHeader.size(), variantSite); + CompressionStash regions = getVariantRegionsFromThisSample(0, windowHeader.size(), variantSite); finalizedReads = closeVariantRegions(regions, true); if (!windowHeader.isEmpty()) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java index 68b068509..e9ed6b153 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java @@ -5,23 +5,22 @@ import org.apache.log4j.Logger; import org.apache.log4j.TTCCLayout; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SimpleTimer; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; -import java.io.FileOutputStream; -import java.io.PrintStream; +import java.io.*; import java.util.*; /** - * Created with IntelliJ IDEA. - * User: depristo - * Date: 10/2/12 - * Time: 10:25 AM - * To change this template use File | Settings | File Templates. + * A simple GATK utility (i.e, runs from command-line) for assessing the performance of + * the exact model */ public class AFCalcPerformanceTest { final static Logger logger = Logger.getLogger(AFCalcPerformanceTest.class); @@ -190,7 +189,8 @@ public class AFCalcPerformanceTest { public enum Operation { ANALYZE, - SINGLE + SINGLE, + EXACT_LOG } public static void main(final String[] args) throws Exception { final TTCCLayout layout = new TTCCLayout(); @@ -204,10 +204,49 @@ public class AFCalcPerformanceTest { switch ( op ) { case ANALYZE: analyze(args); break; case SINGLE: profileBig(args); break; + case EXACT_LOG: exactLog(args); break; default: throw new IllegalAccessException("unknown operation " + op); } } + private static void exactLog(final String[] args) throws Exception { + final File ref = new File(args[1]); + final File exactLogFile = new File(args[2]); + final List startsToUse = new LinkedList(); + + for ( int i = 3; i < args.length; i++ ) + startsToUse.add(Integer.valueOf(args[i])); + + final CachingIndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(ref); + final GenomeLocParser parser = new GenomeLocParser(seq); + final BufferedReader reader = new BufferedReader(new FileReader(exactLogFile)); + final List loggedCalls = ExactCallLogger.readExactLog(reader, startsToUse, parser); + + for ( final ExactCallLogger.ExactCall call : loggedCalls ) { + final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(call.vc.getNSamples(), 1, + AFCalcFactory.Calculation.EXACT_INDEPENDENT, + AFCalcTestBuilder.PriorType.human); + logger.info(call); + final SimpleTimer timer = new SimpleTimer().start(); + final AFCalcResult result = testBuilder.makeModel().getLog10PNonRef(call.vc, testBuilder.makePriors()); + final long newNanoTime = timer.getElapsedTimeNano(); + if ( call.originalCall.anyPolymorphic(-1) || result.anyPolymorphic(-1) ) { + logger.info("**** ONE IS POLY"); + } + logger.info("\t\t getLog10PosteriorOfAFGT0: " + call.originalCall.getLog10PosteriorOfAFGT0() + " vs " + result.getLog10PosteriorOfAFGT0()); + final double speedup = call.runtime / (1.0 * newNanoTime); + logger.info("\t\t runtime: " + call.runtime + " vs " + newNanoTime + " speedup " + String.format("%.2f", speedup) + "x"); + for ( final Allele a : call.originalCall.getAllelesUsedInGenotyping() ) { + if ( a.isNonReference() ) { + final String warningmeMLE = call.originalCall.getAlleleCountAtMLE(a) != result.getAlleleCountAtMLE(a) ? " DANGER-MLE-DIFFERENT" : ""; + logger.info("\t\t MLE " + a + ": " + call.originalCall.getAlleleCountAtMLE(a) + " vs " + result.getAlleleCountAtMLE(a) + warningmeMLE); + final String warningmePost = call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) == 0 && result.getLog10PosteriorOfAFGt0ForAllele(a) < -10 ? " DANGER-POSTERIORS-DIFFERENT" : ""; + logger.info("\t\t Posterior " + a + ": " + call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) + " vs " + result.getLog10PosteriorOfAFGt0ForAllele(a) + warningmePost); + } + } + } + } + private static void profileBig(final String[] args) throws Exception { final int nSamples = Integer.valueOf(args[1]); final int ac = Integer.valueOf(args[2]); @@ -234,7 +273,6 @@ public class AFCalcPerformanceTest { final List modelParams = Arrays.asList( new ModelParams(AFCalcFactory.Calculation.EXACT_REFERENCE, 10000, 10), // new ModelParams(AFCalcTestBuilder.ModelType.GeneralExact, 100, 10), - new ModelParams(AFCalcFactory.Calculation.EXACT_CONSTRAINED, 10000, 100), new ModelParams(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 10000, 1000)); final boolean ONLY_HUMAN_PRIORS = false; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java index b4d105507..cfb67164d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java @@ -45,12 +45,16 @@ public class AFCalcTestBuilder { human } + public int getNumAltAlleles() { + return numAltAlleles; + } + public int getnSamples() { return nSamples; } public AFCalc makeModel() { - return AFCalcFactory.createAFCalc(modelType, nSamples, 4, 4, 2); + return AFCalcFactory.createAFCalc(modelType, nSamples, getNumAltAlleles(), getNumAltAlleles(), 2); } public double[] makePriors() { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 072f81db9..14c1cd59d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -348,12 +348,14 @@ public class LikelihoodCalculationEngine { } } } + // add all filtered reads to the NO_CALL list because they weren't given any likelihoods for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) { // only count the read if it overlaps the event, otherwise it is not added to the output read list at all if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { - for( final Allele a : call.getFirst().getAlleles() ) + for( final Allele a : call.getFirst().getAlleles() ) { likelihoodMap.add(read, a, 0.0); + } } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java index db8ea4eb8..89f251ed4 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -21,36 +21,36 @@ public class ReduceReadsIntegrationTest extends WalkerTest { executeTest(testName, spec); } - @Test(enabled = false) + @Test(enabled = true) public void testDefaultCompression() { - RRTest("testDefaultCompression ", L, "323dd4deabd7767efa0f2c6e7fa4189f"); + RRTest("testDefaultCompression ", L, "1f95f3193bd9f120a73c34a0087abaf6"); } - @Test(enabled = false) + @Test(enabled = true) public void testMultipleIntervals() { String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; - RRTest("testMultipleIntervals ", intervals, "c437fb160547ff271f8eba30e5f3ff76"); + RRTest("testMultipleIntervals ", intervals, "79213d6ac68d56d4d72dcf511223e424"); } - @Test(enabled = false) + @Test(enabled = true) public void testHighCompression() { - RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "3a607bc3ebaf84e9dc44e005c5f8a047"); + RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "dab2aa8e3655139974bbe12a568363d9"); } - @Test(enabled = false) + @Test(enabled = true) public void testLowCompression() { RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "7c9b4a70c2c90b0a995800aa42852e63"); } - @Test(enabled = false) + @Test(enabled = true) public void testIndelCompression() { - RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f7b9fa44c10bc4b2247813d2b8dc1973"); + RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "1255245ed4ebeacda90f0dbb4e4da081"); } - @Test(enabled = false) + @Test(enabled = true) public void testFilteredDeletionCompression() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s "; - executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("891bd6dcda66611f343e8ff25f34aaeb"))); + executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("122e4e60c4412a31d0aeb3cce879e841"))); } /** @@ -61,20 +61,20 @@ public class ReduceReadsIntegrationTest extends WalkerTest { * * This bam is simplified to replicate the exact bug with the three provided intervals. */ - @Test(enabled = false) + @Test(enabled = true) public void testAddingReadAfterTailingTheStash() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s "; - executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("886b43e1f26ff18425814dc7563931c6"))); + executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("4b590269cbe3574dbdd5bdc2bc6f5f1c"))); } /** * Divide by zero bug reported by GdA and users in the forum. Happens when the downsampler goes over a region where all reads get * filtered out. */ - @Test(enabled = false) + @Test(enabled = true) public void testDivideByZero() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s "; - executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("93ffdc209d4cc0fc4f0169ca9be55cc2"))); + executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("d8d066304f7c187f182bfb50f39baa0c"))); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java new file mode 100644 index 000000000..556b7451f --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java @@ -0,0 +1,87 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class AFCalcPerformanceUnitTest extends BaseTest { + @DataProvider(name = "ScalingTests") + public Object[][] makepolyTestProviderLotsOfAlleles() { + List tests = new ArrayList(); + + // list of all high-quality models in the system + final List biAllelicModels = Arrays.asList( + AFCalcFactory.Calculation.EXACT_INDEPENDENT, + AFCalcFactory.Calculation.EXACT_REFERENCE); + + final List multiAllelicModels = Arrays.asList( + AFCalcFactory.Calculation.EXACT_INDEPENDENT); + +// for ( final int nonTypePLs : Arrays.asList(100) ) { +// for ( final int nSamples : Arrays.asList(10000) ) { +// final List alleleCounts = Arrays.asList(50); +// for ( final int nAltAlleles : Arrays.asList(1) ) { + for ( final int nonTypePLs : Arrays.asList(100) ) { + for ( final int nSamples : Arrays.asList(100, 1000) ) { + final List alleleCounts = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 50, 500); + for ( final int nAltAlleles : Arrays.asList(1, 2, 3) ) { + final List models = nAltAlleles > 1 ? multiAllelicModels : biAllelicModels; + for ( final AFCalcFactory.Calculation model : models ) { + for ( final List ACs : Utils.makePermutations(alleleCounts, nAltAlleles, true) ) { + if ( MathUtils.sum(ACs) < nSamples * 2 ) { + final AFCalcTestBuilder testBuilder + = new AFCalcTestBuilder(nSamples, nAltAlleles, model, AFCalcTestBuilder.PriorType.human); + tests.add(new Object[]{testBuilder, ACs, nonTypePLs}); + } + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + private Pair estNumberOfEvaluations(final AFCalcTestBuilder testBuilder, final VariantContext vc, final int nonTypePL) { + final int evalOverhead = 2; // 2 + final int maxEvalsPerSamplePerAC = 3; + + int minEvals = 0, maxEvals = 0; + + for ( final Allele alt : vc.getAlternateAlleles() ) { + final int AC = vc.getCalledChrCount(alt); + minEvals += AC + evalOverhead; // everyone is hom-var + maxEvals += AC * maxEvalsPerSamplePerAC + 10; + } + + return new Pair(minEvals, maxEvals); + } + + @Test(dataProvider = "ScalingTests") + private void testScaling(final AFCalcTestBuilder testBuilder, final List ACs, final int nonTypePL) { + final AFCalc calc = testBuilder.makeModel(); + final double[] priors = testBuilder.makePriors(); + final VariantContext vc = testBuilder.makeACTest(ACs, 0, nonTypePL); + final AFCalcResult result = calc.getLog10PNonRef(vc, priors); + final Pair expectedNEvaluation = estNumberOfEvaluations(testBuilder, vc, nonTypePL); + final int minEvals = expectedNEvaluation.getFirst(); + final int maxEvals = expectedNEvaluation.getSecond(); + + logger.warn(" min " + minEvals + " obs " + result.getnEvaluations() + " max " + maxEvals + " for test " + testBuilder + " sum(ACs)=" + (int)MathUtils.sum(ACs)); + + Assert.assertTrue(result.getnEvaluations() >= minEvals, + "Actual number of evaluations " + result.getnEvaluations() + " < min number of evals " + minEvals); + Assert.assertTrue(result.getnEvaluations() <= maxEvals, + "Actual number of evaluations " + result.getnEvaluations() + " > max number of evals " + minEvals); + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java new file mode 100644 index 000000000..1070642e9 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java @@ -0,0 +1,77 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class AFCalcResultUnitTest extends BaseTest { + private static class MyTest { + final double[] Ls, expectedPosteriors; + + private MyTest(double[] ls, double[] expectedPosteriors) { + Ls = ls; + this.expectedPosteriors = expectedPosteriors; + } + + @Override + public String toString() { + return "Ls [" + Utils.join(",", Ls) + "] expectedPosteriors [" + Utils.join(",", expectedPosteriors) + "]"; + } + } + + @DataProvider(name = "TestComputePosteriors") + public Object[][] makeTestCombineGLs() { + List tests = new ArrayList(); + + tests.add(new Object[]{new MyTest(log10Even, log10Even)}); + + for ( double L0 = -1e9; L0 < 0.0; L0 /= 10.0 ) { + for ( double L1 = -1e2; L1 < 0.0; L1 /= 100.0 ) { + final double[] input = new double[]{L0, L1}; + final double[] expected = MathUtils.normalizeFromLog10(input, true); + tests.add(new Object[]{new MyTest(input, expected)}); + } + } + + for ( double bigBadL = -1e50; bigBadL < -1e200; bigBadL *= 10 ) { + // test that a huge bad likelihood remains, even with a massive better result + for ( final double betterL : Arrays.asList(-1000.0, -100.0, -10.0, -1.0, -0.1, -0.01, -0.001, 0.0)) { + tests.add(new Object[]{new MyTest(new double[]{bigBadL, betterL}, new double[]{bigBadL, 0.0})}); + tests.add(new Object[]{new MyTest(new double[]{betterL, bigBadL}, new double[]{0.0, bigBadL})}); + } + } + + // test that a modest bad likelihood with an ~0.0 value doesn't get lost + for ( final double badL : Arrays.asList(-10000.0, -1000.0, -100.0, -10.0)) { + tests.add(new Object[]{new MyTest(new double[]{badL, -1e-9}, new double[]{badL, 0.0})}); + tests.add(new Object[]{new MyTest(new double[]{-1e-9, badL}, new double[]{0.0, badL})}); + } + + return tests.toArray(new Object[][]{}); + } + + + final static double[] log10Even = MathUtils.normalizeFromLog10(new double[]{0.5, 0.5}, true); + final static Allele C = Allele.create("C"); + final static List alleles = Arrays.asList(Allele.create("A", true), C); + + @Test(enabled = true, dataProvider = "TestComputePosteriors") + private void testComputingPosteriors(final MyTest data) { + final AFCalcResult result = new AFCalcResult(new int[]{0}, 1, alleles, data.Ls, log10Even, Collections.singletonMap(C, -1.0)); + + Assert.assertEquals(result.getLog10PosteriorOfAFEq0(), data.expectedPosteriors[0], 1e-3, "AF = 0 not expected"); + Assert.assertEquals(result.getLog10PosteriorOfAFGT0(), data.expectedPosteriors[1], 1e-3, "AF > 0 not expected"); + + final double[] actualPosteriors = new double[]{result.getLog10PosteriorOfAFEq0(), result.getLog10PosteriorOfAFGT0()}; + Assert.assertEquals(MathUtils.sumLog10(actualPosteriors), 1.0, 1e-3, "Posteriors don't sum to 1 with 1e-3 precision"); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index 4ac4692d7..25df0f6d2 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -372,16 +372,14 @@ public class AFCalcUnitTest extends BaseTest { final VariantContext vc3 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C, G)).make(); final AFCalcTestBuilder.PriorType priorType = AFCalcTestBuilder.PriorType.flat; - final List constrainedModel = Arrays.asList(AFCalcFactory.Calculation.EXACT_CONSTRAINED); - final double TOLERANCE = 0.5; final List initialPNonRefData = Arrays.asList( // bi-allelic sites new PNonRefData(vc2, makePL(AA, 0, 10, 10), 0.1666667, TOLERANCE, true), - new PNonRefData(vc2, makePL(AA, 0, 1, 10), 0.4721084, TOLERANCE, false, constrainedModel), - new PNonRefData(vc2, makePL(AA, 0, 1, 1), 0.6136992, TOLERANCE, false, constrainedModel), - new PNonRefData(vc2, makePL(AA, 0, 5, 5), 0.3874259, TOLERANCE, false, constrainedModel), + new PNonRefData(vc2, makePL(AA, 0, 1, 10), 0.4721084, TOLERANCE, false), + new PNonRefData(vc2, makePL(AA, 0, 1, 1), 0.6136992, TOLERANCE, false), + new PNonRefData(vc2, makePL(AA, 0, 5, 5), 0.3874259, TOLERANCE, false), new PNonRefData(vc2, makePL(AC, 10, 0, 10), 0.9166667, TOLERANCE, true), new PNonRefData(vc2, makePL(CC, 10, 10, 0), 0.9166667, TOLERANCE, true), @@ -448,7 +446,7 @@ public class AFCalcUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(enabled = true & ! DEBUG_ONLY, dataProvider = "Models") + @Test(enabled = true && !DEBUG_ONLY, dataProvider = "Models") public void testBiallelicPriors(final AFCalc model) { for ( int REF_PL = 10; REF_PL <= 20; REF_PL += 10 ) { @@ -456,26 +454,29 @@ public class AFCalcUnitTest extends BaseTest { for ( int log10NonRefPrior = 1; log10NonRefPrior < 10*REF_PL; log10NonRefPrior += 1 ) { final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior); - final double[] priors = MathUtils.normalizeFromLog10(MathUtils.toLog10(new double[]{refPrior, (1-refPrior) / 2, (1-refPrior) / 2}), true); - GetGLsTest cfg = new GetGLsTest(model, 1, Arrays.asList(AB), priors, "pNonRef" + log10NonRefPrior); - final AFCalcResult resultTracker = cfg.execute(); - final int actualAC = resultTracker.getAlleleCountsOfMLE()[0]; + final double nonRefPrior = (1-refPrior) / 2; + final double[] priors = MathUtils.normalizeFromLog10(MathUtils.toLog10(new double[]{refPrior, nonRefPrior, nonRefPrior}), true); + if ( ! Double.isInfinite(priors[1]) ) { + GetGLsTest cfg = new GetGLsTest(model, 1, Arrays.asList(AB), priors, "pNonRef" + log10NonRefPrior); + final AFCalcResult resultTracker = cfg.execute(); + final int actualAC = resultTracker.getAlleleCountsOfMLE()[0]; - final double pRefWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0]; - final double pHetWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1] - Math.log10(0.5); - final double nonRefPost = Math.pow(10, pHetWithPrior) / (Math.pow(10, pRefWithPrior) + Math.pow(10, pHetWithPrior)); - final double log10NonRefPost = Math.log10(nonRefPost); + final double pRefWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0]; + final double pHetWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1] - Math.log10(0.5); + final double nonRefPost = Math.pow(10, pHetWithPrior) / (Math.pow(10, pRefWithPrior) + Math.pow(10, pHetWithPrior)); + final double log10NonRefPost = Math.log10(nonRefPost); - if ( ! Double.isInfinite(log10NonRefPost) ) - Assert.assertEquals(resultTracker.getLog10PosteriorOfAFGT0(), log10NonRefPost, 1e-2); + if ( ! Double.isInfinite(log10NonRefPost) ) + Assert.assertEquals(resultTracker.getLog10PosteriorOfAFGT0(), log10NonRefPost, 1e-2); - if ( nonRefPost >= 0.9 ) - Assert.assertTrue(resultTracker.isPolymorphic(C, -1)); + if ( nonRefPost >= 0.9 ) + Assert.assertTrue(resultTracker.isPolymorphic(C, -1)); - final int expectedMLEAC = 1; // the MLE is independent of the prior - Assert.assertEquals(actualAC, expectedMLEAC, - "actual AC with priors " + log10NonRefPrior + " not expected " - + expectedMLEAC + " priors " + Utils.join(",", priors)); + final int expectedMLEAC = 1; // the MLE is independent of the prior + Assert.assertEquals(actualAC, expectedMLEAC, + "actual AC with priors " + log10NonRefPrior + " not expected " + + expectedMLEAC + " priors " + Utils.join(",", priors)); + } } } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedAFCalculationModelUnitTest.java deleted file mode 100644 index 31ec28af4..000000000 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedAFCalculationModelUnitTest.java +++ /dev/null @@ -1,124 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - - -public class ConstrainedAFCalculationModelUnitTest extends BaseTest { - static Allele A = Allele.create("A", true); - static Allele C = Allele.create("C"); - static Allele G = Allele.create("G"); - - protected static Genotype makePL(final List expectedGT, int ... pls) { - return AFCalcUnitTest.makePL(expectedGT, pls); - } - - @DataProvider(name = "MaxACsToVisit") - public Object[][] makeMaxACsToVisit() { - List tests = new ArrayList(); - - final int nSamples = 10; - - for (int nNonInformative = 0; nNonInformative < nSamples - 1; nNonInformative++ ) { - final int nChrom = (nSamples - nNonInformative) * 2; - for ( int i = 0; i < nChrom; i++ ) { - // bi-allelic - tests.add(new Object[]{nSamples, Arrays.asList(i), nNonInformative, AFCalcFactory.Calculation.EXACT_CONSTRAINED}); - - // tri-allelic - for ( int j = 0; j < (nChrom - i); j++) - tests.add(new Object[]{nSamples, Arrays.asList(i, j), nNonInformative, AFCalcFactory.Calculation.EXACT_CONSTRAINED}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = true, dataProvider = "MaxACsToVisit") - public void testMaxACsToVisit(final int nSamples, final List requestedACs, final int nNonInformative, final AFCalcFactory.Calculation modelType) { - final int nAlts = requestedACs.size(); - final AFCalcTestBuilder testBuilder - = new AFCalcTestBuilder(nSamples, nAlts, modelType, - AFCalcTestBuilder.PriorType.human); - - final VariantContext vc = testBuilder.makeACTest(requestedACs, nNonInformative, 100); - final int[] maxACsToVisit = ((ConstrainedDiploidExactAFCalc)testBuilder.makeModel()).computeMaxACs(vc); - - testExpectedACs(vc, maxACsToVisit); - } - - private void testExpectedACs(final VariantContext vc, final int[] maxACsToVisit) { - // this is necessary because cannot ensure that the tester gives us back the - // requested ACs due to rounding errors - final List ACs = new ArrayList(); - for ( final Allele a : vc.getAlternateAlleles() ) - ACs.add(vc.getCalledChrCount(a)); - - for ( int i = 0; i < maxACsToVisit.length; i++ ) { - Assert.assertEquals(maxACsToVisit[i], (int)ACs.get(i), "Maximum AC computed wasn't equal to the max possible in the construction for alt allele " + i); - } - } - - @DataProvider(name = "MaxACsGenotypes") - public Object[][] makeMaxACsForGenotype() { - List tests = new ArrayList(); - - final List AA = Arrays.asList(A, A); - final List AC = Arrays.asList(A, C); - final List CC = Arrays.asList(C, C); - final List AG = Arrays.asList(A, G); - final List GG = Arrays.asList(G, G); - final List CG = Arrays.asList(C, G); - - final VariantContext vc2 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C)).make(); - final VariantContext vc3 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C, G)).make(); - - tests.add(new Object[]{vc2, makePL(AA, 0, 10, 10)}); - tests.add(new Object[]{vc2, makePL(AC, 10, 0, 10)}); - tests.add(new Object[]{vc2, makePL(CC, 10, 10, 0)}); - - // make sure non-informative => 0 - tests.add(new Object[]{vc2, makePL(AA, 0, 0, 0)}); - tests.add(new Object[]{vc3, makePL(AA, 0, 0, 0, 0, 0, 0)}); - - // multi-allelics - tests.add(new Object[]{vc3, makePL(AG, 10, 10, 10, 0, 10, 10)}); - tests.add(new Object[]{vc3, makePL(CG, 10, 10, 10, 10, 0, 10)}); - tests.add(new Object[]{vc3, makePL(GG, 10, 10, 10, 10, 10, 0)}); - - // deal with non-informatives third alleles - tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 0, 0, 10)}); - tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 10, 0, 10)}); - tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 10, 0, 0)}); - tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 0, 0, 0)}); - tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 0, 0, 10)}); - tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 10, 0, 10)}); - tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 10, 0, 0)}); - tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 0, 0, 0)}); - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = true, dataProvider = "MaxACsGenotypes") - private void testMakeACByGenotype(final VariantContext vcRoot, final Genotype g) { - final VariantContext vc = new VariantContextBuilder(vcRoot).genotypes(g).make(); - - final AFCalcTestBuilder testBuilder - = new AFCalcTestBuilder(1, vc.getNAlleles()-1, AFCalcFactory.Calculation.EXACT_CONSTRAINED, - AFCalcTestBuilder.PriorType.human); - - final int[] maxACsToVisit = ((ConstrainedDiploidExactAFCalc)testBuilder.makeModel()).computeMaxACs(vc); - - testExpectedACs(vc, maxACsToVisit); - } -} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java index ed164f245..391c99990 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java @@ -55,48 +55,14 @@ public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @DataProvider(name = "TestCombineGLsWithDrops") - public Object[][] makeTestCombineGLsWithDrops() { - List tests = new ArrayList(); - - final Set noDrops = Collections.emptySet(); - final Set drop1 = Collections.singleton(1); - final Set drop2 = Collections.singleton(2); - - // AA AB BB AC BC CC - // drop1 (B): AA AC CC - // drop2 (C): AA AB BB - tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 2, 5), noDrops}); - tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 4, 9), noDrops}); - tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 1, 2), drop2}); - tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 3, 5), drop1}); - - tests.add(new Object[]{1, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(0, 2, 6), noDrops}); - tests.add(new Object[]{2, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(1, 0, 2), noDrops}); - tests.add(new Object[]{1, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(2, 1, 0), drop2}); - tests.add(new Object[]{2, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(5, 2, 0), drop1}); - - tests.add(new Object[]{1, 2, makePL(10,10,10,10,10, 0), makePL( 0, 8,11), noDrops}); - tests.add(new Object[]{2, 2, makePL(10,10,10,10,10, 0), makePL( 5, 7, 0), noDrops}); - tests.add(new Object[]{1, 2, makePL(10,10,10,10,10, 0), makePL( 0, 0, 0), drop2}); - tests.add(new Object[]{2, 2, makePL(10,10,10,10,10, 0), makePL(10,10, 0), drop1}); - - return tests.toArray(new Object[][]{}); - } - private Genotype makePL(final int ... PLs) { return AFCalcUnitTest.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), PLs); } @Test(enabled = true, dataProvider = "TestCombineGLs") private void testCombineGLs(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected) { - testCombineGLsWithDrops(altIndex, nAlts, testg, expected, Collections.emptySet()); - } - - @Test(enabled = true, dataProvider = "TestCombineGLsWithDrops") - private void testCombineGLsWithDrops(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected, Set allelesToDrop) { final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4); - final Genotype combined = calc.combineGLs(testg, altIndex, allelesToDrop, nAlts); + final Genotype combined = calc.combineGLs(testg, altIndex, nAlts); Assert.assertEquals(combined.getPL(), expected.getPL(), "Combined PLs " + Utils.join(",", combined.getPL()) + " != expected " + Utils.join(",", expected.getPL())); @@ -120,22 +86,21 @@ public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest { final Genotype gACG = makePL( 0, 1, 2, 3, 4, 5); final Genotype gAGC = makePL( 0, 4, 5, 1, 3, 2); final Genotype gACcombined = makePL(0, 2, 5); + final Genotype gACcombined2 = makePL(0, 1, 4); final Genotype gAGcombined = makePL(0, 4, 9); - final Genotype gACdropped = makePL(0, 1, 2); - final Genotype gAGdropped = makePL(0, 3, 5); // biallelic tests.add(new Object[]{vcAC.genotypes(gACcombined).make(), Arrays.asList(vcAC.genotypes(gACcombined).make())}); // tri-allelic - tests.add(new Object[]{vcACG.genotypes(gACG).make(), Arrays.asList(vcAC.genotypes(gACcombined).make(), vcAG.genotypes(gAGdropped).make())}); - tests.add(new Object[]{vcAGC.genotypes(gAGC).make(), Arrays.asList(vcAG.genotypes(gAGcombined).make(), vcAC.genotypes(gACdropped).make())}); + tests.add(new Object[]{vcACG.genotypes(gACG).make(), Arrays.asList(vcAC.genotypes(gACcombined).make(), vcAG.genotypes(gAGcombined).make())}); + tests.add(new Object[]{vcAGC.genotypes(gAGC).make(), Arrays.asList(vcAG.genotypes(gAGcombined).make(), vcAC.genotypes(gACcombined2).make())}); return tests.toArray(new Object[][]{}); } - @Test(enabled = false, dataProvider = "TestMakeAlleleConditionalContexts") + @Test(enabled = true, dataProvider = "TestMakeAlleleConditionalContexts") private void testMakeAlleleConditionalContexts(final VariantContext vc, final List expectedVCs) { final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4); final List biAllelicVCs = calc.makeAlleleConditionalContexts(vc); @@ -148,7 +113,8 @@ public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest { Assert.assertEquals(actual.getAlleles(), expected.getAlleles()); for ( int j = 0; j < actual.getNSamples(); j++ ) - Assert.assertEquals(actual.getGenotype(j).getPL(), expected.getGenotype(j).getPL()); + Assert.assertEquals(actual.getGenotype(j).getPL(), expected.getGenotype(j).getPL(), + "expected PLs " + Utils.join(",", expected.getGenotype(j).getPL()) + " not equal to actual " + Utils.join(",", actual.getGenotype(j).getPL())); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java new file mode 100644 index 000000000..45e105751 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java @@ -0,0 +1,30 @@ +package org.broadinstitute.sting.gatk.walkers.compression.reducereads; + +import org.broadinstitute.sting.utils.GenomeLoc; + +/** + * GenomeLocs are very useful objects to keep track of genomic locations and perform set operations + * with them. + * + * However, GenomeLocs are bound to strict validation through the GenomeLocParser and cannot + * be created easily for small tasks that do not require the rigors of the GenomeLocParser validation + * + * SimpleGenomeLoc is a simple utility to create GenomeLocs without going through the parser. Should + * only be used outside of the engine. + * + * User: carneiro + * Date: 10/16/12 + * Time: 2:07 PM + */ +public class SimpleGenomeLoc extends GenomeLoc { + private boolean finished; + + public SimpleGenomeLoc(String contigName, int contigIndex, int start, int stop, boolean finished) { + super(contigName, contigIndex, start, stop); + this.finished = finished; + } + + public boolean isFinished() { + return finished; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java index f87084a9c..07f88c9e3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java @@ -29,24 +29,16 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.SimpleTimer; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.PrintStream; -import java.util.Arrays; import java.util.List; /** * Generic interface for calculating the probability of alleles segregating given priors and genotype likelihoods - * */ public abstract class AFCalc implements Cloneable { private final static Logger defaultLogger = Logger.getLogger(AFCalc.class); @@ -58,8 +50,8 @@ public abstract class AFCalc implements Cloneable { protected Logger logger = defaultLogger; private SimpleTimer callTimer = new SimpleTimer(); - private PrintStream callReport = null; private final AFCalcResultTracker resultTracker; + private ExactCallLogger exactCallLogger = null; protected AFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples); @@ -74,7 +66,7 @@ public abstract class AFCalc implements Cloneable { } public void enableProcessLog(final File exactCallsLog) { - initializeOutputFile(exactCallsLog); + exactCallLogger = new ExactCallLogger(exactCallsLog); } public void setLogger(Logger logger) { @@ -102,8 +94,8 @@ public abstract class AFCalc implements Cloneable { final AFCalcResult result = computeLog10PNonRef(vcWorking, log10AlleleFrequencyPriors); final long nanoTime = callTimer.getElapsedTimeNano(); - if ( callReport != null ) - printCallInfo(vcWorking, log10AlleleFrequencyPriors, nanoTime, resultTracker.getLog10PosteriorOfAFzero()); + if ( exactCallLogger != null ) + exactCallLogger.printCallInfo(vcWorking, log10AlleleFrequencyPriors, nanoTime, result); return result; } @@ -170,55 +162,8 @@ public abstract class AFCalc implements Cloneable { return Math.max(maxAlternateAllelesToGenotype, maxAlternateAllelesForIndels); } - - // --------------------------------------------------------------------------- - // - // Print information about the call to the calls log - // - // --------------------------------------------------------------------------- - - private void initializeOutputFile(final File outputFile) { - try { - if (outputFile != null) { - callReport = new PrintStream( new FileOutputStream(outputFile) ); - callReport.println(Utils.join("\t", Arrays.asList("loc", "variable", "key", "value"))); - } - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotCreateOutputFile(outputFile, e); - } - } - - private void printCallInfo(final VariantContext vc, - final double[] log10AlleleFrequencyPriors, - final long runtimeNano, - final double log10PosteriorOfAFzero) { - printCallElement(vc, "type", "ignore", vc.getType()); - - int allelei = 0; - for ( final Allele a : vc.getAlleles() ) - printCallElement(vc, "allele", allelei++, a.getDisplayString()); - - for ( final Genotype g : vc.getGenotypes() ) - printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString()); - - for ( int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++ ) - printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]); - - printCallElement(vc, "runtime.nano", "ignore", runtimeNano); - printCallElement(vc, "log10PosteriorOfAFzero", "ignore", log10PosteriorOfAFzero); - - callReport.flush(); - } - - private void printCallElement(final VariantContext vc, - final Object variable, - final Object key, - final Object value) { - final String loc = String.format("%s:%d", vc.getChr(), vc.getStart()); - callReport.println(Utils.join("\t", Arrays.asList(loc, variable, key, value))); - } - public AFCalcResultTracker getResultTracker() { return resultTracker; } + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java index 981100eaa..7d67815cf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java @@ -30,10 +30,6 @@ public class AFCalcFactory { /** reference implementation of multi-allelic EXACT model */ EXACT_REFERENCE(ReferenceDiploidExactAFCalc.class, 2, -1), - /** expt. implementation */ - @Deprecated - EXACT_CONSTRAINED(ConstrainedDiploidExactAFCalc.class, 2, -1), - /** expt. implementation -- for testing only */ EXACT_INDEPENDENT(IndependentAllelesDiploidExactAFCalc.class, 2, -1), diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java index da7fd08ce..7cacb2060 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -31,10 +31,7 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** * Describes the results of the AFCalc @@ -217,6 +214,14 @@ public class AFCalcResult { return log10PriorsOfAC[AF1p]; } + @Override + public String toString() { + final List byAllele = new LinkedList(); + for ( final Allele a : getAllelesUsedInGenotyping() ) + if ( a.isNonReference() ) byAllele.add(String.format("%s => MLE %d / posterior %.2f", a, getAlleleCountAtMLE(a), getLog10PosteriorOfAFGt0ForAllele(a))); + return String.format("AFCalc%n\t\tlog10PosteriorOfAFGT0=%.2f%n\t\t%s", getLog10LikelihoodOfAFGT0(), Utils.join("\n\t\t", byAllele)); + } + /** * Are we sufficiently confidence in being non-ref that the site is considered polymorphic? * @@ -233,6 +238,19 @@ public class AFCalcResult { return getLog10PosteriorOfAFGt0ForAllele(allele) >= log10minPNonRef; } + /** + * Are any of the alleles polymorphic w.r.t. #isPolymorphic? + * + * @param log10minPNonRef the confidence threshold, in log10 space + * @return true if any are poly, false otherwise + */ + public boolean anyPolymorphic(final double log10minPNonRef) { + for ( final Allele a : getAllelesUsedInGenotyping() ) + if ( a.isNonReference() && isPolymorphic(a, log10minPNonRef) ) + return true; + return false; + } + /** * Returns the log10 probability that allele is segregating * @@ -266,15 +284,7 @@ public class AFCalcResult { final double[] log10UnnormalizedPosteriors = new double[log10LikelihoodsOfAC.length]; for ( int i = 0; i < log10LikelihoodsOfAC.length; i++ ) log10UnnormalizedPosteriors[i] = log10LikelihoodsOfAC[i] + log10PriorsOfAC[i]; - - // necessary because the posteriors may be so skewed that the log-space normalized value isn't - // good, so we have to try both log-space normalization as well as the real-space normalization if the - // result isn't good - final double[] logNormalized = MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, true); - if ( goodLog10ProbVector(logNormalized, logNormalized.length, true) ) - return logNormalized; - else - return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false); + return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedDiploidExactAFCalc.java deleted file mode 100644 index 36d53ceaa..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedDiploidExactAFCalc.java +++ /dev/null @@ -1,107 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -@Deprecated -public class ConstrainedDiploidExactAFCalc extends DiploidExactAFCalc { - protected ConstrainedDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); - } - - protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) { - final int[] maxACsToConsider = computeMaxACs(vc); - resultTracker.setAClimits(maxACsToConsider); - return new StateTracker(maxACsToConsider); - } - - /** - * Computes the maximum ACs we need to consider for each alt allele - * - * Walks over the genotypes in VC, and computes for each alt allele the maximum - * AC we need to consider in that alt allele dimension. Does the calculation - * based on the PLs in each genotype g, choosing to update the max AC for the - * alt alleles corresponding to that PL. Only takes the first lowest PL, - * if there are multiple genotype configurations with the same PL value. It - * takes values in the order of the alt alleles. - * - * @param vc the variant context we will compute max alt alleles for - * @return a vector of max alt alleles, indexed by alt allele, so result[0] is the AC of the - * first alt allele. - */ - @Ensures("result != null") - protected final int[] computeMaxACs(final VariantContext vc) { - final int[] maxACs = new int[vc.getNAlleles()-1]; - - for ( final Genotype g : vc.getGenotypes() ) - updateMaxACs(g, maxACs); - - return maxACs; - } - - /** - * Update the maximum achievable allele counts in maxAC according to the PLs in g - * - * Selects the maximum genotype configuration from the PLs in g, and updates - * the maxAC for this configure. For example, if the lowest PL is for 0/1, updates - * the maxAC for the alt allele 1 by 1. If it's 1/1, update is 2. Works for - * many number of alt alleles (determined by length of maxACs). - * - * If the max PL occurs at 0/0, updates nothing - * Note that this function greedily takes the first min PL, so that if 0/1 and 1/1 have - * the same PL value, then updates the first one. - * - * Also, only will update 1 alt allele, so if 0/1 and 0/2 both have the same PL, - * then only first one (1) will be updated - * - * @param g the genotype to update - * @param maxACs the max allele count vector for alt alleles (starting at 0 => first alt allele) - */ - @Requires({ - "g != null", - "maxACs != null", - "goodMaxACs(maxACs)"}) - private void updateMaxACs(final Genotype g, final int[] maxACs) { - final int[] PLs = g.getLikelihoods().getAsPLs(); - - int minPLi = 0; - int minPL = PLs[0]; - - for ( int i = 0; i < PLs.length; i++ ) { - if ( PLs[i] < minPL ) { - minPL = PLs[i]; - minPLi = i; - } - } - - final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(minPLi); - updateMaxACs(maxACs, pair.alleleIndex1); - updateMaxACs(maxACs, pair.alleleIndex2); - } - - /** - * Simple helper. Update max alt alleles maxACs according to the allele index (where 0 == ref) - * - * If alleleI == 0 => doesn't update anything - * else maxACs[alleleI - 1]++ - * - * @param maxACs array of max alt allele ACs - * @param alleleI the index (relative to 0) to update a count of 1 in max alt alleles. - */ - @Requires({ - "alleleI >= 0", - "(alleleI - 1) < maxACs.length", - "goodMaxACs(maxACs)"}) - private void updateMaxACs(final int[] maxACs, final int alleleI) { - if ( alleleI > 0 ) - maxACs[alleleI-1]++; - } - - private static boolean goodMaxACs(final int[] maxACs) { - return MathUtils.sum(maxACs) >= 0; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java index 8b12dff61..49915c515 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java @@ -36,7 +36,9 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy); } - protected abstract StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker); + protected StateTracker makeMaxLikelihood(VariantContext vc, AFCalcResultTracker resultTracker) { + return new StateTracker(); + } @Override protected AFCalcResult computeLog10PNonRef(final VariantContext vc, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java new file mode 100644 index 000000000..f13fe4429 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java @@ -0,0 +1,179 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import com.google.java.contract.Requires; +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.io.*; +import java.util.*; + +/** + * Allows us to write out and read in information about exact calls (site, alleles, PLs, etc) in tabular format + * + * Once opened, calls can be writen to disk with printCallInfo + */ +public class ExactCallLogger implements Cloneable { + private PrintStream callReport = null; + + /** + * Create a new ExactCallLogger writing it's output to outputFile + * + * @param outputFile + */ + public ExactCallLogger(final File outputFile) { + try { + callReport = new PrintStream(new BufferedOutputStream(new FileOutputStream(outputFile), 10000000)); + callReport.println(Utils.join("\t", Arrays.asList("loc", "variable", "key", "value"))); + } catch (FileNotFoundException e) { + throw new UserException.CouldNotCreateOutputFile(outputFile, e); + } + } + + /** + * Summarizes information about an exact call that happened + */ + public static class ExactCall { + final VariantContext vc; + final long runtime; + final AFCalcResult originalCall; + + public ExactCall(VariantContext vc, final long runtime, final AFCalcResult originalCall) { + this.vc = vc; + this.runtime = runtime; + this.originalCall = originalCall; + } + + @Override + public String toString() { + return String.format("ExactCall %s:%d alleles=%s nSamples=%s orig.pNonRef=%.2f orig.runtime=%s", + vc.getChr(), vc.getStart(), vc.getAlleles(), vc.getNSamples(), + originalCall.getLog10PosteriorOfAFGT0(), + new AutoFormattingTime(runtime / 1e9).toString()); + } + } + + protected final void printCallInfo(final VariantContext vc, + final double[] log10AlleleFrequencyPriors, + final long runtimeNano, + final AFCalcResult result) { + printCallElement(vc, "type", "ignore", vc.getType()); + + int allelei = 0; + for (final Allele a : vc.getAlleles()) + printCallElement(vc, "allele", allelei++, a.getDisplayString()); + + for (final Genotype g : vc.getGenotypes()) + printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString()); + + for (int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++) + printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]); + + printCallElement(vc, "runtime.nano", "ignore", runtimeNano); + printCallElement(vc, "log10PosteriorOfAFEq0", "ignore", result.getLog10PosteriorOfAFEq0()); + printCallElement(vc, "log10PosteriorOfAFGt0", "ignore", result.getLog10PosteriorOfAFGT0()); + + for ( final Allele allele : result.getAllelesUsedInGenotyping() ) { + if ( allele.isNonReference() ) { + printCallElement(vc, "MLE", allele, result.getAlleleCountAtMLE(allele)); + printCallElement(vc, "pNonRefByAllele", allele, result.getLog10PosteriorOfAFGt0ForAllele(allele)); + } + } + + callReport.flush(); + } + + @Requires({"vc != null", "variable != null", "key != null", "value != null", "callReport != null"}) + private void printCallElement(final VariantContext vc, + final Object variable, + final Object key, + final Object value) { + final String loc = String.format("%s:%d", vc.getChr(), vc.getStart()); + callReport.println(Utils.join("\t", Arrays.asList(loc, variable, key, value))); + } + + /** + * Read in a list of ExactCall objects from reader, keeping only those + * with starts in startsToKeep or all sites (if this is empty) + * + * @param reader a just-opened reader sitting at the start of the file + * @param startsToKeep a list of start position of the calls to keep, or empty if all calls should be kept + * @param parser a genome loc parser to create genome locs + * @return a list of ExactCall objects in reader + * @throws IOException + */ + public static List readExactLog(final BufferedReader reader, final List startsToKeep, GenomeLocParser parser) throws IOException { + if ( reader == null ) throw new IllegalArgumentException("reader cannot be null"); + if ( startsToKeep == null ) throw new IllegalArgumentException("startsToKeep cannot be null"); + if ( parser == null ) throw new IllegalArgumentException("GenomeLocParser cannot be null"); + + List calls = new LinkedList(); + + // skip the header line + reader.readLine(); + + // skip the first "type" line + reader.readLine(); + + while (true) { + final VariantContextBuilder builder = new VariantContextBuilder(); + final List alleles = new ArrayList(); + final List genotypes = new ArrayList(); + final double[] posteriors = new double[2]; + final double[] priors = MathUtils.normalizeFromLog10(new double[]{0.5, 0.5}, true); + final List mle = new ArrayList(); + final Map log10pNonRefByAllele = new HashMap(); + long runtimeNano = -1; + + GenomeLoc currentLoc = null; + while (true) { + final String line = reader.readLine(); + if (line == null) + return calls; + + final String[] parts = line.split("\t"); + final GenomeLoc lineLoc = parser.parseGenomeLoc(parts[0]); + final String variable = parts[1]; + final String key = parts[2]; + final String value = parts[3]; + + if (currentLoc == null) + currentLoc = lineLoc; + + if (variable.equals("type")) { + if (startsToKeep.isEmpty() || startsToKeep.contains(currentLoc.getStart())) { + builder.alleles(alleles); + final int stop = currentLoc.getStart() + alleles.get(0).length() - 1; + builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop); + builder.genotypes(genotypes); + final int[] mleInts = ArrayUtils.toPrimitive(mle.toArray(new Integer[]{})); + final AFCalcResult result = new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pNonRefByAllele); + calls.add(new ExactCall(builder.make(), runtimeNano, result)); + } + break; + } else if (variable.equals("allele")) { + final boolean isRef = key.equals("0"); + alleles.add(Allele.create(value, isRef)); + } else if (variable.equals("PL")) { + final GenotypeBuilder gb = new GenotypeBuilder(key); + gb.PL(GenotypeLikelihoods.fromPLField(value).getAsPLs()); + genotypes.add(gb.make()); + } else if (variable.equals("log10PosteriorOfAFEq0")) { + posteriors[0] = Double.valueOf(value); + } else if (variable.equals("log10PosteriorOfAFGt0")) { + posteriors[1] = Double.valueOf(value); + } else if (variable.equals("MLE")) { + mle.add(Integer.valueOf(value)); + } else if (variable.equals("pNonRefByAllele")) { + final Allele a = Allele.create(key); + log10pNonRefByAllele.put(a, Double.valueOf(value)); + } else if (variable.equals("runtime.nano")) { + runtimeNano = Long.valueOf(value); + } else { + // nothing to do + } + } + } + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index 0ac964c9c..c0edee291 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -32,31 +32,97 @@ import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; -public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { +/** + * Computes the conditional bi-allelic exact results + * + * Suppose vc contains 2 alt allele: A* with C and T. This function first computes: + * + * (1) P(D | AF_c > 0 && AF_t == *) [i.e., T can be anything] + * + * it then computes the conditional probability on AF_c == 0: + * + * (2) P(D | AF_t > 0 && AF_c == 0) + * + * Thinking about this visually, we have the following likelihood matrix where each cell is + * the P(D | AF_c == i && AF_t == j): + * + * 0 AF_c > 0 + * ----------------- + * 0 | | + * |--|------------- + * a | | + * f | | + * _ | | + * t | | + * > | | + * 0 | | + * + * What we really want to know how + * + * (3) P(D | AF_c == 0 & AF_t == 0) + * + * compares with + * + * (4) P(D | AF_c > 0 || AF_t > 0) + * + * This is effectively asking for the value in the upper left vs. the sum of all cells. + * + * This class implements the conditional likelihoods summation for any number of alt + * alleles, where each alt allele has its EXACT probability of segregating calculated by + * reducing each alt B into the case XB and computing P(D | AF_b > 0 ) as follows: + * + * Suppose we have for a A/B/C site the following GLs: + * + * AA AB BB AC BC CC + * + * and we want to get the bi-allelic GLs for X/B, where X is everything not B + * + * XX = AA + AC + CC (since X = A or C) + * XB = AB + BC + * BB = BB + * + * After each allele has its probability calculated we compute the joint posterior + * as P(D | AF_* == 0) = prod_i P (D | AF_i == 0), after applying the theta^i + * prior for the ith least likely allele. + */ + public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { + /** + * The min. confidence of an allele to be included in the joint posterior. + */ + private final static double MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR = Math.log10(1e-20); + + private final static int[] BIALLELIC_NON_INFORMATIVE_PLS = new int[]{0,0,0}; private final static List BIALLELIC_NOCALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + /** + * Sorts AFCalcResults by their posteriors of AF > 0, so the + */ private final static class CompareAFCalcResultsByPNonRef implements Comparator { @Override public int compare(AFCalcResult o1, AFCalcResult o2) { - return Double.compare(o1.getLog10LikelihoodOfAFGT0(), o2.getLog10LikelihoodOfAFGT0()); + return Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0()); } } private final static CompareAFCalcResultsByPNonRef compareAFCalcResultsByPNonRef = new CompareAFCalcResultsByPNonRef(); - final ReferenceDiploidExactAFCalc refModel; + /** + * The AFCalc model we are using to do the bi-allelic computation + */ + final AFCalc biAlleleExactModel; protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); - refModel = new ReferenceDiploidExactAFCalc(nSamples, 1, 1, ploidy); - } - - @Override - protected StateTracker makeMaxLikelihood(VariantContext vc, AFCalcResultTracker resultTracker) { - return refModel.makeMaxLikelihood(vc, resultTracker); + biAlleleExactModel = new ReferenceDiploidExactAFCalc(nSamples, 1, 1, ploidy); } + /** + * Trivial subclass that helps with debugging by keeping track of the supporting information for this joint call + */ private static class MyAFCalcResult extends AFCalcResult { + /** + * List of the supporting bi-allelic AFCalcResults that went into making this multi-allelic joint call + */ final List supporting; private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map log10pNonRefByAllele, List supporting) { @@ -68,121 +134,89 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { @Override public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { - final double log10LikelihoodOfRef = computelog10LikelihoodOfRef(vc); - final List independentResultTrackers = computeAlleleConditionalExact(vc, log10AlleleFrequencyPriors); + final List independentResultTrackers = computeAlleleIndependentExact(vc, log10AlleleFrequencyPriors); final List withMultiAllelicPriors = applyMultiAllelicPriors(independentResultTrackers); - return combineIndependentPNonRefs(vc, log10LikelihoodOfRef, withMultiAllelicPriors); + return combineIndependentPNonRefs(vc, withMultiAllelicPriors); } - protected final double computelog10LikelihoodOfRef(final VariantContext vc) { - // this value just the likelihood of AF == 0 in the special constrained multi-allelic calculation - final List allGLs = getGLs(vc.getGenotypes(), false); - double log10LikelihoodOfHomRef = 0.0; - - // TODO -- can be easily optimized (currently looks at all GLs via getGLs) - for ( int i = 0; i < allGLs.size(); i++ ) { - final double[] GLs = allGLs.get(i); - log10LikelihoodOfHomRef += GLs[0]; - //log10LikelihoodOfHomRef += MathUtils.normalizeFromLog10(GLs, true)[0]; - } - - return log10LikelihoodOfHomRef; - } /** - * Computes the conditional bi-allelic exact results + * Compute the conditional exact AFCalcResult for each allele in vc independently, returning + * the result of each, in order of the alt alleles in VC * - * Suppose vc contains 2 alt allele: A* with C and T. This function first computes: - * - * (1) P(D | AF_c > 0 && AF_t == *) [i.e., T can be anything] - * - * it then computes the conditional probability on AF_c == 0: - * - * (2) P(D | AF_t > 0 && AF_c == 0) - * - * Thinking about this visually, we have the following likelihood matrix where each cell is - * the P(D | AF_c == i && AF_t == j): - * - * 0 AF_c > 0 - * ----------------- - * 0 | | - * |--|------------- - * a | | - * f | | - * _ | | - * t | | - * > | | - * 0 | | - * - * What we really want to know how - * - * (3) P(D | AF_c == 0 & AF_t == 0) - * - * compares with - * - * (4) P(D | AF_c > 0 || AF_t > 0) - * - * This is effectively asking for the value in the upper left vs. the sum of all cells. - * - * The quantity (1) is the same of all cells except those with AF_c == 0, while (2) is the - * band at the top where AF_t > 0 and AF_c == 0 - * - * So (4) is actually (1) + (2). - * - * (3) is the direct inverse of the (1) and (2), as we are simultaneously calculating - * - * (1*) P(D | AF_c == 0 && AF_t == *) [i.e., T can be anything] - * (2*) P(D | AF_t == 0 && AF_c == 0) [TODO -- note this value looks like the thing we are supposed to use] - * - * This function implements the conditional likelihoods summation for any number of alt - * alleles (not just the tri-allelic case), where each subsequent variant context is - * further constrained such that each already considered allele x has AF_x == 0 in the - * compute. - * - * @param vc - * @param log10AlleleFrequencyPriors - * @return + * @param vc the VariantContext we want to analyze + * @param log10AlleleFrequencyPriors the priors + * @return a list of the AFCalcResults for each bi-allelic sub context of vc */ - protected List computeAlleleConditionalExact(final VariantContext vc, - final double[] log10AlleleFrequencyPriors) { + @Requires({"vc != null", "log10AlleleFrequencyPriors != null"}) + @Ensures("goodIndependentResult(vc, result)") + protected final List computeAlleleIndependentExact(final VariantContext vc, + final double[] log10AlleleFrequencyPriors) { final List results = new LinkedList(); for ( final VariantContext subvc : makeAlleleConditionalContexts(vc) ) { - final AFCalcResult resultTracker = refModel.getLog10PNonRef(subvc, log10AlleleFrequencyPriors); + final AFCalcResult resultTracker = biAlleleExactModel.getLog10PNonRef(subvc, log10AlleleFrequencyPriors); results.add(resultTracker); } return results; } - protected List makeAlleleConditionalContexts(final VariantContext vc) { + /** + * Helper function to ensure that the computeAlleleIndependentExact is returning reasonable results + */ + private static boolean goodIndependentResult(final VariantContext vc, final List results) { + if ( results.size() != vc.getNAlleles() - 1) return false; + for ( int i = 0; i < results.size(); i++ ) { + if ( results.get(i).getAllelesUsedInGenotyping().size() != 2 ) + return false; + if ( ! results.get(i).getAllelesUsedInGenotyping().contains(vc.getAlternateAllele(i)) ) + return false; + } + + return true; + } + + /** + * Returns the bi-allelic variant context for each alt allele in vc with bi-allelic likelihoods, in order + * + * @param vc the variant context to split. Must have n.alt.alleles > 1 + * @return a bi-allelic variant context for each alt allele in vc + */ + @Requires({"vc != null", "vc.getNAlleles() > 1"}) + @Ensures("result.size() == vc.getNAlleles() - 1") + protected final List makeAlleleConditionalContexts(final VariantContext vc) { final int nAltAlleles = vc.getNAlleles() - 1; final List vcs = new LinkedList(); - final List afZeroAlleles = new LinkedList(); for ( int altI = 0; altI < nAltAlleles; altI++ ) { - final Allele altAllele = vc.getAlternateAllele(altI); - final List biallelic = Arrays.asList(vc.getReference(), altAllele); - vcs.add(biallelicCombinedGLs(vc, biallelic, afZeroAlleles, altI + 1)); - //afZeroAlleles.add(altAllele); + vcs.add(biallelicCombinedGLs(vc, altI + 1)); } return vcs; } - protected VariantContext biallelicCombinedGLs(final VariantContext rootVC, final List biallelic, final List afZeroAlleles, final int allele2) { + /** + * Create a single bi-allelic variant context from rootVC with alt allele with index altAlleleIndex + * + * @param rootVC the root (potentially multi-allelic) variant context + * @param altAlleleIndex index of the alt allele, from 0 == first alt allele + * @return a bi-allelic variant context based on rootVC + */ + @Requires({"rootVC.getNAlleles() > 1", "altAlleleIndex < rootVC.getNAlleles()"}) + @Ensures({"result.isBiallelic()"}) + protected final VariantContext biallelicCombinedGLs(final VariantContext rootVC, final int altAlleleIndex) { if ( rootVC.isBiallelic() ) { - if ( ! afZeroAlleles.isEmpty() ) throw new IllegalArgumentException("Root VariantContext is biallelic but afZeroAlleles wasn't empty: " + afZeroAlleles); return rootVC; } else { - final Set allelesToDiscard = new HashSet(rootVC.getAlleleIndices(afZeroAlleles)); final int nAlts = rootVC.getNAlleles() - 1; final List biallelicGenotypes = new ArrayList(rootVC.getNSamples()); for ( final Genotype g : rootVC.getGenotypes() ) - biallelicGenotypes.add(combineGLs(g, allele2, allelesToDiscard, nAlts)); + biallelicGenotypes.add(combineGLs(g, altAlleleIndex, nAlts)); final VariantContextBuilder vcb = new VariantContextBuilder(rootVC); - vcb.alleles(biallelic); + final Allele altAllele = rootVC.getAlternateAllele(altAlleleIndex - 1); + vcb.alleles(Arrays.asList(rootVC.getReference(), altAllele)); vcb.genotypes(biallelicGenotypes); return vcb.make(); } @@ -203,30 +237,16 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { * XB = AB + BC * BB = BB * - * Supports the additional mode of simply dropping GLs whose allele index occurs in allelesToDiscard. This is - * useful in the case where you want to drop alleles (not combine them), such as above: - * - * AA AB BB AC BC CC - * - * and we want to get the bi-allelic GLs for X/B, where X is everything not B, but dropping C (index 2) - * - * XX = AA (since X = A and C is dropped) - * XB = AB - * BB = BB - * - * This allows us to recover partial GLs the correspond to any allele in allelesToDiscard having strictly - * AF == 0. - * * @param original the original multi-allelic genotype * @param altIndex the index of the alt allele we wish to keep in the bialleic case -- with ref == 0 * @param nAlts the total number of alt alleles * @return a new biallelic genotype with appropriate PLs */ - @Requires({"original.hasLikelihoods()", "! allelesToDiscard.contains(altIndex)"}) + @Requires({"original.hasLikelihoods()"}) // TODO -- add ploidy == 2 test "original.getPLs() == null || original.getPLs().length == 3"}) @Ensures({"result.hasLikelihoods()", "result.getPL().length == 3"}) - protected Genotype combineGLs(final Genotype original, final int altIndex, final Set allelesToDiscard, final int nAlts ) { + protected Genotype combineGLs(final Genotype original, final int altIndex, final int nAlts ) { if ( original.isNonInformative() ) - return new GenotypeBuilder(original).PL(new int[]{0,0,0}).alleles(BIALLELIC_NOCALL).make(); + return new GenotypeBuilder(original).PL(BIALLELIC_NON_INFORMATIVE_PLS).alleles(BIALLELIC_NOCALL).make(); if ( altIndex < 1 || altIndex > nAlts ) throw new IllegalStateException("altIndex must be between 1 and nAlts " + nAlts); @@ -236,10 +256,6 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { for ( int index = 0; index < normalizedPr.length; index++ ) { final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(index); - // just continue if we shouldn't include the pair because it's in the discard set - if ( discardAllelePair(pair, allelesToDiscard) ) - continue; - if ( pair.alleleIndex1 == altIndex ) { if ( pair.alleleIndex2 == altIndex ) // hom-alt case @@ -263,11 +279,7 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { return new GenotypeBuilder(original).PL(GLs).alleles(BIALLELIC_NOCALL).make(); } - protected boolean discardAllelePair(final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair, Set allelesToDiscard) { - return allelesToDiscard.contains(pair.alleleIndex1) || allelesToDiscard.contains(pair.alleleIndex2); - } - - protected List applyMultiAllelicPriors(final List conditionalPNonRefResults) { + protected final List applyMultiAllelicPriors(final List conditionalPNonRefResults) { final ArrayList sorted = new ArrayList(conditionalPNonRefResults); // sort the results, so the most likely allele is first @@ -291,10 +303,11 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { /** * Take the independent estimates of pNonRef for each alt allele and combine them into a single result * + * TODO -- add more docs + * * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently */ protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc, - final double log10LikelihoodsOfACEq0, final List sortedResultsWithThetaNPriors) { int nEvaluations = 0; final int nAltAlleles = sortedResultsWithThetaNPriors.size(); @@ -302,8 +315,8 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { final double[] log10PriorsOfAC = new double[2]; final Map log10pNonRefByAllele = new HashMap(nAltAlleles); - // this value is a sum in real space so we need to store values to sum up later - final double[] log10LikelihoodsOfACGt0 = new double[nAltAlleles]; + // this value is a sum in log space + double log10PosteriorOfACEq0Sum = 0.0; for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) { final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1); @@ -316,7 +329,8 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0(); // the AF > 0 case requires us to store the normalized likelihood for later summation - log10LikelihoodsOfACGt0[altI] = sortedResultWithThetaNPriors.getLog10LikelihoodOfAFGT0(); + if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) + log10PosteriorOfACEq0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0(); // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior log10pNonRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0()); @@ -325,14 +339,19 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { nEvaluations += sortedResultWithThetaNPriors.nEvaluations; } - // the log10 likelihoods are the sum of the log10 likelihoods across all alt alleles - final double[] log10LikelihoodsOfAC = new double[]{ - log10LikelihoodsOfACEq0, - MathUtils.log10sumLog10(log10LikelihoodsOfACGt0)}; + // In principle, if B_p = x and C_p = y are the probabilities of being poly for alleles B and C, + // the probability of being poly is (1 - B_p) * (1 - C_p) = (1 - x) * (1 - y). We want to estimate confidently + // log10((1 - x) * (1 - y)) which is log10(1 - x) + log10(1 - y). This sum is log10PosteriorOfACEq0 + final double log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO); + final double[] log10LikelihoodsOfAC = new double[] { + // L + prior = posterior => L = poster - prior + log10PosteriorOfACEq0Sum - log10PriorsOfAC[0], + log10PosteriorOfACGt0 - log10PriorsOfAC[1] + }; return new MyAFCalcResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(), - MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true, true), // necessary to ensure all values < 0 - MathUtils.normalizeFromLog10(log10PriorsOfAC, true), // priors incorporate multiple alt alleles, must be normalized + MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), // necessary to ensure all values < 0 + MathUtils.normalizeFromLog10(log10PriorsOfAC, true), // priors incorporate multiple alt alleles, must be normalized log10pNonRefByAllele, sortedResultsWithThetaNPriors); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java index 4de983508..b4e7b2ab1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java @@ -1,13 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - public class ReferenceDiploidExactAFCalc extends DiploidExactAFCalc { protected ReferenceDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); } - - protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) { - return new StateTracker(); - } } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index a1d6907a2..3740d5d7c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -596,7 +596,6 @@ public class MathUtils { if (keepInLogSpace) { for (int i = 0; i < array.length; i++) { array[i] -= maxValue; - array[i] = Math.max(array[i], LOG10_P_OF_ZERO); } return array; } @@ -613,8 +612,11 @@ public class MathUtils { sum += normalized[i]; for (int i = 0; i < array.length; i++) { double x = normalized[i] / sum; - if (takeLog10OfOutput) - x = Math.max(Math.log10(x), LOG10_P_OF_ZERO); + if (takeLog10OfOutput) { + x = Math.log10(x); + if ( x < LOG10_P_OF_ZERO || Double.isInfinite(x) ) + x = array[i] - maxValue; + } normalized[i] = x; } diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java b/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java index 617391714..6fda0245b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java +++ b/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java @@ -34,7 +34,12 @@ import java.io.PrintStream; * to the provided output stream. For testing/debugging purposes. * * Log entries are of the following form (fields are tab-separated): - * LABEL VALUE KEY1 KEY2 ... KEY_N + * LABEL OPERATION VALUE KEY1 KEY2 ... KEY_N + * + * A header line is written before the log entries giving the dimensions of this NestedIntegerArray. + * It has the form: + * + * # LABEL SIZE_OF_FIRST_DIMENSION SIZE_OF_SECOND_DIMENSION ... SIZE_OF_NTH_DIMENSION * * @author David Roazen */ @@ -43,6 +48,9 @@ public class LoggingNestedIntegerArray extends NestedIntegerArray { private PrintStream log; private String logEntryLabel; + public static final String HEADER_LINE_PREFIX = "# "; + public enum NestedIntegerArrayOperation { GET, PUT }; + /** * * @param log output stream to which to log update operations @@ -57,6 +65,37 @@ public class LoggingNestedIntegerArray extends NestedIntegerArray { } this.log = log; this.logEntryLabel = logEntryLabel != null ? logEntryLabel : ""; + + // Write the header line recording the dimensions of this NestedIntegerArray: + StringBuilder logHeaderLine = new StringBuilder(); + + logHeaderLine.append(HEADER_LINE_PREFIX); + logHeaderLine.append(this.logEntryLabel); + for ( int dimension : dimensions ) { + logHeaderLine.append("\t"); + logHeaderLine.append(dimension); + } + + this.log.println(logHeaderLine.toString()); + } + + @Override + public T get( final int... keys ) { + StringBuilder logEntry = new StringBuilder(); + + logEntry.append(logEntryLabel); + logEntry.append("\t"); + logEntry.append(NestedIntegerArrayOperation.GET); + logEntry.append("\t"); // empty field for the datum value + + for ( int key : keys ) { + logEntry.append("\t"); + logEntry.append(key); + } + + log.println(logEntry.toString()); + + return super.get(keys); } @Override @@ -67,6 +106,8 @@ public class LoggingNestedIntegerArray extends NestedIntegerArray { logEntry.append(logEntryLabel); logEntry.append("\t"); + logEntry.append(NestedIntegerArrayOperation.PUT); + logEntry.append("\t"); logEntry.append(value); for ( int key : keys ) { logEntry.append("\t"); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index aa801c2b9..67e80cf3c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -298,12 +298,16 @@ public abstract class Genotype implements Comparable { * @return true if all samples PLs are equal and == 0 */ public boolean isNonInformative() { - for ( final int PL : getPL() ) { - if ( PL != 0 ) - return false; - } + if ( getPL() == null ) + return true; + else { + for ( final int PL : getPL() ) { + if ( PL != 0 ) + return false; + } - return true; + return true; + } } /** diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index e2ea47d9c..df088a4ad 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -349,7 +349,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("beee9457d7cea42006ac45400db5e873")); + Arrays.asList("f3ff7fe0f15f31eadd726c711d6bf3de")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java index 24ffde9c3..c1b28314c 100755 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java @@ -21,7 +21,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest { for ( final int nct : Arrays.asList(1, 2) ) { // tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct }); //// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct }); - tests.add(new Object[]{ "BOTH", "78ce72d8f9d029313f5f2ceb02bb9822", nt, nct }); + tests.add(new Object[]{ "BOTH", "8cad82c3a5f5b932042933f136663c8a", nt, nct }); } return tests.toArray(new Object[][]{}); diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index 5b84bfd16..2afa66d9c 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -24,7 +24,6 @@ package org.broadinstitute.sting.queue -import function.QFunction import java.io.File import org.broadinstitute.sting.commandline._ import org.broadinstitute.sting.queue.util._ @@ -96,18 +95,18 @@ class QCommandLine extends CommandLineProgram with Logging { new PluginManager[QScript](classOf[QScript], Seq(qScriptClasses.toURI.toURL)) } - private lazy val qStatusMessengerPluginManager = { - new PluginManager[QStatusMessenger](classOf[QStatusMessenger]) + private lazy val qCommandPlugin = { + new PluginManager[QCommandPlugin](classOf[QCommandPlugin]) } - ClassFieldCache.parsingEngine = new ParsingEngine(this) - /** * Takes the QScripts passed in, runs their script() methods, retrieves their generated * functions, and then builds and runs a QGraph based on the dependencies. */ def execute = { - val allStatusMessengers = qStatusMessengerPluginManager.createAllTypes() + ClassFieldCache.parsingEngine = this.parser + + val allCommandPlugins = qCommandPlugin.createAllTypes() if (settings.qSettings.runName == null) settings.qSettings.runName = FilenameUtils.removeExtension(scripts.head.getName) @@ -115,14 +114,24 @@ class QCommandLine extends CommandLineProgram with Logging { settings.qSettings.tempDirectory = IOUtils.absolute(settings.qSettings.runDirectory, ".queue/tmp") qGraph.initializeWithSettings(settings) - for (statusMessenger <- allStatusMessengers) { - loadArgumentsIntoObject(statusMessenger) + for (commandPlugin <- allCommandPlugins) { + loadArgumentsIntoObject(commandPlugin) } - for (statusMessenger <- allStatusMessengers) { - statusMessenger.started() + for (commandPlugin <- allCommandPlugins) { + if (commandPlugin.statusMessenger != null) + commandPlugin.statusMessenger.started() } + // TODO: Default command plugin argument? + val remoteFileConverter = ( + for (commandPlugin <- allCommandPlugins if (commandPlugin.remoteFileConverter != null)) + yield commandPlugin.remoteFileConverter + ).headOption.getOrElse(null) + + if (remoteFileConverter != null) + loadArgumentsIntoObject(remoteFileConverter) + val allQScripts = qScriptPluginManager.createAllTypes() for (script <- allQScripts) { logger.info("Scripting " + qScriptPluginManager.getName(script.getClass.asSubclass(classOf[QScript]))) @@ -137,10 +146,15 @@ class QCommandLine extends CommandLineProgram with Logging { case e: Exception => throw new UserException.CannotExecuteQScript(script.getClass.getSimpleName + ".script() threw the following exception: " + e, e) } + + if (remoteFileConverter != null) { + if (remoteFileConverter.convertToRemoteEnabled) + script.mkRemoteOutputs(remoteFileConverter) + } + script.functions.foreach(qGraph.add(_)) logger.info("Added " + script.functions.size + " functions") } - // Execute the job graph qGraph.run() @@ -162,14 +176,16 @@ class QCommandLine extends CommandLineProgram with Logging { if (!success) { logger.info("Done with errors") qGraph.logFailed() - for (statusMessenger <- allStatusMessengers) - statusMessenger.exit("Done with errors") + for (commandPlugin <- allCommandPlugins) + if (commandPlugin.statusMessenger != null) + commandPlugin.statusMessenger.exit("Done with errors") 1 } else { if (settings.run) { allQScripts.foreach(_.pushOutputs()) - for (statusMessenger <- allStatusMessengers) - statusMessenger.done(allQScripts.map(_.remoteOutputs)) + for (commandPlugin <- allCommandPlugins) + if (commandPlugin.statusMessenger != null) + commandPlugin.statusMessenger.done(allQScripts.map(_.remoteOutputs)) } 0 } @@ -189,7 +205,7 @@ class QCommandLine extends CommandLineProgram with Logging { override def getArgumentSources = { var plugins = Seq.empty[Class[_]] plugins ++= qScriptPluginManager.getPlugins - plugins ++= qStatusMessengerPluginManager.getPlugins + plugins ++= qCommandPlugin.getPlugins plugins.toArray } @@ -200,11 +216,10 @@ class QCommandLine extends CommandLineProgram with Logging { override def getArgumentSourceName(source: Class[_]) = { if (classOf[QScript].isAssignableFrom(source)) qScriptPluginManager.getName(source.asSubclass(classOf[QScript])) - else if (classOf[QStatusMessenger].isAssignableFrom(source)) - qStatusMessengerPluginManager.getName(source.asSubclass(classOf[QStatusMessenger])) + else if (classOf[QCommandPlugin].isAssignableFrom(source)) + qCommandPlugin.getName(source.asSubclass(classOf[QCommandPlugin])) else null - } /** diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala new file mode 100644 index 000000000..499c31554 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala @@ -0,0 +1,9 @@ +package org.broadinstitute.sting.queue + +import engine.QStatusMessenger +import util.RemoteFileConverter + +trait QCommandPlugin { + def statusMessenger: QStatusMessenger = null + def remoteFileConverter: RemoteFileConverter = null +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala index 2dcfb916c..3df61b1e3 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala @@ -108,6 +108,24 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon functions.foreach( f => add(f) ) } + /** + * Convert all @Output files to remote output files. + * @param remoteFileConverter Converter for files to remote files. + */ + def mkRemoteOutputs(remoteFileConverter: RemoteFileConverter) { + for (field <- outputFields) { + val fieldFile = ClassFieldCache.getFieldFile(this, field) + if (fieldFile != null && !fieldFile.isInstanceOf[RemoteFile]) { + val fieldName = ClassFieldCache.fullName(field) + val remoteFile = remoteFileConverter.convertToRemote(fieldFile, fieldName) + ClassFieldCache.setFieldValue(this, field, remoteFile) + } + } + } + + /** + * Pull all remote files to the local disk. + */ def pullInputs() { val inputs = ClassFieldCache.getFieldFiles(this, inputFields) for (remoteFile <- filterRemoteFiles(inputs)) { @@ -116,6 +134,9 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon } } + /** + * Push all remote files from the local disk. + */ def pushOutputs() { val outputs = ClassFieldCache.getFieldFiles(this, outputFields) for (remoteFile <- filterRemoteFiles(outputs)) { @@ -124,6 +145,10 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon } } + /** + * List out the remote outputs + * @return the RemoteFile outputs by argument source + */ def remoteOutputs: Map[ArgumentSource, Seq[RemoteFile]] = outputFields.map(field => (field -> filterRemoteFiles(ClassFieldCache.getFieldFiles(this, field)))).filter(tuple => !tuple._2.isEmpty).toMap diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ClassFieldCache.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ClassFieldCache.scala index 870dd5617..ae3db6860 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/ClassFieldCache.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/ClassFieldCache.scala @@ -180,4 +180,15 @@ object ClassFieldCache { case unknown => throw new QException("Non-file found. Try removing the annotation, change the annotation to @Argument, or extend File with FileExtension: %s: %s".format(field.field, unknown)) } + + // + // other utilities + // + + /** + * Retrieves the fullName of the argument + * @param field ArgumentSource to check + * @return Full name of the argument source + */ + def fullName(field: ArgumentSource) = field.createArgumentDefinitions().get(0).fullName } diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFileConverter.scala b/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFileConverter.scala new file mode 100644 index 000000000..c77c242d0 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFileConverter.scala @@ -0,0 +1,21 @@ +package org.broadinstitute.sting.queue.util + +import java.io.File + +trait RemoteFileConverter { + type RemoteFileType <: RemoteFile + + /** + * If this remote file creator is capable of converting to a remote file. + * @return true if ready to convert + */ + def convertToRemoteEnabled: Boolean + + /** + * Converts to a remote file + * @param file The original file + * @param name A "name" to use for the remote file + * @return The new version of this remote file. + */ + def convertToRemote(file: File, name: String): RemoteFileType +}