From 4714ccc28456625853791d56167f114b74eeba22 Mon Sep 17 00:00:00 2001 From: Ami Levy-Moonshine Date: Wed, 21 Nov 2012 10:47:35 -0500 Subject: [PATCH 02/31] change the way CombineVariants check the priority arguments in order to throw error when the genotypeMergeOption argument is set to PRIORITIZE but PRIORITY_STRING is not provided --- .../walkers/variantutils/CombineVariants.java | 21 +++++++++---------- .../variantcontext/VariantContextUtils.java | 5 ++++- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index b1d8dc91d..e710befc9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -134,7 +134,7 @@ public class CombineVariants extends RodWalker implements Tree protected VariantContextWriter vcfWriter = null; @Argument(shortName="genotypeMergeOptions", doc="Determines how we should merge genotype records for samples shared across the ROD files", required=false) - public VariantContextUtils.GenotypeMergeType genotypeMergeOption = VariantContextUtils.GenotypeMergeType.PRIORITIZE; + public VariantContextUtils.GenotypeMergeType genotypeMergeOption = null; @Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false) public VariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED; @@ -200,13 +200,13 @@ public class CombineVariants extends RodWalker implements Tree } else logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option"); - if ( PRIORITY_STRING == null ) { + validateAnnotateUnionArguments(); + if ( PRIORITY_STRING == null && genotypeMergeOption == null) { genotypeMergeOption = VariantContextUtils.GenotypeMergeType.UNSORTED; //PRIORITY_STRING = Utils.join(",", vcfRods.keySet()); Deleted by Ami (7/10/12) - logger.info("Priority string not provided, using arbitrary genotyping order: " + PRIORITY_STRING); + logger.info("Priority string not provided, using arbitrary genotyping order: "+priority); } - validateAnnotateUnionArguments(); samples = sitesOnlyVCF ? Collections.emptySet() : SampleUtils.getSampleList(vcfRods, genotypeMergeOption); if ( SET_KEY.toLowerCase().equals("null") ) @@ -228,16 +228,15 @@ public class CombineVariants extends RodWalker implements Tree if ( genotypeMergeOption == VariantContextUtils.GenotypeMergeType.PRIORITIZE && PRIORITY_STRING == null ) throw new UserException.MissingArgument("rod_priority_list", "Priority string must be provided if you want to prioritize genotypes"); - if ( genotypeMergeOption == VariantContextUtils.GenotypeMergeType.PRIORITIZE ) + if ( genotypeMergeOption == VariantContextUtils.GenotypeMergeType.PRIORITIZE ){ priority = new ArrayList(Arrays.asList(PRIORITY_STRING.split(","))); - else - priority = new ArrayList(rodNames); + if ( rodNames.size() != priority.size() ) + throw new UserException.BadArgumentValue("rod_priority_list", "The priority list must contain exactly one rod binding per ROD provided to the GATK: rodNames=" + rodNames + " priority=" + priority); - if ( rodNames.size() != priority.size() ) - throw new UserException.BadArgumentValue("rod_priority_list", "The priority list must contain exactly one rod binding per ROD provided to the GATK: rodNames=" + rodNames + " priority=" + priority); + if ( ! rodNames.containsAll(priority) ) + throw new UserException.BadArgumentValue("rod_priority_list", "Not all priority elements provided as input RODs: " + PRIORITY_STRING); + } - if ( ! rodNames.containsAll(priority) ) - throw new UserException.BadArgumentValue("rod_priority_list", "Not all priority elements provided as input RODs: " + PRIORITY_STRING); } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 81959c998..614b234e9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -840,8 +840,11 @@ public class VariantContextUtils { if ( mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null ) throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); - if ( priorityListOfVCs == null || mergeOption == GenotypeMergeType.UNSORTED ) + if ( mergeOption == GenotypeMergeType.UNSORTED ){ + if (priorityListOfVCs != null ) + logger.info("Priority string was provided but is not used since GenotypeMergeType is UNSORTED"); return new ArrayList(unsortedVCs); + } else { ArrayList sorted = new ArrayList(unsortedVCs); Collections.sort(sorted, new CompareByPriority(priorityListOfVCs)); From 969c995298f2b6cd17872f6887464e8709614f45 Mon Sep 17 00:00:00 2001 From: Ami Levy-Moonshine Date: Sat, 1 Dec 2012 00:08:19 -0500 Subject: [PATCH 03/31] work under development - catVariants. Changes to AssessRRQuals based on Eric todo comments. bug fix in CombineVariants --- .../gatk/walkers/qc/AssessReducedQuals.java | 44 ++++++++++++------- .../variantcontext/VariantContextUtils.java | 8 ++-- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java index 78bcf1228..2c70d44e2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java @@ -45,7 +45,6 @@ import java.util.List; public class AssessReducedQuals extends LocusWalker implements TreeReducible { private static final String reduced = "reduced"; - private static final String original = "original"; private static final int originalQualsIndex = 0; private static final int reducedQualsIndex = 1; @@ -55,14 +54,14 @@ public class AssessReducedQuals extends LocusWalker implem @Argument(fullName = "qual_epsilon", shortName = "epsilon", doc = "when |Quals_reduced_bam - Quals_original_bam| > epsilon*Quals_original_bam we output this interval", required = false) public int qual_epsilon = 0; - @Argument(fullName = "debugLevel", shortName = "debug", doc = "debug mode on") // TODO -- best to make this optional - public int debugLevel = 0; // TODO -- best to make this an enum or boolean + @Argument(fullName = "debugLevel", shortName = "debug", doc = "debug level: NO_DEBUG, PRINT_LOCI,PRINT_PILEUPS", required = false) + public DebugLevel debugLevel = DebugLevel.NO_DEBUG; @Output protected PrintStream out; public void initialize() { - if (debugLevel != 0) + if (debugLevel != DebugLevel.NO_DEBUG) out.println(" Debug mode" + "Debug:\tsufficientQualSum: "+sufficientQualSum+ "\n " + "Debug:\tqual_epsilon: "+qual_epsilon); @@ -78,20 +77,20 @@ public class AssessReducedQuals extends LocusWalker implem boolean reportLocus; final int[] quals = getPileupQuals(context.getBasePileup()); - if (debugLevel != 0) + if (debugLevel != DebugLevel.NO_DEBUG) out.println("Debug:\tLocus Quals\t"+ref.getLocus()+"\toriginal\t"+quals[originalQualsIndex]+"\treduced\t"+quals[reducedQualsIndex]); final int epsilon = MathUtils.fastRound(quals[originalQualsIndex]*qual_epsilon); final int calcOriginalQuals = Math.min(quals[originalQualsIndex],sufficientQualSum); final int calcReducedQuals = Math.min(quals[reducedQualsIndex],sufficientQualSum); final int OriginalReducedQualDiff = calcOriginalQuals - calcReducedQuals; reportLocus = OriginalReducedQualDiff > epsilon || OriginalReducedQualDiff < -1*epsilon; - if(debugLevel != 0 && reportLocus) + if(debugLevel != DebugLevel.NO_DEBUG && reportLocus) out.println("Debug:\tEmited Locus\t"+ref.getLocus()+"\toriginal\t"+quals[originalQualsIndex]+"\treduced\t"+quals[reducedQualsIndex]+"\tepsilon\t"+epsilon+"\tdiff\t"+OriginalReducedQualDiff); return reportLocus ? ref.getLocus() : null; } - private final int[] getPileupQuals(final ReadBackedPileup readPileup) { + private int[] getPileupQuals(final ReadBackedPileup readPileup) { final int[] quals = new int[2]; String[] printPileup = {"Debug 2:\toriginal pileup:\t"+readPileup.getLocation()+"\nDebug 2:----------------------------------\n", @@ -99,30 +98,29 @@ public class AssessReducedQuals extends LocusWalker implem for( PileupElement p : readPileup ){ final List tags = getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags(); - if ( isGoodRead(p,tags) ){ + if ( isGoodRead(p) ){ final int tempQual = (int)(p.getQual()) * p.getRepresentativeCount(); final int tagIndex = getTagIndex(tags); quals[tagIndex] += tempQual; - if(debugLevel == 2) + if(debugLevel == DebugLevel.PRINT_PILEUPS) printPileup[tagIndex] += "\tDebug 2: ("+p+")\tMQ="+p.getMappingQual()+":QU="+p.getQual()+":RC="+p.getRepresentativeCount()+":OS="+p.getOffset()+"\n"; } } - if(debugLevel == 2){ + if(debugLevel == DebugLevel.PRINT_PILEUPS){ out.println(printPileup[originalQualsIndex]); out.println(printPileup[reducedQualsIndex]); } return quals; } - // TODO -- arguments/variables should be final, not method declaration - private final boolean isGoodRead(PileupElement p, List tags){ - // TODO -- this isn't quite right. You don't need the tags here. Instead, you want to check whether the read itself (which - // TODO -- you can get from the PileupElement) is a reduced read (not all reads from the reduced bam are reduced) and only - // TODO -- for them do you want to ignore that min mapping quality cutoff (but you *do* still want the min base cutoff). - return !p.isDeletion() && (tags.contains(reduced) || (tags.contains(original) && (int)p.getQual() >= 20 && p.getMappingQual() >= 20)); + + private boolean isGoodRead(PileupElement p){ + // TODO -- You want to check whether the read itself is a reduced read and only + // TODO -- for them you want to ignore that min mapping quality cutoff (but you *do* still want the min base cutoff). + return !p.isDeletion() && ((p.getRead().isReducedRead()) || (!p.getRead().isReducedRead() && (int)p.getQual() >= 20 && p.getMappingQual() >= 20)); } - private final int getTagIndex(List tags){ + private int getTagIndex(List tags){ return tags.contains(reduced) ? 1 : 0; } @@ -170,4 +168,16 @@ public class AssessReducedQuals extends LocusWalker implem out.println(sum); return value; } + + public enum DebugLevel { + NO_DEBUG, + /** + * Print locus level information (such as locus quals) and loci with unmatch quals + */ + PRINT_LOCI, + /** + * Print the pileup infomarion of the reduced bam files and the original bam files + */ + PRINT_PILEUPS + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 614b234e9..3b6bd0182 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -451,7 +451,7 @@ public class VariantContextUtils { if ( unsortedVCs == null || unsortedVCs.size() == 0 ) return null; - if ( annotateOrigin && priorityListOfVCs == null ) + if ( annotateOrigin && priorityListOfVCs == null && genotypeMergeOptions == GenotypeMergeType.PRIORITIZE) throw new IllegalArgumentException("Cannot merge calls and annotate their origins without a complete priority list of VariantContexts"); if ( genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE ) @@ -597,7 +597,7 @@ public class VariantContextUtils { if ( annotateOrigin ) { // we care about where the call came from String setValue; - if ( nFiltered == 0 && variantSources.size() == priorityListOfVCs.size() ) // nothing was unfiltered + if ( nFiltered == 0 && variantSources.size() == preFilteredVCs.size() ) // nothing was unfiltered setValue = MERGE_INTERSECTION; else if ( nFiltered == VCs.size() ) // everything was filtered out setValue = MERGE_FILTER_IN_ALL; @@ -840,9 +840,9 @@ public class VariantContextUtils { if ( mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null ) throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); - if ( mergeOption == GenotypeMergeType.UNSORTED ){ + if ( mergeOption != GenotypeMergeType.PRIORITIZE ){ if (priorityListOfVCs != null ) - logger.info("Priority string was provided but is not used since GenotypeMergeType is UNSORTED"); + logger.info("Priority string was provided but is not used since GenotypeMergeType is not PRIORITIZE"); return new ArrayList(unsortedVCs); } else { From 2b601571e764c7ff9fc9afb9e3b11dcb21fa01e6 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Dec 2012 20:59:07 -0500 Subject: [PATCH 06/31] Better error handling in NanoScheduler -- The previous nanoscheduler would deadlock in the case where an Error, not an Exception, was thrown. Errors, like out of memory, would cause the whole system to die. This bugfix resolves that issue --- .../utils/nanoScheduler/InputProducer.java | 9 +++++- .../utils/nanoScheduler/NanoScheduler.java | 29 +++++++++++++++++-- .../nanoScheduler/NanoSchedulerUnitTest.java | 26 ++++++++++++----- 3 files changed, 52 insertions(+), 12 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java index bd99a9266..45c7c5096 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java @@ -103,6 +103,8 @@ class InputProducer implements Runnable { } else { // get the next value, and return it final InputType input = inputReader.next(); + if ( input == null ) + throw new IllegalStateException("inputReader.next() returned a null value, breaking our contract"); inputTimer.stop(); nRead++; return input; @@ -121,6 +123,9 @@ class InputProducer implements Runnable { final InputType value = readNextItem(); if ( value == null ) { + if ( ! readLastValue ) + throw new IllegalStateException("value == null but readLastValue is false!"); + // add the EOF object so our consumer knows we are done in all inputs // note that we do not increase inputID here, so that variable indicates the ID // of the last real value read from the queue @@ -133,8 +138,10 @@ class InputProducer implements Runnable { } latch.countDown(); - } catch (Exception ex) { + } catch (Throwable ex) { errorTracker.notifyOfError(ex); + } finally { +// logger.info("Exiting input thread readLastValue = " + readLastValue); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index d83a23c0f..6d769c2cf 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -320,6 +320,7 @@ public class NanoScheduler { while ( true ) { // check that no errors occurred while we were waiting handleErrors(); +// checkForDeadlocks(); try { final ReduceType result = reduceResult.get(100, TimeUnit.MILLISECONDS); @@ -341,6 +342,26 @@ public class NanoScheduler { } } +// private void checkForDeadlocks() { +// if ( deadLockCheckCounter++ % 100 == 0 ) { +// logger.info("Checking for deadlocks..."); +// final ThreadMXBean bean = ManagementFactory.getThreadMXBean(); +// final long[] threadIds = bean.findDeadlockedThreads(); // Returns null if no threads are deadlocked. +// +// if (threadIds != null) { +// final ThreadInfo[] infos = bean.getThreadInfo(threadIds); +// +// logger.error("!!! Deadlock detected !!!!"); +// for (final ThreadInfo info : infos) { +// logger.error("Thread " + info); +// for ( final StackTraceElement elt : info.getStackTrace() ) { +// logger.error("\t" + elt.toString()); +// } +// } +// } +// } +// } + private void handleErrors() { if ( errorTracker.hasAnErrorOccurred() ) { masterExecutor.shutdownNow(); @@ -408,7 +429,8 @@ public class NanoScheduler { // wait for all of the input and map threads to finish return waitForCompletion(inputProducer, reducer); - } catch (Exception ex) { + } catch (Throwable ex) { +// logger.warn("Reduce job got exception " + ex); errorTracker.notifyOfError(ex); return initialValue; } @@ -495,7 +517,7 @@ public class NanoScheduler { // enqueue the result into the mapResultQueue result = new MapResult(mapValue, jobID); - if ( jobID % bufferSize == 0 && progressFunction != null ) + if ( progressFunction != null ) progressFunction.progress(input); } else { // push back the EOF marker so other waiting threads can read it @@ -508,7 +530,8 @@ public class NanoScheduler { mapResultQueue.put(result); final int nReduced = reducer.reduceAsMuchAsPossible(mapResultQueue); - } catch (Exception ex) { + } catch (Throwable ex) { +// logger.warn("Map job got exception " + ex); errorTracker.notifyOfError(ex); } finally { // we finished a map job, release the job queue semaphore diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index af2e18ad9..d415b8b4c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -243,7 +243,7 @@ public class NanoSchedulerUnitTest extends BaseTest { for ( final int nThreads : Arrays.asList(8) ) { for ( final boolean addDelays : Arrays.asList(true, false) ) { final NanoSchedulerBasicTest test = new NanoSchedulerBasicTest(bufSize, nThreads, 1, 1000000, false); - final int maxN = addDelays ? 10000 : 100000; + final int maxN = addDelays ? 1000 : 10000; for ( int nElementsBeforeError = 0; nElementsBeforeError < maxN; nElementsBeforeError += Math.max(nElementsBeforeError / 10, 1) ) { tests.add(new Object[]{nElementsBeforeError, test, addDelays}); } @@ -259,17 +259,22 @@ public class NanoSchedulerUnitTest extends BaseTest { executeTestErrorThrowingInput(10, new NullPointerException(), exampleTest, false); } - @Test(enabled = true, expectedExceptions = ReviewedStingException.class, timeOut = 10000) + @Test(enabled = true, expectedExceptions = ReviewedStingException.class, timeOut = 1000) public void testInputErrorIsThrown_RSE() throws InterruptedException { executeTestErrorThrowingInput(10, new ReviewedStingException("test"), exampleTest, false); } - @Test(enabled = true, expectedExceptions = NullPointerException.class, dataProvider = "NanoSchedulerInputExceptionTest", timeOut = 10000, invocationCount = 1) - public void testInputErrorDoesntDeadlock(final int nElementsBeforeError, final NanoSchedulerBasicTest test, final boolean addDelays ) throws InterruptedException { + @Test(enabled = true, expectedExceptions = NullPointerException.class, dataProvider = "NanoSchedulerInputExceptionTest", timeOut = 1000, invocationCount = 1) + public void testInputRuntimeExceptionDoesntDeadlock(final int nElementsBeforeError, final NanoSchedulerBasicTest test, final boolean addDelays ) throws InterruptedException { executeTestErrorThrowingInput(nElementsBeforeError, new NullPointerException(), test, addDelays); } - private void executeTestErrorThrowingInput(final int nElementsBeforeError, final RuntimeException ex, final NanoSchedulerBasicTest test, final boolean addDelays) { + @Test(enabled = true, expectedExceptions = ReviewedStingException.class, dataProvider = "NanoSchedulerInputExceptionTest", timeOut = 1000, invocationCount = 1) + public void testInputErrorDoesntDeadlock(final int nElementsBeforeError, final NanoSchedulerBasicTest test, final boolean addDelays ) throws InterruptedException { + executeTestErrorThrowingInput(nElementsBeforeError, new Error(), test, addDelays); + } + + private void executeTestErrorThrowingInput(final int nElementsBeforeError, final Throwable ex, final NanoSchedulerBasicTest test, final boolean addDelays) { logger.warn("executeTestErrorThrowingInput " + nElementsBeforeError + " ex=" + ex + " test=" + test + " addInputDelays=" + addDelays); final NanoScheduler nanoScheduler = test.makeScheduler(); nanoScheduler.execute(new ErrorThrowingIterator(nElementsBeforeError, ex, addDelays), test.makeMap(), test.initReduce(), test.makeReduce()); @@ -279,9 +284,9 @@ public class NanoSchedulerUnitTest extends BaseTest { final int nElementsBeforeError; final boolean addDelays; int i = 0; - final RuntimeException ex; + final Throwable ex; - private ErrorThrowingIterator(final int nElementsBeforeError, RuntimeException ex, boolean addDelays) { + private ErrorThrowingIterator(final int nElementsBeforeError, Throwable ex, boolean addDelays) { this.nElementsBeforeError = nElementsBeforeError; this.ex = ex; this.addDelays = addDelays; @@ -290,7 +295,12 @@ public class NanoSchedulerUnitTest extends BaseTest { @Override public boolean hasNext() { return true; } @Override public Integer next() { if ( i++ > nElementsBeforeError ) { - throw ex; + if ( ex instanceof Error ) + throw (Error)ex; + else if ( ex instanceof RuntimeException ) + throw (RuntimeException)ex; + else + throw new RuntimeException("Bad exception " + ex); } else if ( addDelays ) { maybeDelayMe(i); return i; From 465694078e5fdc8704fa12b588f1a66a0cc97783 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Dec 2012 22:08:01 -0500 Subject: [PATCH 07/31] Major performance improvement to the GATK engine -- The NanoSchedule timing code (in NSRuntimeProfile) was crazy expensive, but never showed up in the profilers. Removed all of the timing code from the NanoScheduler, the NSRuntimeProfile itself, and updated the unit tests. -- For tools that largely pass through data quickly, this change reduces runtimes by as much as 10x. For the RealignerTargetCreator example, the runtime before this commit was 3 hours, and after is 30 minutes (6x improvement). -- Took this opportunity to improve the GATK ProgressMeter. NotifyOfProgress now just keeps track of the maximum position seen, and a separate daemon thread ProgressMeterDaemon periodically wakes up and prints the current progress. This removes all inner loop calls to the GATK timers. -- The history of the bug started here: http://gatkforums.broadinstitute.org/discussion/comment/2402#Comment_2402 --- .../sting/gatk/executive/MicroScheduler.java | 4 -- .../broadinstitute/sting/utils/GenomeLoc.java | 10 +++ .../utils/nanoScheduler/InputProducer.java | 12 ---- .../utils/nanoScheduler/NSRuntimeProfile.java | 67 ------------------- .../utils/nanoScheduler/NanoScheduler.java | 53 +-------------- .../sting/utils/nanoScheduler/Reducer.java | 9 --- .../utils/progressmeter/ProgressMeter.java | 65 ++++++++++++------ .../progressmeter/ProgressMeterDaemon.java | 60 +++++++++++++++++ .../nanoScheduler/InputProducerUnitTest.java | 5 +- .../nanoScheduler/NanoSchedulerUnitTest.java | 11 --- .../utils/nanoScheduler/ReducerUnitTest.java | 5 +- 11 files changed, 122 insertions(+), 179 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSRuntimeProfile.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 38170040a..8d0cefaa4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -43,7 +43,6 @@ import org.broadinstitute.sting.utils.AutoFormattingTime; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; @@ -346,9 +345,6 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { for ( final TraversalEngine te : allCreatedTraversalEngines) te.shutdown(); - // horrible hack to print nano scheduling information across all nano schedulers, if any were used - NanoScheduler.printCombinedRuntimeProfile(); - allCreatedTraversalEngines.clear(); availableTraversalEngines.clear(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 4d2c26a79..ec82cdef2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -495,4 +495,14 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome public long sizeOfOverlap( final GenomeLoc that ) { return ( this.overlapsP(that) ? Math.min( getStop(), that.getStop() ) - Math.max( getStart(), that.getStart() ) + 1L : 0L ); } + + /** + * Returns the maximum GenomeLoc of this and other + * @param other another non-null genome loc + * @return the max of this and other + */ + public GenomeLoc max(final GenomeLoc other) { + final int cmp = this.compareTo(other); + return cmp == -1 ? other : this; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java index 45c7c5096..0e0237412 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java @@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils.nanoScheduler; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MultiThreadedErrorTracker; -import org.broadinstitute.sting.utils.SimpleTimer; import java.util.Iterator; import java.util.concurrent.BlockingQueue; @@ -19,11 +18,6 @@ class InputProducer implements Runnable { */ final Iterator inputReader; - /** - * Our timer (may be null) that we use to track our input costs - */ - final SimpleTimer inputTimer; - /** * Where we put our input values for consumption */ @@ -51,16 +45,13 @@ class InputProducer implements Runnable { public InputProducer(final Iterator inputReader, final MultiThreadedErrorTracker errorTracker, - final SimpleTimer inputTimer, final BlockingQueue outputQueue) { if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null"); if ( errorTracker == null ) throw new IllegalArgumentException("errorTracker cannot be null"); - if ( inputTimer == null ) throw new IllegalArgumentException("inputTimer cannot be null"); if ( outputQueue == null ) throw new IllegalArgumentException("OutputQueue cannot be null"); this.inputReader = inputReader; this.errorTracker = errorTracker; - this.inputTimer = inputTimer; this.outputQueue = outputQueue; } @@ -94,18 +85,15 @@ class InputProducer implements Runnable { * @throws InterruptedException */ private synchronized InputType readNextItem() throws InterruptedException { - inputTimer.restart(); if ( ! inputReader.hasNext() ) { // we are done, mark ourselves as such and return null readLastValue = true; - inputTimer.stop(); return null; } else { // get the next value, and return it final InputType input = inputReader.next(); if ( input == null ) throw new IllegalStateException("inputReader.next() returned a null value, breaking our contract"); - inputTimer.stop(); nRead++; return input; } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSRuntimeProfile.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSRuntimeProfile.java deleted file mode 100644 index 0926b4c50..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSRuntimeProfile.java +++ /dev/null @@ -1,67 +0,0 @@ -package org.broadinstitute.sting.utils.nanoScheduler; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.AutoFormattingTime; -import org.broadinstitute.sting.utils.SimpleTimer; - -/** - * Holds runtime profile (input, read, map) times as tracked by NanoScheduler - * - * User: depristo - * Date: 9/10/12 - * Time: 8:31 PM - */ -public class NSRuntimeProfile { - final SimpleTimer outsideSchedulerTimer = new SimpleTimer("outside"); - final SimpleTimer inputTimer = new SimpleTimer("input"); - final SimpleTimer mapTimer = new SimpleTimer("map"); - final SimpleTimer reduceTimer = new SimpleTimer("reduce"); - - /** - * Combine the elapsed time information from other with this profile - * - * @param other a non-null profile - */ - public void combine(final NSRuntimeProfile other) { - outsideSchedulerTimer.addElapsed(other.outsideSchedulerTimer); - inputTimer.addElapsed(other.inputTimer); - mapTimer.addElapsed(other.mapTimer); - reduceTimer.addElapsed(other.reduceTimer); - } - - /** - * Print the runtime profiling to logger - * - * @param logger - */ - public void log(final Logger logger) { - log1(logger, "Input time", inputTimer); - log1(logger, "Map time", mapTimer); - log1(logger, "Reduce time", reduceTimer); - log1(logger, "Outside time", outsideSchedulerTimer); - } - - /** - * @return the total runtime for all functions of this nano scheduler - */ - //@Ensures("result >= 0.0") - public double totalRuntimeInSeconds() { - return inputTimer.getElapsedTime() - + mapTimer.getElapsedTime() - + reduceTimer.getElapsedTime() - + outsideSchedulerTimer.getElapsedTime(); - } - - /** - * Print to logger.info timing information from timer, with name label - * - * @param label the name of the timer to display. Should be human readable - * @param timer the timer whose elapsed time we will display - */ - //@Requires({"label != null", "timer != null"}) - private void log1(final Logger logger, final String label, final SimpleTimer timer) { - final double myTimeInSec = timer.getElapsedTime(); - final double myTimePercent = myTimeInSec / totalRuntimeInSeconds() * 100; - logger.info(String.format("%s: %s (%5.2f%%)", label, new AutoFormattingTime(myTimeInSec), myTimePercent)); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 6d769c2cf..4cc91faa4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -57,16 +57,6 @@ public class NanoScheduler { boolean debug = false; private NSProgressFunction progressFunction = null; - /** - * Tracks the combined runtime profiles across all created nano schedulers - */ - final static private NSRuntimeProfile combinedNSRuntimeProfiler = new NSRuntimeProfile(); - - /** - * The profile specific to this nano scheduler - */ - final private NSRuntimeProfile myNSRuntimeProfile = new NSRuntimeProfile(); - /** * Create a new nanoscheduler with the desire characteristics requested by the argument * @@ -94,9 +84,6 @@ public class NanoScheduler { this.inputExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-input-thread-%d")); this.masterExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-master-thread-%d")); } - - // start timing the time spent outside of the nanoScheduler - myNSRuntimeProfile.outsideSchedulerTimer.start(); } /** @@ -123,11 +110,6 @@ public class NanoScheduler { * After this call, execute cannot be invoked without throwing an error */ public void shutdown() { - myNSRuntimeProfile.outsideSchedulerTimer.stop(); - - // add my timing information to the combined NS runtime profile - combinedNSRuntimeProfiler.combine(myNSRuntimeProfile); - if ( nThreads > 1 ) { shutdownExecutor("inputExecutor", inputExecutor); shutdownExecutor("mapExecutor", mapExecutor); @@ -137,19 +119,6 @@ public class NanoScheduler { shutdown = true; } - public void printRuntimeProfile() { - myNSRuntimeProfile.log(logger); - } - - public static void printCombinedRuntimeProfile() { - if ( combinedNSRuntimeProfiler.totalRuntimeInSeconds() > 0.1 ) - combinedNSRuntimeProfiler.log(logger); - } - - protected double getTotalRuntime() { - return myNSRuntimeProfile.totalRuntimeInSeconds(); - } - /** * Helper function to cleanly shutdown an execution service, checking that the execution * state is clean when it's done. @@ -245,8 +214,6 @@ public class NanoScheduler { if ( map == null ) throw new IllegalArgumentException("map function cannot be null"); if ( reduce == null ) throw new IllegalArgumentException("reduce function cannot be null"); - myNSRuntimeProfile.outsideSchedulerTimer.stop(); - ReduceType result; if ( ALLOW_SINGLE_THREAD_FASTPATH && getnThreads() == 1 ) { result = executeSingleThreaded(inputReader, map, initialValue, reduce); @@ -254,7 +221,6 @@ public class NanoScheduler { result = executeMultiThreaded(inputReader, map, initialValue, reduce); } - myNSRuntimeProfile.outsideSchedulerTimer.restart(); return result; } @@ -273,28 +239,19 @@ public class NanoScheduler { while ( true ) { // start timer to ensure that both hasNext and next are caught by the timer - myNSRuntimeProfile.inputTimer.restart(); if ( ! inputReader.hasNext() ) { - myNSRuntimeProfile.inputTimer.stop(); break; } else { final InputType input = inputReader.next(); - myNSRuntimeProfile.inputTimer.stop(); // map - myNSRuntimeProfile.mapTimer.restart(); - final long preMapTime = LOG_MAP_TIMES ? 0 : myNSRuntimeProfile.mapTimer.currentTimeNano(); final MapType mapValue = map.apply(input); - if ( LOG_MAP_TIMES ) logger.info("MAP TIME " + (myNSRuntimeProfile.mapTimer.currentTimeNano() - preMapTime)); - myNSRuntimeProfile.mapTimer.stop(); - if ( i++ % this.bufferSize == 0 && progressFunction != null ) + if ( progressFunction != null ) progressFunction.progress(input); // reduce - myNSRuntimeProfile.reduceTimer.restart(); sum = reduce.apply(mapValue, sum); - myNSRuntimeProfile.reduceTimer.stop(); } } @@ -401,7 +358,7 @@ public class NanoScheduler { // Create the input producer and start it running final InputProducer inputProducer = - new InputProducer(inputReader, errorTracker, myNSRuntimeProfile.inputTimer, inputQueue); + new InputProducer(inputReader, errorTracker, inputQueue); inputExecutor.submit(inputProducer); // a priority queue that stores up to bufferSize elements @@ -410,7 +367,7 @@ public class NanoScheduler { new PriorityBlockingQueue>(); final Reducer reducer - = new Reducer(reduce, errorTracker, myNSRuntimeProfile.reduceTimer, initialValue); + = new Reducer(reduce, errorTracker, initialValue); try { int nSubmittedJobs = 0; @@ -508,11 +465,7 @@ public class NanoScheduler { final InputType input = inputWrapper.getValue(); // map - myNSRuntimeProfile.mapTimer.restart(); - final long preMapTime = LOG_MAP_TIMES ? 0 : myNSRuntimeProfile.mapTimer.currentTimeNano(); final MapType mapValue = map.apply(input); - if ( LOG_MAP_TIMES ) logger.info("MAP TIME " + (myNSRuntimeProfile.mapTimer.currentTimeNano() - preMapTime)); - myNSRuntimeProfile.mapTimer.stop(); // enqueue the result into the mapResultQueue result = new MapResult(mapValue, jobID); diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java index 92c1018eb..5cae28187 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java @@ -4,7 +4,6 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MultiThreadedErrorTracker; -import org.broadinstitute.sting.utils.SimpleTimer; import java.util.concurrent.CountDownLatch; import java.util.concurrent.PriorityBlockingQueue; @@ -34,7 +33,6 @@ class Reducer { final CountDownLatch countDownLatch = new CountDownLatch(1); final NSReduceFunction reduce; - final SimpleTimer reduceTimer; final MultiThreadedErrorTracker errorTracker; /** @@ -61,20 +59,16 @@ class Reducer { * reduceTimer * * @param reduce the reduce function to apply - * @param reduceTimer the timer to time the reduce function call * @param initialSum the initial reduce sum */ public Reducer(final NSReduceFunction reduce, final MultiThreadedErrorTracker errorTracker, - final SimpleTimer reduceTimer, final ReduceType initialSum) { if ( errorTracker == null ) throw new IllegalArgumentException("Error tracker cannot be null"); if ( reduce == null ) throw new IllegalArgumentException("Reduce function cannot be null"); - if ( reduceTimer == null ) throw new IllegalArgumentException("reduceTimer cannot be null"); this.errorTracker = errorTracker; this.reduce = reduce; - this.reduceTimer = reduceTimer; this.sum = initialSum; } @@ -125,10 +119,7 @@ class Reducer { nReducesNow++; // apply reduce, keeping track of sum - reduceTimer.restart(); sum = reduce.apply(result.getValue(), sum); - reduceTimer.stop(); - } numJobsReduced++; diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java index a8715e242..b69283b9d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java +++ b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.utils.progressmeter; import com.google.java.contract.Ensures; import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -143,6 +144,12 @@ public class ProgressMeter { /** We use the SimpleTimer to time our run */ private final SimpleTimer timer = new SimpleTimer(); + private GenomeLoc maxGenomeLoc = null; + private String positionMessage = "starting"; + private long nTotalRecordsProcessed = 0; + + final ProgressMeterDaemon progressMeterDaemon; + /** * Create a new ProgressMeter * @@ -177,21 +184,15 @@ public class ProgressMeter { targetSizeInBP = processingIntervals.coveredSize(); // start up the timer + progressMeterDaemon = new ProgressMeterDaemon(this); start(); } /** - * Forward request to notifyOfProgress - * - * Assumes that one cycle has been completed - * - * @param loc our current location. Null means "in unmapped reads" - * @param nTotalRecordsProcessed the total number of records we've processed + * Start up the progress meter, printing initialization message and starting up the + * daemon thread for periodic printing. */ - public void notifyOfProgress(final GenomeLoc loc, final long nTotalRecordsProcessed) { - notifyOfProgress(loc, false, nTotalRecordsProcessed); - } - + @Requires("progressMeterDaemon != null") private synchronized void start() { timer.start(); lastProgressPrintTime = timer.currentTime(); @@ -199,6 +200,8 @@ public class ProgressMeter { logger.info("[INITIALIZATION COMPLETE; STARTING PROCESSING]"); logger.info(String.format("%15s processed.%s runtime per.1M.%s completed total.runtime remaining", "Location", processingUnitName, processingUnitName)); + + progressMeterDaemon.start(); } /** @@ -216,19 +219,41 @@ public class ProgressMeter { * Synchronized to ensure that even with multiple threads calling notifyOfProgress we still * get one clean stream of meter logs. * + * Note this thread doesn't actually print progress, unless must print is true, but just registers + * the progress itself. A separate printing daemon periodically polls the meter to print out + * progress + * * @param loc Current location, can be null if you are at the end of the processing unit - * @param mustPrint If true, will print out info, regardless of time interval * @param nTotalRecordsProcessed the total number of records we've processed */ - private synchronized void notifyOfProgress(final GenomeLoc loc, boolean mustPrint, final long nTotalRecordsProcessed) { + public synchronized void notifyOfProgress(final GenomeLoc loc, final long nTotalRecordsProcessed) { if ( nTotalRecordsProcessed < 0 ) throw new IllegalArgumentException("nTotalRecordsProcessed must be >= 0"); + // weird comparison to ensure that loc == null (in unmapped reads) is keep before maxGenomeLoc == null (on startup) + this.maxGenomeLoc = loc == null ? loc : (maxGenomeLoc == null ? loc : loc.max(maxGenomeLoc)); + this.nTotalRecordsProcessed = Math.max(this.nTotalRecordsProcessed, nTotalRecordsProcessed); + + // a pretty name for our position + this.positionMessage = maxGenomeLoc == null + ? "unmapped reads" + : String.format("%s:%d", maxGenomeLoc.getContig(), maxGenomeLoc.getStart()); + } + + /** + * Actually try to print out progress + * + * This function may print out if the progress print is due, but if not enough time has elapsed + * since the last print we will not print out information. + * + * @param mustPrint if true, progress will be printed regardless of the last time we printed progress + */ + protected synchronized void printProgress(final boolean mustPrint) { final long curTime = timer.currentTime(); final boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, progressPrintFrequency); final boolean printLog = performanceLog != null && maxElapsedIntervalForPrinting(curTime, lastPerformanceLogPrintTime, PERFORMANCE_LOG_PRINT_FREQUENCY); if ( printProgress || printLog ) { - final ProgressMeterData progressData = takeProgressSnapshot(loc, nTotalRecordsProcessed); + final ProgressMeterData progressData = takeProgressSnapshot(maxGenomeLoc, nTotalRecordsProcessed); final AutoFormattingTime elapsed = new AutoFormattingTime(progressData.getElapsedSeconds()); final AutoFormattingTime bpRate = new AutoFormattingTime(progressData.secondsPerMillionBP()); @@ -241,13 +266,8 @@ public class ProgressMeter { lastProgressPrintTime = curTime; updateLoggerPrintFrequency(estTotalRuntime.getTimeInSeconds()); - // a pretty name for our position - final String posName = loc == null - ? (mustPrint ? "done" : "unmapped reads") - : String.format("%s:%d", loc.getContig(), loc.getStart()); - logger.info(String.format("%15s %5.2e %s %s %5.1f%% %s %s", - posName, progressData.getUnitsProcessed()*1.0, elapsed, unitRate, + positionMessage, progressData.getUnitsProcessed()*1.0, elapsed, unitRate, 100*fractionGenomeTargetCompleted, estTotalRuntime, timeToCompletion)); } @@ -296,13 +316,18 @@ public class ProgressMeter { */ public void notifyDone(final long nTotalRecordsProcessed) { // print out the progress meter - notifyOfProgress(null, true, nTotalRecordsProcessed); + this.nTotalRecordsProcessed = nTotalRecordsProcessed; + this.positionMessage = "done"; + printProgress(true); logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours", timer.getElapsedTime(), timer.getElapsedTime() / 60, timer.getElapsedTime() / 3600)); if ( performanceLog != null ) performanceLog.close(); + + // shutdown our daemon thread + progressMeterDaemon.done(); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java new file mode 100644 index 000000000..16887400a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java @@ -0,0 +1,60 @@ +package org.broadinstitute.sting.utils.progressmeter; + +/** + * Daemon thread that periodically prints the progress of the progress meter + * + * User: depristo + * Date: 12/4/12 + * Time: 9:16 PM + */ +public final class ProgressMeterDaemon extends Thread { + /** + * How frequently should we poll and print progress? + */ + private final static long POLL_FREQUENCY_MILLISECONDS = 10 * 1000; + + /** + * Are we to continue periodically printing status, or should we shut down? + */ + boolean done = false; + + /** + * The meter we will call print on + */ + final ProgressMeter meter; + + /** + * Create a new ProgressMeterDaemon printing progress for meter + * @param meter the progress meter to print progress of + */ + public ProgressMeterDaemon(final ProgressMeter meter) { + if ( meter == null ) throw new IllegalArgumentException("meter cannot be null"); + this.meter = meter; + setDaemon(true); + setName("ProgressMeterDaemon"); + } + + /** + * Tells this daemon thread to shutdown at the next opportunity, as the progress + * metering is complete. + */ + public final void done() { + this.done = true; + } + + /** + * Start up the ProgressMeterDaemon, polling every tens of seconds to print, if + * necessary, the provided progress meter. Never exits until the JVM is complete, + * or done() is called, as the thread is a daemon thread + */ + public void run() { + while (! done) { + meter.printProgress(false); + try { + Thread.sleep(POLL_FREQUENCY_MILLISECONDS); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java index 6c59f1585..489adab6b 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java @@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils.nanoScheduler; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.MultiThreadedErrorTracker; -import org.broadinstitute.sting.utils.SimpleTimer; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -46,7 +45,7 @@ public class InputProducerUnitTest extends BaseTest { final LinkedBlockingDeque.InputValue> readQueue = new LinkedBlockingDeque.InputValue>(queueSize); - final InputProducer ip = new InputProducer(elements.iterator(), new MultiThreadedErrorTracker(), new SimpleTimer(), readQueue); + final InputProducer ip = new InputProducer(elements.iterator(), new MultiThreadedErrorTracker(), readQueue); final ExecutorService es = Executors.newSingleThreadExecutor(); @@ -94,7 +93,7 @@ public class InputProducerUnitTest extends BaseTest { final LinkedBlockingDeque.InputValue> readQueue = new LinkedBlockingDeque.InputValue>(); - final InputProducer ip = new InputProducer(elements.iterator(), new MultiThreadedErrorTracker(), new SimpleTimer(), readQueue); + final InputProducer ip = new InputProducer(elements.iterator(), new MultiThreadedErrorTracker(), readQueue); final ExecutorService es = Executors.newSingleThreadExecutor(); es.submit(ip); diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index d415b8b4c..61e8ec0a1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -188,17 +188,6 @@ public class NanoSchedulerUnitTest extends BaseTest { Assert.assertTrue(callback.callBacks >= test.nExpectedCallbacks(), "Not enough callbacks detected. Expected at least " + test.nExpectedCallbacks() + " but saw only " + callback.callBacks); nanoScheduler.shutdown(); - - // TODO -- need to enable only in the case where there's serious time spend in - // TODO -- read /map / reduce, otherwise the "outside" timer doesn't add up - final double myTimeEstimate = timer.getElapsedTime(); - final double tolerance = 0.1; - if ( false && myTimeEstimate > 0.1 ) { - Assert.assertTrue(nanoScheduler.getTotalRuntime() > myTimeEstimate * tolerance, - "NanoScheduler said that the total runtime was " + nanoScheduler.getTotalRuntime() - + " but the overall test time was " + myTimeEstimate + ", beyond our tolerance factor of " - + tolerance); - } } @Test(enabled = true && ! DEBUG, dataProvider = "NanoSchedulerBasicTest", dependsOnMethods = "testMultiThreadedNanoScheduler", timeOut = NANO_SCHEDULE_MAX_RUNTIME) diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java index 39133d1ed..6c17aa78d 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java @@ -2,7 +2,6 @@ package org.broadinstitute.sting.utils.nanoScheduler; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.MultiThreadedErrorTracker; -import org.broadinstitute.sting.utils.SimpleTimer; import org.broadinstitute.sting.utils.Utils; import org.testng.Assert; import org.testng.annotations.DataProvider; @@ -93,7 +92,7 @@ public class ReducerUnitTest extends BaseTest { final List>> jobGroups = Utils.groupList(allJobs, groupSize); final ReduceSumTest reduce = new ReduceSumTest(); - final Reducer reducer = new Reducer(reduce, new MultiThreadedErrorTracker(), new SimpleTimer(), 0); + final Reducer reducer = new Reducer(reduce, new MultiThreadedErrorTracker(), 0); final TestWaitingForFinalReduce waitingThread = new TestWaitingForFinalReduce(reducer, expectedSum(allJobs)); final ExecutorService es = Executors.newSingleThreadExecutor(); @@ -155,7 +154,7 @@ public class ReducerUnitTest extends BaseTest { private void runSettingJobIDTwice() throws Exception { final PriorityBlockingQueue> mapResultsQueue = new PriorityBlockingQueue>(); - final Reducer reducer = new Reducer(new ReduceSumTest(), new MultiThreadedErrorTracker(), new SimpleTimer(), 0); + final Reducer reducer = new Reducer(new ReduceSumTest(), new MultiThreadedErrorTracker(), 0); reducer.setTotalJobCount(10); reducer.setTotalJobCount(15); From d0cab795b7784bca88e5543d24b85e760ef60549 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Dec 2012 14:49:01 -0500 Subject: [PATCH 08/31] Got caught in the middle of a bad integration test, that was fixed in independent push. Moved test bam into testdata. --- .../gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 9e9c7e37e..7459d131b 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -436,7 +436,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testNsInCigar() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + validationDataLocation + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1, + "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1, Arrays.asList("32f18ba50406cd8c8069ba07f2f89558")); executeTest("test calling on reads with Ns in CIGAR", spec); } From 234ff645566cce18307dc4a8d2588bb4ce9ea249 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 5 Dec 2012 15:52:57 -0500 Subject: [PATCH 09/31] Changes to AssessNA12878 to allow for 100s of input callsets to assess against the database. --- .../src/org/broadinstitute/sting/gatk/report/GATKReport.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java index 7ae2bb453..605a6680f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java @@ -343,7 +343,7 @@ public class GATKReport { GATKReportTable table = tables.firstEntry().getValue(); if ( table.getNumColumns() != values.length ) - throw new ReviewedStingException("The number of arguments in writeRow() " + values.length + " must match the number of columns in the table" + table.getNumColumns()); + throw new ReviewedStingException("The number of arguments in writeRow (" + values.length + ") must match the number of columns in the table (" + table.getNumColumns() + ")" ); final int rowIndex = table.getNumRows(); for ( int i = 0; i < values.length; i++ ) From dbf721968d8196f4ea425c27b74fb013db64b199 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Dec 2012 21:35:27 -0500 Subject: [PATCH 10/31] PrintReads large-scale test to protect against another major low-level performance issue --- .../walkers/PrintReadsLargeScaleTest.java | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100755 public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsLargeScaleTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsLargeScaleTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsLargeScaleTest.java new file mode 100755 index 000000000..ad7ac56f9 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsLargeScaleTest.java @@ -0,0 +1,20 @@ +package org.broadinstitute.sting.gatk.walkers; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.ArrayList; + +public class PrintReadsLargeScaleTest extends WalkerTest { + @Test( timeOut = 1000 * 60 * 60 * 20 ) // 60 seconds * 60 seconds / minute * 20 minutes + public void testRealignerTargetCreator() { + WalkerTestSpec spec = new WalkerTestSpec( + "-R " + b37KGReference + + " -T PrintReads" + + " -I " + evaluationDataLocation + "CEUTrio.HiSeq.WEx.b37.NA12892.clean.dedup.recal.1.bam" + + " -o /dev/null", + 0, + new ArrayList(0)); + executeTest("testPrintReadsWholeExomeChr1", spec); + } +} From 406adb8d44079b6f2928f0432eaf792a764b3eb6 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 5 Dec 2012 23:15:36 -0500 Subject: [PATCH 11/31] The allele biased downsampling should not abort if there's a reduced read. Rather it should always keep the RR and downsample only original reads in the pileup. --- .../gatk/downsampling/AlleleBiasedDownsamplingUtils.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java index a61614481..94f6ff649 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java @@ -56,11 +56,14 @@ public class AlleleBiasedDownsamplingUtils { for ( int i = 0; i < 4; i++ ) alleleStratifiedElements[i] = new ArrayList(); + // keep all of the reduced reads + final ArrayList reducedReadPileups = new ArrayList(); + // start by stratifying the reads by the alleles they represent at this position for( final PileupElement pe : pileup ) { - // abort if we have a reduced read - we do not want to remove it! + // we do not want to remove a reduced read if ( pe.getRead().isReducedRead() ) - return pileup; + reducedReadPileups.add(pe); final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); if ( baseIndex != -1 ) @@ -76,6 +79,7 @@ public class AlleleBiasedDownsamplingUtils { return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName()); } }); + elementsToKeep.addAll(reducedReadPileups); // make a listing of allele counts final int[] alleleCounts = new int[4]; From cc4e3ec58927014f95758ed28d20b1715ec23276 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Wed, 5 Dec 2012 12:45:02 -0500 Subject: [PATCH 13/31] Update TODO list --- .../traversals/TraverseActiveRegionsTest.java | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java index a65b0cb45..b08210ea6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java @@ -109,6 +109,8 @@ public class TraverseActiveRegionsTest extends BaseTest { dictionary = reference.getSequenceDictionary(); genomeLocParser = new GenomeLocParser(dictionary); + // TODO: test shard boundaries + intervals = new ArrayList(); intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20)); intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999)); @@ -139,8 +141,6 @@ public class TraverseActiveRegionsTest extends BaseTest { List activeIntervals = getIsActiveIntervals(walker, intervals); // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call verifyEqualIntervals(intervals, activeIntervals); - - // TODO: more tests and edge cases } private List getIsActiveIntervals(DummyActiveRegionWalker walker, List intervals) { @@ -171,8 +171,6 @@ public class TraverseActiveRegionsTest extends BaseTest { Collection activeRegions = getActiveRegions(walker, intervals).values(); verifyActiveRegionCoverage(intervals, activeRegions); - - // TODO: more tests and edge cases } private void verifyActiveRegionCoverage(List intervals, Collection activeRegions) { @@ -279,8 +277,6 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); getRead(region, "simple20"); - - // TODO: more tests and edge cases } @Test @@ -348,8 +344,6 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); getRead(region, "simple20"); - - // TODO: more tests and edge cases } @Test @@ -418,8 +412,11 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); getRead(region, "simple20"); + } - // TODO: more tests and edge cases + @Test + public void testUnmappedReads() { + // TODO } private void verifyReadNotPlaced(ActiveRegion region, String readName) { From 3b0e3767bfbe4de06139f5de5dbd37519775983d Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Fri, 7 Dec 2012 13:46:41 -0500 Subject: [PATCH 14/31] Add a test for a read that extends off the end of chr1 --- .../traversals/TraverseActiveRegionsTest.java | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java index b08210ea6..69907d485 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java @@ -117,6 +117,8 @@ public class TraverseActiveRegionsTest extends BaseTest { intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999)); intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999)); intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000)); + intervals.add(genomeLocParser.createGenomeLoc("1", 249250600, 249250621)); + intervals.add(genomeLocParser.createGenomeLoc("2", 1, 100)); intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 10100)); intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList(); @@ -128,6 +130,7 @@ public class TraverseActiveRegionsTest extends BaseTest { reads.add(buildSAMRecord("boundary_unequal", "1", 1990, 2008)); reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990)); reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000)); + reads.add(buildSAMRecord("end_of_chr1", "1", 249250600, 249250700)); reads.add(buildSAMRecord("simple20", "20", 10025, 10075)); // required by LocusIteratorByState, and I prefer to list them in test case order above @@ -229,6 +232,7 @@ public class TraverseActiveRegionsTest extends BaseTest { // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none + // end_of_chr1: Primary in 1:249250600-249250621 // simple20: Primary in 20:10000-10100 Map activeRegions = getActiveRegions(walker, intervals); @@ -243,6 +247,7 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); @@ -254,6 +259,7 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); @@ -265,6 +271,19 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 249250600, 249250621)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + getRead(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); @@ -276,6 +295,7 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); getRead(region, "simple20"); } @@ -296,6 +316,7 @@ public class TraverseActiveRegionsTest extends BaseTest { // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none + // end_of_chr1: Primary in 1:249250600-249250621 // simple20: Primary in 20:10000-10100 Map activeRegions = getActiveRegions(walker, intervals); @@ -310,6 +331,7 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); @@ -321,6 +343,7 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); @@ -332,6 +355,19 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 249250600, 249250621)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + getRead(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); @@ -343,6 +379,7 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); getRead(region, "simple20"); } @@ -364,6 +401,7 @@ public class TraverseActiveRegionsTest extends BaseTest { // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none + // end_of_chr1: Primary in 1:249250600-249250621 // simple20: Primary in 20:10000-10100 Map activeRegions = getActiveRegions(walker, intervals); @@ -378,6 +416,7 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); @@ -389,6 +428,7 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); @@ -400,6 +440,19 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 249250600, 249250621)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + getRead(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); @@ -411,6 +464,7 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); getRead(region, "simple20"); } From bf8421eeb72c3c52f4fe595b90bd725c4d20a08b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 7 Dec 2012 16:02:58 -0500 Subject: [PATCH 16/31] Fixes GSA-671 / AFCalcResult.log10pNonRefByAllele should really be log10pRefByAllele -- The current implementation of AFCalcResult contains a map from allele -> log10pNonRef. The only use of this field is to support the isPolymorphic function per allele. The call to this function looks like isPolymorphic(allele, QUAL). The QUAL is a phred-scaled threshold where you want to include alleles where the log10pNonRef >= QUAL (appropriately transformed). The problem is that when log10pNonRef is large, it quickly gets set to 0, while it's complementary log10pRef value has a meaningful log10 value. For example, if log10pRef = -100 (not an uncommonly large value), log10pNonRef = 0.0. -- In order to preserve precision and allow us to more finally differentiate high QUAL from low QUAL (but still poly) sites we should store log10pRef values instead, and test that log10pRef <= threshold. -- See https://jira.broadinstitute.org/browse/GSA-671 for more information. --- .../afcalc/AFCalcPerformanceTest.java | 4 +- .../afcalc/AFCalcResultUnitTest.java | 6 +-- .../genotyper/afcalc/AFCalcUnitTest.java | 2 +- ...dentAllelesDiploidExactAFCalcUnitTest.java | 2 +- .../genotyper/afcalc/AFCalcResult.java | 38 +++++++++++-------- .../genotyper/afcalc/ExactCallLogger.java | 10 ++--- .../IndependentAllelesDiploidExactAFCalc.java | 10 ++--- .../afcalc/OriginalDiploidExactAFCalc.java | 6 +-- .../genotyper/afcalc/StateTracker.java | 8 ++-- 9 files changed, 47 insertions(+), 39 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java index e9ed6b153..0a3512aa6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java @@ -240,8 +240,8 @@ public class AFCalcPerformanceTest { if ( a.isNonReference() ) { final String warningmeMLE = call.originalCall.getAlleleCountAtMLE(a) != result.getAlleleCountAtMLE(a) ? " DANGER-MLE-DIFFERENT" : ""; logger.info("\t\t MLE " + a + ": " + call.originalCall.getAlleleCountAtMLE(a) + " vs " + result.getAlleleCountAtMLE(a) + warningmeMLE); - final String warningmePost = call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) == 0 && result.getLog10PosteriorOfAFGt0ForAllele(a) < -10 ? " DANGER-POSTERIORS-DIFFERENT" : ""; - logger.info("\t\t Posterior " + a + ": " + call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) + " vs " + result.getLog10PosteriorOfAFGt0ForAllele(a) + warningmePost); + final String warningmePost = call.originalCall.getLog10PosteriorOfAFEq0ForAllele(a) == 0 && result.getLog10PosteriorOfAFEq0ForAllele(a) < -10 ? " DANGER-POSTERIORS-DIFFERENT" : ""; + logger.info("\t\t Posterior " + a + ": " + call.originalCall.getLog10PosteriorOfAFEq0ForAllele(a) + " vs " + result.getLog10PosteriorOfAFEq0ForAllele(a) + warningmePost); } } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java index 96e055e92..016926e12 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java @@ -83,8 +83,8 @@ public class AFCalcResultUnitTest extends BaseTest { List tests = new ArrayList(); final List pValues = new LinkedList(); - for ( final double p : Arrays.asList(0.01, 0.1, 0.9, 0.99, 0.999) ) - for ( final double espilon : Arrays.asList(-1e-5, 0.0, 1e-5) ) + for ( final double p : Arrays.asList(0.01, 0.1, 0.9, 0.99, 0.999, 1 - 1e-4, 1 - 1e-5, 1 - 1e-6) ) + for ( final double espilon : Arrays.asList(-1e-7, 0.0, 1e-7) ) pValues.add(p + espilon); for ( final double pNonRef : pValues ) { @@ -106,7 +106,7 @@ public class AFCalcResultUnitTest extends BaseTest { alleles, MathUtils.normalizeFromLog10(new double[]{1 - pNonRef, pNonRef}, true, false), log10Even, - Collections.singletonMap(C, Math.log10(pNonRef))); + Collections.singletonMap(C, Math.log10(1 - pNonRef))); } @Test(enabled = true, dataProvider = "TestIsPolymorphic") diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index 2d346e548..7ee909fe0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -681,7 +681,7 @@ public class AFCalcUnitTest extends BaseTest { // must be getCalledChrCount because we cannot ensure that the VC made has our desired ACs Assert.assertEquals(result.getAlleleCountAtMLE(alt), vc.getCalledChrCount(alt)); - Assert.assertEquals(result.isPolymorphic(alt, -1), (boolean)expectedPoly.get(i), "isPolymorphic for allele " + alt + " " + result.getLog10PosteriorOfAFGt0ForAllele(alt)); + Assert.assertEquals(result.isPolymorphic(alt, -1), (boolean)expectedPoly.get(i), "isPolymorphic for allele " + alt + " " + result.getLog10PosteriorOfAFEq0ForAllele(alt)); } } } \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java index 391c99990..663471106 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java @@ -148,7 +148,7 @@ public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest { for ( int i = 0; i < log10LAlleles.size(); i++ ) { final double log10LAllele1 = log10LAlleles.get(i); final double[] L1 = MathUtils.normalizeFromLog10(new double[]{log10LAllele1, 0.0}, true); - final AFCalcResult result1 = new AFCalcResult(new int[]{1}, 1, Arrays.asList(A, C), L1, rawPriors, Collections.singletonMap(C, 0.0)); + final AFCalcResult result1 = new AFCalcResult(new int[]{1}, 1, Arrays.asList(A, C), L1, rawPriors, Collections.singletonMap(C, -10000.0)); originalPriors.add(result1); pNonRefN.add(log10pNonRef*(i+1)); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java index dbb0e8cdd..d6a5cb16d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -52,7 +51,7 @@ public class AFCalcResult { private final double[] log10PriorsOfAC; private final double[] log10PosteriorsOfAC; - private final Map log10pNonRefByAllele; + private final Map log10pRefByAllele; /** * The AC values for all ALT alleles at the MLE @@ -74,16 +73,16 @@ public class AFCalcResult { final List allelesUsedInGenotyping, final double[] log10LikelihoodsOfAC, final double[] log10PriorsOfAC, - final Map log10pNonRefByAllele) { + final Map log10pRefByAllele) { if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.size() < 1 ) throw new IllegalArgumentException("allelesUsedInGenotyping must be non-null list of at least 1 value " + allelesUsedInGenotyping); if ( alleleCountsOfMLE == null ) throw new IllegalArgumentException("alleleCountsOfMLE cannot be null"); if ( alleleCountsOfMLE.length != allelesUsedInGenotyping.size() - 1) throw new IllegalArgumentException("alleleCountsOfMLE.length " + alleleCountsOfMLE.length + " != allelesUsedInGenotyping.size() " + allelesUsedInGenotyping.size()); if ( nEvaluations < 0 ) throw new IllegalArgumentException("nEvaluations must be >= 0 but saw " + nEvaluations); if ( log10LikelihoodsOfAC.length != 2 ) throw new IllegalArgumentException("log10LikelihoodsOfAC must have length equal 2"); if ( log10PriorsOfAC.length != 2 ) throw new IllegalArgumentException("log10PriorsOfAC must have length equal 2"); - if ( log10pNonRefByAllele == null ) throw new IllegalArgumentException("log10pNonRefByAllele cannot be null"); - if ( log10pNonRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pNonRefByAllele has the wrong number of elements: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); - if ( ! allelesUsedInGenotyping.containsAll(log10pNonRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pNonRefByAllele doesn't contain all of the alleles used in genotyping: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); + if ( log10pRefByAllele == null ) throw new IllegalArgumentException("log10pRefByAllele cannot be null"); + if ( log10pRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pRefByAllele has the wrong number of elements: log10pRefByAllele " + log10pRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); + if ( ! allelesUsedInGenotyping.containsAll(log10pRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pRefByAllele doesn't contain all of the alleles used in genotyping: log10pRefByAllele " + log10pRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); if ( ! MathUtils.goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC)); if ( ! MathUtils.goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC)); @@ -94,7 +93,7 @@ public class AFCalcResult { this.log10LikelihoodsOfAC = Arrays.copyOf(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES); this.log10PriorsOfAC = Arrays.copyOf(log10PriorsOfAC, LOG_10_ARRAY_SIZES); this.log10PosteriorsOfAC = computePosteriors(log10LikelihoodsOfAC, log10PriorsOfAC); - this.log10pNonRefByAllele = new HashMap(log10pNonRefByAllele); + this.log10pRefByAllele = new HashMap(log10pRefByAllele); } /** @@ -104,7 +103,7 @@ public class AFCalcResult { * @return */ public AFCalcResult withNewPriors(final double[] log10PriorsOfAC) { - return new AFCalcResult(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pNonRefByAllele); + return new AFCalcResult(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pRefByAllele); } /** @@ -219,7 +218,7 @@ public class AFCalcResult { public String toString() { final List byAllele = new LinkedList(); for ( final Allele a : getAllelesUsedInGenotyping() ) - if ( a.isNonReference() ) byAllele.add(String.format("%s => MLE %d / posterior %.2f", a, getAlleleCountAtMLE(a), getLog10PosteriorOfAFGt0ForAllele(a))); + if ( a.isNonReference() ) byAllele.add(String.format("%s => MLE %d / posterior %.2f", a, getAlleleCountAtMLE(a), getLog10PosteriorOfAFEq0ForAllele(a))); return String.format("AFCalc%n\t\tlog10PosteriorOfAFGT0=%.2f%n\t\t%s", getLog10LikelihoodOfAFGT0(), Utils.join("\n\t\t", byAllele)); } @@ -237,7 +236,7 @@ public class AFCalcResult { */ @Requires("MathUtils.goodLog10Probability(log10minPNonRef)") public boolean isPolymorphic(final Allele allele, final double log10minPNonRef) { - return getLog10PosteriorOfAFGt0ForAllele(allele) >= log10minPNonRef; + return getLog10PosteriorOfAFEq0ForAllele(allele) < log10minPNonRef; } /** @@ -245,7 +244,7 @@ public class AFCalcResult { */ public boolean isPolymorphicPhredScaledQual(final Allele allele, final double minPNonRefPhredScaledQual) { if ( minPNonRefPhredScaledQual < 0 ) throw new IllegalArgumentException("phredScaledQual " + minPNonRefPhredScaledQual + " < 0 "); - final double log10Threshold = Math.log10(QualityUtils.qualToProb(minPNonRefPhredScaledQual)); + final double log10Threshold = minPNonRefPhredScaledQual / -10; return isPolymorphic(allele, log10Threshold); } @@ -263,7 +262,16 @@ public class AFCalcResult { } /** - * Returns the log10 probability that allele is segregating + * Returns the log10 probability that allele is not segregating + * + * Note that this function is p not segregating so that we can store + * internally the log10 value of AF == 0, which grows very quickly + * negative and yet has sufficient resolution for high confidence tests. + * For example, if log10pRef == -100, not an unreasonably high number, + * if we tried to store log10pNonRef we'd be looking at 1 - 10^-100, which + * quickly underflows to 1. So the logic here is backward from what + * you really want (the p of segregating) but we do that for numerical + * reasons * * Unlike the sites-level annotation, this calculation is specific to allele, and can be * used to separately determine how much evidence there is that allele is independently @@ -272,11 +280,11 @@ public class AFCalcResult { * evidence for one allele but not so much for any other allele * * @param allele the allele we're interested in, must be in getAllelesUsedInGenotyping - * @return the log10 probability that allele is segregating at this site + * @return the log10 probability that allele is not segregating at this site */ @Ensures("MathUtils.goodLog10Probability(result)") - public double getLog10PosteriorOfAFGt0ForAllele(final Allele allele) { - final Double log10pNonRef = log10pNonRefByAllele.get(allele); + public double getLog10PosteriorOfAFEq0ForAllele(final Allele allele) { + final Double log10pNonRef = log10pRefByAllele.get(allele); if ( log10pNonRef == null ) throw new IllegalArgumentException("Unknown allele " + allele); return log10pNonRef; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java index f13fe4429..b138ddf70 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java @@ -77,7 +77,7 @@ public class ExactCallLogger implements Cloneable { for ( final Allele allele : result.getAllelesUsedInGenotyping() ) { if ( allele.isNonReference() ) { printCallElement(vc, "MLE", allele, result.getAlleleCountAtMLE(allele)); - printCallElement(vc, "pNonRefByAllele", allele, result.getLog10PosteriorOfAFGt0ForAllele(allele)); + printCallElement(vc, "pRefByAllele", allele, result.getLog10PosteriorOfAFEq0ForAllele(allele)); } } @@ -123,7 +123,7 @@ public class ExactCallLogger implements Cloneable { final double[] posteriors = new double[2]; final double[] priors = MathUtils.normalizeFromLog10(new double[]{0.5, 0.5}, true); final List mle = new ArrayList(); - final Map log10pNonRefByAllele = new HashMap(); + final Map log10pRefByAllele = new HashMap(); long runtimeNano = -1; GenomeLoc currentLoc = null; @@ -148,7 +148,7 @@ public class ExactCallLogger implements Cloneable { builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop); builder.genotypes(genotypes); final int[] mleInts = ArrayUtils.toPrimitive(mle.toArray(new Integer[]{})); - final AFCalcResult result = new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pNonRefByAllele); + final AFCalcResult result = new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pRefByAllele); calls.add(new ExactCall(builder.make(), runtimeNano, result)); } break; @@ -165,9 +165,9 @@ public class ExactCallLogger implements Cloneable { posteriors[1] = Double.valueOf(value); } else if (variable.equals("MLE")) { mle.add(Integer.valueOf(value)); - } else if (variable.equals("pNonRefByAllele")) { + } else if (variable.equals("pRefByAllele")) { final Allele a = Allele.create(key); - log10pNonRefByAllele.put(a, Double.valueOf(value)); + log10pRefByAllele.put(a, Double.valueOf(value)); } else if (variable.equals("runtime.nano")) { runtimeNano = Long.valueOf(value); } else { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index d0b801a20..937ef2ffc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -125,8 +125,8 @@ import java.util.*; */ final List supporting; - private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map log10pNonRefByAllele, List supporting) { - super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pNonRefByAllele); + private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map log10pRefByAllele, List supporting) { + super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pRefByAllele); this.supporting = supporting; } } @@ -323,7 +323,7 @@ import java.util.*; final int nAltAlleles = sortedResultsWithThetaNPriors.size(); final int[] alleleCountsOfMLE = new int[nAltAlleles]; final double[] log10PriorsOfAC = new double[2]; - final Map log10pNonRefByAllele = new HashMap(nAltAlleles); + final Map log10pRefByAllele = new HashMap(nAltAlleles); // the sum of the log10 posteriors for AF == 0 and AF > 0 to determine joint probs double log10PosteriorOfACEq0Sum = 0.0; @@ -348,7 +348,7 @@ import java.util.*; log10PosteriorOfACGt0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0(); // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior - log10pNonRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0()); + log10pRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0()); // trivial -- update the number of evaluations nEvaluations += sortedResultWithThetaNPriors.nEvaluations; @@ -384,6 +384,6 @@ import java.util.*; MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), // priors incorporate multiple alt alleles, must be normalized MathUtils.normalizeFromLog10(log10PriorsOfAC, true), - log10pNonRefByAllele, sortedResultsWithThetaNPriors); + log10pRefByAllele, sortedResultsWithThetaNPriors); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java index fc26111e0..67cc79646 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -30,13 +30,13 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { final double[] log10Priors = new double[]{log10AlleleFrequencyPriors[0], MathUtils.log10sumLog10(log10AlleleFrequencyPriors, 1)}; final double[] log10Posteriors = MathUtils.vectorSum(log10Likelihoods, log10Priors); - final double log10PNonRef = log10Posteriors[1] > log10Posteriors[0] ? 0.0 : MathUtils.LOG10_P_OF_ZERO; - final Map log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), log10PNonRef); + final double log10PRef = log10Posteriors[1] > log10Posteriors[0] ? MathUtils.LOG10_P_OF_ZERO : 0.0; + final Map log10pRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), log10PRef); return new AFCalcResult(new int[]{mleK}, 0, vc.getAlleles(), MathUtils.normalizeFromLog10(log10Likelihoods, true), MathUtils.normalizeFromLog10(log10Priors, true), - log10pNonRefByAllele); + log10pRefByAllele); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java index b82ec1d29..ad6361a3f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java @@ -165,14 +165,14 @@ final class StateTracker { final double[] log10Likelihoods = MathUtils.normalizeFromLog10(new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero()}, true); final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); - final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); + final Map log10pRefByAllele = new HashMap(allelesUsedInGenotyping.size()); for ( int i = 0; i < subACOfMLE.length; i++ ) { final Allele allele = allelesUsedInGenotyping.get(i+1); - final double log10PNonRef = alleleCountsOfMAP[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was - log10pNonRefByAllele.put(allele, log10PNonRef); + final double log10PRef = alleleCountsOfMAP[i] > 0 ? -10000 : 0; // TODO -- a total hack but in effect what the old behavior was + log10pRefByAllele.put(allele, log10PRef); } - return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele); + return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pRefByAllele); } // -------------------------------------------------------------------------------- From 574d5b467fccaf48faedf8e1108afc5f930750f8 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sun, 9 Dec 2012 02:09:34 -0500 Subject: [PATCH 18/31] Bug fix for indel HMM: protect against situation where long reads (e.g. Sanger) in a pileup can lead to a read starting after the haplotype end for a given haplotype. --- .../afcalc/AFCalcPerformanceTest.java | 4 +- .../afcalc/AFCalcResultUnitTest.java | 6 +- .../genotyper/afcalc/AFCalcUnitTest.java | 2 +- ...dentAllelesDiploidExactAFCalcUnitTest.java | 2 +- .../sting/gatk/report/GATKReport.java | 2 +- .../genotyper/afcalc/AFCalcResult.java | 38 ++++++---- .../genotyper/afcalc/ExactCallLogger.java | 10 +-- .../IndependentAllelesDiploidExactAFCalc.java | 10 +-- .../afcalc/OriginalDiploidExactAFCalc.java | 6 +- .../genotyper/afcalc/StateTracker.java | 8 +-- .../indels/PairHMMIndelErrorModel.java | 7 +- .../traversals/TraverseActiveRegionsTest.java | 69 ++++++++++++++++--- 12 files changed, 113 insertions(+), 51 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java index e9ed6b153..0a3512aa6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java @@ -240,8 +240,8 @@ public class AFCalcPerformanceTest { if ( a.isNonReference() ) { final String warningmeMLE = call.originalCall.getAlleleCountAtMLE(a) != result.getAlleleCountAtMLE(a) ? " DANGER-MLE-DIFFERENT" : ""; logger.info("\t\t MLE " + a + ": " + call.originalCall.getAlleleCountAtMLE(a) + " vs " + result.getAlleleCountAtMLE(a) + warningmeMLE); - final String warningmePost = call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) == 0 && result.getLog10PosteriorOfAFGt0ForAllele(a) < -10 ? " DANGER-POSTERIORS-DIFFERENT" : ""; - logger.info("\t\t Posterior " + a + ": " + call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) + " vs " + result.getLog10PosteriorOfAFGt0ForAllele(a) + warningmePost); + final String warningmePost = call.originalCall.getLog10PosteriorOfAFEq0ForAllele(a) == 0 && result.getLog10PosteriorOfAFEq0ForAllele(a) < -10 ? " DANGER-POSTERIORS-DIFFERENT" : ""; + logger.info("\t\t Posterior " + a + ": " + call.originalCall.getLog10PosteriorOfAFEq0ForAllele(a) + " vs " + result.getLog10PosteriorOfAFEq0ForAllele(a) + warningmePost); } } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java index 96e055e92..016926e12 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java @@ -83,8 +83,8 @@ public class AFCalcResultUnitTest extends BaseTest { List tests = new ArrayList(); final List pValues = new LinkedList(); - for ( final double p : Arrays.asList(0.01, 0.1, 0.9, 0.99, 0.999) ) - for ( final double espilon : Arrays.asList(-1e-5, 0.0, 1e-5) ) + for ( final double p : Arrays.asList(0.01, 0.1, 0.9, 0.99, 0.999, 1 - 1e-4, 1 - 1e-5, 1 - 1e-6) ) + for ( final double espilon : Arrays.asList(-1e-7, 0.0, 1e-7) ) pValues.add(p + espilon); for ( final double pNonRef : pValues ) { @@ -106,7 +106,7 @@ public class AFCalcResultUnitTest extends BaseTest { alleles, MathUtils.normalizeFromLog10(new double[]{1 - pNonRef, pNonRef}, true, false), log10Even, - Collections.singletonMap(C, Math.log10(pNonRef))); + Collections.singletonMap(C, Math.log10(1 - pNonRef))); } @Test(enabled = true, dataProvider = "TestIsPolymorphic") diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index 2d346e548..7ee909fe0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -681,7 +681,7 @@ public class AFCalcUnitTest extends BaseTest { // must be getCalledChrCount because we cannot ensure that the VC made has our desired ACs Assert.assertEquals(result.getAlleleCountAtMLE(alt), vc.getCalledChrCount(alt)); - Assert.assertEquals(result.isPolymorphic(alt, -1), (boolean)expectedPoly.get(i), "isPolymorphic for allele " + alt + " " + result.getLog10PosteriorOfAFGt0ForAllele(alt)); + Assert.assertEquals(result.isPolymorphic(alt, -1), (boolean)expectedPoly.get(i), "isPolymorphic for allele " + alt + " " + result.getLog10PosteriorOfAFEq0ForAllele(alt)); } } } \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java index 391c99990..663471106 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java @@ -148,7 +148,7 @@ public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest { for ( int i = 0; i < log10LAlleles.size(); i++ ) { final double log10LAllele1 = log10LAlleles.get(i); final double[] L1 = MathUtils.normalizeFromLog10(new double[]{log10LAllele1, 0.0}, true); - final AFCalcResult result1 = new AFCalcResult(new int[]{1}, 1, Arrays.asList(A, C), L1, rawPriors, Collections.singletonMap(C, 0.0)); + final AFCalcResult result1 = new AFCalcResult(new int[]{1}, 1, Arrays.asList(A, C), L1, rawPriors, Collections.singletonMap(C, -10000.0)); originalPriors.add(result1); pNonRefN.add(log10pNonRef*(i+1)); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java index 7ae2bb453..605a6680f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java @@ -343,7 +343,7 @@ public class GATKReport { GATKReportTable table = tables.firstEntry().getValue(); if ( table.getNumColumns() != values.length ) - throw new ReviewedStingException("The number of arguments in writeRow() " + values.length + " must match the number of columns in the table" + table.getNumColumns()); + throw new ReviewedStingException("The number of arguments in writeRow (" + values.length + ") must match the number of columns in the table (" + table.getNumColumns() + ")" ); final int rowIndex = table.getNumRows(); for ( int i = 0; i < values.length; i++ ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java index dbb0e8cdd..d6a5cb16d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -52,7 +51,7 @@ public class AFCalcResult { private final double[] log10PriorsOfAC; private final double[] log10PosteriorsOfAC; - private final Map log10pNonRefByAllele; + private final Map log10pRefByAllele; /** * The AC values for all ALT alleles at the MLE @@ -74,16 +73,16 @@ public class AFCalcResult { final List allelesUsedInGenotyping, final double[] log10LikelihoodsOfAC, final double[] log10PriorsOfAC, - final Map log10pNonRefByAllele) { + final Map log10pRefByAllele) { if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.size() < 1 ) throw new IllegalArgumentException("allelesUsedInGenotyping must be non-null list of at least 1 value " + allelesUsedInGenotyping); if ( alleleCountsOfMLE == null ) throw new IllegalArgumentException("alleleCountsOfMLE cannot be null"); if ( alleleCountsOfMLE.length != allelesUsedInGenotyping.size() - 1) throw new IllegalArgumentException("alleleCountsOfMLE.length " + alleleCountsOfMLE.length + " != allelesUsedInGenotyping.size() " + allelesUsedInGenotyping.size()); if ( nEvaluations < 0 ) throw new IllegalArgumentException("nEvaluations must be >= 0 but saw " + nEvaluations); if ( log10LikelihoodsOfAC.length != 2 ) throw new IllegalArgumentException("log10LikelihoodsOfAC must have length equal 2"); if ( log10PriorsOfAC.length != 2 ) throw new IllegalArgumentException("log10PriorsOfAC must have length equal 2"); - if ( log10pNonRefByAllele == null ) throw new IllegalArgumentException("log10pNonRefByAllele cannot be null"); - if ( log10pNonRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pNonRefByAllele has the wrong number of elements: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); - if ( ! allelesUsedInGenotyping.containsAll(log10pNonRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pNonRefByAllele doesn't contain all of the alleles used in genotyping: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); + if ( log10pRefByAllele == null ) throw new IllegalArgumentException("log10pRefByAllele cannot be null"); + if ( log10pRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pRefByAllele has the wrong number of elements: log10pRefByAllele " + log10pRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); + if ( ! allelesUsedInGenotyping.containsAll(log10pRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pRefByAllele doesn't contain all of the alleles used in genotyping: log10pRefByAllele " + log10pRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); if ( ! MathUtils.goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC)); if ( ! MathUtils.goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC)); @@ -94,7 +93,7 @@ public class AFCalcResult { this.log10LikelihoodsOfAC = Arrays.copyOf(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES); this.log10PriorsOfAC = Arrays.copyOf(log10PriorsOfAC, LOG_10_ARRAY_SIZES); this.log10PosteriorsOfAC = computePosteriors(log10LikelihoodsOfAC, log10PriorsOfAC); - this.log10pNonRefByAllele = new HashMap(log10pNonRefByAllele); + this.log10pRefByAllele = new HashMap(log10pRefByAllele); } /** @@ -104,7 +103,7 @@ public class AFCalcResult { * @return */ public AFCalcResult withNewPriors(final double[] log10PriorsOfAC) { - return new AFCalcResult(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pNonRefByAllele); + return new AFCalcResult(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pRefByAllele); } /** @@ -219,7 +218,7 @@ public class AFCalcResult { public String toString() { final List byAllele = new LinkedList(); for ( final Allele a : getAllelesUsedInGenotyping() ) - if ( a.isNonReference() ) byAllele.add(String.format("%s => MLE %d / posterior %.2f", a, getAlleleCountAtMLE(a), getLog10PosteriorOfAFGt0ForAllele(a))); + if ( a.isNonReference() ) byAllele.add(String.format("%s => MLE %d / posterior %.2f", a, getAlleleCountAtMLE(a), getLog10PosteriorOfAFEq0ForAllele(a))); return String.format("AFCalc%n\t\tlog10PosteriorOfAFGT0=%.2f%n\t\t%s", getLog10LikelihoodOfAFGT0(), Utils.join("\n\t\t", byAllele)); } @@ -237,7 +236,7 @@ public class AFCalcResult { */ @Requires("MathUtils.goodLog10Probability(log10minPNonRef)") public boolean isPolymorphic(final Allele allele, final double log10minPNonRef) { - return getLog10PosteriorOfAFGt0ForAllele(allele) >= log10minPNonRef; + return getLog10PosteriorOfAFEq0ForAllele(allele) < log10minPNonRef; } /** @@ -245,7 +244,7 @@ public class AFCalcResult { */ public boolean isPolymorphicPhredScaledQual(final Allele allele, final double minPNonRefPhredScaledQual) { if ( minPNonRefPhredScaledQual < 0 ) throw new IllegalArgumentException("phredScaledQual " + minPNonRefPhredScaledQual + " < 0 "); - final double log10Threshold = Math.log10(QualityUtils.qualToProb(minPNonRefPhredScaledQual)); + final double log10Threshold = minPNonRefPhredScaledQual / -10; return isPolymorphic(allele, log10Threshold); } @@ -263,7 +262,16 @@ public class AFCalcResult { } /** - * Returns the log10 probability that allele is segregating + * Returns the log10 probability that allele is not segregating + * + * Note that this function is p not segregating so that we can store + * internally the log10 value of AF == 0, which grows very quickly + * negative and yet has sufficient resolution for high confidence tests. + * For example, if log10pRef == -100, not an unreasonably high number, + * if we tried to store log10pNonRef we'd be looking at 1 - 10^-100, which + * quickly underflows to 1. So the logic here is backward from what + * you really want (the p of segregating) but we do that for numerical + * reasons * * Unlike the sites-level annotation, this calculation is specific to allele, and can be * used to separately determine how much evidence there is that allele is independently @@ -272,11 +280,11 @@ public class AFCalcResult { * evidence for one allele but not so much for any other allele * * @param allele the allele we're interested in, must be in getAllelesUsedInGenotyping - * @return the log10 probability that allele is segregating at this site + * @return the log10 probability that allele is not segregating at this site */ @Ensures("MathUtils.goodLog10Probability(result)") - public double getLog10PosteriorOfAFGt0ForAllele(final Allele allele) { - final Double log10pNonRef = log10pNonRefByAllele.get(allele); + public double getLog10PosteriorOfAFEq0ForAllele(final Allele allele) { + final Double log10pNonRef = log10pRefByAllele.get(allele); if ( log10pNonRef == null ) throw new IllegalArgumentException("Unknown allele " + allele); return log10pNonRef; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java index f13fe4429..b138ddf70 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java @@ -77,7 +77,7 @@ public class ExactCallLogger implements Cloneable { for ( final Allele allele : result.getAllelesUsedInGenotyping() ) { if ( allele.isNonReference() ) { printCallElement(vc, "MLE", allele, result.getAlleleCountAtMLE(allele)); - printCallElement(vc, "pNonRefByAllele", allele, result.getLog10PosteriorOfAFGt0ForAllele(allele)); + printCallElement(vc, "pRefByAllele", allele, result.getLog10PosteriorOfAFEq0ForAllele(allele)); } } @@ -123,7 +123,7 @@ public class ExactCallLogger implements Cloneable { final double[] posteriors = new double[2]; final double[] priors = MathUtils.normalizeFromLog10(new double[]{0.5, 0.5}, true); final List mle = new ArrayList(); - final Map log10pNonRefByAllele = new HashMap(); + final Map log10pRefByAllele = new HashMap(); long runtimeNano = -1; GenomeLoc currentLoc = null; @@ -148,7 +148,7 @@ public class ExactCallLogger implements Cloneable { builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop); builder.genotypes(genotypes); final int[] mleInts = ArrayUtils.toPrimitive(mle.toArray(new Integer[]{})); - final AFCalcResult result = new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pNonRefByAllele); + final AFCalcResult result = new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pRefByAllele); calls.add(new ExactCall(builder.make(), runtimeNano, result)); } break; @@ -165,9 +165,9 @@ public class ExactCallLogger implements Cloneable { posteriors[1] = Double.valueOf(value); } else if (variable.equals("MLE")) { mle.add(Integer.valueOf(value)); - } else if (variable.equals("pNonRefByAllele")) { + } else if (variable.equals("pRefByAllele")) { final Allele a = Allele.create(key); - log10pNonRefByAllele.put(a, Double.valueOf(value)); + log10pRefByAllele.put(a, Double.valueOf(value)); } else if (variable.equals("runtime.nano")) { runtimeNano = Long.valueOf(value); } else { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index d0b801a20..937ef2ffc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -125,8 +125,8 @@ import java.util.*; */ final List supporting; - private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map log10pNonRefByAllele, List supporting) { - super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pNonRefByAllele); + private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map log10pRefByAllele, List supporting) { + super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pRefByAllele); this.supporting = supporting; } } @@ -323,7 +323,7 @@ import java.util.*; final int nAltAlleles = sortedResultsWithThetaNPriors.size(); final int[] alleleCountsOfMLE = new int[nAltAlleles]; final double[] log10PriorsOfAC = new double[2]; - final Map log10pNonRefByAllele = new HashMap(nAltAlleles); + final Map log10pRefByAllele = new HashMap(nAltAlleles); // the sum of the log10 posteriors for AF == 0 and AF > 0 to determine joint probs double log10PosteriorOfACEq0Sum = 0.0; @@ -348,7 +348,7 @@ import java.util.*; log10PosteriorOfACGt0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0(); // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior - log10pNonRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0()); + log10pRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0()); // trivial -- update the number of evaluations nEvaluations += sortedResultWithThetaNPriors.nEvaluations; @@ -384,6 +384,6 @@ import java.util.*; MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), // priors incorporate multiple alt alleles, must be normalized MathUtils.normalizeFromLog10(log10PriorsOfAC, true), - log10pNonRefByAllele, sortedResultsWithThetaNPriors); + log10pRefByAllele, sortedResultsWithThetaNPriors); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java index fc26111e0..67cc79646 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -30,13 +30,13 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { final double[] log10Priors = new double[]{log10AlleleFrequencyPriors[0], MathUtils.log10sumLog10(log10AlleleFrequencyPriors, 1)}; final double[] log10Posteriors = MathUtils.vectorSum(log10Likelihoods, log10Priors); - final double log10PNonRef = log10Posteriors[1] > log10Posteriors[0] ? 0.0 : MathUtils.LOG10_P_OF_ZERO; - final Map log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), log10PNonRef); + final double log10PRef = log10Posteriors[1] > log10Posteriors[0] ? MathUtils.LOG10_P_OF_ZERO : 0.0; + final Map log10pRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), log10PRef); return new AFCalcResult(new int[]{mleK}, 0, vc.getAlleles(), MathUtils.normalizeFromLog10(log10Likelihoods, true), MathUtils.normalizeFromLog10(log10Priors, true), - log10pNonRefByAllele); + log10pRefByAllele); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java index b82ec1d29..ad6361a3f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java @@ -165,14 +165,14 @@ final class StateTracker { final double[] log10Likelihoods = MathUtils.normalizeFromLog10(new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero()}, true); final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); - final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); + final Map log10pRefByAllele = new HashMap(allelesUsedInGenotyping.size()); for ( int i = 0; i < subACOfMLE.length; i++ ) { final Allele allele = allelesUsedInGenotyping.get(i+1); - final double log10PNonRef = alleleCountsOfMAP[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was - log10pNonRefByAllele.put(allele, log10PNonRef); + final double log10PRef = alleleCountsOfMAP[i] > 0 ? -10000 : 0; // TODO -- a total hack but in effect what the old behavior was + log10pRefByAllele.put(allele, log10PRef); } - return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele); + return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pRefByAllele); } // -------------------------------------------------------------------------------- diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 7b797432d..848aaf8a3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -287,6 +287,9 @@ public class PairHMMIndelErrorModel { if (startLocationInRefForHaplotypes < ref.getWindow().getStart()) { startLocationInRefForHaplotypes = ref.getWindow().getStart(); // read starts before haplotype: read will have to be cut numStartSoftClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes; } + else if (startLocationInRefForHaplotypes > ref.getWindow().getStop()) { + startLocationInRefForHaplotypes = ref.getWindow().getStop(); // read starts after haplotype: read will have to be clipped completely; + } if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) { stopLocationInRefForHaplotypes = ref.getWindow().getStop(); // check also if end of read will go beyond reference context @@ -338,6 +341,8 @@ public class PairHMMIndelErrorModel { if (startLocationInRefForHaplotypes < haplotype.getStartPosition()) startLocationInRefForHaplotypes = haplotype.getStartPosition(); + else if (startLocationInRefForHaplotypes > haplotype.getStopPosition()) + startLocationInRefForHaplotypes = haplotype.getStopPosition(); final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); @@ -347,8 +352,6 @@ public class PairHMMIndelErrorModel { System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n", indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString()); - - final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java index a65b0cb45..69907d485 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java @@ -109,12 +109,16 @@ public class TraverseActiveRegionsTest extends BaseTest { dictionary = reference.getSequenceDictionary(); genomeLocParser = new GenomeLocParser(dictionary); + // TODO: test shard boundaries + intervals = new ArrayList(); intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20)); intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999)); intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999)); intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999)); intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000)); + intervals.add(genomeLocParser.createGenomeLoc("1", 249250600, 249250621)); + intervals.add(genomeLocParser.createGenomeLoc("2", 1, 100)); intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 10100)); intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList(); @@ -126,6 +130,7 @@ public class TraverseActiveRegionsTest extends BaseTest { reads.add(buildSAMRecord("boundary_unequal", "1", 1990, 2008)); reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990)); reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000)); + reads.add(buildSAMRecord("end_of_chr1", "1", 249250600, 249250700)); reads.add(buildSAMRecord("simple20", "20", 10025, 10075)); // required by LocusIteratorByState, and I prefer to list them in test case order above @@ -139,8 +144,6 @@ public class TraverseActiveRegionsTest extends BaseTest { List activeIntervals = getIsActiveIntervals(walker, intervals); // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call verifyEqualIntervals(intervals, activeIntervals); - - // TODO: more tests and edge cases } private List getIsActiveIntervals(DummyActiveRegionWalker walker, List intervals) { @@ -171,8 +174,6 @@ public class TraverseActiveRegionsTest extends BaseTest { Collection activeRegions = getActiveRegions(walker, intervals).values(); verifyActiveRegionCoverage(intervals, activeRegions); - - // TODO: more tests and edge cases } private void verifyActiveRegionCoverage(List intervals, Collection activeRegions) { @@ -231,6 +232,7 @@ public class TraverseActiveRegionsTest extends BaseTest { // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none + // end_of_chr1: Primary in 1:249250600-249250621 // simple20: Primary in 20:10000-10100 Map activeRegions = getActiveRegions(walker, intervals); @@ -245,6 +247,7 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); @@ -256,6 +259,7 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); @@ -267,6 +271,19 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 249250600, 249250621)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + getRead(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); @@ -278,9 +295,8 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); getRead(region, "simple20"); - - // TODO: more tests and edge cases } @Test @@ -300,6 +316,7 @@ public class TraverseActiveRegionsTest extends BaseTest { // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none + // end_of_chr1: Primary in 1:249250600-249250621 // simple20: Primary in 20:10000-10100 Map activeRegions = getActiveRegions(walker, intervals); @@ -314,6 +331,7 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); @@ -325,6 +343,7 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); @@ -336,6 +355,19 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 249250600, 249250621)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + getRead(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); @@ -347,9 +379,8 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); getRead(region, "simple20"); - - // TODO: more tests and edge cases } @Test @@ -370,6 +401,7 @@ public class TraverseActiveRegionsTest extends BaseTest { // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none + // end_of_chr1: Primary in 1:249250600-249250621 // simple20: Primary in 20:10000-10100 Map activeRegions = getActiveRegions(walker, intervals); @@ -384,6 +416,7 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); @@ -395,6 +428,7 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); @@ -406,6 +440,19 @@ public class TraverseActiveRegionsTest extends BaseTest { getRead(region, "boundary_unequal"); getRead(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); + verifyReadNotPlaced(region, "simple20"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 249250600, 249250621)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + getRead(region, "end_of_chr1"); verifyReadNotPlaced(region, "simple20"); region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); @@ -417,9 +464,13 @@ public class TraverseActiveRegionsTest extends BaseTest { verifyReadNotPlaced(region, "boundary_unequal"); verifyReadNotPlaced(region, "extended_and_np"); verifyReadNotPlaced(region, "outside_intervals"); + verifyReadNotPlaced(region, "end_of_chr1"); getRead(region, "simple20"); + } - // TODO: more tests and edge cases + @Test + public void testUnmappedReads() { + // TODO } private void verifyReadNotPlaced(ActiveRegion region, String readName) { From 3a420d163e0e07b23ca29acedd83c572534877de Mon Sep 17 00:00:00 2001 From: Ami Levy-Moonshine Date: Sun, 9 Dec 2012 23:40:03 -0500 Subject: [PATCH 19/31] (1) changes in catVariants (work still under development) (2) changes to CV to throw an error when GenotypeMergeType is PRIORITIZE but no priority (rod_priority_list) is not given. Reported by TechnicalVault on the forum on Nov 14 2012 --- .../walkers/variantutils/CombineVariants.java | 2 +- .../variantcontext/VariantContextUtils.java | 45 ++----------------- 2 files changed, 5 insertions(+), 42 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index e710befc9..1d4913769 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -228,7 +228,7 @@ public class CombineVariants extends RodWalker implements Tree if ( genotypeMergeOption == VariantContextUtils.GenotypeMergeType.PRIORITIZE && PRIORITY_STRING == null ) throw new UserException.MissingArgument("rod_priority_list", "Priority string must be provided if you want to prioritize genotypes"); - if ( genotypeMergeOption == VariantContextUtils.GenotypeMergeType.PRIORITIZE ){ + if ( PRIORITY_STRING != null || genotypeMergeOption == VariantContextUtils.GenotypeMergeType.PRIORITIZE ){ priority = new ArrayList(Arrays.asList(PRIORITY_STRING.split(","))); if ( rodNames.size() != priority.size() ) throw new UserException.BadArgumentValue("rod_priority_list", "The priority list must contain exactly one rod binding per ROD provided to the GATK: rodNames=" + rodNames + " priority=" + priority); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 14fe0a184..81959c998 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -451,7 +451,7 @@ public class VariantContextUtils { if ( unsortedVCs == null || unsortedVCs.size() == 0 ) return null; - if ( annotateOrigin && priorityListOfVCs == null && genotypeMergeOptions == GenotypeMergeType.PRIORITIZE) + if ( annotateOrigin && priorityListOfVCs == null ) throw new IllegalArgumentException("Cannot merge calls and annotate their origins without a complete priority list of VariantContexts"); if ( genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE ) @@ -597,7 +597,7 @@ public class VariantContextUtils { if ( annotateOrigin ) { // we care about where the call came from String setValue; - if ( nFiltered == 0 && variantSources.size() == preFilteredVCs.size() ) // nothing was unfiltered + if ( nFiltered == 0 && variantSources.size() == priorityListOfVCs.size() ) // nothing was unfiltered setValue = MERGE_INTERSECTION; else if ( nFiltered == VCs.size() ) // everything was filtered out setValue = MERGE_FILTER_IN_ALL; @@ -840,11 +840,8 @@ public class VariantContextUtils { if ( mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null ) throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); - if ( mergeOption != GenotypeMergeType.PRIORITIZE ){ - if (priorityListOfVCs != null ) - logger.info("Priority string was provided but is not used since GenotypeMergeType is not PRIORITIZE"); + if ( priorityListOfVCs == null || mergeOption == GenotypeMergeType.UNSORTED ) return new ArrayList(unsortedVCs); - } else { ArrayList sorted = new ArrayList(unsortedVCs); Collections.sort(sorted, new CompareByPriority(priorityListOfVCs)); @@ -982,40 +979,6 @@ public class VariantContextUtils { private static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. - /** - * Split variant context into its biallelic components if there are more than 2 alleles - * - * For VC has A/B/C alleles, returns A/B and A/C contexts. - * Genotypes are all no-calls now (it's not possible to fix them easily) - * Alleles are right trimmed to satisfy VCF conventions - * - * If vc is biallelic or non-variant it is just returned - * - * Chromosome counts are updated (but they are by definition 0) - * - * @param vc a potentially multi-allelic variant context - * @return a list of bi-allelic (or monomorphic) variant context - */ - public static List splitVariantContextToBiallelics(final VariantContext vc) { - if ( ! vc.isVariant() || vc.isBiallelic() ) - // non variant or biallelics already satisfy the contract - return Collections.singletonList(vc); - else { - final List biallelics = new LinkedList(); - - for ( final Allele alt : vc.getAlternateAlleles() ) { - VariantContextBuilder builder = new VariantContextBuilder(vc); - final List alleles = Arrays.asList(vc.getReference(), alt); - builder.alleles(alleles); - builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false)); - calculateChromosomeCounts(builder, true); - biallelics.add(reverseTrimAlleles(builder.make())); - } - - return biallelics; - } - } - /** * subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately) * @@ -1270,7 +1233,7 @@ public class VariantContextUtils { * @param testString String to test * @return Number of repetitions (0 if testString is not a concatenation of n repeatUnit's */ - public static int findNumberofRepetitions(byte[] repeatUnit, byte[] testString) { + protected static int findNumberofRepetitions(byte[] repeatUnit, byte[] testString) { int numRepeats = 0; for (int start = 0; start < testString.length; start += repeatUnit.length) { int end = start + repeatUnit.length; From 46edab6d6adc0e787a9f939a9f3b5e1b9e1bc212 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 26 Nov 2012 12:44:48 -0500 Subject: [PATCH 20/31] Use the new downsampling implementation by default -Switch back to the old implementation, if needed, with --use_legacy_downsampler -LocusIteratorByStateExperimental becomes the new LocusIteratorByState, and the original LocusIteratorByState becomes LegacyLocusIteratorByState -Similarly, the ExperimentalReadShardBalancer becomes the new ReadShardBalancer, with the old one renamed to LegacyReadShardBalancer -Performance improvements: locus traversals used to be 20% slower in the new downsampling implementation, now they are roughly the same speed. -Tests show a very high level of concordance with UG calls from the previous implementation, with some new calls and edge cases that still require more examination. -With the new implementation, can now use -dcov with ReadWalkers to set a limit on the max # of reads per alignment start position per sample. Appropriate value for ReadWalker dcov may be in the single digits for some tools, but this too requires more investigation. --- ...GenotyperGeneralPloidyIntegrationTest.java | 4 +- .../UnifiedGenotyperIntegrationTest.java | 10 +- .../sting/gatk/GenomeAnalysisEngine.java | 18 +- .../sting/gatk/WalkerManager.java | 32 +- .../arguments/GATKArgumentCollection.java | 11 +- .../gatk/datasources/providers/LocusView.java | 9 +- .../gatk/datasources/reads/BAMScheduler.java | 7 +- .../reads/ExperimentalReadShardBalancer.java | 228 -------- .../reads/LegacyReadShardBalancer.java | 129 +++++ .../datasources/reads/ReadShardBalancer.java | 187 ++++-- .../gatk/datasources/reads/SAMDataSource.java | 71 ++- .../gatk/downsampling/DownsamplingMethod.java | 48 +- .../downsampling/PassThroughDownsampler.java | 106 ++++ .../sting/gatk/executive/WindowMaker.java | 13 +- ...l.java => LegacyLocusIteratorByState.java} | 531 +++++++++++++---- .../gatk/iterators/LocusIteratorByState.java | 533 ++++-------------- ...r.java => LegacyReservoirDownsampler.java} | 6 +- .../reads/DownsamplerBenchmark.java | 5 +- ...st.java => ReadShardBalancerUnitTest.java} | 8 +- ...> LegacyLocusIteratorByStateUnitTest.java} | 438 +++++--------- .../LocusIteratorByStateUnitTest.java | 404 +++++++++---- .../traversals/TraverseReadsUnitTest.java | 4 +- .../LegacyReservoirDownsamplerUnitTest.java | 16 +- 23 files changed, 1490 insertions(+), 1328 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LegacyReadShardBalancer.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java rename public/java/src/org/broadinstitute/sting/gatk/iterators/{LocusIteratorByStateExperimental.java => LegacyLocusIteratorByState.java} (61%) rename public/java/src/org/broadinstitute/sting/utils/{ReservoirDownsampler.java => LegacyReservoirDownsampler.java} (94%) rename public/java/test/org/broadinstitute/sting/gatk/datasources/reads/{ExperimentalReadShardBalancerUnitTest.java => ReadShardBalancerUnitTest.java} (97%) rename public/java/test/org/broadinstitute/sting/gatk/iterators/{LocusIteratorByStateExperimentalUnitTest.java => LegacyLocusIteratorByStateUnitTest.java} (50%) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index 73bc8fba6..f26194e00 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -80,11 +80,11 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testMT_SNP_DISCOVERY_sp4() { - PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","dd568dc30be90135a3a8957a45a7321c"); + PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","3fc6f4d458313616727c60e49c0e852b"); } @Test(enabled = true) public void testMT_SNP_GGA_sp10() { - PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "bf793c43b635a931207170be8035b288"); + PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "1bebbc0f28bff6fd64736ccca8839df8"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 7459d131b..f2b2dfb7d 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("d20c7a143b899f0239bf64b652ad3edb")); + Arrays.asList("97df6c2a8d390d43b9bdf56c979d9b09")); executeTest("test Multiple SNP alleles", spec); } @@ -86,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("fb204e821a24d03bd3a671b6e01c449a")); + Arrays.asList("935ee705ffe8cc6bf1d9efcceea271c8")); executeTest("test mismatched PLs", spec); } @@ -437,7 +437,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNsInCigar() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1, - Arrays.asList("32f18ba50406cd8c8069ba07f2f89558")); + Arrays.asList("4d36969d4f8f1094f1fb6e7e085c19f6")); executeTest("test calling on reads with Ns in CIGAR", spec); } @@ -451,13 +451,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("c1077662411164182c5f75478344f83d")); + Arrays.asList("092e42a712afb660ec79ff11c55933e2")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", "dee6590e3b7079890bc3a9cb372c297e"); + testReducedCalling("SNP", "c0de74ab8f4f14eb3a2c5d55c200ac5f"); } @Test diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index b7000e0ee..1187039bb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -445,13 +445,17 @@ public class GenomeAnalysisEngine { protected DownsamplingMethod getDownsamplingMethod() { GATKArgumentCollection argCollection = this.getArguments(); - boolean useExperimentalDownsampling = argCollection.enableExperimentalDownsampling; + + // Legacy downsampler can only be selected via the command line, not via walker annotations + boolean useLegacyDownsampler = argCollection.useLegacyDownsampler; DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod(); - DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useExperimentalDownsampling); - DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useExperimentalDownsampling); + DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useLegacyDownsampler); + DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useLegacyDownsampler); - return commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod); + DownsamplingMethod method = commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod); + method.checkCompatibilityWithWalker(walker); + return method; } protected void setDownsamplingMethod(DownsamplingMethod method) { @@ -580,9 +584,9 @@ public class GenomeAnalysisEngine { throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); } - // Use the experimental ReadShardBalancer if experimental downsampling is enabled - ShardBalancer readShardBalancer = downsamplingMethod != null && downsamplingMethod.useExperimentalDownsampling ? - new ExperimentalReadShardBalancer() : + // Use the legacy ReadShardBalancer if legacy downsampling is enabled + ShardBalancer readShardBalancer = downsamplingMethod != null && downsamplingMethod.useLegacyDownsampler ? + new LegacyReadShardBalancer() : new ReadShardBalancer(); if(intervals == null) diff --git a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java index 28b5f918d..1a9162e51 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java @@ -305,11 +305,23 @@ public class WalkerManager extends PluginManager { * Gets the type of downsampling method requested by the walker. If an alternative * downsampling method is specified on the command-line, the command-line version will * be used instead. - * @param walkerClass The class of the walker to interrogate. - * @param useExperimentalDownsampling If true, use the experimental downsampling implementation + * @param walker The walker to interrogate. + * @param useLegacyDownsampler If true, use the legacy downsampling implementation * @return The downsampling method, as specified by the walker. Null if none exists. */ - public static DownsamplingMethod getDownsamplingMethod(Class walkerClass, boolean useExperimentalDownsampling) { + public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useLegacyDownsampler) { + return getDownsamplingMethod(walker.getClass(), useLegacyDownsampler); + } + + /** + * Gets the type of downsampling method requested by the walker. If an alternative + * downsampling method is specified on the command-line, the command-line version will + * be used instead. + * @param walkerClass The class of the walker to interrogate. + * @param useLegacyDownsampler If true, use the legacy downsampling implementation + * @return The downsampling method, as specified by the walker. Null if none exists. + */ + public static DownsamplingMethod getDownsamplingMethod(Class walkerClass, boolean useLegacyDownsampler) { DownsamplingMethod downsamplingMethod = null; if( walkerClass.isAnnotationPresent(Downsample.class) ) { @@ -317,7 +329,7 @@ public class WalkerManager extends PluginManager { DownsampleType type = downsampleParameters.by(); Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null; Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null; - downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useExperimentalDownsampling); + downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useLegacyDownsampler); } return downsamplingMethod; @@ -331,18 +343,6 @@ public class WalkerManager extends PluginManager { return walker.getClass().getAnnotation(BAQMode.class).ApplicationTime(); } - /** - * Gets the type of downsampling method requested by the walker. If an alternative - * downsampling method is specified on the command-line, the command-line version will - * be used instead. - * @param walker The walker to interrogate. - * @param useExperimentalDownsampling If true, use the experimental downsampling implementation - * @return The downsampling method, as specified by the walker. Null if none exists. - */ - public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useExperimentalDownsampling) { - return getDownsamplingMethod(walker.getClass(), useExperimentalDownsampling); - } - /** * Create a name for this type of walker. * diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index d0f3e91e0..d9c7c9008 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -162,12 +162,11 @@ public class GATKArgumentCollection { @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false) public Double downsampleFraction = null; - @Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus", required = false) + @Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus. For non-locus-based traversals (eg., ReadWalkers), this sets the maximum number of reads at each alignment start position.", required = false) public Integer downsampleCoverage = null; - @Argument(fullName = "enable_experimental_downsampling", shortName = "enable_experimental_downsampling", doc = "Enable experimental engine-level downsampling", required = false) - @Hidden - public boolean enableExperimentalDownsampling = false; + @Argument(fullName = "use_legacy_downsampler", shortName = "use_legacy_downsampler", doc = "Use the legacy downsampling implementation instead of the newer, less-tested implementation", required = false) + public boolean useLegacyDownsampler = false; /** * Gets the downsampling method explicitly specified by the user. If the user didn't specify @@ -178,7 +177,7 @@ public class GATKArgumentCollection { if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null ) return null; - return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, enableExperimentalDownsampling); + return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, useLegacyDownsampler); } /** @@ -192,7 +191,7 @@ public class GATKArgumentCollection { downsamplingType = method.type; downsampleCoverage = method.toCoverage; downsampleFraction = method.toFraction; - enableExperimentalDownsampling = method.useExperimentalDownsampling; + useLegacyDownsampler = method.useLegacyDownsampler; } // -------------------------------------------------------------------------------------------------------------- diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java index cd3403f2f..c12bce208 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java @@ -136,11 +136,12 @@ public abstract class LocusView extends LocusIterator implements View { // Cache the current and apply filtering. AlignmentContext current = nextLocus; - // The old ALL_READS downsampling implementation -- only use if we're not using the new experimental downsampling: - if( ! sourceInfo.getDownsamplingMethod().useExperimentalDownsampling && - sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null ) { + // The old ALL_READS downsampling implementation -- use only if legacy downsampling was requested: + if ( sourceInfo.getDownsamplingMethod().useLegacyDownsampler && + sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && + sourceInfo.getDownsamplingMethod().toCoverage != null ) { - current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage ); + current.downsampleToCoverage(sourceInfo.getDownsamplingMethod().toCoverage); } // Indicate that the next operation will need to advance. diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index 8ee7e0439..cb33c5ab8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -134,12 +134,11 @@ public class BAMScheduler implements Iterator { // Only use the deprecated SAMDataSource.getCurrentPosition() if we're not using experimental downsampling // TODO: clean this up once the experimental downsampling engine fork collapses - if ( dataSource.getReadsInfo().getDownsamplingMethod() != null && dataSource.getReadsInfo().getDownsamplingMethod().useExperimentalDownsampling ) { - currentPosition = dataSource.getInitialReaderPositions(); + if ( dataSource.getReadsInfo().getDownsamplingMethod() != null && dataSource.getReadsInfo().getDownsamplingMethod().useLegacyDownsampler ) { + currentPosition = dataSource.getCurrentPosition(); } else { - currentPosition = dataSource.getCurrentPosition(); - + currentPosition = dataSource.getInitialReaderPositions(); } for(SAMReaderID reader: dataSource.getReaderIDs()) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java deleted file mode 100644 index 0440c7eae..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java +++ /dev/null @@ -1,228 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.picard.util.PeekableIterator; -import net.sf.samtools.SAMRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.*; - -/** - * Convert from an unbalanced iterator over FilePointers to a balanced iterator over Shards. - * - * When processing FilePointers, our strategy is to aggregate all FilePointers for each contig - * together into one monolithic FilePointer, create one persistent set of read iterators over - * that monolithic FilePointer, and repeatedly use that persistent set of read iterators to - * fill read shards with reads. - * - * This strategy has several important advantages: - * - * 1. We avoid issues with file span overlap. FilePointers that are more granular than a whole - * contig will have regions that overlap with other FilePointers on the same contig, due - * to the limited granularity of BAM index data. By creating only one FilePointer per contig, - * we avoid having to track how much of each file region we've visited (as we did in the - * former implementation), we avoid expensive non-sequential access patterns in the files, - * and we avoid having to repeatedly re-create our iterator chain for every small region - * of interest. - * - * 2. We avoid boundary issues with the engine-level downsampling. Since we create a single - * persistent set of read iterators (which include the downsampling iterator(s)) per contig, - * the downsampling process is never interrupted by FilePointer or Shard boundaries, and never - * loses crucial state information while downsampling within a contig. - * - * TODO: There is also at least one important disadvantage: - * - * 1. We load more BAM index data into memory at once, and this work is done upfront before processing - * the next contig, creating a delay before traversal of each contig. This delay may be - * compensated for by the gains listed in #1 above, and we may be no worse off overall in - * terms of total runtime, but we need to verify this empirically. - * - * @author David Roazen - */ -public class ExperimentalReadShardBalancer extends ShardBalancer { - - private static Logger logger = Logger.getLogger(ExperimentalReadShardBalancer.class); - - /** - * Convert iterators of file pointers into balanced iterators of shards. - * @return An iterator over balanced shards. - */ - public Iterator iterator() { - return new Iterator() { - /** - * The cached shard to be returned next. Prefetched in the peekable iterator style. - */ - private Shard nextShard = null; - - /** - * The file pointer currently being processed. - */ - private FilePointer currentContigFilePointer = null; - - /** - * Iterator over the reads from the current contig's file pointer. The same iterator will be - * used to fill all shards associated with a given file pointer - */ - private PeekableIterator currentContigReadsIterator = null; - - /** - * How many FilePointers have we pulled from the filePointers iterator? - */ - private int totalFilePointersConsumed = 0; - - /** - * Have we encountered a monolithic FilePointer? - */ - private boolean encounteredMonolithicFilePointer = false; - - - { - createNextContigFilePointer(); - advance(); - } - - public boolean hasNext() { - return nextShard != null; - } - - public Shard next() { - if ( ! hasNext() ) - throw new NoSuchElementException("No next read shard available"); - Shard currentShard = nextShard; - advance(); - return currentShard; - } - - private void advance() { - nextShard = null; - - // May need multiple iterations to fill the next shard if all reads in current file spans get filtered/downsampled away - while ( nextShard == null && currentContigFilePointer != null ) { - - // If we've exhausted the current file pointer of reads, move to the next file pointer (if there is one): - if ( currentContigReadsIterator != null && ! currentContigReadsIterator.hasNext() ) { - - // Close the old, exhausted chain of iterators to release resources - currentContigReadsIterator.close(); - - // Advance to the FilePointer for the next contig - createNextContigFilePointer(); - - // We'll need to create a fresh iterator for this file pointer when we create the first - // shard for it below. - currentContigReadsIterator = null; - } - - // At this point our currentContigReadsIterator may be null or non-null depending on whether or not - // this is our first shard for this file pointer. - if ( currentContigFilePointer != null ) { - Shard shard = new ReadShard(parser,readsDataSource, currentContigFilePointer.fileSpans, currentContigFilePointer.locations, currentContigFilePointer.isRegionUnmapped); - - // Create a new reads iterator only when we've just advanced to the file pointer for the next - // contig. It's essential that the iterators persist across all shards that share the same contig - // to allow the downsampling to work properly. - if ( currentContigReadsIterator == null ) { - currentContigReadsIterator = new PeekableIterator(readsDataSource.getIterator(shard)); - } - - if ( currentContigReadsIterator.hasNext() ) { - shard.fill(currentContigReadsIterator); - nextShard = shard; - } - } - } - } - - /** - * Aggregate all FilePointers for the next contig together into one monolithic FilePointer - * to avoid boundary issues with visiting the same file regions more than once (since more - * granular FilePointers will have regions that overlap with other nearby FilePointers due - * to the nature of BAM indices). - * - * By creating one persistent set of iterators per contig we also avoid boundary artifacts - * in the engine-level downsampling. - * - * TODO: This FilePointer aggregation should ideally be done at the BAMSchedule level for - * TODO: read traversals, as there's little point in the BAMSchedule emitting extremely - * TODO: granular FilePointers if we're just going to union them. The BAMSchedule should - * TODO: emit one FilePointer per contig for read traversals (but, crucially, NOT for - * TODO: locus traversals). - */ - private void createNextContigFilePointer() { - currentContigFilePointer = null; - List nextContigFilePointers = new ArrayList(); - - logger.info("Loading BAM index data for next contig"); - - while ( filePointers.hasNext() ) { - - // Make sure that if we see a monolithic FilePointer (representing all regions in all files) that - // it is the ONLY FilePointer we ever encounter - if ( encounteredMonolithicFilePointer ) { - throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer"); - } - if ( filePointers.peek().isMonolithic() ) { - if ( totalFilePointersConsumed > 0 ) { - throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer"); - } - encounteredMonolithicFilePointer = true; - logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek())); - } - - // If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the - // same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge - if ( nextContigFilePointers.isEmpty() || - (! nextContigFilePointers.get(0).isRegionUnmapped && ! filePointers.peek().isRegionUnmapped && - nextContigFilePointers.get(0).getContigIndex() == filePointers.peek().getContigIndex()) || - (nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) { - - nextContigFilePointers.add(filePointers.next()); - totalFilePointersConsumed++; - } - else { - break; // next FilePointer is on a different contig or has different mapped/unmapped status, - // save it for next time - } - } - - if ( ! nextContigFilePointers.isEmpty() ) { - currentContigFilePointer = FilePointer.union(nextContigFilePointers, parser); - } - - if ( currentContigFilePointer != null ) { - logger.info("Done loading BAM index data for next contig"); - logger.debug(String.format("Next contig FilePointer: %s", currentContigFilePointer)); - } - } - - public void remove() { - throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); - } - }; - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LegacyReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LegacyReadShardBalancer.java new file mode 100644 index 000000000..f5b4fba8e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LegacyReadShardBalancer.java @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.GATKBAMFileSpan; +import net.sf.samtools.SAMFileSpan; + +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.NoSuchElementException; + +/** + * Divide up large file pointers containing reads into more manageable subcomponents. + * + * TODO: delete this class once the experimental downsampling engine fork collapses + */ +public class LegacyReadShardBalancer extends ShardBalancer { + /** + * Convert iterators of file pointers into balanced iterators of shards. + * @return An iterator over balanced shards. + */ + public Iterator iterator() { + return new Iterator() { + /** + * The cached shard to be returned next. Prefetched in the peekable iterator style. + */ + private Shard nextShard = null; + + /** + * The file pointer currently being processed. + */ + private FilePointer currentFilePointer; + + /** + * Ending position of the last shard in the file. + */ + private Map position = readsDataSource.getCurrentPosition(); + + { + if(filePointers.hasNext()) + currentFilePointer = filePointers.next(); + advance(); + } + + public boolean hasNext() { + return nextShard != null; + } + + public Shard next() { + if(!hasNext()) + throw new NoSuchElementException("No next read shard available"); + Shard currentShard = nextShard; + advance(); + return currentShard; + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); + } + + private void advance() { + Map shardPosition; + nextShard = null; + + Map selectedReaders = new HashMap(); + while(selectedReaders.size() == 0 && currentFilePointer != null) { + shardPosition = currentFilePointer.fileSpans; + + for(SAMReaderID id: shardPosition.keySet()) { + SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id))); + selectedReaders.put(id,fileSpan); + } + + if(!isEmpty(selectedReaders)) { + Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped); + readsDataSource.fillShard(shard); + + if(!shard.isBufferEmpty()) { + nextShard = shard; + break; + } + } + + selectedReaders.clear(); + currentFilePointer = filePointers.hasNext() ? filePointers.next() : null; + } + + position = readsDataSource.getCurrentPosition(); + } + + /** + * Detects whether the list of file spans contain any read data. + * @param selectedSpans Mapping of readers to file spans. + * @return True if file spans are completely empty; false otherwise. + */ + private boolean isEmpty(Map selectedSpans) { + for(SAMFileSpan fileSpan: selectedSpans.values()) { + if(!fileSpan.isEmpty()) + return false; + } + return true; + } + }; + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java index 18fafb95d..8cee535b0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, The Broad Institute + * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -24,20 +24,49 @@ package org.broadinstitute.sting.gatk.datasources.reads; -import net.sf.samtools.GATKBAMFileSpan; -import net.sf.samtools.SAMFileSpan; +import net.sf.picard.util.PeekableIterator; +import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; -import java.util.NoSuchElementException; +import java.util.*; /** - * Divide up large file pointers containing reads into more manageable subcomponents. + * Convert from an unbalanced iterator over FilePointers to a balanced iterator over Shards. * - * TODO: delete this class once the experimental downsampling engine fork collapses + * When processing FilePointers, our strategy is to aggregate all FilePointers for each contig + * together into one monolithic FilePointer, create one persistent set of read iterators over + * that monolithic FilePointer, and repeatedly use that persistent set of read iterators to + * fill read shards with reads. + * + * This strategy has several important advantages: + * + * 1. We avoid issues with file span overlap. FilePointers that are more granular than a whole + * contig will have regions that overlap with other FilePointers on the same contig, due + * to the limited granularity of BAM index data. By creating only one FilePointer per contig, + * we avoid having to track how much of each file region we've visited (as we did in the + * former implementation), we avoid expensive non-sequential access patterns in the files, + * and we avoid having to repeatedly re-create our iterator chain for every small region + * of interest. + * + * 2. We avoid boundary issues with the engine-level downsampling. Since we create a single + * persistent set of read iterators (which include the downsampling iterator(s)) per contig, + * the downsampling process is never interrupted by FilePointer or Shard boundaries, and never + * loses crucial state information while downsampling within a contig. + * + * TODO: There is also at least one important disadvantage: + * + * 1. We load more BAM index data into memory at once, and this work is done upfront before processing + * the next contig, creating a delay before traversal of each contig. This delay may be + * compensated for by the gains listed in #1 above, and we may be no worse off overall in + * terms of total runtime, but we need to verify this empirically. + * + * @author David Roazen */ public class ReadShardBalancer extends ShardBalancer { + + private static Logger logger = Logger.getLogger(ReadShardBalancer.class); + /** * Convert iterators of file pointers into balanced iterators of shards. * @return An iterator over balanced shards. @@ -52,16 +81,27 @@ public class ReadShardBalancer extends ShardBalancer { /** * The file pointer currently being processed. */ - private FilePointer currentFilePointer; + private FilePointer currentContigFilePointer = null; /** - * Ending position of the last shard in the file. + * Iterator over the reads from the current contig's file pointer. The same iterator will be + * used to fill all shards associated with a given file pointer */ - private Map position = readsDataSource.getCurrentPosition(); + private PeekableIterator currentContigReadsIterator = null; + + /** + * How many FilePointers have we pulled from the filePointers iterator? + */ + private int totalFilePointersConsumed = 0; + + /** + * Have we encountered a monolithic FilePointer? + */ + private boolean encounteredMonolithicFilePointer = false; + { - if(filePointers.hasNext()) - currentFilePointer = filePointers.next(); + createNextContigFilePointer(); advance(); } @@ -70,58 +110,117 @@ public class ReadShardBalancer extends ShardBalancer { } public Shard next() { - if(!hasNext()) + if ( ! hasNext() ) throw new NoSuchElementException("No next read shard available"); Shard currentShard = nextShard; advance(); return currentShard; } - public void remove() { - throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); - } - private void advance() { - Map shardPosition; nextShard = null; - Map selectedReaders = new HashMap(); - while(selectedReaders.size() == 0 && currentFilePointer != null) { - shardPosition = currentFilePointer.fileSpans; + // May need multiple iterations to fill the next shard if all reads in current file spans get filtered/downsampled away + while ( nextShard == null && currentContigFilePointer != null ) { - for(SAMReaderID id: shardPosition.keySet()) { - SAMFileSpan fileSpan = new GATKBAMFileSpan(shardPosition.get(id).removeContentsBefore(position.get(id))); - selectedReaders.put(id,fileSpan); + // If we've exhausted the current file pointer of reads, move to the next file pointer (if there is one): + if ( currentContigReadsIterator != null && ! currentContigReadsIterator.hasNext() ) { + + // Close the old, exhausted chain of iterators to release resources + currentContigReadsIterator.close(); + + // Advance to the FilePointer for the next contig + createNextContigFilePointer(); + + // We'll need to create a fresh iterator for this file pointer when we create the first + // shard for it below. + currentContigReadsIterator = null; } - if(!isEmpty(selectedReaders)) { - Shard shard = new ReadShard(parser,readsDataSource,selectedReaders,currentFilePointer.locations,currentFilePointer.isRegionUnmapped); - readsDataSource.fillShard(shard); + // At this point our currentContigReadsIterator may be null or non-null depending on whether or not + // this is our first shard for this file pointer. + if ( currentContigFilePointer != null ) { + Shard shard = new ReadShard(parser,readsDataSource, currentContigFilePointer.fileSpans, currentContigFilePointer.locations, currentContigFilePointer.isRegionUnmapped); - if(!shard.isBufferEmpty()) { + // Create a new reads iterator only when we've just advanced to the file pointer for the next + // contig. It's essential that the iterators persist across all shards that share the same contig + // to allow the downsampling to work properly. + if ( currentContigReadsIterator == null ) { + currentContigReadsIterator = new PeekableIterator(readsDataSource.getIterator(shard)); + } + + if ( currentContigReadsIterator.hasNext() ) { + shard.fill(currentContigReadsIterator); nextShard = shard; - break; } } - - selectedReaders.clear(); - currentFilePointer = filePointers.hasNext() ? filePointers.next() : null; } - - position = readsDataSource.getCurrentPosition(); } /** - * Detects whether the list of file spans contain any read data. - * @param selectedSpans Mapping of readers to file spans. - * @return True if file spans are completely empty; false otherwise. + * Aggregate all FilePointers for the next contig together into one monolithic FilePointer + * to avoid boundary issues with visiting the same file regions more than once (since more + * granular FilePointers will have regions that overlap with other nearby FilePointers due + * to the nature of BAM indices). + * + * By creating one persistent set of iterators per contig we also avoid boundary artifacts + * in the engine-level downsampling. + * + * TODO: This FilePointer aggregation should ideally be done at the BAMSchedule level for + * TODO: read traversals, as there's little point in the BAMSchedule emitting extremely + * TODO: granular FilePointers if we're just going to union them. The BAMSchedule should + * TODO: emit one FilePointer per contig for read traversals (but, crucially, NOT for + * TODO: locus traversals). */ - private boolean isEmpty(Map selectedSpans) { - for(SAMFileSpan fileSpan: selectedSpans.values()) { - if(!fileSpan.isEmpty()) - return false; + private void createNextContigFilePointer() { + currentContigFilePointer = null; + List nextContigFilePointers = new ArrayList(); + + logger.info("Loading BAM index data for next contig"); + + while ( filePointers.hasNext() ) { + + // Make sure that if we see a monolithic FilePointer (representing all regions in all files) that + // it is the ONLY FilePointer we ever encounter + if ( encounteredMonolithicFilePointer ) { + throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer"); + } + if ( filePointers.peek().isMonolithic() ) { + if ( totalFilePointersConsumed > 0 ) { + throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer"); + } + encounteredMonolithicFilePointer = true; + logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek())); + } + + // If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the + // same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge + if ( nextContigFilePointers.isEmpty() || + (! nextContigFilePointers.get(0).isRegionUnmapped && ! filePointers.peek().isRegionUnmapped && + nextContigFilePointers.get(0).getContigIndex() == filePointers.peek().getContigIndex()) || + (nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) { + + nextContigFilePointers.add(filePointers.next()); + totalFilePointersConsumed++; + } + else { + break; // next FilePointer is on a different contig or has different mapped/unmapped status, + // save it for next time + } } - return true; + + if ( ! nextContigFilePointers.isEmpty() ) { + currentContigFilePointer = FilePointer.union(nextContigFilePointers, parser); + } + + if ( currentContigFilePointer != null ) { + logger.info("Done loading BAM index data for next contig"); + logger.debug(String.format("Next contig FilePointer: %s", currentContigFilePointer)); + } + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from shard balancing iterator"); } }; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 88de3ac9b..e99814278 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -466,7 +466,7 @@ public class SAMDataSource { /** * Legacy method to fill the given buffering shard with reads. * - * Shard.fill() is used instead of this method when experimental downsampling is enabled + * Shard.fill() is used instead of this method unless legacy downsampling is enabled * * TODO: delete this method once the experimental downsampling engine fork collapses * @@ -638,7 +638,8 @@ public class SAMDataSource { readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), readProperties.getSupplementalFilters(), readProperties.getReadTransformers(), - readProperties.defaultBaseQualities()); + readProperties.defaultBaseQualities(), + shard instanceof LocusShard); } private class BAMCodecIterator implements CloseableIterator { @@ -695,6 +696,7 @@ public class SAMDataSource { * @param noValidationOfReadOrder Another trigger for the verifying iterator? TODO: look into this. * @param supplementalFilters additional filters to apply to the reads. * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. + * @param isLocusBasedTraversal true if we're dealing with a read stream from a LocusShard * @return An iterator wrapped with filters reflecting the passed-in parameters. Will not be null. */ protected StingSAMIterator applyDecoratingIterators(ReadMetrics readMetrics, @@ -705,7 +707,8 @@ public class SAMDataSource { Boolean noValidationOfReadOrder, Collection supplementalFilters, List readTransformers, - byte defaultBaseQualities) { + byte defaultBaseQualities, + boolean isLocusBasedTraversal ) { // ************************************************************************************************ // // * NOTE: ALL FILTERING/DOWNSAMPLING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * // @@ -714,12 +717,26 @@ public class SAMDataSource { wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters)); - if ( readProperties.getDownsamplingMethod().useExperimentalDownsampling ) { - wrappedIterator = applyDownsamplingIterator(wrappedIterator); + // If we're using the new downsampling implementation, apply downsampling iterators at this + // point in the read stream for most (but not all) cases + if ( ! readProperties.getDownsamplingMethod().useLegacyDownsampler ) { + + // For locus traversals where we're downsampling to coverage by sample, assume that the downsamplers + // will be invoked downstream from us in LocusIteratorByState. This improves performance by avoiding + // splitting/re-assembly of the read stream at this stage, and also allows for partial downsampling + // of individual reads. + boolean assumeDownstreamLIBSDownsampling = isLocusBasedTraversal && + readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && + readProperties.getDownsamplingMethod().toCoverage != null; + + if ( ! assumeDownstreamLIBSDownsampling ) { + wrappedIterator = applyDownsamplingIterator(wrappedIterator); + } } - // Use the old fractional downsampler only if we're not using experimental downsampling: - if ( ! readProperties.getDownsamplingMethod().useExperimentalDownsampling && downsamplingFraction != null ) + // Use the old fractional downsampler only if we're using legacy downsampling: + // TODO: remove this statement (and associated classes) once the downsampling engine fork collapses + if ( readProperties.getDownsamplingMethod().useLegacyDownsampler && downsamplingFraction != null ) wrappedIterator = new LegacyDownsampleIterator(wrappedIterator, downsamplingFraction); // unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification, @@ -741,19 +758,37 @@ public class SAMDataSource { } protected StingSAMIterator applyDownsamplingIterator( StingSAMIterator wrappedIterator ) { - if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) { - ReadsDownsamplerFactory downsamplerFactory = readProperties.getDownsamplingMethod().toCoverage != null ? - new SimplePositionalDownsamplerFactory(readProperties.getDownsamplingMethod().toCoverage) : - new FractionalDownsamplerFactory(readProperties.getDownsamplingMethod().toFraction); - - return new PerSampleDownsamplingReadsIterator(wrappedIterator, downsamplerFactory); + if ( readProperties.getDownsamplingMethod() == null || + readProperties.getDownsamplingMethod().type == DownsampleType.NONE ) { + return wrappedIterator; } - else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) { - ReadsDownsampler downsampler = readProperties.getDownsamplingMethod().toCoverage != null ? - new SimplePositionalDownsampler(readProperties.getDownsamplingMethod().toCoverage) : - new FractionalDownsampler(readProperties.getDownsamplingMethod().toFraction); - return new DownsamplingReadsIterator(wrappedIterator, downsampler); + if ( readProperties.getDownsamplingMethod().toFraction != null ) { + + // If we're downsampling to a fraction of reads, there's no point in paying the cost of + // splitting/re-assembling the read stream by sample to run the FractionalDownsampler on + // reads from each sample separately, since the result would be the same as running the + // FractionalDownsampler on the entire stream. So, ALWAYS use the DownsamplingReadsIterator + // rather than the PerSampleDownsamplingReadsIterator, even if BY_SAMPLE downsampling + // was requested. + + return new DownsamplingReadsIterator(wrappedIterator, + new FractionalDownsampler(readProperties.getDownsamplingMethod().toFraction)); + } + else if ( readProperties.getDownsamplingMethod().toCoverage != null ) { + + // If we're downsampling to coverage, we DO need to pay the cost of splitting/re-assembling + // the read stream to run the downsampler on the reads for each individual sample separately if + // BY_SAMPLE downsampling was requested. + + if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) { + return new PerSampleDownsamplingReadsIterator(wrappedIterator, + new SimplePositionalDownsamplerFactory(readProperties.getDownsamplingMethod().toCoverage)); + } + else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) { + return new DownsamplingReadsIterator(wrappedIterator, + new SimplePositionalDownsampler(readProperties.getDownsamplingMethod().toCoverage)); + } } return wrappedIterator; diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java index ae1d98ce0..b3f636fd6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java @@ -50,9 +50,9 @@ public class DownsamplingMethod { public final Double toFraction; /** - * Use the new experimental downsampling? + * Use the legacy downsampling implementation instead of the newer implementation? */ - public final boolean useExperimentalDownsampling; + public final boolean useLegacyDownsampler; /** * Expresses no downsampling applied at all. @@ -69,11 +69,11 @@ public class DownsamplingMethod { */ public static int DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE = 1000; - public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useExperimentalDownsampling ) { + public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useLegacyDownsampler ) { this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE; this.toCoverage = toCoverage; this.toFraction = toFraction; - this.useExperimentalDownsampling = useExperimentalDownsampling; + this.useLegacyDownsampler = useLegacyDownsampler; if ( type == DownsampleType.NONE ) { toCoverage = null; @@ -101,19 +101,19 @@ public class DownsamplingMethod { if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) { throw new UserException.CommandLineException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads"); } + } - // Some restrictions only exist for the old downsampling implementation: - if ( ! useExperimentalDownsampling ) { - // By sample downsampling does not work with a fraction of reads in the old downsampling implementation - if( type == DownsampleType.BY_SAMPLE && toFraction != null ) - throw new UserException.CommandLineException("Cannot downsample to fraction with the BY_SAMPLE method"); + public void checkCompatibilityWithWalker( Walker walker ) { + boolean isLocusTraversal = walker instanceof LocusWalker || walker instanceof ActiveRegionWalker; + + if ( ! isLocusTraversal && useLegacyDownsampler && toCoverage != null ) { + throw new UserException.CommandLineException("Downsampling to coverage for read-based traversals (eg., ReadWalkers) is not supported in the legacy downsampling implementation. " + + "The newer downsampling implementation does not have this limitation."); } - // Some restrictions only exist for the new downsampling implementation: - if ( useExperimentalDownsampling ) { - if ( type == DownsampleType.ALL_READS && toCoverage != null ) { - throw new UserException.CommandLineException("Cannot downsample to coverage with the ALL_READS method in the experimental downsampling implementation"); - } + if ( isLocusTraversal && ! useLegacyDownsampler && type == DownsampleType.ALL_READS && toCoverage != null ) { + throw new UserException.CommandLineException("Downsampling to coverage with the ALL_READS method for locus-based traversals (eg., LocusWalkers) is not yet supported in the new downsampling implementation (though it is supported for ReadWalkers). " + + "You can run with --use_legacy_downsampler for a broken and poorly-maintained implementation of ALL_READS to-coverage downsampling, but this is not recommended."); } } @@ -124,30 +124,34 @@ public class DownsamplingMethod { builder.append("No downsampling"); } else { - builder.append(String.format("Method: %s ", type)); + builder.append(String.format("Method: %s, ", type)); if ( toCoverage != null ) { - builder.append(String.format("Target Coverage: %d ", toCoverage)); + builder.append(String.format("Target Coverage: %d, ", toCoverage)); } else { - builder.append(String.format("Target Fraction: %.2f ", toFraction)); + builder.append(String.format("Target Fraction: %.2f, ", toFraction)); } - if ( useExperimentalDownsampling ) { - builder.append("Using Experimental Downsampling"); + if ( useLegacyDownsampler ) { + builder.append("Using the legacy downsampling implementation"); + } + else { + builder.append("Using the new downsampling implementation"); } } return builder.toString(); } - public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useExperimentalDownsampling ) { + public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useLegacyDownsampler ) { if ( walker instanceof LocusWalker || walker instanceof ActiveRegionWalker ) { return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE, DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE, - null, useExperimentalDownsampling); + null, useLegacyDownsampler); } else { - return new DownsamplingMethod(DownsampleType.NONE, null, null, useExperimentalDownsampling); + // Downsampling is off by default for non-locus-based traversals + return new DownsamplingMethod(DownsampleType.NONE, null, null, useLegacyDownsampler); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java new file mode 100644 index 000000000..008ffde3b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PassThroughDownsampler.java @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** + * Pass-Through Downsampler: Implementation of the ReadsDownsampler interface that does no + * downsampling whatsoever, and instead simply "passes-through" all the reads it's given. + * Useful for situations where you want to disable downsampling, but still need to use + * the downsampler interface. + * + * @author David Roazen + */ +public class PassThroughDownsampler implements ReadsDownsampler { + + private ArrayList selectedReads; + + public PassThroughDownsampler() { + clear(); + } + + public void submit( T newRead ) { + // All reads pass-through, no reads get downsampled + selectedReads.add(newRead); + } + + public void submit( Collection newReads ) { + for ( T read : newReads ) { + submit(read); + } + } + + public boolean hasFinalizedItems() { + return selectedReads.size() > 0; + } + + public List consumeFinalizedItems() { + // pass by reference rather than make a copy, for speed + List downsampledItems = selectedReads; + clear(); + return downsampledItems; + } + + public boolean hasPendingItems() { + return false; + } + + public T peekFinalized() { + return selectedReads.isEmpty() ? null : selectedReads.get(0); + } + + public T peekPending() { + return null; + } + + public int getNumberOfDiscardedItems() { + return 0; + } + + public void signalEndOfInput() { + // NO-OP + } + + public void clear() { + selectedReads = new ArrayList(); + } + + public void reset() { + // NO-OP + } + + public boolean requiresCoordinateSortOrder() { + return false; + } + + public void signalNoMoreReadsBefore( T read ) { + // NO-OP + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index 6c0dc9769..735a4dce5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -4,9 +4,9 @@ import net.sf.picard.util.PeekableIterator; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.reads.Shard; +import org.broadinstitute.sting.gatk.iterators.LegacyLocusIteratorByState; import org.broadinstitute.sting.gatk.iterators.LocusIterator; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; -import org.broadinstitute.sting.gatk.iterators.LocusIteratorByStateExperimental; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -83,17 +83,18 @@ public class WindowMaker implements Iterable, I this.sourceInfo = shard.getReadProperties(); this.readIterator = iterator; - // Temporary: use the experimental version of LocusIteratorByState if experimental downsampling was requested: - this.sourceIterator = sourceInfo.getDownsamplingMethod().useExperimentalDownsampling ? - new PeekableIterator(new LocusIteratorByStateExperimental(iterator,sourceInfo,genomeLocParser, sampleNames)) + // Use the legacy version of LocusIteratorByState if legacy downsampling was requested: + this.sourceIterator = sourceInfo.getDownsamplingMethod().useLegacyDownsampler ? + new PeekableIterator(new LegacyLocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleNames)) : - new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames)); + new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleNames)); + this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null; } public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals ) { - this(shard, genomeLocParser, iterator, intervals, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); + this(shard, genomeLocParser, iterator, intervals, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups()); } public Iterator iterator() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByState.java similarity index 61% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByState.java index 557cbd009..5c833de4c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByState.java @@ -31,14 +31,14 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; -import org.broadinstitute.sting.gatk.downsampling.Downsampler; -import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.LegacyReservoirDownsampler; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; @@ -50,11 +50,11 @@ import java.util.*; /** * Iterator that traverses a SAM File, accumulating information on a per-locus basis */ -public class LocusIteratorByStateExperimental extends LocusIterator { +public class LegacyLocusIteratorByState extends LocusIterator { /** * our log, which we want to capture anything from this class */ - private static Logger logger = Logger.getLogger(LocusIteratorByState.class); + private static Logger logger = Logger.getLogger(LegacyLocusIteratorByState.class); // ----------------------------------------------------------------------------------------------------------------- // @@ -69,7 +69,7 @@ public class LocusIteratorByStateExperimental extends LocusIterator { private final ArrayList samples; private final ReadStateManager readStates; - protected static class SAMRecordState { + static private class SAMRecordState { SAMRecord read; int readOffset = -1; // how far are we offset from the start of the read bases? int genomeOffset = -1; // how far are we offset from the alignment start on the genome? @@ -213,7 +213,6 @@ public class LocusIteratorByStateExperimental extends LocusIterator { //final boolean DEBUG2 = false && DEBUG; private ReadProperties readInfo; private AlignmentContext nextAlignmentContext; - private boolean performLevelingDownsampling; // ----------------------------------------------------------------------------------------------------------------- // @@ -221,15 +220,11 @@ public class LocusIteratorByStateExperimental extends LocusIterator { // // ----------------------------------------------------------------------------------------------------------------- - public LocusIteratorByStateExperimental(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples) { + public LegacyLocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples) { this.readInfo = readInformation; this.genomeLocParser = genomeLocParser; this.samples = new ArrayList(samples); - this.readStates = new ReadStateManager(samIterator); - - this.performLevelingDownsampling = readInfo.getDownsamplingMethod() != null && - readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && - readInfo.getDownsamplingMethod().toCoverage != null; + this.readStates = new ReadStateManager(samIterator, readInformation.getDownsamplingMethod()); // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when // there's no read data. So we need to throw this error only when samIterator.hasNext() is true @@ -290,13 +285,11 @@ public class LocusIteratorByStateExperimental extends LocusIterator { final GenomeLoc location = getLocation(); final Map fullPileup = new HashMap(); - - // TODO: How can you determine here whether the current pileup has been downsampled? boolean hasBeenSampled = false; - for (final String sample : samples) { final Iterator iterator = readStates.iterator(sample); final List pile = new ArrayList(readStates.size(sample)); + hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample); int size = 0; // number of elements in this sample's pileup int nDeletions = 0; // number of deletions in this sample's pileup @@ -405,20 +398,34 @@ public class LocusIteratorByStateExperimental extends LocusIterator { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } - protected class ReadStateManager { + private class ReadStateManager { private final PeekableIterator iterator; + private final DownsamplingMethod downsamplingMethod; private final SamplePartitioner samplePartitioner; private final Map readStatesBySample = new HashMap(); + private final int targetCoverage; private int totalReadStates = 0; - public ReadStateManager(Iterator source) { + public ReadStateManager(Iterator source, DownsamplingMethod downsamplingMethod) { this.iterator = new PeekableIterator(source); - - for (final String sample : samples) { - readStatesBySample.put(sample, new PerSampleReadStateManager()); + this.downsamplingMethod = downsamplingMethod.type != null ? downsamplingMethod : DownsamplingMethod.NONE; + switch (this.downsamplingMethod.type) { + case BY_SAMPLE: + if (downsamplingMethod.toCoverage == null) + throw new UserException.BadArgumentValue("dcov", "Downsampling coverage (-dcov) must be specified when downsampling by sample"); + this.targetCoverage = downsamplingMethod.toCoverage; + break; + default: + this.targetCoverage = Integer.MAX_VALUE; } - samplePartitioner = new SamplePartitioner(); + Map readSelectors = new HashMap(); + for (final String sample : samples) { + readStatesBySample.put(sample, new PerSampleReadStateManager()); + readSelectors.put(sample, downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null, targetCoverage) : new AllReadsSelector()); + } + + samplePartitioner = new SamplePartitioner(readSelectors); } /** @@ -442,6 +449,7 @@ public class LocusIteratorByStateExperimental extends LocusIterator { public void remove() { wrappedIterator.remove(); + totalReadStates--; } }; } @@ -469,6 +477,17 @@ public class LocusIteratorByStateExperimental extends LocusIterator { return readStatesBySample.get(sample).size(); } + /** + * The extent of downsampling; basically, the furthest base out which has 'fallen + * victim' to the downsampler. + * + * @param sample Sample, downsampled independently. + * @return Integer stop of the furthest undownsampled region. + */ + public int getDownsamplingExtent(final String sample) { + return readStatesBySample.get(sample).getDownsamplingExtent(); + } + public SAMRecordState getFirst() { for (final String sample : samples) { PerSampleReadStateManager reads = readStatesBySample.get(sample); @@ -501,13 +520,61 @@ public class LocusIteratorByStateExperimental extends LocusIterator { samplePartitioner.submitRead(iterator.next()); } } + samplePartitioner.complete(); for (final String sample : samples) { - Collection newReads = samplePartitioner.getReadsForSample(sample); - PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); - addReadsToSample(statesBySample, newReads); - } + ReadSelector aggregator = samplePartitioner.getSelectedReads(sample); + Collection newReads = new ArrayList(aggregator.getSelectedReads()); + + PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); + int numReads = statesBySample.size(); + int downsamplingExtent = aggregator.getDownsamplingExtent(); + + if (numReads + newReads.size() <= targetCoverage || downsamplingMethod.type == DownsampleType.NONE) { + long readLimit = aggregator.getNumReadsSeen(); + addReadsToSample(statesBySample, newReads, readLimit); + statesBySample.specifyNewDownsamplingExtent(downsamplingExtent); + } else { + int[] counts = statesBySample.getCountsPerAlignmentStart(); + int[] updatedCounts = new int[counts.length]; + System.arraycopy(counts, 0, updatedCounts, 0, counts.length); + + boolean readPruned = true; + while (numReads + newReads.size() > targetCoverage && readPruned) { + readPruned = false; + for (int alignmentStart = updatedCounts.length - 1; numReads + newReads.size() > targetCoverage && alignmentStart >= 0; alignmentStart--) { + if (updatedCounts[alignmentStart] > 1) { + updatedCounts[alignmentStart]--; + numReads--; + readPruned = true; + } + } + } + + if (numReads == targetCoverage) { + updatedCounts[0]--; + numReads--; + } + + BitSet toPurge = new BitSet(readStates.size()); + int readOffset = 0; + + for (int i = 0; i < updatedCounts.length; i++) { + int n = counts[i]; + int k = updatedCounts[i]; + + for (Integer purgedElement : MathUtils.sampleIndicesWithoutReplacement(n, n - k)) + toPurge.set(readOffset + purgedElement); + + readOffset += counts[i]; + } + downsamplingExtent = Math.max(downsamplingExtent, statesBySample.purge(toPurge)); + + addReadsToSample(statesBySample, newReads, targetCoverage - numReads); + statesBySample.specifyNewDownsamplingExtent(downsamplingExtent); + } + } samplePartitioner.reset(); } @@ -516,134 +583,380 @@ public class LocusIteratorByStateExperimental extends LocusIterator { * * @param readStates The list of read states to add this collection of reads. * @param reads Reads to add. Selected reads will be pulled from this source. + * @param maxReads Maximum number of reads to add. */ - private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads) { + private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads, final long maxReads) { if (reads.isEmpty()) return; Collection newReadStates = new LinkedList(); - + int readCount = 0; for (SAMRecord read : reads) { - SAMRecordState state = new SAMRecordState(read); - state.stepForwardOnGenome(); - newReadStates.add(state); + if (readCount < maxReads) { + SAMRecordState state = new SAMRecordState(read); + state.stepForwardOnGenome(); + newReadStates.add(state); + readCount++; + } } - readStates.addStatesAtNextAlignmentStart(newReadStates); } - protected class PerSampleReadStateManager implements Iterable { - private List> readStatesByAlignmentStart = new LinkedList>(); - private int thisSampleReadStates = 0; - private Downsampler> levelingDownsampler = - performLevelingDownsampling ? - new LevelingDownsampler, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) : - null; + private class PerSampleReadStateManager implements Iterable { + private final Queue readStates = new LinkedList(); + private final Deque readStateCounter = new LinkedList(); + private int downsamplingExtent = 0; public void addStatesAtNextAlignmentStart(Collection states) { - if ( states.isEmpty() ) { - return; - } - - readStatesByAlignmentStart.add(new LinkedList(states)); - thisSampleReadStates += states.size(); + readStates.addAll(states); + readStateCounter.add(new Counter(states.size())); totalReadStates += states.size(); - - if ( levelingDownsampler != null ) { - levelingDownsampler.submit(readStatesByAlignmentStart); - levelingDownsampler.signalEndOfInput(); - - thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); - totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); - - // use returned List directly rather than make a copy, for efficiency's sake - readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems(); - levelingDownsampler.reset(); - } } public boolean isEmpty() { - return readStatesByAlignmentStart.isEmpty(); + return readStates.isEmpty(); } public SAMRecordState peek() { - return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek(); + return readStates.peek(); } public int size() { - return thisSampleReadStates; + return readStates.size(); + } + + public void specifyNewDownsamplingExtent(int downsamplingExtent) { + this.downsamplingExtent = Math.max(this.downsamplingExtent, downsamplingExtent); + } + + public int getDownsamplingExtent() { + return downsamplingExtent; + } + + public int[] getCountsPerAlignmentStart() { + int[] counts = new int[readStateCounter.size()]; + int index = 0; + for (Counter counter : readStateCounter) + counts[index++] = counter.getCount(); + return counts; } public Iterator iterator() { return new Iterator() { - private Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); - private LinkedList currentPositionReadStates = null; - private Iterator currentPositionReadStatesIterator = null; + private Iterator wrappedIterator = readStates.iterator(); public boolean hasNext() { - return alignmentStartIterator.hasNext() || - (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext()); + return wrappedIterator.hasNext(); } public SAMRecordState next() { - if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) { - currentPositionReadStates = alignmentStartIterator.next(); - currentPositionReadStatesIterator = currentPositionReadStates.iterator(); - } - - return currentPositionReadStatesIterator.next(); + return wrappedIterator.next(); } public void remove() { - currentPositionReadStatesIterator.remove(); - thisSampleReadStates--; - totalReadStates--; - - if ( currentPositionReadStates.isEmpty() ) { - alignmentStartIterator.remove(); - } + wrappedIterator.remove(); + Counter counter = readStateCounter.peek(); + counter.decrement(); + if (counter.getCount() == 0) + readStateCounter.remove(); } }; } + + /** + * Purge the given elements from the bitset. If an element in the bitset is true, purge + * the corresponding read state. + * + * @param elements bits from the set to purge. + * @return the extent of the final downsampled read. + */ + public int purge(final BitSet elements) { + int downsamplingExtent = 0; + + if (elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent; + + Iterator readStateIterator = readStates.iterator(); + + Iterator counterIterator = readStateCounter.iterator(); + Counter currentCounter = counterIterator.next(); + + int readIndex = 0; + long alignmentStartCounter = currentCounter.getCount(); + + int toPurge = elements.nextSetBit(0); + int removedCount = 0; + + while (readStateIterator.hasNext() && toPurge >= 0) { + SAMRecordState state = readStateIterator.next(); + downsamplingExtent = Math.max(downsamplingExtent, state.getRead().getAlignmentEnd()); + + if (readIndex == toPurge) { + readStateIterator.remove(); + currentCounter.decrement(); + if (currentCounter.getCount() == 0) + counterIterator.remove(); + removedCount++; + toPurge = elements.nextSetBit(toPurge + 1); + } + + readIndex++; + alignmentStartCounter--; + if (alignmentStartCounter == 0 && counterIterator.hasNext()) { + currentCounter = counterIterator.next(); + alignmentStartCounter = currentCounter.getCount(); + } + } + + totalReadStates -= removedCount; + + return downsamplingExtent; + } } } /** - * Note: stores reads by sample ID string, not by sample object + * Note: assuming that, whenever we downsample, we downsample to an integer capacity. */ - private class SamplePartitioner { - private Map> readsBySample; - private long readsSeen = 0; + static private class Counter { + private int count; - public SamplePartitioner() { - readsBySample = new HashMap>(); - - for ( String sample : samples ) { - readsBySample.put(sample, new ArrayList()); - } + public Counter(int count) { + this.count = count; } - public void submitRead(SAMRecord read) { - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - if (readsBySample.containsKey(sampleName)) - readsBySample.get(sampleName).add(read); - readsSeen++; + public int getCount() { + return count; } - public long getNumReadsSeen() { - return readsSeen; - } - - public Collection getReadsForSample(String sampleName) { - if ( ! readsBySample.containsKey(sampleName) ) - throw new NoSuchElementException("Sample name not found"); - return readsBySample.get(sampleName); - } - - public void reset() { - for ( Collection perSampleReads : readsBySample.values() ) - perSampleReads.clear(); - readsSeen = 0; + public void decrement() { + count--; } } -} \ No newline at end of file +} + +/** + * Selects reads passed to it based on a criteria decided through inheritance. + * TODO: This is a temporary abstraction until we can get rid of this downsampling implementation and the mrl option. Get rid of this. + */ +interface ReadSelector { + /** + * All previous selectors in the chain have allowed this read. Submit it to this selector for consideration. + * + * @param read the read to evaluate. + */ + public void submitRead(SAMRecord read); + + /** + * A previous selector has deemed this read unfit. Notify this selector so that this selector's counts are valid. + * + * @param read the read previously rejected. + */ + public void notifyReadRejected(SAMRecord read); + + /** + * Signal the selector that read additions are complete. + */ + public void complete(); + + /** + * Retrieve the number of reads seen by this selector so far. + * + * @return number of reads seen. + */ + public long getNumReadsSeen(); + + /** + * Return the number of reads accepted by this selector so far. + * + * @return number of reads selected. + */ + public long getNumReadsSelected(); + + /** + * Gets the locus at which the last of the downsampled reads selected by this selector ends. The value returned will be the + * last aligned position from this selection to which a downsampled read aligns -- in other words, if a read is thrown out at + * position 3 whose cigar string is 76M, the value of this parameter will be 78. + * + * @return If any read has been downsampled, this will return the last aligned base of the longest alignment. Else, 0. + */ + public int getDownsamplingExtent(); + + /** + * Get the reads selected by this selector. + * + * @return collection of reads selected by this selector. + */ + public Collection getSelectedReads(); + + /** + * Reset this collection to its pre-gathered state. + */ + public void reset(); +} + +/** + * Select every read passed in. + */ +class AllReadsSelector implements ReadSelector { + private Collection reads = new LinkedList(); + private long readsSeen = 0; + private int downsamplingExtent = 0; + + public void submitRead(SAMRecord read) { + reads.add(read); + readsSeen++; + } + + public void notifyReadRejected(SAMRecord read) { + readsSeen++; + downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd()); + } + + public void complete() { + // NO-OP. + } + + public long getNumReadsSeen() { + return readsSeen; + } + + public long getNumReadsSelected() { + return readsSeen; + } + + public int getDownsamplingExtent() { + return downsamplingExtent; + } + + public Collection getSelectedReads() { + return reads; + } + + public void reset() { + reads.clear(); + readsSeen = 0; + downsamplingExtent = 0; + } +} + + +/** + * Select N reads randomly from the input stream. + */ +class NRandomReadSelector implements ReadSelector { + private final LegacyReservoirDownsampler reservoir; + private final ReadSelector chainedSelector; + private long readsSeen = 0; + private int downsamplingExtent = 0; + + public NRandomReadSelector(ReadSelector chainedSelector, long readLimit) { + this.reservoir = new LegacyReservoirDownsampler((int) readLimit); + this.chainedSelector = chainedSelector; + } + + public void submitRead(SAMRecord read) { + SAMRecord displaced = reservoir.add(read); + if (displaced != null && chainedSelector != null) { + chainedSelector.notifyReadRejected(read); + downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd()); + } + readsSeen++; + } + + public void notifyReadRejected(SAMRecord read) { + readsSeen++; + } + + public void complete() { + for (SAMRecord read : reservoir.getDownsampledContents()) + chainedSelector.submitRead(read); + if (chainedSelector != null) + chainedSelector.complete(); + } + + + public long getNumReadsSeen() { + return readsSeen; + } + + public long getNumReadsSelected() { + return reservoir.size(); + } + + public int getDownsamplingExtent() { + return downsamplingExtent; + } + + public Collection getSelectedReads() { + return reservoir.getDownsampledContents(); + } + + public void reset() { + reservoir.clear(); + downsamplingExtent = 0; + if (chainedSelector != null) + chainedSelector.reset(); + } +} + +/** + * Note: stores reads by sample ID string, not by sample object + */ +class SamplePartitioner implements ReadSelector { + private final Map readsBySample; + private long readsSeen = 0; + + public SamplePartitioner(Map readSelectors) { + readsBySample = readSelectors; + } + + public void submitRead(SAMRecord read) { + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + if (readsBySample.containsKey(sampleName)) + readsBySample.get(sampleName).submitRead(read); + readsSeen++; + } + + public void notifyReadRejected(SAMRecord read) { + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + if (readsBySample.containsKey(sampleName)) + readsBySample.get(sampleName).notifyReadRejected(read); + readsSeen++; + } + + public void complete() { + // NO-OP. + } + + public long getNumReadsSeen() { + return readsSeen; + } + + public long getNumReadsSelected() { + return readsSeen; + } + + public int getDownsamplingExtent() { + int downsamplingExtent = 0; + for (ReadSelector storage : readsBySample.values()) + downsamplingExtent = Math.max(downsamplingExtent, storage.getDownsamplingExtent()); + return downsamplingExtent; + } + + public Collection getSelectedReads() { + throw new UnsupportedOperationException("Cannot directly get selected reads from a read partitioner."); + } + + public ReadSelector getSelectedReads(String sampleName) { + if (!readsBySample.containsKey(sampleName)) + throw new NoSuchElementException("Sample name not found"); + return readsBySample.get(sampleName); + } + + public void reset() { + for (ReadSelector storage : readsBySample.values()) + storage.reset(); + readsSeen = 0; + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 46e84798a..4f8d7d3f2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -31,14 +31,11 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; -import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.downsampling.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.ReservoirDownsampler; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; @@ -54,7 +51,7 @@ public class LocusIteratorByState extends LocusIterator { /** * our log, which we want to capture anything from this class */ - private static Logger logger = Logger.getLogger(LocusIteratorByState.class); + private static Logger logger = Logger.getLogger(LegacyLocusIteratorByState.class); // ----------------------------------------------------------------------------------------------------------------- // @@ -69,7 +66,7 @@ public class LocusIteratorByState extends LocusIterator { private final ArrayList samples; private final ReadStateManager readStates; - static private class SAMRecordState { + protected static class SAMRecordState { SAMRecord read; int readOffset = -1; // how far are we offset from the start of the read bases? int genomeOffset = -1; // how far are we offset from the alignment start on the genome? @@ -213,6 +210,7 @@ public class LocusIteratorByState extends LocusIterator { //final boolean DEBUG2 = false && DEBUG; private ReadProperties readInfo; private AlignmentContext nextAlignmentContext; + private boolean performDownsampling; // ----------------------------------------------------------------------------------------------------------------- // @@ -224,7 +222,18 @@ public class LocusIteratorByState extends LocusIterator { this.readInfo = readInformation; this.genomeLocParser = genomeLocParser; this.samples = new ArrayList(samples); - this.readStates = new ReadStateManager(samIterator, readInformation.getDownsamplingMethod()); + + // LIBS will invoke the Reservoir and Leveling downsamplers on the read stream if we're + // downsampling to coverage by sample. SAMDataSource will have refrained from applying + // any downsamplers to the read stream in this case, in the expectation that LIBS will + // manage the downsampling. The reason for this is twofold: performance (don't have to + // split/re-assemble the read stream in SAMDataSource), and to enable partial downsampling + // of reads (eg., using half of a read, and throwing the rest away). + this.performDownsampling = readInfo.getDownsamplingMethod() != null && + readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && + readInfo.getDownsamplingMethod().toCoverage != null; + + this.readStates = new ReadStateManager(samIterator); // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when // there's no read data. So we need to throw this error only when samIterator.hasNext() is true @@ -285,11 +294,13 @@ public class LocusIteratorByState extends LocusIterator { final GenomeLoc location = getLocation(); final Map fullPileup = new HashMap(); + + // TODO: How can you determine here whether the current pileup has been downsampled? boolean hasBeenSampled = false; + for (final String sample : samples) { final Iterator iterator = readStates.iterator(sample); final List pile = new ArrayList(readStates.size(sample)); - hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample); int size = 0; // number of elements in this sample's pileup int nDeletions = 0; // number of deletions in this sample's pileup @@ -398,34 +409,20 @@ public class LocusIteratorByState extends LocusIterator { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } - private class ReadStateManager { + protected class ReadStateManager { private final PeekableIterator iterator; - private final DownsamplingMethod downsamplingMethod; private final SamplePartitioner samplePartitioner; private final Map readStatesBySample = new HashMap(); - private final int targetCoverage; private int totalReadStates = 0; - public ReadStateManager(Iterator source, DownsamplingMethod downsamplingMethod) { + public ReadStateManager(Iterator source) { this.iterator = new PeekableIterator(source); - this.downsamplingMethod = downsamplingMethod.type != null ? downsamplingMethod : DownsamplingMethod.NONE; - switch (this.downsamplingMethod.type) { - case BY_SAMPLE: - if (downsamplingMethod.toCoverage == null) - throw new UserException.BadArgumentValue("dcov", "Downsampling coverage (-dcov) must be specified when downsampling by sample"); - this.targetCoverage = downsamplingMethod.toCoverage; - break; - default: - this.targetCoverage = Integer.MAX_VALUE; - } - Map readSelectors = new HashMap(); for (final String sample : samples) { readStatesBySample.put(sample, new PerSampleReadStateManager()); - readSelectors.put(sample, downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null, targetCoverage) : new AllReadsSelector()); } - samplePartitioner = new SamplePartitioner(readSelectors); + samplePartitioner = new SamplePartitioner(performDownsampling); } /** @@ -449,7 +446,6 @@ public class LocusIteratorByState extends LocusIterator { public void remove() { wrappedIterator.remove(); - totalReadStates--; } }; } @@ -477,17 +473,6 @@ public class LocusIteratorByState extends LocusIterator { return readStatesBySample.get(sample).size(); } - /** - * The extent of downsampling; basically, the furthest base out which has 'fallen - * victim' to the downsampler. - * - * @param sample Sample, downsampled independently. - * @return Integer stop of the furthest undownsampled region. - */ - public int getDownsamplingExtent(final String sample) { - return readStatesBySample.get(sample).getDownsamplingExtent(); - } - public SAMRecordState getFirst() { for (final String sample : samples) { PerSampleReadStateManager reads = readStatesBySample.get(sample); @@ -520,61 +505,15 @@ public class LocusIteratorByState extends LocusIterator { samplePartitioner.submitRead(iterator.next()); } } - samplePartitioner.complete(); + + samplePartitioner.doneSubmittingReads(); for (final String sample : samples) { - ReadSelector aggregator = samplePartitioner.getSelectedReads(sample); - - Collection newReads = new ArrayList(aggregator.getSelectedReads()); - + Collection newReads = samplePartitioner.getReadsForSample(sample); PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); - int numReads = statesBySample.size(); - int downsamplingExtent = aggregator.getDownsamplingExtent(); - - if (numReads + newReads.size() <= targetCoverage || downsamplingMethod.type == DownsampleType.NONE) { - long readLimit = aggregator.getNumReadsSeen(); - addReadsToSample(statesBySample, newReads, readLimit); - statesBySample.specifyNewDownsamplingExtent(downsamplingExtent); - } else { - int[] counts = statesBySample.getCountsPerAlignmentStart(); - int[] updatedCounts = new int[counts.length]; - System.arraycopy(counts, 0, updatedCounts, 0, counts.length); - - boolean readPruned = true; - while (numReads + newReads.size() > targetCoverage && readPruned) { - readPruned = false; - for (int alignmentStart = updatedCounts.length - 1; numReads + newReads.size() > targetCoverage && alignmentStart >= 0; alignmentStart--) { - if (updatedCounts[alignmentStart] > 1) { - updatedCounts[alignmentStart]--; - numReads--; - readPruned = true; - } - } - } - - if (numReads == targetCoverage) { - updatedCounts[0]--; - numReads--; - } - - BitSet toPurge = new BitSet(readStates.size()); - int readOffset = 0; - - for (int i = 0; i < updatedCounts.length; i++) { - int n = counts[i]; - int k = updatedCounts[i]; - - for (Integer purgedElement : MathUtils.sampleIndicesWithoutReplacement(n, n - k)) - toPurge.set(readOffset + purgedElement); - - readOffset += counts[i]; - } - downsamplingExtent = Math.max(downsamplingExtent, statesBySample.purge(toPurge)); - - addReadsToSample(statesBySample, newReads, targetCoverage - numReads); - statesBySample.specifyNewDownsamplingExtent(downsamplingExtent); - } + addReadsToSample(statesBySample, newReads); } + samplePartitioner.reset(); } @@ -583,380 +522,140 @@ public class LocusIteratorByState extends LocusIterator { * * @param readStates The list of read states to add this collection of reads. * @param reads Reads to add. Selected reads will be pulled from this source. - * @param maxReads Maximum number of reads to add. */ - private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads, final long maxReads) { + private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads) { if (reads.isEmpty()) return; Collection newReadStates = new LinkedList(); - int readCount = 0; + for (SAMRecord read : reads) { - if (readCount < maxReads) { - SAMRecordState state = new SAMRecordState(read); - state.stepForwardOnGenome(); - newReadStates.add(state); - readCount++; - } + SAMRecordState state = new SAMRecordState(read); + state.stepForwardOnGenome(); + newReadStates.add(state); } + readStates.addStatesAtNextAlignmentStart(newReadStates); } - private class PerSampleReadStateManager implements Iterable { - private final Queue readStates = new LinkedList(); - private final Deque readStateCounter = new LinkedList(); - private int downsamplingExtent = 0; + protected class PerSampleReadStateManager implements Iterable { + private List> readStatesByAlignmentStart = new LinkedList>(); + private int thisSampleReadStates = 0; + private Downsampler> levelingDownsampler = + performDownsampling ? + new LevelingDownsampler, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) : + null; public void addStatesAtNextAlignmentStart(Collection states) { - readStates.addAll(states); - readStateCounter.add(new Counter(states.size())); + if ( states.isEmpty() ) { + return; + } + + readStatesByAlignmentStart.add(new LinkedList(states)); + thisSampleReadStates += states.size(); totalReadStates += states.size(); + + if ( levelingDownsampler != null ) { + levelingDownsampler.submit(readStatesByAlignmentStart); + levelingDownsampler.signalEndOfInput(); + + thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); + totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); + + // use returned List directly rather than make a copy, for efficiency's sake + readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems(); + levelingDownsampler.reset(); + } } public boolean isEmpty() { - return readStates.isEmpty(); + return readStatesByAlignmentStart.isEmpty(); } public SAMRecordState peek() { - return readStates.peek(); + return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek(); } public int size() { - return readStates.size(); - } - - public void specifyNewDownsamplingExtent(int downsamplingExtent) { - this.downsamplingExtent = Math.max(this.downsamplingExtent, downsamplingExtent); - } - - public int getDownsamplingExtent() { - return downsamplingExtent; - } - - public int[] getCountsPerAlignmentStart() { - int[] counts = new int[readStateCounter.size()]; - int index = 0; - for (Counter counter : readStateCounter) - counts[index++] = counter.getCount(); - return counts; + return thisSampleReadStates; } public Iterator iterator() { return new Iterator() { - private Iterator wrappedIterator = readStates.iterator(); + private Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); + private LinkedList currentPositionReadStates = null; + private Iterator currentPositionReadStatesIterator = null; public boolean hasNext() { - return wrappedIterator.hasNext(); + return alignmentStartIterator.hasNext() || + (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext()); } public SAMRecordState next() { - return wrappedIterator.next(); + if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) { + currentPositionReadStates = alignmentStartIterator.next(); + currentPositionReadStatesIterator = currentPositionReadStates.iterator(); + } + + return currentPositionReadStatesIterator.next(); } public void remove() { - wrappedIterator.remove(); - Counter counter = readStateCounter.peek(); - counter.decrement(); - if (counter.getCount() == 0) - readStateCounter.remove(); + currentPositionReadStatesIterator.remove(); + thisSampleReadStates--; + totalReadStates--; + + if ( currentPositionReadStates.isEmpty() ) { + alignmentStartIterator.remove(); + } } }; } + } + } - /** - * Purge the given elements from the bitset. If an element in the bitset is true, purge - * the corresponding read state. - * - * @param elements bits from the set to purge. - * @return the extent of the final downsampled read. - */ - public int purge(final BitSet elements) { - int downsamplingExtent = 0; + /** + * Divides reads by sample and (if requested) does a preliminary downsampling pass with a ReservoirDownsampler. + * + * Note: stores reads by sample ID string, not by sample object + */ + private class SamplePartitioner { + private Map> readsBySample; - if (elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent; + public SamplePartitioner( boolean downsampleReads ) { + readsBySample = new HashMap>(); - Iterator readStateIterator = readStates.iterator(); + for ( String sample : samples ) { + readsBySample.put(sample, + downsampleReads ? new ReservoirDownsampler(readInfo.getDownsamplingMethod().toCoverage) : + new PassThroughDownsampler()); + } + } - Iterator counterIterator = readStateCounter.iterator(); - Counter currentCounter = counterIterator.next(); + public void submitRead(SAMRecord read) { + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + if (readsBySample.containsKey(sampleName)) + readsBySample.get(sampleName).submit(read); + } - int readIndex = 0; - long alignmentStartCounter = currentCounter.getCount(); + public void doneSubmittingReads() { + for ( Map.Entry> perSampleReads : readsBySample.entrySet() ) { + perSampleReads.getValue().signalEndOfInput(); + } + } - int toPurge = elements.nextSetBit(0); - int removedCount = 0; + public Collection getReadsForSample(String sampleName) { + if ( ! readsBySample.containsKey(sampleName) ) + throw new NoSuchElementException("Sample name not found"); - while (readStateIterator.hasNext() && toPurge >= 0) { - SAMRecordState state = readStateIterator.next(); - downsamplingExtent = Math.max(downsamplingExtent, state.getRead().getAlignmentEnd()); + return readsBySample.get(sampleName).consumeFinalizedItems(); + } - if (readIndex == toPurge) { - readStateIterator.remove(); - currentCounter.decrement(); - if (currentCounter.getCount() == 0) - counterIterator.remove(); - removedCount++; - toPurge = elements.nextSetBit(toPurge + 1); - } - - readIndex++; - alignmentStartCounter--; - if (alignmentStartCounter == 0 && counterIterator.hasNext()) { - currentCounter = counterIterator.next(); - alignmentStartCounter = currentCounter.getCount(); - } - } - - totalReadStates -= removedCount; - - return downsamplingExtent; + public void reset() { + for ( Map.Entry> perSampleReads : readsBySample.entrySet() ) { + perSampleReads.getValue().clear(); + perSampleReads.getValue().reset(); } } } - - /** - * Note: assuming that, whenever we downsample, we downsample to an integer capacity. - */ - static private class Counter { - private int count; - - public Counter(int count) { - this.count = count; - } - - public int getCount() { - return count; - } - - public void decrement() { - count--; - } - } -} - -/** - * Selects reads passed to it based on a criteria decided through inheritance. - * TODO: This is a temporary abstraction until we can get rid of this downsampling implementation and the mrl option. Get rid of this. - */ -interface ReadSelector { - /** - * All previous selectors in the chain have allowed this read. Submit it to this selector for consideration. - * - * @param read the read to evaluate. - */ - public void submitRead(SAMRecord read); - - /** - * A previous selector has deemed this read unfit. Notify this selector so that this selector's counts are valid. - * - * @param read the read previously rejected. - */ - public void notifyReadRejected(SAMRecord read); - - /** - * Signal the selector that read additions are complete. - */ - public void complete(); - - /** - * Retrieve the number of reads seen by this selector so far. - * - * @return number of reads seen. - */ - public long getNumReadsSeen(); - - /** - * Return the number of reads accepted by this selector so far. - * - * @return number of reads selected. - */ - public long getNumReadsSelected(); - - /** - * Gets the locus at which the last of the downsampled reads selected by this selector ends. The value returned will be the - * last aligned position from this selection to which a downsampled read aligns -- in other words, if a read is thrown out at - * position 3 whose cigar string is 76M, the value of this parameter will be 78. - * - * @return If any read has been downsampled, this will return the last aligned base of the longest alignment. Else, 0. - */ - public int getDownsamplingExtent(); - - /** - * Get the reads selected by this selector. - * - * @return collection of reads selected by this selector. - */ - public Collection getSelectedReads(); - - /** - * Reset this collection to its pre-gathered state. - */ - public void reset(); -} - -/** - * Select every read passed in. - */ -class AllReadsSelector implements ReadSelector { - private Collection reads = new LinkedList(); - private long readsSeen = 0; - private int downsamplingExtent = 0; - - public void submitRead(SAMRecord read) { - reads.add(read); - readsSeen++; - } - - public void notifyReadRejected(SAMRecord read) { - readsSeen++; - downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd()); - } - - public void complete() { - // NO-OP. - } - - public long getNumReadsSeen() { - return readsSeen; - } - - public long getNumReadsSelected() { - return readsSeen; - } - - public int getDownsamplingExtent() { - return downsamplingExtent; - } - - public Collection getSelectedReads() { - return reads; - } - - public void reset() { - reads.clear(); - readsSeen = 0; - downsamplingExtent = 0; - } -} - - -/** - * Select N reads randomly from the input stream. - */ -class NRandomReadSelector implements ReadSelector { - private final ReservoirDownsampler reservoir; - private final ReadSelector chainedSelector; - private long readsSeen = 0; - private int downsamplingExtent = 0; - - public NRandomReadSelector(ReadSelector chainedSelector, long readLimit) { - this.reservoir = new ReservoirDownsampler((int) readLimit); - this.chainedSelector = chainedSelector; - } - - public void submitRead(SAMRecord read) { - SAMRecord displaced = reservoir.add(read); - if (displaced != null && chainedSelector != null) { - chainedSelector.notifyReadRejected(read); - downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd()); - } - readsSeen++; - } - - public void notifyReadRejected(SAMRecord read) { - readsSeen++; - } - - public void complete() { - for (SAMRecord read : reservoir.getDownsampledContents()) - chainedSelector.submitRead(read); - if (chainedSelector != null) - chainedSelector.complete(); - } - - - public long getNumReadsSeen() { - return readsSeen; - } - - public long getNumReadsSelected() { - return reservoir.size(); - } - - public int getDownsamplingExtent() { - return downsamplingExtent; - } - - public Collection getSelectedReads() { - return reservoir.getDownsampledContents(); - } - - public void reset() { - reservoir.clear(); - downsamplingExtent = 0; - if (chainedSelector != null) - chainedSelector.reset(); - } -} - -/** - * Note: stores reads by sample ID string, not by sample object - */ -class SamplePartitioner implements ReadSelector { - private final Map readsBySample; - private long readsSeen = 0; - - public SamplePartitioner(Map readSelectors) { - readsBySample = readSelectors; - } - - public void submitRead(SAMRecord read) { - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - if (readsBySample.containsKey(sampleName)) - readsBySample.get(sampleName).submitRead(read); - readsSeen++; - } - - public void notifyReadRejected(SAMRecord read) { - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - if (readsBySample.containsKey(sampleName)) - readsBySample.get(sampleName).notifyReadRejected(read); - readsSeen++; - } - - public void complete() { - // NO-OP. - } - - public long getNumReadsSeen() { - return readsSeen; - } - - public long getNumReadsSelected() { - return readsSeen; - } - - public int getDownsamplingExtent() { - int downsamplingExtent = 0; - for (ReadSelector storage : readsBySample.values()) - downsamplingExtent = Math.max(downsamplingExtent, storage.getDownsamplingExtent()); - return downsamplingExtent; - } - - public Collection getSelectedReads() { - throw new UnsupportedOperationException("Cannot directly get selected reads from a read partitioner."); - } - - public ReadSelector getSelectedReads(String sampleName) { - if (!readsBySample.containsKey(sampleName)) - throw new NoSuchElementException("Sample name not found"); - return readsBySample.get(sampleName); - } - - public void reset() { - for (ReadSelector storage : readsBySample.values()) - storage.reset(); - readsSeen = 0; - } - -} +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java b/public/java/src/org/broadinstitute/sting/utils/LegacyReservoirDownsampler.java similarity index 94% rename from public/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java rename to public/java/src/org/broadinstitute/sting/utils/LegacyReservoirDownsampler.java index a758df431..ba863310d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/utils/LegacyReservoirDownsampler.java @@ -8,6 +8,8 @@ import java.util.Collection; import java.util.Iterator; /** + * THIS IMPLEMENTATION IS BROKEN AND WILL BE REMOVED ONCE THE DOWNSAMPLING ENGINE FORK COLLAPSES + * * Randomly downsample from a stream of elements. This algorithm is a direct, * naive implementation of reservoir downsampling as described in "Random Downsampling * with a Reservoir" (Vitter 1985). At time of writing, this paper is located here: @@ -16,7 +18,7 @@ import java.util.Iterator; * @author mhanna * @version 0.1 */ -public class ReservoirDownsampler { +public class LegacyReservoirDownsampler { /** * The reservoir of elements tracked by this downsampler. */ @@ -31,7 +33,7 @@ public class ReservoirDownsampler { * Create a new downsampler with the given source iterator and given comparator. * @param maxElements What is the maximum number of reads that can be returned in any call of this */ - public ReservoirDownsampler(final int maxElements) { + public LegacyReservoirDownsampler(final int maxElements) { if(maxElements < 0) throw new ReviewedStingException("Unable to work with an negative size collection of elements"); this.reservoir = new ArrayList(maxElements); diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 61c1c51b4..8186f69cf 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -32,11 +32,10 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; -import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; -import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; +import org.broadinstitute.sting.gatk.iterators.LegacyLocusIteratorByState; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.walkers.qc.CountLoci; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -85,7 +84,7 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark { GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()); // Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally filters these out? Iterator readIterator = new FilteringIterator(reader.iterator(),new UnmappedReadFilter()); - LocusIteratorByState locusIteratorByState = new LocusIteratorByState(readIterator,readProperties,genomeLocParser, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); + LegacyLocusIteratorByState locusIteratorByState = new LegacyLocusIteratorByState(readIterator,readProperties,genomeLocParser, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups()); while(locusIteratorByState.hasNext()) { locusIteratorByState.next().getLocation(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancerUnitTest.java similarity index 97% rename from public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancerUnitTest.java index 0807f36dc..73664141f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancerUnitTest.java @@ -45,10 +45,10 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; -public class ExperimentalReadShardBalancerUnitTest extends BaseTest { +public class ReadShardBalancerUnitTest extends BaseTest { /** - * Tests to ensure that ExperimentalReadShardBalancer works as expected and does not place shard boundaries + * Tests to ensure that ReadShardBalancer works as expected and does not place shard boundaries * at inappropriate places, such as within an alignment start position */ private static class ExperimentalReadShardBalancerTest extends TestDataProvider { @@ -74,7 +74,7 @@ public class ExperimentalReadShardBalancerUnitTest extends BaseTest { this.stackSize = stackSize; this.numUnmappedReads = numUnmappedReads; - this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null, true); + this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null, false); this.expectedReadCount = Math.min(stackSize, downsamplingTargetCoverage) * numStacksPerContig * numContigs + numUnmappedReads; setName(String.format("%s: numContigs=%d numStacksPerContig=%d stackSize=%d numUnmappedReads=%d downsamplingTargetCoverage=%d", @@ -96,7 +96,7 @@ public class ExperimentalReadShardBalancerUnitTest extends BaseTest { new ArrayList(), false); - Iterable shardIterator = dataSource.createShardIteratorOverAllReads(new ExperimentalReadShardBalancer()); + Iterable shardIterator = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); SAMRecord readAtEndOfLastShard = null; int totalReadsSeen = 0; diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByStateUnitTest.java similarity index 50% rename from public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByStateUnitTest.java index a49a602c6..44d182661 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByStateUnitTest.java @@ -7,10 +7,8 @@ import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; -import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -21,14 +19,17 @@ import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.*; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; /** - * testing of the experimental version of LocusIteratorByState + * testing of the LEGACY version of LocusIteratorByState */ -public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { +public class LegacyLocusIteratorByStateUnitTest extends BaseTest { private static SAMFileHeader header; - private LocusIteratorByStateExperimental li; + private LegacyLocusIteratorByState li; private GenomeLocParser genomeLocParser; @BeforeClass @@ -37,8 +38,8 @@ public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); } - private LocusIteratorByStateExperimental makeLTBS(List reads, ReadProperties readAttributes) { - return new LocusIteratorByStateExperimental(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByStateExperimental.sampleListForSAMWithoutReadGroups()); + private LegacyLocusIteratorByState makeLTBS(List reads, ReadProperties readAttributes) { + return new LegacyLocusIteratorByState(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups()); } @Test @@ -328,184 +329,14 @@ public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { // End comprehensive LIBS/PileupElement tests // //////////////////////////////////////////////// - - /////////////////////////////////////// - // Read State Manager Tests // - /////////////////////////////////////// - - private class PerSampleReadStateManagerTest extends TestDataProvider { - private List readCountsPerAlignmentStart; - private List reads; - private List> recordStatesByAlignmentStart; - private int removalInterval; - - public PerSampleReadStateManagerTest( List readCountsPerAlignmentStart, int removalInterval ) { - super(PerSampleReadStateManagerTest.class); - - this.readCountsPerAlignmentStart = readCountsPerAlignmentStart; - this.removalInterval = removalInterval; - - reads = new ArrayList(); - recordStatesByAlignmentStart = new ArrayList>(); - - setName(String.format("%s: readCountsPerAlignmentStart: %s removalInterval: %d", - getClass().getSimpleName(), readCountsPerAlignmentStart, removalInterval)); - } - - public void run() { - LocusIteratorByStateExperimental libs = makeLTBS(new ArrayList(), createTestReadProperties()); - LocusIteratorByStateExperimental.ReadStateManager readStateManager = - libs.new ReadStateManager(new ArrayList().iterator()); - LocusIteratorByStateExperimental.ReadStateManager.PerSampleReadStateManager perSampleReadStateManager = - readStateManager.new PerSampleReadStateManager(); - - makeReads(); - - for ( ArrayList stackRecordStates : recordStatesByAlignmentStart ) { - perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates); - } - - // read state manager should have the right number of reads - Assert.assertEquals(reads.size(), perSampleReadStateManager.size()); - - Iterator originalReadsIterator = reads.iterator(); - Iterator recordStateIterator = perSampleReadStateManager.iterator(); - int recordStateCount = 0; - int numReadStatesRemoved = 0; - - // Do a first-pass validation of the record state iteration by making sure we get back everything we - // put in, in the same order, doing any requested removals of read states along the way - while ( recordStateIterator.hasNext() ) { - LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next(); - recordStateCount++; - SAMRecord readFromPerSampleReadStateManager = readState.getRead(); - - Assert.assertTrue(originalReadsIterator.hasNext()); - SAMRecord originalRead = originalReadsIterator.next(); - - // The read we get back should be literally the same read in memory as we put in - Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); - - // If requested, remove a read state every removalInterval states - if ( removalInterval > 0 && recordStateCount % removalInterval == 0 ) { - recordStateIterator.remove(); - numReadStatesRemoved++; - } - } - - Assert.assertFalse(originalReadsIterator.hasNext()); - - // If we removed any read states, do a second pass through the read states to make sure the right - // states were removed - if ( numReadStatesRemoved > 0 ) { - Assert.assertEquals(perSampleReadStateManager.size(), reads.size() - numReadStatesRemoved); - - originalReadsIterator = reads.iterator(); - recordStateIterator = perSampleReadStateManager.iterator(); - int readCount = 0; - int readStateCount = 0; - - // Match record states with the reads that should remain after removal - while ( recordStateIterator.hasNext() ) { - LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next(); - readStateCount++; - SAMRecord readFromPerSampleReadStateManager = readState.getRead(); - - Assert.assertTrue(originalReadsIterator.hasNext()); - - SAMRecord originalRead = originalReadsIterator.next(); - readCount++; - - if ( readCount % removalInterval == 0 ) { - originalRead = originalReadsIterator.next(); // advance to next read, since the previous one should have been discarded - readCount++; - } - - // The read we get back should be literally the same read in memory as we put in (after accounting for removals) - Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); - } - - Assert.assertEquals(readStateCount, reads.size() - numReadStatesRemoved); - } - - // Allow memory used by this test to be reclaimed - readCountsPerAlignmentStart = null; - reads = null; - recordStatesByAlignmentStart = null; - } - - private void makeReads() { - int alignmentStart = 1; - - for ( int readsThisStack : readCountsPerAlignmentStart ) { - ArrayList stackReads = new ArrayList(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100))); - ArrayList stackRecordStates = new ArrayList(); - - for ( SAMRecord read : stackReads ) { - stackRecordStates.add(new LocusIteratorByStateExperimental.SAMRecordState(read)); - } - - reads.addAll(stackReads); - recordStatesByAlignmentStart.add(stackRecordStates); - } - } - } - - @DataProvider(name = "PerSampleReadStateManagerTestDataProvider") - public Object[][] createPerSampleReadStateManagerTests() { - for ( List thisTestReadStateCounts : Arrays.asList( Arrays.asList(1), - Arrays.asList(2), - Arrays.asList(10), - Arrays.asList(1, 1), - Arrays.asList(2, 2), - Arrays.asList(10, 10), - Arrays.asList(1, 10), - Arrays.asList(10, 1), - Arrays.asList(1, 1, 1), - Arrays.asList(2, 2, 2), - Arrays.asList(10, 10, 10), - Arrays.asList(1, 1, 1, 1, 1, 1), - Arrays.asList(10, 10, 10, 10, 10, 10), - Arrays.asList(1, 2, 10, 1, 2, 10) - ) ) { - - for ( int removalInterval : Arrays.asList(0, 2, 3) ) { - new PerSampleReadStateManagerTest(thisTestReadStateCounts, removalInterval); - } - } - - return PerSampleReadStateManagerTest.getTests(PerSampleReadStateManagerTest.class); - } - - @Test(dataProvider = "PerSampleReadStateManagerTestDataProvider") - public void runPerSampleReadStateManagerTest( PerSampleReadStateManagerTest test ) { - logger.warn("Running test: " + test); - - test.run(); - } - - /////////////////////////////////////// - // End Read State Manager Tests // - /////////////////////////////////////// - - - - /////////////////////////////////////// - // Helper methods / classes // - /////////////////////////////////////// - private static ReadProperties createTestReadProperties() { - return createTestReadProperties(null); - } - - private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) { return new ReadProperties( Collections.emptyList(), new SAMFileHeader(), SAMFileHeader.SortOrder.coordinate, false, SAMFileReader.ValidationStringency.STRICT, - downsamplingMethod, + null, new ValidationExclusion(), Collections.emptyList(), Collections.emptyList(), @@ -513,136 +344,137 @@ public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { (byte) -1 ); } +} - private static class FakeCloseableIterator implements CloseableIterator { - Iterator iterator; +class FakeCloseableIterator implements CloseableIterator { + Iterator iterator; - public FakeCloseableIterator(Iterator it) { - iterator = it; - } - - @Override - public void close() {} - - @Override - public boolean hasNext() { - return iterator.hasNext(); - } - - @Override - public T next() { - return iterator.next(); - } - - @Override - public void remove() { - throw new UnsupportedOperationException("Don't remove!"); - } + public FakeCloseableIterator(Iterator it) { + iterator = it; } - private static final class LIBS_position { + @Override + public void close() {} - SAMRecord read; + @Override + public boolean hasNext() { + return iterator.hasNext(); + } - final int numOperators; - int currentOperatorIndex = 0; - int currentPositionOnOperator = 0; - int currentReadOffset = 0; + @Override + public T next() { + return iterator.next(); + } - boolean isBeforeDeletionStart = false; - boolean isBeforeDeletedBase = false; - boolean isAfterDeletionEnd = false; - boolean isAfterDeletedBase = false; - boolean isBeforeInsertion = false; - boolean isAfterInsertion = false; - boolean isNextToSoftClip = false; - - boolean sawMop = false; - - public LIBS_position(final SAMRecord read) { - this.read = read; - numOperators = read.getCigar().numCigarElements(); - } - - public int getCurrentReadOffset() { - return Math.max(0, currentReadOffset - 1); - } - - /** - * Steps forward on the genome. Returns false when done reading the read, true otherwise. - */ - public boolean stepForwardOnGenome() { - if ( currentOperatorIndex == numOperators ) - return false; - - CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); - if ( currentPositionOnOperator >= curElement.getLength() ) { - if ( ++currentOperatorIndex == numOperators ) - return false; - - curElement = read.getCigar().getCigarElement(currentOperatorIndex); - currentPositionOnOperator = 0; - } - - switch ( curElement.getOperator() ) { - case I: // insertion w.r.t. the reference - if ( !sawMop ) - break; - case S: // soft clip - currentReadOffset += curElement.getLength(); - case H: // hard clip - case P: // padding - currentOperatorIndex++; - return stepForwardOnGenome(); - - case D: // deletion w.r.t. the reference - case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) - currentPositionOnOperator++; - break; - - case M: - case EQ: - case X: - sawMop = true; - currentReadOffset++; - currentPositionOnOperator++; - break; - default: - throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); - } - - final boolean isFirstOp = currentOperatorIndex == 0; - final boolean isLastOp = currentOperatorIndex == numOperators - 1; - final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; - final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); - - isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); - isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); - isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); - isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); - isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) - || (!sawMop && curElement.getOperator() == CigarOperator.I); - isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); - isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) - || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); - - return true; - } - - private static boolean isBeforeOp(final Cigar cigar, - final int currentOperatorIndex, - final CigarOperator op, - final boolean isLastOp, - final boolean isLastBaseOfOp) { - return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; - } - - private static boolean isAfterOp(final Cigar cigar, - final int currentOperatorIndex, - final CigarOperator op, - final boolean isFirstOp, - final boolean isFirstBaseOfOp) { - return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; - } + @Override + public void remove() { + throw new UnsupportedOperationException("Don't remove!"); + } +} + + +final class LIBS_position { + + SAMRecord read; + + final int numOperators; + int currentOperatorIndex = 0; + int currentPositionOnOperator = 0; + int currentReadOffset = 0; + + boolean isBeforeDeletionStart = false; + boolean isBeforeDeletedBase = false; + boolean isAfterDeletionEnd = false; + boolean isAfterDeletedBase = false; + boolean isBeforeInsertion = false; + boolean isAfterInsertion = false; + boolean isNextToSoftClip = false; + + boolean sawMop = false; + + public LIBS_position(final SAMRecord read) { + this.read = read; + numOperators = read.getCigar().numCigarElements(); + } + + public int getCurrentReadOffset() { + return Math.max(0, currentReadOffset - 1); + } + + /** + * Steps forward on the genome. Returns false when done reading the read, true otherwise. + */ + public boolean stepForwardOnGenome() { + if ( currentOperatorIndex == numOperators ) + return false; + + CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); + if ( currentPositionOnOperator >= curElement.getLength() ) { + if ( ++currentOperatorIndex == numOperators ) + return false; + + curElement = read.getCigar().getCigarElement(currentOperatorIndex); + currentPositionOnOperator = 0; + } + + switch ( curElement.getOperator() ) { + case I: // insertion w.r.t. the reference + if ( !sawMop ) + break; + case S: // soft clip + currentReadOffset += curElement.getLength(); + case H: // hard clip + case P: // padding + currentOperatorIndex++; + return stepForwardOnGenome(); + + case D: // deletion w.r.t. the reference + case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + currentPositionOnOperator++; + break; + + case M: + case EQ: + case X: + sawMop = true; + currentReadOffset++; + currentPositionOnOperator++; + break; + default: + throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); + } + + final boolean isFirstOp = currentOperatorIndex == 0; + final boolean isLastOp = currentOperatorIndex == numOperators - 1; + final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; + final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); + + isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); + isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); + isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); + isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); + isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) + || (!sawMop && curElement.getOperator() == CigarOperator.I); + isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); + isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) + || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); + + return true; + } + + private static boolean isBeforeOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isLastOp, + final boolean isLastBaseOfOp) { + return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; + } + + private static boolean isAfterOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isFirstOp, + final boolean isFirstBaseOfOp) { + return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index 83913fa76..4e4126ad4 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -7,8 +7,10 @@ import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -19,13 +21,10 @@ import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.Arrays; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; +import java.util.*; /** - * testing of the LocusIteratorByState + * testing of the new (non-legacy) version of LocusIteratorByState */ public class LocusIteratorByStateUnitTest extends BaseTest { private static SAMFileHeader header; @@ -329,14 +328,184 @@ public class LocusIteratorByStateUnitTest extends BaseTest { // End comprehensive LIBS/PileupElement tests // //////////////////////////////////////////////// + + /////////////////////////////////////// + // Read State Manager Tests // + /////////////////////////////////////// + + private class PerSampleReadStateManagerTest extends TestDataProvider { + private List readCountsPerAlignmentStart; + private List reads; + private List> recordStatesByAlignmentStart; + private int removalInterval; + + public PerSampleReadStateManagerTest( List readCountsPerAlignmentStart, int removalInterval ) { + super(PerSampleReadStateManagerTest.class); + + this.readCountsPerAlignmentStart = readCountsPerAlignmentStart; + this.removalInterval = removalInterval; + + reads = new ArrayList(); + recordStatesByAlignmentStart = new ArrayList>(); + + setName(String.format("%s: readCountsPerAlignmentStart: %s removalInterval: %d", + getClass().getSimpleName(), readCountsPerAlignmentStart, removalInterval)); + } + + public void run() { + LocusIteratorByState libs = makeLTBS(new ArrayList(), createTestReadProperties()); + LocusIteratorByState.ReadStateManager readStateManager = + libs.new ReadStateManager(new ArrayList().iterator()); + LocusIteratorByState.ReadStateManager.PerSampleReadStateManager perSampleReadStateManager = + readStateManager.new PerSampleReadStateManager(); + + makeReads(); + + for ( ArrayList stackRecordStates : recordStatesByAlignmentStart ) { + perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates); + } + + // read state manager should have the right number of reads + Assert.assertEquals(reads.size(), perSampleReadStateManager.size()); + + Iterator originalReadsIterator = reads.iterator(); + Iterator recordStateIterator = perSampleReadStateManager.iterator(); + int recordStateCount = 0; + int numReadStatesRemoved = 0; + + // Do a first-pass validation of the record state iteration by making sure we get back everything we + // put in, in the same order, doing any requested removals of read states along the way + while ( recordStateIterator.hasNext() ) { + LocusIteratorByState.SAMRecordState readState = recordStateIterator.next(); + recordStateCount++; + SAMRecord readFromPerSampleReadStateManager = readState.getRead(); + + Assert.assertTrue(originalReadsIterator.hasNext()); + SAMRecord originalRead = originalReadsIterator.next(); + + // The read we get back should be literally the same read in memory as we put in + Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); + + // If requested, remove a read state every removalInterval states + if ( removalInterval > 0 && recordStateCount % removalInterval == 0 ) { + recordStateIterator.remove(); + numReadStatesRemoved++; + } + } + + Assert.assertFalse(originalReadsIterator.hasNext()); + + // If we removed any read states, do a second pass through the read states to make sure the right + // states were removed + if ( numReadStatesRemoved > 0 ) { + Assert.assertEquals(perSampleReadStateManager.size(), reads.size() - numReadStatesRemoved); + + originalReadsIterator = reads.iterator(); + recordStateIterator = perSampleReadStateManager.iterator(); + int readCount = 0; + int readStateCount = 0; + + // Match record states with the reads that should remain after removal + while ( recordStateIterator.hasNext() ) { + LocusIteratorByState.SAMRecordState readState = recordStateIterator.next(); + readStateCount++; + SAMRecord readFromPerSampleReadStateManager = readState.getRead(); + + Assert.assertTrue(originalReadsIterator.hasNext()); + + SAMRecord originalRead = originalReadsIterator.next(); + readCount++; + + if ( readCount % removalInterval == 0 ) { + originalRead = originalReadsIterator.next(); // advance to next read, since the previous one should have been discarded + readCount++; + } + + // The read we get back should be literally the same read in memory as we put in (after accounting for removals) + Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); + } + + Assert.assertEquals(readStateCount, reads.size() - numReadStatesRemoved); + } + + // Allow memory used by this test to be reclaimed + readCountsPerAlignmentStart = null; + reads = null; + recordStatesByAlignmentStart = null; + } + + private void makeReads() { + int alignmentStart = 1; + + for ( int readsThisStack : readCountsPerAlignmentStart ) { + ArrayList stackReads = new ArrayList(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100))); + ArrayList stackRecordStates = new ArrayList(); + + for ( SAMRecord read : stackReads ) { + stackRecordStates.add(new LocusIteratorByState.SAMRecordState(read)); + } + + reads.addAll(stackReads); + recordStatesByAlignmentStart.add(stackRecordStates); + } + } + } + + @DataProvider(name = "PerSampleReadStateManagerTestDataProvider") + public Object[][] createPerSampleReadStateManagerTests() { + for ( List thisTestReadStateCounts : Arrays.asList( Arrays.asList(1), + Arrays.asList(2), + Arrays.asList(10), + Arrays.asList(1, 1), + Arrays.asList(2, 2), + Arrays.asList(10, 10), + Arrays.asList(1, 10), + Arrays.asList(10, 1), + Arrays.asList(1, 1, 1), + Arrays.asList(2, 2, 2), + Arrays.asList(10, 10, 10), + Arrays.asList(1, 1, 1, 1, 1, 1), + Arrays.asList(10, 10, 10, 10, 10, 10), + Arrays.asList(1, 2, 10, 1, 2, 10) + ) ) { + + for ( int removalInterval : Arrays.asList(0, 2, 3) ) { + new PerSampleReadStateManagerTest(thisTestReadStateCounts, removalInterval); + } + } + + return PerSampleReadStateManagerTest.getTests(PerSampleReadStateManagerTest.class); + } + + @Test(dataProvider = "PerSampleReadStateManagerTestDataProvider") + public void runPerSampleReadStateManagerTest( PerSampleReadStateManagerTest test ) { + logger.warn("Running test: " + test); + + test.run(); + } + + /////////////////////////////////////// + // End Read State Manager Tests // + /////////////////////////////////////// + + + + /////////////////////////////////////// + // Helper methods / classes // + /////////////////////////////////////// + private static ReadProperties createTestReadProperties() { + return createTestReadProperties(null); + } + + private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) { return new ReadProperties( Collections.emptyList(), new SAMFileHeader(), SAMFileHeader.SortOrder.coordinate, false, SAMFileReader.ValidationStringency.STRICT, - null, + downsamplingMethod, new ValidationExclusion(), Collections.emptyList(), Collections.emptyList(), @@ -344,137 +513,136 @@ public class LocusIteratorByStateUnitTest extends BaseTest { (byte) -1 ); } -} -class FakeCloseableIterator implements CloseableIterator { - Iterator iterator; + private static class FakeCloseableIterator implements CloseableIterator { + Iterator iterator; - public FakeCloseableIterator(Iterator it) { - iterator = it; + public FakeCloseableIterator(Iterator it) { + iterator = it; + } + + @Override + public void close() {} + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public T next() { + return iterator.next(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Don't remove!"); + } } - @Override - public void close() {} + private static final class LIBS_position { - @Override - public boolean hasNext() { - return iterator.hasNext(); - } + SAMRecord read; - @Override - public T next() { - return iterator.next(); - } + final int numOperators; + int currentOperatorIndex = 0; + int currentPositionOnOperator = 0; + int currentReadOffset = 0; - @Override - public void remove() { - throw new UnsupportedOperationException("Don't remove!"); - } -} + boolean isBeforeDeletionStart = false; + boolean isBeforeDeletedBase = false; + boolean isAfterDeletionEnd = false; + boolean isAfterDeletedBase = false; + boolean isBeforeInsertion = false; + boolean isAfterInsertion = false; + boolean isNextToSoftClip = false; + boolean sawMop = false; -final class LIBS_position { + public LIBS_position(final SAMRecord read) { + this.read = read; + numOperators = read.getCigar().numCigarElements(); + } - SAMRecord read; + public int getCurrentReadOffset() { + return Math.max(0, currentReadOffset - 1); + } - final int numOperators; - int currentOperatorIndex = 0; - int currentPositionOnOperator = 0; - int currentReadOffset = 0; - - boolean isBeforeDeletionStart = false; - boolean isBeforeDeletedBase = false; - boolean isAfterDeletionEnd = false; - boolean isAfterDeletedBase = false; - boolean isBeforeInsertion = false; - boolean isAfterInsertion = false; - boolean isNextToSoftClip = false; - - boolean sawMop = false; - - public LIBS_position(final SAMRecord read) { - this.read = read; - numOperators = read.getCigar().numCigarElements(); - } - - public int getCurrentReadOffset() { - return Math.max(0, currentReadOffset - 1); - } - - /** - * Steps forward on the genome. Returns false when done reading the read, true otherwise. - */ - public boolean stepForwardOnGenome() { - if ( currentOperatorIndex == numOperators ) - return false; - - CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); - if ( currentPositionOnOperator >= curElement.getLength() ) { - if ( ++currentOperatorIndex == numOperators ) + /** + * Steps forward on the genome. Returns false when done reading the read, true otherwise. + */ + public boolean stepForwardOnGenome() { + if ( currentOperatorIndex == numOperators ) return false; - curElement = read.getCigar().getCigarElement(currentOperatorIndex); - currentPositionOnOperator = 0; - } + CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); + if ( currentPositionOnOperator >= curElement.getLength() ) { + if ( ++currentOperatorIndex == numOperators ) + return false; - switch ( curElement.getOperator() ) { - case I: // insertion w.r.t. the reference - if ( !sawMop ) + curElement = read.getCigar().getCigarElement(currentOperatorIndex); + currentPositionOnOperator = 0; + } + + switch ( curElement.getOperator() ) { + case I: // insertion w.r.t. the reference + if ( !sawMop ) + break; + case S: // soft clip + currentReadOffset += curElement.getLength(); + case H: // hard clip + case P: // padding + currentOperatorIndex++; + return stepForwardOnGenome(); + + case D: // deletion w.r.t. the reference + case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + currentPositionOnOperator++; break; - case S: // soft clip - currentReadOffset += curElement.getLength(); - case H: // hard clip - case P: // padding - currentOperatorIndex++; - return stepForwardOnGenome(); - case D: // deletion w.r.t. the reference - case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) - currentPositionOnOperator++; - break; + case M: + case EQ: + case X: + sawMop = true; + currentReadOffset++; + currentPositionOnOperator++; + break; + default: + throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); + } - case M: - case EQ: - case X: - sawMop = true; - currentReadOffset++; - currentPositionOnOperator++; - break; - default: - throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); + final boolean isFirstOp = currentOperatorIndex == 0; + final boolean isLastOp = currentOperatorIndex == numOperators - 1; + final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; + final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); + + isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); + isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); + isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); + isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); + isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) + || (!sawMop && curElement.getOperator() == CigarOperator.I); + isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); + isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) + || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); + + return true; } - final boolean isFirstOp = currentOperatorIndex == 0; - final boolean isLastOp = currentOperatorIndex == numOperators - 1; - final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; - final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); + private static boolean isBeforeOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isLastOp, + final boolean isLastBaseOfOp) { + return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; + } - isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); - isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); - isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); - isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); - isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) - || (!sawMop && curElement.getOperator() == CigarOperator.I); - isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); - isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) - || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); - - return true; - } - - private static boolean isBeforeOp(final Cigar cigar, - final int currentOperatorIndex, - final CigarOperator op, - final boolean isLastOp, - final boolean isLastBaseOfOp) { - return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; - } - - private static boolean isAfterOp(final Cigar cigar, - final int currentOperatorIndex, - final CigarOperator op, - final boolean isFirstOp, - final boolean isFirstBaseOfOp) { - return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; + private static boolean isAfterOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isFirstOp, + final boolean isFirstBaseOfOp) { + return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; + } } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java index bf1fc9e65..d9af2ea7e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java @@ -6,7 +6,7 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.sting.gatk.datasources.reads.ReadShardBalancer; +import org.broadinstitute.sting.gatk.datasources.reads.LegacyReadShardBalancer; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.datasources.reads.Shard; @@ -114,7 +114,7 @@ public class TraverseReadsUnitTest extends BaseTest { @Test public void testUnmappedReadCount() { SAMDataSource dataSource = new SAMDataSource(bamList,new ThreadAllocation(),null,genomeLocParser); - Iterable shardStrategy = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); + Iterable shardStrategy = dataSource.createShardIteratorOverAllReads(new LegacyReadShardBalancer()); countReadWalker.initialize(); Object accumulator = countReadWalker.reduceInit(); diff --git a/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java index 5b052454a..47096c73a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java @@ -23,14 +23,14 @@ public class LegacyReservoirDownsamplerUnitTest { @Test public void testEmptyIterator() { - ReservoirDownsampler downsampler = new ReservoirDownsampler(1); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(1); Assert.assertTrue(downsampler.isEmpty(),"Downsampler is not empty but should be."); } @Test public void testOneElementWithPoolSizeOne() { List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(1); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(1); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); @@ -42,7 +42,7 @@ public class LegacyReservoirDownsamplerUnitTest { @Test public void testOneElementWithPoolSizeGreaterThanOne() { List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(5); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(5); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); @@ -58,7 +58,7 @@ public class LegacyReservoirDownsamplerUnitTest { reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(5); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(5); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); @@ -78,7 +78,7 @@ public class LegacyReservoirDownsamplerUnitTest { reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read4",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read5",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(5); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(5); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); @@ -99,7 +99,7 @@ public class LegacyReservoirDownsamplerUnitTest { reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(0); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(0); downsampler.addAll(reads); Assert.assertTrue(downsampler.isEmpty(),"Downsampler isn't empty but should be"); @@ -115,7 +115,7 @@ public class LegacyReservoirDownsamplerUnitTest { reads.add(ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read4",0,1,76)); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read5",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(1); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(1); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); @@ -128,7 +128,7 @@ public class LegacyReservoirDownsamplerUnitTest { public void testFillingAcrossLoci() { List reads = new ArrayList(); reads.add(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); - ReservoirDownsampler downsampler = new ReservoirDownsampler(5); + LegacyReservoirDownsampler downsampler = new LegacyReservoirDownsampler(5); downsampler.addAll(reads); Assert.assertFalse(downsampler.isEmpty(),"Downsampler is empty but shouldn't be"); From 573ace4403e688504c714d7608ad00e199ca1909 Mon Sep 17 00:00:00 2001 From: Ami Levy-Moonshine Date: Mon, 10 Dec 2012 10:28:56 -0500 Subject: [PATCH 21/31] restore the right version of VariantContextUtils.java in my unstable dir --- .../variantcontext/VariantContextUtils.java | 36 ++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 81959c998..b3e3cf8df 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -979,6 +979,40 @@ public class VariantContextUtils { private static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. + /** + * Split variant context into its biallelic components if there are more than 2 alleles + * + * For VC has A/B/C alleles, returns A/B and A/C contexts. + * Genotypes are all no-calls now (it's not possible to fix them easily) + * Alleles are right trimmed to satisfy VCF conventions + * + * If vc is biallelic or non-variant it is just returned + * + * Chromosome counts are updated (but they are by definition 0) + * + * @param vc a potentially multi-allelic variant context + * @return a list of bi-allelic (or monomorphic) variant context + */ + public static List splitVariantContextToBiallelics(final VariantContext vc) { + if ( ! vc.isVariant() || vc.isBiallelic() ) + // non variant or biallelics already satisfy the contract + return Collections.singletonList(vc); + else { + final List biallelics = new LinkedList(); + + for ( final Allele alt : vc.getAlternateAlleles() ) { + VariantContextBuilder builder = new VariantContextBuilder(vc); + final List alleles = Arrays.asList(vc.getReference(), alt); + builder.alleles(alleles); + builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false)); + calculateChromosomeCounts(builder, true); + biallelics.add(reverseTrimAlleles(builder.make())); + } + + return biallelics; + } + } + /** * subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately) * @@ -1233,7 +1267,7 @@ public class VariantContextUtils { * @param testString String to test * @return Number of repetitions (0 if testString is not a concatenation of n repeatUnit's */ - protected static int findNumberofRepetitions(byte[] repeatUnit, byte[] testString) { + public static int findNumberofRepetitions(byte[] repeatUnit, byte[] testString) { int numRepeats = 0; for (int start = 0; start < testString.length; start += repeatUnit.length) { int end = start + repeatUnit.length; From bdda63d9730182e5b0860ab00fb4ef5f0857b223 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 10 Dec 2012 14:47:04 -0500 Subject: [PATCH 23/31] Related bug fixes to GGA mode in the HC: some variants (especially MNPs) were causing problems because they don't have to start at the current location to match the allele being genotyped. Fixed. --- .../gatk/walkers/haplotypecaller/GenotypingEngine.java | 7 +++---- .../haplotypecaller/HaplotypeCallerIntegrationTest.java | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index fee6c86f8..4d81d0010 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -33,7 +33,6 @@ import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -449,10 +448,10 @@ public class GenotypingEngine { final ArrayList undeterminedHaplotypes = new ArrayList(haplotypes.size()); for( final Haplotype h : haplotypes ) { - if( h.getEventMap().get(loc) == null ) { // no event at this location so this is a reference-supporting haplotype - alleleMapper.get(refAllele).add(h); - } else if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() && alleleMapper.containsKey(h.getArtificialAllele()) ) { + if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() && alleleMapper.containsKey(h.getArtificialAllele()) ) { alleleMapper.get(h.getArtificialAllele()).add(h); + } else if( h.getEventMap().get(loc) == null ) { // no event at this location so let's investigate later + undeterminedHaplotypes.add(h); } else { boolean haplotypeIsDetermined = false; for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index e9c1ec605..a80137c27 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -33,7 +33,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "e2b3bf420c45c677956a2e4a56d75ea2"); + "fe84caa79f59ecbd98fcbcd5b30ab164"); } private void HCTestComplexVariants(String bam, String args, String md5) { From 8a115edbaf17b0bd798c1d55a126d6be6b7ff7b1 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 6 Dec 2012 17:06:32 -0500 Subject: [PATCH 24/31] ReduceReads is now scattered by contig It's no longer safe to scatter/gather by interval because now we don't hard-clip to the intervals anymore. --- .../compression/reducereads/ReduceReads.java | 2 +- .../walkers/qc/AssessReducedCoverage.java | 149 -------------- .../gatk/walkers/qc/AssessReducedQuals.java | 183 ------------------ 3 files changed, 1 insertion(+), 333 deletions(-) delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 629a27c48..2061c5364 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -85,7 +85,7 @@ import java.util.*; */ @DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} ) -@PartitionBy(PartitionType.INTERVAL) +@PartitionBy(PartitionType.CONTIG) @ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class}) public class ReduceReads extends ReadWalker, ReduceReadsStash> { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java deleted file mode 100755 index d38c11594..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.io.PrintStream; -import java.util.*; - -/** - * Emits intervals present in either the original or reduced bam but not the other. - * - *

Input

- *

- * The original and reduced BAM files. - *

- * - *

Output

- *

- * A list of intervals present in one bam but not the other. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -I:original original.bam \
- *   -I:reduced reduced.bam \
- *   -R ref.fasta \
- *   -T AssessReducedCoverage \
- *   -o output.intervals
- * 
- * - * @author ebanks - */ -@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class}) -@Hidden -public class AssessReducedCoverage extends LocusWalker implements TreeReducible { - - private static final String original = "original"; - private static final String reduced = "reduced"; - - @Output - protected PrintStream out; - - @Override - public boolean includeReadsWithDeletionAtLoci() { return true; } - - @Argument(fullName = "output_reduced_only_coverage", shortName = "output_reduced_only_coverage", doc = "Output an interval if the reduced bam has coverage where the original does not", required = false) - public boolean OUTPUT_REDUCED_ONLY_INTERVALS = false; - - public void initialize() {} - - public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - if ( tracker == null ) - return null; - - Set tags = getAllTags(context.getBasePileup()); - return (tags.contains(original) && !tags.contains(reduced)) || - (OUTPUT_REDUCED_ONLY_INTERVALS && tags.contains(reduced) && !tags.contains(original)) ? ref.getLocus() : null; - } - - private Set getAllTags(final ReadBackedPileup pileup) { - - final Set tags = new HashSet(10); - - for ( final PileupElement p : pileup ) { - if ( (int)p.getQual() > 2 && p.getMappingQual() > 0 && !p.isDeletion() ) - tags.addAll(getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags()); - } - - return tags; - } - - public void onTraversalDone(GenomeLoc sum) { - if ( sum != null ) - out.println(sum); - } - - public GenomeLoc reduceInit() { - return null; - } - - public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) { - if ( lhs == null ) - return rhs; - - if ( rhs == null ) - return lhs; - - // if contiguous, just merge them - if ( lhs.contiguousP(rhs) ) - return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop()); - - // otherwise, print the lhs and start over with the rhs - out.println(lhs); - return rhs; - } - - public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) { - if ( value == null ) - return sum; - - if ( sum == null ) - return value; - - // if contiguous, just merge them - if ( sum.contiguousP(value) ) - return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop()); - - // otherwise, print the sum and start over with the value - out.println(sum); - return value; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java deleted file mode 100644 index 2c70d44e2..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java +++ /dev/null @@ -1,183 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.io.PrintStream; -import java.util.List; - -/** - * Emits intervals in which the differences between the original and reduced bam quals are bigger epsilon (unless the quals of - * the reduced bam are above sufficient threshold) - * - *

Input

- *

- * The original and reduced BAM files. - *

- * - *

Output

- *

- * A list of intervals in which the differences between the original and reduced bam quals are bigger epsilon. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -I:original original.bam \
- *   -I:reduced reduced.bam \
- *   -R ref.fasta \
- *   -T AssessReducedQuals \
- *   -o output.intervals
- * 
- * - * @author ami - */ - -public class AssessReducedQuals extends LocusWalker implements TreeReducible { - - private static final String reduced = "reduced"; - private static final int originalQualsIndex = 0; - private static final int reducedQualsIndex = 1; - - @Argument(fullName = "sufficientQualSum", shortName = "sufficientQualSum", doc = "When a reduced bam qual sum is above this threshold, it passes even without comparing to the non-reduced bam ", required = false) - public int sufficientQualSum = 600; - - @Argument(fullName = "qual_epsilon", shortName = "epsilon", doc = "when |Quals_reduced_bam - Quals_original_bam| > epsilon*Quals_original_bam we output this interval", required = false) - public int qual_epsilon = 0; - - @Argument(fullName = "debugLevel", shortName = "debug", doc = "debug level: NO_DEBUG, PRINT_LOCI,PRINT_PILEUPS", required = false) - public DebugLevel debugLevel = DebugLevel.NO_DEBUG; - - @Output - protected PrintStream out; - - public void initialize() { - if (debugLevel != DebugLevel.NO_DEBUG) - out.println(" Debug mode" + - "Debug:\tsufficientQualSum: "+sufficientQualSum+ "\n " + - "Debug:\tqual_epsilon: "+qual_epsilon); - } - - @Override - public boolean includeReadsWithDeletionAtLoci() { return true; } - - @Override - public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return null; - - boolean reportLocus; - final int[] quals = getPileupQuals(context.getBasePileup()); - if (debugLevel != DebugLevel.NO_DEBUG) - out.println("Debug:\tLocus Quals\t"+ref.getLocus()+"\toriginal\t"+quals[originalQualsIndex]+"\treduced\t"+quals[reducedQualsIndex]); - final int epsilon = MathUtils.fastRound(quals[originalQualsIndex]*qual_epsilon); - final int calcOriginalQuals = Math.min(quals[originalQualsIndex],sufficientQualSum); - final int calcReducedQuals = Math.min(quals[reducedQualsIndex],sufficientQualSum); - final int OriginalReducedQualDiff = calcOriginalQuals - calcReducedQuals; - reportLocus = OriginalReducedQualDiff > epsilon || OriginalReducedQualDiff < -1*epsilon; - if(debugLevel != DebugLevel.NO_DEBUG && reportLocus) - out.println("Debug:\tEmited Locus\t"+ref.getLocus()+"\toriginal\t"+quals[originalQualsIndex]+"\treduced\t"+quals[reducedQualsIndex]+"\tepsilon\t"+epsilon+"\tdiff\t"+OriginalReducedQualDiff); - - return reportLocus ? ref.getLocus() : null; - } - - private int[] getPileupQuals(final ReadBackedPileup readPileup) { - - final int[] quals = new int[2]; - String[] printPileup = {"Debug 2:\toriginal pileup:\t"+readPileup.getLocation()+"\nDebug 2:----------------------------------\n", - "Debug 2:\t reduced pileup:\t"+readPileup.getLocation()+"\nDebug 2:----------------------------------\n"}; - - for( PileupElement p : readPileup ){ - final List tags = getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags(); - if ( isGoodRead(p) ){ - final int tempQual = (int)(p.getQual()) * p.getRepresentativeCount(); - final int tagIndex = getTagIndex(tags); - quals[tagIndex] += tempQual; - if(debugLevel == DebugLevel.PRINT_PILEUPS) - printPileup[tagIndex] += "\tDebug 2: ("+p+")\tMQ="+p.getMappingQual()+":QU="+p.getQual()+":RC="+p.getRepresentativeCount()+":OS="+p.getOffset()+"\n"; - } - } - if(debugLevel == DebugLevel.PRINT_PILEUPS){ - out.println(printPileup[originalQualsIndex]); - out.println(printPileup[reducedQualsIndex]); - } - return quals; - } - - - private boolean isGoodRead(PileupElement p){ - // TODO -- You want to check whether the read itself is a reduced read and only - // TODO -- for them you want to ignore that min mapping quality cutoff (but you *do* still want the min base cutoff). - return !p.isDeletion() && ((p.getRead().isReducedRead()) || (!p.getRead().isReducedRead() && (int)p.getQual() >= 20 && p.getMappingQual() >= 20)); - } - - private int getTagIndex(List tags){ - return tags.contains(reduced) ? 1 : 0; - } - - @Override - public void onTraversalDone(GenomeLoc sum) { - if ( sum != null ) - out.println(sum); - } - - @Override - public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) { - if ( lhs == null ) - return rhs; - - if ( rhs == null ) - return lhs; - - // if contiguous, just merge them - if ( lhs.contiguousP(rhs) ) - return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop()); - - // otherwise, print the lhs and start over with the rhs - out.println(lhs); - return rhs; - } - - @Override - public GenomeLoc reduceInit() { - return null; - } - - @Override - public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) { - if ( value == null ) - return sum; - - if ( sum == null ) - return value; - - // if contiguous, just merge them - if ( sum.contiguousP(value) ) - return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop()); - - // otherwise, print the sum and start over with the value - out.println(sum); - return value; - } - - public enum DebugLevel { - NO_DEBUG, - /** - * Print locus level information (such as locus quals) and loci with unmatch quals - */ - PRINT_LOCI, - /** - * Print the pileup infomarion of the reduced bam files and the original bam files - */ - PRINT_PILEUPS - } -} From 2e3284f30669c2b28e14dffa9ac0cf49f1f1f7e6 Mon Sep 17 00:00:00 2001 From: Ami Levy-Moonshine Date: Mon, 10 Dec 2012 22:23:58 -0500 Subject: [PATCH 25/31] Continue to fix the case where PRIORITIZE is used but no priority list is given. While fixing that case I also removed unnecessary sorting, when the prioeity list is not provied. When the priority list is not provided, it will continue to be null. Thus, the number of original Variant Contexts should be given as a new parameter to simpleMerge (since priority might be null). This new parameter is used for checking if there are filtered VC, when annotationOrigin is true. --- .../walkers/variantutils/CombineVariants.java | 7 ++-- .../variantcontext/VariantContextUtils.java | 42 +++++++++++++++++-- 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 1d4913769..68fac7631 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -228,7 +228,7 @@ public class CombineVariants extends RodWalker implements Tree if ( genotypeMergeOption == VariantContextUtils.GenotypeMergeType.PRIORITIZE && PRIORITY_STRING == null ) throw new UserException.MissingArgument("rod_priority_list", "Priority string must be provided if you want to prioritize genotypes"); - if ( PRIORITY_STRING != null || genotypeMergeOption == VariantContextUtils.GenotypeMergeType.PRIORITIZE ){ + if ( PRIORITY_STRING != null){ priority = new ArrayList(Arrays.asList(PRIORITY_STRING.split(","))); if ( rodNames.size() != priority.size() ) throw new UserException.BadArgumentValue("rod_priority_list", "The priority list must contain exactly one rod binding per ROD provided to the GATK: rodNames=" + rodNames + " priority=" + priority); @@ -243,6 +243,7 @@ public class CombineVariants extends RodWalker implements Tree if ( tracker == null ) // RodWalkers can make funky map calls return 0; + Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); // get all of the vcf rods at this locus // Need to provide reference bases to simpleMerge starting at current locus Collection vcs = tracker.getValues(variants, context.getLocation()); @@ -289,13 +290,13 @@ public class CombineVariants extends RodWalker implements Tree for (VariantContext.Type type : VariantContext.Type.values()) { if (VCsByType.containsKey(type)) mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type), - priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + priority, rodNames.size() , filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); } } else if (multipleAllelesMergeType == VariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) { mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), vcs, - priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); } else { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index b3e3cf8df..8b360eb5e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -448,11 +448,47 @@ public class VariantContextUtils { final String setKey, final boolean filteredAreUncalled, final boolean mergeInfoWithMaxAC ) { + int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size(); + return simpleMerge(genomeLocParser,unsortedVCs,priorityListOfVCs,originalNumOfVCs,filteredRecordMergeType,genotypeMergeOptions,annotateOrigin,printMessages,setKey,filteredAreUncalled,mergeInfoWithMaxAC); + } + + /** + * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. + * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with + * the sample name + * + * @param genomeLocParser loc parser + * @param unsortedVCs collection of unsorted VCs + * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs + * @param filteredRecordMergeType merge type for filtered records + * @param genotypeMergeOptions merge option for genotypes + * @param annotateOrigin should we annotate the set it came from? + * @param printMessages should we print messages? + * @param setKey the key name of the set + * @param filteredAreUncalled are filtered records uncalled? + * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? + * @return new VariantContext representing the merge of unsortedVCs + */ + public static VariantContext simpleMerge(final GenomeLocParser genomeLocParser, + final Collection unsortedVCs, + final List priorityListOfVCs, + final int originalNumOfVCs, + final FilteredRecordMergeType filteredRecordMergeType, + final GenotypeMergeType genotypeMergeOptions, + final boolean annotateOrigin, + final boolean printMessages, + final String setKey, + final boolean filteredAreUncalled, + final boolean mergeInfoWithMaxAC ) { + if ( unsortedVCs == null || unsortedVCs.size() == 0 ) return null; - if ( annotateOrigin && priorityListOfVCs == null ) - throw new IllegalArgumentException("Cannot merge calls and annotate their origins without a complete priority list of VariantContexts"); + if (priorityListOfVCs != null && originalNumOfVCs != priorityListOfVCs.size()) + throw new IllegalArgumentException("the number of the original VariantContexts must be the same as the number of VariantContexts in the priority list"); + + if ( annotateOrigin && priorityListOfVCs == null && originalNumOfVCs == 0) + throw new IllegalArgumentException("Cannot merge calls and annotate their origins without a complete priority list of VariantContexts or the number of original VariantContexts"); if ( genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE ) verifyUniqueSampleNames(unsortedVCs); @@ -597,7 +633,7 @@ public class VariantContextUtils { if ( annotateOrigin ) { // we care about where the call came from String setValue; - if ( nFiltered == 0 && variantSources.size() == priorityListOfVCs.size() ) // nothing was unfiltered + if ( nFiltered == 0 && variantSources.size() == originalNumOfVCs ) // nothing was unfiltered setValue = MERGE_INTERSECTION; else if ( nFiltered == VCs.size() ) // everything was filtered out setValue = MERGE_FILTER_IN_ALL; From 2f99569dda32da2c0d7124194e4e1c4371d29ee7 Mon Sep 17 00:00:00 2001 From: Ami Levy-Moonshine Date: Mon, 10 Dec 2012 22:48:15 -0500 Subject: [PATCH 26/31] change the md5 in one of the CV intergration tests, since it wasn't use the priority list when printing the origin of the annotation (the setValue field) --- .../walkers/variantutils/CombineVariantsIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index c32d77f82..28b176d4b 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -137,7 +137,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest { " -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" + " -genotypeMergeOptions UNIQUIFY -L 1"), 1, - Arrays.asList("e5f0e7a80cd392172ebf5ddb06b91a00")); + Arrays.asList("58e6281df108c361e99673a501ee4749")); cvExecuteTest("threeWayWithRefs", spec, true); } From dd52a70d459004d32ee9edc5490bde7fb269c53a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 11 Dec 2012 10:39:58 -0500 Subject: [PATCH 27/31] Fix AFCalcResult unit test -- I was simply passing in the wrong values into the function. Fixed the calls, and expanded the docs on what needs to be passed in. --- .../walkers/genotyper/afcalc/AFCalcResultUnitTest.java | 10 +++++----- .../gatk/walkers/genotyper/afcalc/AFCalcResult.java | 3 +++ 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java index 016926e12..ee5436264 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java @@ -111,11 +111,11 @@ public class AFCalcResultUnitTest extends BaseTest { @Test(enabled = true, dataProvider = "TestIsPolymorphic") private void testIsPolymorphic(final double pNonRef, final double pThreshold, final boolean shouldBePoly) { - final AFCalcResult result = makePolymorphicTestData(pNonRef); - final boolean actualIsPoly = result.isPolymorphic(C, Math.log10(pThreshold)); - Assert.assertEquals(actualIsPoly, shouldBePoly, - "isPolymorphic with pNonRef " + pNonRef + " and threshold " + pThreshold + " returned " - + actualIsPoly + " but the expected result is " + shouldBePoly); + final AFCalcResult result = makePolymorphicTestData(pNonRef); + final boolean actualIsPoly = result.isPolymorphic(C, Math.log10(1 - pThreshold)); + Assert.assertEquals(actualIsPoly, shouldBePoly, + "isPolymorphic with pNonRef " + pNonRef + " and threshold " + pThreshold + " returned " + + actualIsPoly + " but the expected result is " + shouldBePoly); } @Test(enabled = true, dataProvider = "TestIsPolymorphic") diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java index d6a5cb16d..142469077 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -230,6 +230,9 @@ public class AFCalcResult { * And that log10minPNonRef is -3. * We are considered polymorphic since 10^-5 < 10^-3 => -5 < -3 * + * Note that log10minPNonRef is really the minimum confidence, scaled as an error rate, so + * if you want to be 99% confidence, then log10PNonRef should be log10(0.01) = -2. + * * @param log10minPNonRef the log10 scaled min pr of being non-ref to be considered polymorphic * * @return true if there's enough confidence (relative to log10minPNonRef) to reject AF == 0 From 5632c13bf247826920018fb9c881470504567d4e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 12 Dec 2012 10:27:07 -0500 Subject: [PATCH 28/31] Resolves GSA-681 / Compressed VCF.gz output is too big because of unnecessary call to flush(). -- Now compressed output VCFs are properly blocked compressed (i.e., they are actually smaller than the uncompressed VCF) --- .../walkers/genotyper/UnifiedGenotyperIntegrationTest.java | 2 +- .../sting/utils/variantcontext/writer/VCFWriter.java | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index f2b2dfb7d..c768f95ad 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -96,7 +96,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "5b8f477c287770b5769b05591e35bc2d"; + private final static String COMPRESSED_OUTPUT_MD5 = "3eba6c309514d1e9ee06a20a112b68e6"; @Test public void testCompressedOutput() { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index 9a987f161..974e50ced 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -248,7 +248,8 @@ class VCFWriter extends IndexingVariantContextWriter { } mWriter.write("\n"); - mWriter.flush(); // necessary so that writing to an output stream will work + // note that we cannot call flush here if we want block gzipping to work properly + // calling flush results in all gzipped blocks for each variant } catch (IOException e) { throw new RuntimeException("Unable to write the VCF object to " + getStreamName(), e); } From 211a6e78ea2404f714ecaeec6460fa52233689ba Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 12 Dec 2012 14:53:02 -0500 Subject: [PATCH 29/31] Further related bug fixes to GGA mode in the HC: some variants (especially MNPs) were causing problems because they don't have to start at the current location to match the allele being genotyped. Fixed. --- .../haplotypecaller/SimpleDeBruijnAssembler.java | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java index 4f072d720..3c5a1f79c 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java @@ -282,10 +282,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { for( final VariantContext compVC : activeAllelesToGenotype ) { for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()); - if( !addHaplotype( insertedRefHaplotype, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) { - return returnHaplotypes; - //throw new ReviewedStingException("Unable to add reference+allele haplotype during GGA-enabled assembly: " + insertedRefHaplotype); - } + addHaplotype( insertedRefHaplotype, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, true ); } } @@ -293,7 +290,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() ); - if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) { + if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false ) ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present if( !activeAllelesToGenotype.isEmpty() ) { @@ -308,7 +305,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { // falls into the bin for the 1bp deletion because we keep track of the artificial alleles). if( vcOnHaplotype == null ) { for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ); + addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop, false ); } } } @@ -332,7 +329,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { return returnHaplotypes; } - private boolean addHaplotype( final Haplotype haplotype, final byte[] ref, final ArrayList haplotypeList, final int activeRegionStart, final int activeRegionStop ) { + private boolean addHaplotype( final Haplotype haplotype, final byte[] ref, final ArrayList haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) { if( haplotype == null ) { return false; } final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); @@ -387,7 +384,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { return false; } - if( !haplotypeList.contains(h) ) { + if( FORCE_INCLUSION_FOR_GGA_MODE || !haplotypeList.contains(h) ) { haplotypeList.add(h); return true; } else { From bba63a3b0ed94bf4d604d5a7f15e33f0f52fa930 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 12 Dec 2012 20:25:45 +0000 Subject: [PATCH 30/31] Fix for GSA-615: UnifiedGenotyperEngine.getGLModelsToUse takes 5% of the runtime of UG, should be optimized away. --- .../GenotypeLikelihoodsCalculationModel.java | 4 +- .../genotyper/UnifiedGenotyperEngine.java | 74 ++++++++++--------- 2 files changed, 42 insertions(+), 36 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index ae9b01f2d..a8ee4afde 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -59,8 +59,8 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { public enum Model { SNP, INDEL, - GeneralPloidySNP, - GeneralPloidyINDEL, + GENERALPLOIDYSNP, + GENERALPLOIDYINDEL, BOTH } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index cc086b148..8f2588679 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -52,7 +52,7 @@ import java.util.*; public class UnifiedGenotyperEngine { public static final String LOW_QUAL_FILTER_NAME = "LowQual"; - private static final String GPSTRING = "GeneralPloidy"; + private static final String GPSTRING = "GENERALPLOIDY"; public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA"; @@ -79,6 +79,7 @@ public class UnifiedGenotyperEngine { // the model used for calculating genotypes private ThreadLocal> glcm = new ThreadLocal>(); + private final List modelsToUse = new ArrayList(2); // the model used for calculating p(non-ref) private ThreadLocal afcm = new ThreadLocal(); @@ -134,6 +135,8 @@ public class UnifiedGenotyperEngine { computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY); filter.add(LOW_QUAL_FILTER_NAME); + + determineGLModelsToUse(); } /** @@ -286,7 +289,7 @@ public class UnifiedGenotyperEngine { glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); } - return glcm.get().get(model.name().toUpperCase()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser, perReadAlleleLikelihoodMap); + return glcm.get().get(model.name()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser, perReadAlleleLikelihoodMap); } private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, AlignmentContext rawContext) { @@ -634,48 +637,51 @@ public class UnifiedGenotyperEngine { (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && QualityUtils.phredScaleErrorRate(PofF) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING); } + private void determineGLModelsToUse() { + + String modelPrefix = ""; + if ( !UAC.GLmodel.name().contains(GPSTRING) && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY ) + modelPrefix = GPSTRING; + + if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) { + modelPrefix += UAC.GLmodel.name().toUpperCase().replaceAll("BOTH",""); + modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP")); + modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL")); + } + else { + modelsToUse.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+UAC.GLmodel.name().toUpperCase())); + } + } + // decide whether we are currently processing SNPs, indels, neither, or both private List getGLModelsToUse(final RefMetaDataTracker tracker, final ReferenceContext refContext, final AlignmentContext rawContext) { - final List models = new ArrayList(2); - String modelPrefix = ""; - if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) - modelPrefix = UAC.GLmodel.name().toUpperCase().replaceAll("BOTH",""); + if ( UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) + return modelsToUse; - if (!UAC.GLmodel.name().contains(GPSTRING) && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) - modelPrefix = GPSTRING + modelPrefix; + // if we're genotyping given alleles then we need to choose the model corresponding to the variant type requested + final List GGAmodel = new ArrayList(1); + final VariantContext vcInput = getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles); + if ( vcInput == null ) + return GGAmodel; // no work to be done - // if we're genotyping given alleles and we have a requested SNP at this position, do SNP - if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - final VariantContext vcInput = getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles); - if ( vcInput == null ) - return models; - - if ( vcInput.isSNP() ) { - // ignore SNPs if the user chose INDEL mode only - if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") || UAC.GLmodel.name().toUpperCase().contains("SNP") ) - models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP")); - } - else if ( vcInput.isIndel() || vcInput.isMixed() ) { - // ignore INDELs if the user chose SNP mode only - if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") || UAC.GLmodel.name().toUpperCase().contains("INDEL") ) - models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL")); - } - // No support for other types yet + if ( vcInput.isSNP() ) { + // use the SNP model unless the user chose INDEL mode only + if ( modelsToUse.size() == 2 || modelsToUse.get(0).name().endsWith("SNP") ) + GGAmodel.add(modelsToUse.get(0)); } - else { - if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) { - models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP")); - models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL")); - } - else { - models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+UAC.GLmodel.name().toUpperCase())); - } + else if ( vcInput.isIndel() || vcInput.isMixed() ) { + // use the INDEL model unless the user chose SNP mode only + if ( modelsToUse.size() == 2 ) + GGAmodel.add(modelsToUse.get(1)); + else if ( modelsToUse.get(0).name().endsWith("INDEL") ) + GGAmodel.add(modelsToUse.get(0)); } + // No support for other types yet - return models; + return GGAmodel; } public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) { From 62eaffdf0a475888bec87771a3c1499b0bbce524 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 12 Dec 2012 20:28:04 +0000 Subject: [PATCH 31/31] Fix docs for ReadBackedPhasing --- .../sting/gatk/walkers/phasing/ReadBackedPhasing.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java index eda43e6a5..7d848d0d4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java @@ -135,6 +135,9 @@ public class ReadBackedPhasing extends RodWalker