From e29469eeebc1c1c6cec097615aed7e04043a3ea9 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 22 Aug 2012 15:53:33 -0400 Subject: [PATCH 003/161] Forgot to update 2 integration test md5's (in this cases, changes are legit because of the code revamp of AD, it's simpler if AD is not output when a site is not variant, as genotype DP conveys the same information) --- .../walkers/genotyper/UnifiedGenotyperIntegrationTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 7390ec206..02e1bdf12 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -174,12 +174,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "8084a847f4a3c53a030e8c52eec35cea"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "bec7bcc50b42782e20a970db11201399"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "931e396f2a6903a291e813c64c18f8b5"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "09494afd12cef97293ed35d1a972f623"); } private void testOutputParameters(final String args, final String md5) { From 18060f237b21b53e2442526c4b06ea955cd8baac Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 22 Aug 2012 09:10:09 -0400 Subject: [PATCH 006/161] Add thread efficiency monitoring to GATK HMS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit -- See https://jira.broadinstitute.org/browse/GSA-502 -- New command line argument -mt enables thread monitoring -- If enabled, HMS uses StateMonitoringThreadFactory to create monitored threads, and prints out an efficiency report when HMS exits, telling the user information like: for BQSR – known to be inefficient locking INFO 17:10:33,195 StateMonitoringThreadFactory - Number of activeThreads used: 8 INFO 17:10:33,196 StateMonitoringThreadFactory - Total runtime 90.3 m INFO 17:10:33,196 StateMonitoringThreadFactory - Fraction of time spent blocked is 0.72 ( 64.8 m) INFO 17:10:33,197 StateMonitoringThreadFactory - Fraction of time spent running is 0.26 ( 23.7 m) INFO 17:10:33,197 StateMonitoringThreadFactory - Fraction of time spent waiting is 0.02 ( 112.8 s) INFO 17:10:33,197 StateMonitoringThreadFactory - Efficiency of multi-threading: 26.19% of time spent doing productive work for CountLoci INFO 17:06:12,777 StateMonitoringThreadFactory - Number of activeThreads used: 8 INFO 17:06:12,777 StateMonitoringThreadFactory - Total runtime 43.5 m INFO 17:06:12,778 StateMonitoringThreadFactory - Fraction of time spent blocked is 0.00 ( 4.2 s) INFO 17:06:12,778 StateMonitoringThreadFactory - Fraction of time spent running is 1.00 ( 43.3 m) INFO 17:06:12,779 StateMonitoringThreadFactory - Fraction of time spent waiting is 0.00 ( 6.0 s) INFO 17:06:12,779 StateMonitoringThreadFactory - Efficiency of multi-threading: 99.61% of time spent doing productive work --- .../sting/gatk/GenomeAnalysisEngine.java | 2 +- .../arguments/GATKArgumentCollection.java | 4 +++ .../executive/HierarchicalMicroScheduler.java | 30 +++++++++++++++++-- .../sting/gatk/executive/MicroScheduler.java | 2 +- .../resourcemanagement/ThreadAllocation.java | 16 ++++++++-- 5 files changed, 47 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index e76cde43a..9a9febb78 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -372,7 +372,7 @@ public class GenomeAnalysisEngine { else if(argCollection.numberOfIOThreads != null) numIOThreads = argCollection.numberOfIOThreads; - this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads,numCPUThreads,numIOThreads); + this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads, numCPUThreads, numIOThreads, argCollection.monitorThreads); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index f66e229bc..6a14373f3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -283,6 +283,10 @@ public class GATKArgumentCollection { @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false) public Integer numberOfThreads = 1; + /** Should we monitor threading efficiency? . */ + @Argument(fullName = "monitorThreads", shortName = "mt", doc = "Should we monitor the threading efficiency when running in multi-threaded mode?", required = false) + public Boolean monitorThreads = false; + /** * The following two arguments (num_cpu_threads, num_io_threads are TEMPORARY since Queue cannot currently support arbitrary tagged data types. * TODO: Kill this when I can do a tagged integer in Queue. diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 70b1be0e1..017eeb55a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -11,6 +11,7 @@ import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.threading.StateMonitoringThreadFactory; import org.broadinstitute.sting.utils.threading.ThreadPoolMonitor; import java.util.Collection; @@ -72,6 +73,9 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar /** What is the total time spent merging output? */ private long totalOutputMergeTime = 0; + /** may be null */ + final StateMonitoringThreadFactory monitoringThreadFactory; + /** * Create a new hierarchical microscheduler to process the given reads and reference. * @@ -80,9 +84,22 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar * @param reference Reference for driving the traversal. * @param nThreadsToUse maximum number of threads to use to do the work */ - protected HierarchicalMicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, int nThreadsToUse ) { + protected HierarchicalMicroScheduler(final GenomeAnalysisEngine engine, + final Walker walker, + final SAMDataSource reads, + final IndexedFastaSequenceFile reference, + final Collection rods, + final int nThreadsToUse, + final boolean monitorThreadPerformance ) { super(engine, walker, reads, reference, rods); - this.threadPool = Executors.newFixedThreadPool(nThreadsToUse); + + if ( monitorThreadPerformance ) { + this.monitoringThreadFactory = new StateMonitoringThreadFactory(nThreadsToUse); + this.threadPool = Executors.newFixedThreadPool(nThreadsToUse, monitoringThreadFactory); + } else { + this.monitoringThreadFactory = null; + this.threadPool = Executors.newFixedThreadPool(nThreadsToUse); + } } public Object execute( Walker walker, Iterable shardStrategy ) { @@ -140,10 +157,19 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar // do final cleanup operations outputTracker.close(); cleanup(); + printThreadingEfficiency(); return result; } + /** + * Print out the threading efficiency of this HMS, if state monitoring is enabled + */ + private void printThreadingEfficiency() { + if ( monitoringThreadFactory != null ) + monitoringThreadFactory.printUsageInformation(logger); + } + /** * Run the initialize method of the walker. Ensure that any calls * to the output stream will bypass thread local storage and write diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 95e39b7c6..c845bbce0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -98,7 +98,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { if(walker instanceof ReadWalker) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s is a read walker. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads())); - return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads()); + return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.shouldMonitorThreads()); } else { if(threadAllocation.getNumCPUThreads() > 1) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); diff --git a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java index 0c81af07b..07a45c0f9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java @@ -40,6 +40,11 @@ public class ThreadAllocation { */ private final int numIOThreads; + /** + * Should we monitor thread efficiency? + */ + private final boolean monitorThreads; + public int getNumCPUThreads() { return numCPUThreads; } @@ -48,11 +53,15 @@ public class ThreadAllocation { return numIOThreads; } + public boolean shouldMonitorThreads() { + return monitorThreads; + } + /** * Construct the default thread allocation. */ public ThreadAllocation() { - this(1,null,null); + this(1, null, null, false); } /** @@ -62,7 +71,7 @@ public class ThreadAllocation { * @param numCPUThreads Total number of threads allocated to the traversal. * @param numIOThreads Total number of threads allocated exclusively to IO. */ - public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads) { + public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads, final boolean monitorThreads) { // If no allocation information is present, allocate all threads to CPU if(numCPUThreads == null && numIOThreads == null) { this.numCPUThreads = totalThreads; @@ -88,6 +97,7 @@ public class ThreadAllocation { this.numCPUThreads = numCPUThreads; this.numIOThreads = numIOThreads; } - } + this.monitorThreads = monitorThreads; + } } From f876c5127742646854511e6b223145e68323aa34 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 22 Aug 2012 10:28:27 -0400 Subject: [PATCH 007/161] Separately track time spent doing user and system CPU work -- Allows us to ID (by proxy) time spent doing IO -- Refactor StateMonitoryingThreadFactory to use it's own enum, not Thread.State -- Reliable unit tests across mac and unix --- .../StateMonitoringThreadFactory.java | 122 +++++++++++------- .../StateMonitoringThreadFactoryUnitTest.java | 43 +++--- 2 files changed, 99 insertions(+), 66 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactory.java b/public/java/src/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactory.java index 39d5c1497..a62501f08 100644 --- a/public/java/src/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactory.java +++ b/public/java/src/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactory.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.utils.threading; import com.google.java.contract.Ensures; import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.apache.log4j.Priority; import org.broadinstitute.sting.utils.AutoFormattingTime; @@ -33,11 +34,11 @@ import java.lang.management.ManagementFactory; import java.lang.management.ThreadInfo; import java.lang.management.ThreadMXBean; import java.util.ArrayList; -import java.util.Arrays; import java.util.EnumMap; import java.util.List; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ThreadFactory; +import java.util.concurrent.TimeUnit; /** * Create activeThreads, collecting statistics about their running state over time @@ -51,20 +52,36 @@ import java.util.concurrent.ThreadFactory; @Invariant({ "activeThreads.size() <= nThreadsToCreate", "countDownLatch.getCount() <= nThreadsToCreate", - "nThreadsToCreated <= nThreadsToCreate" + "nThreadsCreated <= nThreadsToCreate" }) public class StateMonitoringThreadFactory implements ThreadFactory { - protected static final boolean DEBUG = false; + protected static final boolean DEBUG = true; private static Logger logger = Logger.getLogger(StateMonitoringThreadFactory.class); - public static final List TRACKED_STATES = Arrays.asList(Thread.State.BLOCKED, Thread.State.RUNNABLE, Thread.State.WAITING); + + public enum State { + BLOCKING("blocking on synchronized data structure"), + WAITING("waiting on some other thread"), + USER_CPU("doing productive CPU work"), + WAITING_FOR_IO("waiting for I/O"); + + private final String userFriendlyName; + + private State(String userFriendlyName) { + this.userFriendlyName = userFriendlyName; + } + + public String getUserFriendlyName() { + return userFriendlyName; + } + } // todo -- it would be nice to not have to specify upfront the number of threads. // todo -- can we dynamically increment countDownLatch? It seems not... final int nThreadsToCreate; final List activeThreads; - final EnumMap times = new EnumMap(Thread.State.class); + final EnumMap times = new EnumMap(State.class); - int nThreadsToCreated = 0; + int nThreadsCreated = 0; /** * The bean used to get the thread info about blocked and waiting times @@ -78,16 +95,6 @@ public class StateMonitoringThreadFactory implements ThreadFactory { */ final CountDownLatch countDownLatch; - /** - * Instead of RUNNABLE we want to print running. This map goes from Thread.State names to human readable ones - */ - final static EnumMap PRETTY_NAMES = new EnumMap(Thread.State.class); - static { - PRETTY_NAMES.put(Thread.State.RUNNABLE, "running"); - PRETTY_NAMES.put(Thread.State.BLOCKED, "blocked"); - PRETTY_NAMES.put(Thread.State.WAITING, "waiting"); - } - /** * Create a new factory generating threads whose runtime and contention * behavior is tracked in this factory. @@ -102,7 +109,7 @@ public class StateMonitoringThreadFactory implements ThreadFactory { activeThreads = new ArrayList(nThreadsToCreate); // initialize times to 0 - for ( final Thread.State state : Thread.State.values() ) + for ( final State state : State.values() ) times.put(state, 0l); // get the bean, and start tracking @@ -113,17 +120,22 @@ public class StateMonitoringThreadFactory implements ThreadFactory { logger.warn("Thread contention monitoring not supported, we cannot track GATK multi-threaded efficiency"); //bean.setThreadCpuTimeEnabled(true); + if ( bean.isThreadCpuTimeSupported() ) + bean.setThreadCpuTimeEnabled(true); + else + logger.warn("Thread CPU monitoring not supported, we cannot track GATK multi-threaded efficiency"); + countDownLatch = new CountDownLatch(nThreadsToCreate); } /** * Get the time spent in state across all threads created by this factory * - * @param state on of the TRACKED_STATES + * @param state to get information about * @return the time in milliseconds */ - @Ensures({"result >= 0", "TRACKED_STATES.contains(state)"}) - public synchronized long getStateTime(final Thread.State state) { + @Ensures({"result >= 0"}) + public synchronized long getStateTime(final State state) { return times.get(state); } @@ -145,8 +157,8 @@ public class StateMonitoringThreadFactory implements ThreadFactory { * * @return the fraction (0.0-1.0) of time spent in state over all state times of all threads */ - @Ensures({"result >= 0.0", "result <= 1.0", "TRACKED_STATES.contains(state)"}) - public synchronized double getStateFraction(final Thread.State state) { + @Ensures({"result >= 0.0", "result <= 1.0"}) + public synchronized double getStateFraction(final State state) { return getStateTime(state) / (1.0 * Math.max(getTotalTime(), 1)); } @@ -156,10 +168,15 @@ public class StateMonitoringThreadFactory implements ThreadFactory { */ @Ensures("result >= 0") public int getNThreadsCreated() { - return nThreadsToCreated; + return nThreadsCreated; } - public void waitForAllThreadsToComplete() throws InterruptedException { + /** + * Only useful for testing, so that we can wait for all of the threads in the factory to complete running + * + * @throws InterruptedException + */ + protected void waitForAllThreadsToComplete() throws InterruptedException { countDownLatch.await(); } @@ -168,7 +185,7 @@ public class StateMonitoringThreadFactory implements ThreadFactory { final StringBuilder b = new StringBuilder(); b.append("total ").append(getTotalTime()).append(" "); - for ( final Thread.State state : TRACKED_STATES ) { + for ( final State state : State.values() ) { b.append(state).append(" ").append(getStateTime(state)).append(" "); } @@ -193,17 +210,17 @@ public class StateMonitoringThreadFactory implements ThreadFactory { */ public synchronized void printUsageInformation(final Logger logger, final Priority priority) { logger.log(priority, "Number of activeThreads used: " + getNThreadsCreated()); - logger.log(priority, "Total runtime " + new AutoFormattingTime(getTotalTime() / 1000.0)); - for ( final Thread.State state : TRACKED_STATES ) { + logger.log(priority, "Total runtime " + new AutoFormattingTime(TimeUnit.MILLISECONDS.toSeconds(getTotalTime()))); + for ( final State state : State.values() ) { logger.log(priority, String.format(" Fraction of time spent %s is %.2f (%s)", - prettyName(state), getStateFraction(state), new AutoFormattingTime(getStateTime(state) / 1000.0))); + state.getUserFriendlyName(), + getStateFraction(state), + new AutoFormattingTime(getStateTime(state) / 1000.0))); } - logger.log(priority, String.format("Efficiency of multi-threading: %.2f%% of time spent doing productive work", - getStateFraction(Thread.State.RUNNABLE) * 100)); - } - - private String prettyName(final Thread.State state) { - return PRETTY_NAMES.get(state); + logger.log(priority, String.format("CPU efficiency : %.2f%% of time spent doing productive work", + getStateFraction(State.USER_CPU) * 100)); + logger.log(priority, String.format("I/O inefficiency: %.2f%% of time spent waiting on I/O", + getStateFraction(State.WAITING_FOR_IO) * 100)); } /** @@ -216,13 +233,13 @@ public class StateMonitoringThreadFactory implements ThreadFactory { @Ensures({ "activeThreads.size() > old(activeThreads.size())", "activeThreads.contains(result)", - "nThreadsToCreated == old(nThreadsToCreated) + 1" + "nThreadsCreated == old(nThreadsCreated) + 1" }) public synchronized Thread newThread(final Runnable runnable) { if ( activeThreads.size() >= nThreadsToCreate) throw new IllegalStateException("Attempting to create more activeThreads than allowed by constructor argument nThreadsToCreate " + nThreadsToCreate); - nThreadsToCreated++; + nThreadsCreated++; final Thread myThread = new TrackingThread(runnable); activeThreads.add(myThread); return myThread; @@ -234,8 +251,7 @@ public class StateMonitoringThreadFactory implements ThreadFactory { * This method updates all of the key timing and tracking information in the factory so that * thread can be retired. After this call the factory shouldn't have a pointer to the thread any longer * - * @param thread - * @param runtimeInMilliseconds + * @param thread the thread whose information we are updating */ @Ensures({ "activeThreads.size() < old(activeThreads.size())", @@ -243,16 +259,24 @@ public class StateMonitoringThreadFactory implements ThreadFactory { "getTotalTime() >= old(getTotalTime())", "countDownLatch.getCount() < old(countDownLatch.getCount())" }) - private synchronized void threadIsDone(final Thread thread, final long runtimeInMilliseconds) { + private synchronized void threadIsDone(final Thread thread) { if ( DEBUG ) logger.warn(" Countdown " + countDownLatch.getCount() + " in thread " + Thread.currentThread().getName()); if ( DEBUG ) logger.warn("UpdateThreadInfo called"); + final long threadID = thread.getId(); final ThreadInfo info = bean.getThreadInfo(thread.getId()); + final long totalTimeNano = bean.getThreadCpuTime(threadID); + final long userTimeNano = bean.getThreadUserTime(threadID); + final long systemTimeNano = totalTimeNano - userTimeNano; + final long userTimeInMilliseconds = nanoToMilli(userTimeNano); + final long systemTimeInMilliseconds = nanoToMilli(systemTimeNano); + if ( info != null ) { - if ( DEBUG ) logger.warn("Updating thread total runtime " + runtimeInMilliseconds + " of which blocked " + info.getBlockedTime() + " and waiting " + info.getWaitedTime()); - incTimes(Thread.State.BLOCKED, info.getBlockedTime()); - incTimes(Thread.State.WAITING, info.getWaitedTime()); - incTimes(Thread.State.RUNNABLE, runtimeInMilliseconds - info.getWaitedTime() - info.getBlockedTime()); + if ( DEBUG ) logger.warn("Updating thread with user runtime " + userTimeInMilliseconds + " and system runtime " + systemTimeInMilliseconds + " of which blocked " + info.getBlockedTime() + " and waiting " + info.getWaitedTime()); + incTimes(State.BLOCKING, info.getBlockedTime()); + incTimes(State.WAITING, info.getWaitedTime()); + incTimes(State.USER_CPU, userTimeInMilliseconds); + incTimes(State.WAITING_FOR_IO, systemTimeInMilliseconds); } // remove the thread from the list of active activeThreads @@ -270,10 +294,16 @@ public class StateMonitoringThreadFactory implements ThreadFactory { * @param state * @param by */ - private synchronized void incTimes(final Thread.State state, final long by) { + @Requires({"state != null", "by >= 0"}) + @Ensures("getTotalTime() == old(getTotalTime()) + by") + private synchronized void incTimes(final State state, final long by) { times.put(state, times.get(state) + by); } + private static long nanoToMilli(final long timeInNano) { + return TimeUnit.NANOSECONDS.toMillis(timeInNano); + } + /** * A wrapper around Thread that tracks the runtime of the thread and calls threadIsDone() when complete */ @@ -284,10 +314,8 @@ public class StateMonitoringThreadFactory implements ThreadFactory { @Override public void run() { - final long startTime = System.currentTimeMillis(); super.run(); - final long endTime = System.currentTimeMillis(); - threadIsDone(this, endTime - startTime); + threadIsDone(this); } } } diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactoryUnitTest.java index 5a606c50e..b41070a14 100755 --- a/public/java/test/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactoryUnitTest.java @@ -41,30 +41,30 @@ import java.util.concurrent.*; */ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { // the duration of the tests -- 100 ms is tolerable given the number of tests we are doing - private final static long THREAD_TARGET_DURATION_IN_MILLISECOND = 100; + private final static long THREAD_TARGET_DURATION_IN_MILLISECOND = 1000; final static Object GLOBAL_LOCK = new Object(); private class StateTest extends TestDataProvider { private final double TOLERANCE = 0.1; // willing to tolerate a 10% error - final List statesForThreads; + final List statesForThreads; - public StateTest(final List statesForThreads) { + public StateTest(final List statesForThreads) { super(StateTest.class); this.statesForThreads = statesForThreads; setName("StateTest " + Utils.join(",", statesForThreads)); } - public List getStatesForThreads() { + public List getStatesForThreads() { return statesForThreads; } public int getNStates() { return statesForThreads.size(); } - public double maxStateFraction(final Thread.State state) { return fraction(state) + TOLERANCE; } - public double minStateFraction(final Thread.State state) { return fraction(state) - TOLERANCE; } + public double maxStateFraction(final StateMonitoringThreadFactory.State state) { return fraction(state) + TOLERANCE; } + public double minStateFraction(final StateMonitoringThreadFactory.State state) { return fraction(state) - TOLERANCE; } - private double fraction(final Thread.State state) { + private double fraction(final StateMonitoringThreadFactory.State state) { return Collections.frequency(statesForThreads, state) / (1.0 * statesForThreads.size()); } } @@ -74,18 +74,16 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { * requested for input argument */ private static class StateTestThread implements Callable { - private final Thread.State stateToImplement; + private final StateMonitoringThreadFactory.State stateToImplement; - private StateTestThread(final Thread.State stateToImplement) { - if ( ! StateMonitoringThreadFactory.TRACKED_STATES.contains(stateToImplement) ) - throw new IllegalArgumentException("Unexpected state " + stateToImplement); + private StateTestThread(final StateMonitoringThreadFactory.State stateToImplement) { this.stateToImplement = stateToImplement; } @Override public Double call() throws Exception { switch ( stateToImplement ) { - case RUNNABLE: + case USER_CPU: // do some work until we get to THREAD_TARGET_DURATION_IN_MILLISECOND double sum = 0.0; final long startTime = System.currentTimeMillis(); @@ -96,13 +94,17 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { case WAITING: Thread.currentThread().sleep(THREAD_TARGET_DURATION_IN_MILLISECOND); return 0.0; - case BLOCKED: + case BLOCKING: if ( StateMonitoringThreadFactory.DEBUG ) logger.warn("Blocking..."); synchronized (GLOBAL_LOCK) { // the GLOBAL_LOCK must be held by the unit test itself for this to properly block if ( StateMonitoringThreadFactory.DEBUG ) logger.warn(" ... done blocking"); } return 0.0; + case WAITING_FOR_IO: + // TODO -- implement me + // shouldn't ever get here, throw an exception + throw new ReviewedStingException("WAITING_FOR_IO testing currently not implemented, until we figure out how to force a system call block"); default: throw new ReviewedStingException("Unexpected thread test state " + stateToImplement); } @@ -111,8 +113,11 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { @DataProvider(name = "StateTest") public Object[][] createStateTest() { - for ( final int nThreads : Arrays.asList(1, 2, 3, 4) ) { - for (final List states : Utils.makePermutations(StateMonitoringThreadFactory.TRACKED_STATES, nThreads, true) ) { + for ( final int nThreads : Arrays.asList(3) ) { + //final List allStates = Arrays.asList(StateMonitoringThreadFactory.State.WAITING_FOR_IO); + final List allStates = Arrays.asList(StateMonitoringThreadFactory.State.USER_CPU, StateMonitoringThreadFactory.State.WAITING, StateMonitoringThreadFactory.State.BLOCKING); + //final List allStates = Arrays.asList(StateMonitoringThreadFactory.State.values()); + for (final List states : Utils.makePermutations(allStates, nThreads, true) ) { //if ( Collections.frequency(states, Thread.State.BLOCKED) > 0) new StateTest(states); } @@ -121,7 +126,7 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { return StateTest.getTests(StateTest.class); } - @Test(enabled = false, dataProvider = "StateTest") + @Test(enabled = true, dataProvider = "StateTest") public void testStateTest(final StateTest test) throws InterruptedException { // allows us to test blocking final StateMonitoringThreadFactory factory = new StateMonitoringThreadFactory(test.getNStates()); @@ -130,7 +135,7 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { logger.warn("Running " + test); synchronized (GLOBAL_LOCK) { //logger.warn(" Have lock"); - for ( final Thread.State threadToRunState : test.getStatesForThreads() ) + for ( final StateMonitoringThreadFactory.State threadToRunState : test.getStatesForThreads() ) threadPool.submit(new StateTestThread(threadToRunState)); // lock has to be here for the whole running of the activeThreads but end before the sleep so the blocked activeThreads @@ -153,7 +158,7 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { Assert.assertTrue(totalTime >= minTime, "Factory results not properly accumulated: totalTime = " + totalTime + " < minTime = " + minTime); Assert.assertTrue(totalTime <= maxTime, "Factory results not properly accumulated: totalTime = " + totalTime + " > maxTime = " + maxTime); - for (final Thread.State state : StateMonitoringThreadFactory.TRACKED_STATES ) { + for (final StateMonitoringThreadFactory.State state : StateMonitoringThreadFactory.State.values() ) { final double min = test.minStateFraction(state); final double max = test.maxStateFraction(state); final double obs = factory.getStateFraction(state); @@ -170,6 +175,6 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { Assert.assertEquals(factory.getNThreadsCreated(), test.getNStates()); // should be called to ensure we don't format / NPE on output - factory.printUsageInformation(logger, Priority.INFO); + factory.printUsageInformation(logger, Priority.WARN); } } \ No newline at end of file From e1293f0ef27f33cb5c32ff2ec61c1a6b9bf831f4 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 22 Aug 2012 11:31:14 -0400 Subject: [PATCH 008/161] GSA-507: Thread monitoring refactored so it can work without a thread factory -- Old version StateMonitoringThreadFactory refactored into base class ThreadEfficiencyMonitor and subclass EfficiencyMonitoringThreadFactory. -- Base class is used by LinearMicroScheduler to monitor performance of GATK in single threaded mode -- MicroScheduler now handles management of the efficiency monitor. Includes master thread in monitor, meaning that reduce is now included for both schedulers --- .../executive/HierarchicalMicroScheduler.java | 19 +- .../gatk/executive/LinearMicroScheduler.java | 13 +- .../sting/gatk/executive/MicroScheduler.java | 33 +- .../EfficiencyMonitoringThreadFactory.java | 159 +++++++++ .../StateMonitoringThreadFactory.java | 321 ------------------ .../threading/ThreadEfficiencyMonitor.java | 206 +++++++++++ .../StateMonitoringThreadFactoryUnitTest.java | 36 +- 7 files changed, 431 insertions(+), 356 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactory.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 017eeb55a..70cdaab22 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -11,7 +11,7 @@ import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.threading.StateMonitoringThreadFactory; +import org.broadinstitute.sting.utils.threading.EfficiencyMonitoringThreadFactory; import org.broadinstitute.sting.utils.threading.ThreadPoolMonitor; import java.util.Collection; @@ -73,9 +73,6 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar /** What is the total time spent merging output? */ private long totalOutputMergeTime = 0; - /** may be null */ - final StateMonitoringThreadFactory monitoringThreadFactory; - /** * Create a new hierarchical microscheduler to process the given reads and reference. * @@ -94,10 +91,10 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar super(engine, walker, reads, reference, rods); if ( monitorThreadPerformance ) { - this.monitoringThreadFactory = new StateMonitoringThreadFactory(nThreadsToUse); + final EfficiencyMonitoringThreadFactory monitoringThreadFactory = new EfficiencyMonitoringThreadFactory(nThreadsToUse); + setThreadEfficiencyMonitor(monitoringThreadFactory); this.threadPool = Executors.newFixedThreadPool(nThreadsToUse, monitoringThreadFactory); } else { - this.monitoringThreadFactory = null; this.threadPool = Executors.newFixedThreadPool(nThreadsToUse); } } @@ -157,19 +154,11 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar // do final cleanup operations outputTracker.close(); cleanup(); - printThreadingEfficiency(); + executionIsDone(); return result; } - /** - * Print out the threading efficiency of this HMS, if state monitoring is enabled - */ - private void printThreadingEfficiency() { - if ( monitoringThreadFactory != null ) - monitoringThreadFactory.printUsageInformation(logger); - } - /** * Run the initialize method of the walker. Ensure that any calls * to the output stream will bypass thread local storage and write diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index b35abb775..7a6902fff 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -13,6 +13,7 @@ import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.traversals.TraverseActiveRegions; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; import java.util.Collection; @@ -33,8 +34,17 @@ public class LinearMicroScheduler extends MicroScheduler { * @param reference Reference for driving the traversal. * @param rods Reference-ordered data. */ - protected LinearMicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods ) { + protected LinearMicroScheduler(final GenomeAnalysisEngine engine, + final Walker walker, + final SAMDataSource reads, + final IndexedFastaSequenceFile reference, + final Collection rods, + final boolean monitorThreadPerformance ) { super(engine, walker, reads, reference, rods); + + if ( monitorThreadPerformance ) + setThreadEfficiencyMonitor(new ThreadEfficiencyMonitor()); + } /** @@ -88,6 +98,7 @@ public class LinearMicroScheduler extends MicroScheduler { outputTracker.close(); cleanup(); + executionIsDone(); return accumulator; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index c845bbce0..0abd75b65 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -39,6 +39,8 @@ import org.broadinstitute.sting.gatk.traversals.*; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.threading.EfficiencyMonitoringThreadFactory; +import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; import javax.management.JMException; import javax.management.MBeanServer; @@ -79,6 +81,13 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { private final MBeanServer mBeanServer; private final ObjectName mBeanName; + /** + * Threading efficiency monitor for tracking the resource utilization of the GATK + * + * may be null + */ + ThreadEfficiencyMonitor threadEfficiencyMonitor = null; + /** * MicroScheduler factory function. Create a microscheduler appropriate for reducing the * selected walker. @@ -102,7 +111,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { } else { if(threadAllocation.getNumCPUThreads() > 1) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); - return new LinearMicroScheduler(engine, walker, reads, reference, rods); + return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.shouldMonitorThreads()); } } @@ -150,6 +159,16 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { } } + + /** + * Inform this Microscheduler to use the efficiency monitor used to create threads in subclasses + * + * @param threadEfficiencyMonitor + */ + public void setThreadEfficiencyMonitor(final ThreadEfficiencyMonitor threadEfficiencyMonitor) { + this.threadEfficiencyMonitor = threadEfficiencyMonitor; + } + /** * Walks a walker over the given list of intervals. * @@ -183,6 +202,18 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { traversalEngine.printOnTraversalDone(); } + /** + * Must be called by subclasses when execute is done + */ + protected void executionIsDone() { + // Print out the threading efficiency of this HMS, if state monitoring is enabled + if ( threadEfficiencyMonitor != null ) { + // include the master thread information + threadEfficiencyMonitor.threadIsDone(Thread.currentThread()); + threadEfficiencyMonitor.printUsageInformation(logger); + } + } + /** * Gets the engine that created this microscheduler. * @return The engine owning this microscheduler. diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java b/public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java new file mode 100644 index 000000000..51af08681 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java @@ -0,0 +1,159 @@ +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package org.broadinstitute.sting.utils.threading; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.apache.log4j.Priority; +import org.broadinstitute.sting.utils.AutoFormattingTime; + +import java.lang.management.ManagementFactory; +import java.lang.management.ThreadInfo; +import java.lang.management.ThreadMXBean; +import java.util.ArrayList; +import java.util.EnumMap; +import java.util.List; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ThreadFactory; +import java.util.concurrent.TimeUnit; + +/** + * Creates threads that automatically monitor their efficiency via the parent ThreadEfficiencyMonitor + * + * User: depristo + * Date: 8/14/12 + * Time: 8:47 AM + */ +@Invariant({ + "activeThreads.size() <= nThreadsToCreate", + "countDownLatch.getCount() <= nThreadsToCreate", + "nThreadsCreated <= nThreadsToCreate" +}) +public class EfficiencyMonitoringThreadFactory extends ThreadEfficiencyMonitor implements ThreadFactory { + final int nThreadsToCreate; + final List activeThreads; + + int nThreadsCreated = 0; + + /** + * Counts down the number of active activeThreads whose runtime info hasn't been incorporated into + * times. Counts down from nThreadsToCreate to 0, at which point any code waiting + * on the final times is freed to run. + */ + final CountDownLatch countDownLatch; + + /** + * Create a new factory generating threads whose runtime and contention + * behavior is tracked in this factory. + * + * @param nThreadsToCreate the number of threads we will create in the factory before it's considered complete + */ + public EfficiencyMonitoringThreadFactory(final int nThreadsToCreate) { + super(); + if ( nThreadsToCreate <= 0 ) throw new IllegalArgumentException("nThreadsToCreate <= 0: " + nThreadsToCreate); + + this.nThreadsToCreate = nThreadsToCreate; + activeThreads = new ArrayList(nThreadsToCreate); + countDownLatch = new CountDownLatch(nThreadsToCreate); + } + + /** + * How many threads have been created by this factory so far? + * @return + */ + @Ensures("result >= 0") + public int getNThreadsCreated() { + return nThreadsCreated; + } + + /** + * Only useful for testing, so that we can wait for all of the threads in the factory to complete running + * + * @throws InterruptedException + */ + protected void waitForAllThreadsToComplete() throws InterruptedException { + countDownLatch.await(); + } + + @Ensures({ + "activeThreads.size() < old(activeThreads.size())", + "! activeThreads.contains(thread)", + "countDownLatch.getCount() < old(countDownLatch.getCount())" + }) + @Override + public synchronized void threadIsDone(final Thread thread) { + nThreadsAnalyzed++; + + if ( DEBUG ) logger.warn(" Countdown " + countDownLatch.getCount() + " in thread " + Thread.currentThread().getName()); + + super.threadIsDone(thread); + + // remove the thread from the list of active activeThreads + if ( ! activeThreads.remove(thread) ) + throw new IllegalStateException("Thread " + thread + " not in list of active activeThreads"); + + // one less thread is live for those blocking on all activeThreads to be complete + countDownLatch.countDown(); + if ( DEBUG ) logger.warn(" -> Countdown " + countDownLatch.getCount() + " in thread " + Thread.currentThread().getName()); + } + + /** + * Create a new thread from this factory + * + * @param runnable + * @return + */ + @Override + @Ensures({ + "activeThreads.size() > old(activeThreads.size())", + "activeThreads.contains(result)", + "nThreadsCreated == old(nThreadsCreated) + 1" + }) + public synchronized Thread newThread(final Runnable runnable) { + if ( activeThreads.size() >= nThreadsToCreate) + throw new IllegalStateException("Attempting to create more activeThreads than allowed by constructor argument nThreadsToCreate " + nThreadsToCreate); + + nThreadsCreated++; + final Thread myThread = new TrackingThread(runnable); + activeThreads.add(myThread); + return myThread; + } + + /** + * A wrapper around Thread that tracks the runtime of the thread and calls threadIsDone() when complete + */ + private class TrackingThread extends Thread { + private TrackingThread(Runnable runnable) { + super(runnable); + } + + @Override + public void run() { + super.run(); + threadIsDone(this); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactory.java b/public/java/src/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactory.java deleted file mode 100644 index a62501f08..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactory.java +++ /dev/null @@ -1,321 +0,0 @@ -/* - * The MIT License - * - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -package org.broadinstitute.sting.utils.threading; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.apache.log4j.Priority; -import org.broadinstitute.sting.utils.AutoFormattingTime; - -import java.lang.management.ManagementFactory; -import java.lang.management.ThreadInfo; -import java.lang.management.ThreadMXBean; -import java.util.ArrayList; -import java.util.EnumMap; -import java.util.List; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.ThreadFactory; -import java.util.concurrent.TimeUnit; - -/** - * Create activeThreads, collecting statistics about their running state over time - * - * Uses a ThreadMXBean to capture info via ThreadInfo - * - * User: depristo - * Date: 8/14/12 - * Time: 8:47 AM - */ -@Invariant({ - "activeThreads.size() <= nThreadsToCreate", - "countDownLatch.getCount() <= nThreadsToCreate", - "nThreadsCreated <= nThreadsToCreate" -}) -public class StateMonitoringThreadFactory implements ThreadFactory { - protected static final boolean DEBUG = true; - private static Logger logger = Logger.getLogger(StateMonitoringThreadFactory.class); - - public enum State { - BLOCKING("blocking on synchronized data structure"), - WAITING("waiting on some other thread"), - USER_CPU("doing productive CPU work"), - WAITING_FOR_IO("waiting for I/O"); - - private final String userFriendlyName; - - private State(String userFriendlyName) { - this.userFriendlyName = userFriendlyName; - } - - public String getUserFriendlyName() { - return userFriendlyName; - } - } - - // todo -- it would be nice to not have to specify upfront the number of threads. - // todo -- can we dynamically increment countDownLatch? It seems not... - final int nThreadsToCreate; - final List activeThreads; - final EnumMap times = new EnumMap(State.class); - - int nThreadsCreated = 0; - - /** - * The bean used to get the thread info about blocked and waiting times - */ - final ThreadMXBean bean; - - /** - * Counts down the number of active activeThreads whose runtime info hasn't been incorporated into - * times. Counts down from nThreadsToCreate to 0, at which point any code waiting - * on the final times is freed to run. - */ - final CountDownLatch countDownLatch; - - /** - * Create a new factory generating threads whose runtime and contention - * behavior is tracked in this factory. - * - * @param nThreadsToCreate the number of threads we will create in the factory before it's considered complete - * // TODO -- remove argument when we figure out how to implement this capability - */ - public StateMonitoringThreadFactory(final int nThreadsToCreate) { - if ( nThreadsToCreate <= 0 ) throw new IllegalArgumentException("nThreadsToCreate <= 0: " + nThreadsToCreate); - - this.nThreadsToCreate = nThreadsToCreate; - activeThreads = new ArrayList(nThreadsToCreate); - - // initialize times to 0 - for ( final State state : State.values() ) - times.put(state, 0l); - - // get the bean, and start tracking - bean = ManagementFactory.getThreadMXBean(); - if ( bean.isThreadContentionMonitoringSupported() ) - bean.setThreadContentionMonitoringEnabled(true); - else - logger.warn("Thread contention monitoring not supported, we cannot track GATK multi-threaded efficiency"); - //bean.setThreadCpuTimeEnabled(true); - - if ( bean.isThreadCpuTimeSupported() ) - bean.setThreadCpuTimeEnabled(true); - else - logger.warn("Thread CPU monitoring not supported, we cannot track GATK multi-threaded efficiency"); - - countDownLatch = new CountDownLatch(nThreadsToCreate); - } - - /** - * Get the time spent in state across all threads created by this factory - * - * @param state to get information about - * @return the time in milliseconds - */ - @Ensures({"result >= 0"}) - public synchronized long getStateTime(final State state) { - return times.get(state); - } - - /** - * Get the total time spent in all states across all threads created by this factory - * - * @return the time in milliseconds - */ - @Ensures({"result >= 0"}) - public synchronized long getTotalTime() { - long total = 0; - for ( final long time : times.values() ) - total += time; - return total; - } - - /** - * Get the fraction of time spent in state across all threads created by this factory - * - * @return the fraction (0.0-1.0) of time spent in state over all state times of all threads - */ - @Ensures({"result >= 0.0", "result <= 1.0"}) - public synchronized double getStateFraction(final State state) { - return getStateTime(state) / (1.0 * Math.max(getTotalTime(), 1)); - } - - /** - * How many threads have been created by this factory so far? - * @return - */ - @Ensures("result >= 0") - public int getNThreadsCreated() { - return nThreadsCreated; - } - - /** - * Only useful for testing, so that we can wait for all of the threads in the factory to complete running - * - * @throws InterruptedException - */ - protected void waitForAllThreadsToComplete() throws InterruptedException { - countDownLatch.await(); - } - - @Override - public synchronized String toString() { - final StringBuilder b = new StringBuilder(); - - b.append("total ").append(getTotalTime()).append(" "); - for ( final State state : State.values() ) { - b.append(state).append(" ").append(getStateTime(state)).append(" "); - } - - return b.toString(); - } - - /** - * Print usage information about threads from this factory to logger - * with the INFO priority - * - * @param logger - */ - public synchronized void printUsageInformation(final Logger logger) { - printUsageInformation(logger, Priority.INFO); - } - - /** - * Print usage information about threads from this factory to logger - * with the provided priority - * - * @param logger - */ - public synchronized void printUsageInformation(final Logger logger, final Priority priority) { - logger.log(priority, "Number of activeThreads used: " + getNThreadsCreated()); - logger.log(priority, "Total runtime " + new AutoFormattingTime(TimeUnit.MILLISECONDS.toSeconds(getTotalTime()))); - for ( final State state : State.values() ) { - logger.log(priority, String.format(" Fraction of time spent %s is %.2f (%s)", - state.getUserFriendlyName(), - getStateFraction(state), - new AutoFormattingTime(getStateTime(state) / 1000.0))); - } - logger.log(priority, String.format("CPU efficiency : %.2f%% of time spent doing productive work", - getStateFraction(State.USER_CPU) * 100)); - logger.log(priority, String.format("I/O inefficiency: %.2f%% of time spent waiting on I/O", - getStateFraction(State.WAITING_FOR_IO) * 100)); - } - - /** - * Create a new thread from this factory - * - * @param runnable - * @return - */ - @Override - @Ensures({ - "activeThreads.size() > old(activeThreads.size())", - "activeThreads.contains(result)", - "nThreadsCreated == old(nThreadsCreated) + 1" - }) - public synchronized Thread newThread(final Runnable runnable) { - if ( activeThreads.size() >= nThreadsToCreate) - throw new IllegalStateException("Attempting to create more activeThreads than allowed by constructor argument nThreadsToCreate " + nThreadsToCreate); - - nThreadsCreated++; - final Thread myThread = new TrackingThread(runnable); - activeThreads.add(myThread); - return myThread; - } - - /** - * Update the information about completed thread that ran for runtime in milliseconds - * - * This method updates all of the key timing and tracking information in the factory so that - * thread can be retired. After this call the factory shouldn't have a pointer to the thread any longer - * - * @param thread the thread whose information we are updating - */ - @Ensures({ - "activeThreads.size() < old(activeThreads.size())", - "! activeThreads.contains(thread)", - "getTotalTime() >= old(getTotalTime())", - "countDownLatch.getCount() < old(countDownLatch.getCount())" - }) - private synchronized void threadIsDone(final Thread thread) { - if ( DEBUG ) logger.warn(" Countdown " + countDownLatch.getCount() + " in thread " + Thread.currentThread().getName()); - if ( DEBUG ) logger.warn("UpdateThreadInfo called"); - - final long threadID = thread.getId(); - final ThreadInfo info = bean.getThreadInfo(thread.getId()); - final long totalTimeNano = bean.getThreadCpuTime(threadID); - final long userTimeNano = bean.getThreadUserTime(threadID); - final long systemTimeNano = totalTimeNano - userTimeNano; - final long userTimeInMilliseconds = nanoToMilli(userTimeNano); - final long systemTimeInMilliseconds = nanoToMilli(systemTimeNano); - - if ( info != null ) { - if ( DEBUG ) logger.warn("Updating thread with user runtime " + userTimeInMilliseconds + " and system runtime " + systemTimeInMilliseconds + " of which blocked " + info.getBlockedTime() + " and waiting " + info.getWaitedTime()); - incTimes(State.BLOCKING, info.getBlockedTime()); - incTimes(State.WAITING, info.getWaitedTime()); - incTimes(State.USER_CPU, userTimeInMilliseconds); - incTimes(State.WAITING_FOR_IO, systemTimeInMilliseconds); - } - - // remove the thread from the list of active activeThreads - if ( ! activeThreads.remove(thread) ) - throw new IllegalStateException("Thread " + thread + " not in list of active activeThreads"); - - // one less thread is live for those blocking on all activeThreads to be complete - countDownLatch.countDown(); - if ( DEBUG ) logger.warn(" -> Countdown " + countDownLatch.getCount() + " in thread " + Thread.currentThread().getName()); - } - - /** - * Helper function that increments the times counter by by for state - * - * @param state - * @param by - */ - @Requires({"state != null", "by >= 0"}) - @Ensures("getTotalTime() == old(getTotalTime()) + by") - private synchronized void incTimes(final State state, final long by) { - times.put(state, times.get(state) + by); - } - - private static long nanoToMilli(final long timeInNano) { - return TimeUnit.NANOSECONDS.toMillis(timeInNano); - } - - /** - * A wrapper around Thread that tracks the runtime of the thread and calls threadIsDone() when complete - */ - private class TrackingThread extends Thread { - private TrackingThread(Runnable runnable) { - super(runnable); - } - - @Override - public void run() { - super.run(); - threadIsDone(this); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java b/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java new file mode 100644 index 000000000..ef836a06d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java @@ -0,0 +1,206 @@ +package org.broadinstitute.sting.utils.threading; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.apache.log4j.Priority; +import org.broadinstitute.sting.utils.AutoFormattingTime; + +import java.lang.management.ManagementFactory; +import java.lang.management.ThreadInfo; +import java.lang.management.ThreadMXBean; +import java.util.EnumMap; +import java.util.concurrent.TimeUnit; + +/** + * Uses an MXBean to monitor thread efficiency + * + * Once the monitor is created, calls to threadIsDone() can be used to add information + * about the efficiency of the provided thread to this monitor. + * + * Provides simple print() for displaying efficiency information to a logger + * + * User: depristo + * Date: 8/22/12 + * Time: 10:48 AM + */ +@Invariant({"nThreadsAnalyzed >= 0"}) +public class ThreadEfficiencyMonitor { + protected static final boolean DEBUG = false; + protected static Logger logger = Logger.getLogger(EfficiencyMonitoringThreadFactory.class); + final EnumMap times = new EnumMap(State.class); + + /** + * The number of threads we've included in our efficiency monitoring + */ + int nThreadsAnalyzed = 0; + + /** + * The bean used to get the thread info about blocked and waiting times + */ + final ThreadMXBean bean; + + public ThreadEfficiencyMonitor() { + bean = ManagementFactory.getThreadMXBean(); + + // get the bean, and start tracking + if ( bean.isThreadContentionMonitoringSupported() ) + bean.setThreadContentionMonitoringEnabled(true); + else + logger.warn("Thread contention monitoring not supported, we cannot track GATK multi-threaded efficiency"); + //bean.setThreadCpuTimeEnabled(true); + + if ( bean.isThreadCpuTimeSupported() ) + bean.setThreadCpuTimeEnabled(true); + else + logger.warn("Thread CPU monitoring not supported, we cannot track GATK multi-threaded efficiency"); + + // initialize times to 0 + for ( final State state : State.values() ) + times.put(state, 0l); + } + + private static long nanoToMilli(final long timeInNano) { + return TimeUnit.NANOSECONDS.toMillis(timeInNano); + } + + /** + * Get the time spent in state across all threads created by this factory + * + * @param state to get information about + * @return the time in milliseconds + */ + @Ensures({"result >= 0"}) + public synchronized long getStateTime(final State state) { + return times.get(state); + } + + /** + * Get the total time spent in all states across all threads created by this factory + * + * @return the time in milliseconds + */ + @Ensures({"result >= 0"}) + public synchronized long getTotalTime() { + long total = 0; + for ( final long time : times.values() ) + total += time; + return total; + } + + /** + * Get the fraction of time spent in state across all threads created by this factory + * + * @return the percentage (0.0-100.0) of time spent in state over all state times of all threads + */ + @Ensures({"result >= 0.0", "result <= 100.0"}) + public synchronized double getStatePercent(final State state) { + return (100.0 * getStateTime(state)) / Math.max(getTotalTime(), 1); + } + + public int getnThreadsAnalyzed() { + return nThreadsAnalyzed; + } + + @Override + public synchronized String toString() { + final StringBuilder b = new StringBuilder(); + + b.append("total ").append(getTotalTime()).append(" "); + for ( final State state : State.values() ) { + b.append(state).append(" ").append(getStateTime(state)).append(" "); + } + + return b.toString(); + } + + /** + * Print usage information about threads from this factory to logger + * with the INFO priority + * + * @param logger + */ + public synchronized void printUsageInformation(final Logger logger) { + printUsageInformation(logger, Priority.INFO); + } + + /** + * Print usage information about threads from this factory to logger + * with the provided priority + * + * @param logger + */ + public synchronized void printUsageInformation(final Logger logger, final Priority priority) { + logger.debug("Number of threads monitored: " + getnThreadsAnalyzed()); + logger.debug("Total runtime " + new AutoFormattingTime(TimeUnit.MILLISECONDS.toSeconds(getTotalTime()))); + for ( final State state : State.values() ) { + logger.debug(String.format("\tPercent of time spent %s is %.2f", state.getUserFriendlyName(), getStatePercent(state))); + } + logger.log(priority, String.format("CPU efficiency : %6.2f%% of time spent %s", getStatePercent(State.USER_CPU), State.USER_CPU.getUserFriendlyName())); + logger.log(priority, String.format("Walker inefficiency : %6.2f%% of time spent %s", getStatePercent(State.BLOCKING), State.BLOCKING.getUserFriendlyName())); + logger.log(priority, String.format("I/O inefficiency : %6.2f%% of time spent %s", getStatePercent(State.WAITING_FOR_IO), State.WAITING_FOR_IO.getUserFriendlyName())); + } + + /** + * Update the information about completed thread that ran for runtime in milliseconds + * + * This method updates all of the key timing and tracking information in the factory so that + * thread can be retired. After this call the factory shouldn't have a pointer to the thread any longer + * + * @param thread the thread whose information we are updating + */ + @Ensures({ + "getTotalTime() >= old(getTotalTime())" + }) + public synchronized void threadIsDone(final Thread thread) { + nThreadsAnalyzed++; + + if ( DEBUG ) logger.warn("UpdateThreadInfo called"); + + final long threadID = thread.getId(); + final ThreadInfo info = bean.getThreadInfo(thread.getId()); + final long totalTimeNano = bean.getThreadCpuTime(threadID); + final long userTimeNano = bean.getThreadUserTime(threadID); + final long systemTimeNano = totalTimeNano - userTimeNano; + final long userTimeInMilliseconds = nanoToMilli(userTimeNano); + final long systemTimeInMilliseconds = nanoToMilli(systemTimeNano); + + if ( info != null ) { + if ( DEBUG ) logger.warn("Updating thread with user runtime " + userTimeInMilliseconds + " and system runtime " + systemTimeInMilliseconds + " of which blocked " + info.getBlockedTime() + " and waiting " + info.getWaitedTime()); + incTimes(State.BLOCKING, info.getBlockedTime()); + incTimes(State.WAITING, info.getWaitedTime()); + incTimes(State.USER_CPU, userTimeInMilliseconds); + incTimes(State.WAITING_FOR_IO, systemTimeInMilliseconds); + } + } + + /** + * Helper function that increments the times counter by by for state + * + * @param state + * @param by + */ + @Requires({"state != null", "by >= 0"}) + @Ensures("getTotalTime() == old(getTotalTime()) + by") + private synchronized void incTimes(final State state, final long by) { + times.put(state, times.get(state) + by); + } + + public enum State { + BLOCKING("blocking on synchronized data structures"), + WAITING("waiting on some other thread"), + USER_CPU("doing productive CPU work"), + WAITING_FOR_IO("waiting for I/O"); + + private final String userFriendlyName; + + private State(String userFriendlyName) { + this.userFriendlyName = userFriendlyName; + } + + public String getUserFriendlyName() { + return userFriendlyName; + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactoryUnitTest.java index b41070a14..0b655873d 100755 --- a/public/java/test/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactoryUnitTest.java @@ -47,24 +47,24 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { private class StateTest extends TestDataProvider { private final double TOLERANCE = 0.1; // willing to tolerate a 10% error - final List statesForThreads; + final List statesForThreads; - public StateTest(final List statesForThreads) { + public StateTest(final List statesForThreads) { super(StateTest.class); this.statesForThreads = statesForThreads; setName("StateTest " + Utils.join(",", statesForThreads)); } - public List getStatesForThreads() { + public List getStatesForThreads() { return statesForThreads; } public int getNStates() { return statesForThreads.size(); } - public double maxStateFraction(final StateMonitoringThreadFactory.State state) { return fraction(state) + TOLERANCE; } - public double minStateFraction(final StateMonitoringThreadFactory.State state) { return fraction(state) - TOLERANCE; } + public double maxStateFraction(final EfficiencyMonitoringThreadFactory.State state) { return fraction(state) + TOLERANCE; } + public double minStateFraction(final EfficiencyMonitoringThreadFactory.State state) { return fraction(state) - TOLERANCE; } - private double fraction(final StateMonitoringThreadFactory.State state) { + private double fraction(final EfficiencyMonitoringThreadFactory.State state) { return Collections.frequency(statesForThreads, state) / (1.0 * statesForThreads.size()); } } @@ -74,9 +74,9 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { * requested for input argument */ private static class StateTestThread implements Callable { - private final StateMonitoringThreadFactory.State stateToImplement; + private final EfficiencyMonitoringThreadFactory.State stateToImplement; - private StateTestThread(final StateMonitoringThreadFactory.State stateToImplement) { + private StateTestThread(final EfficiencyMonitoringThreadFactory.State stateToImplement) { this.stateToImplement = stateToImplement; } @@ -95,10 +95,10 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { Thread.currentThread().sleep(THREAD_TARGET_DURATION_IN_MILLISECOND); return 0.0; case BLOCKING: - if ( StateMonitoringThreadFactory.DEBUG ) logger.warn("Blocking..."); + if ( EfficiencyMonitoringThreadFactory.DEBUG ) logger.warn("Blocking..."); synchronized (GLOBAL_LOCK) { // the GLOBAL_LOCK must be held by the unit test itself for this to properly block - if ( StateMonitoringThreadFactory.DEBUG ) logger.warn(" ... done blocking"); + if ( EfficiencyMonitoringThreadFactory.DEBUG ) logger.warn(" ... done blocking"); } return 0.0; case WAITING_FOR_IO: @@ -114,10 +114,10 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { @DataProvider(name = "StateTest") public Object[][] createStateTest() { for ( final int nThreads : Arrays.asList(3) ) { - //final List allStates = Arrays.asList(StateMonitoringThreadFactory.State.WAITING_FOR_IO); - final List allStates = Arrays.asList(StateMonitoringThreadFactory.State.USER_CPU, StateMonitoringThreadFactory.State.WAITING, StateMonitoringThreadFactory.State.BLOCKING); - //final List allStates = Arrays.asList(StateMonitoringThreadFactory.State.values()); - for (final List states : Utils.makePermutations(allStates, nThreads, true) ) { + //final List allStates = Arrays.asList(EfficiencyMonitoringThreadFactory.State.WAITING_FOR_IO); + final List allStates = Arrays.asList(EfficiencyMonitoringThreadFactory.State.USER_CPU, EfficiencyMonitoringThreadFactory.State.WAITING, EfficiencyMonitoringThreadFactory.State.BLOCKING); + //final List allStates = Arrays.asList(EfficiencyMonitoringThreadFactory.State.values()); + for (final List states : Utils.makePermutations(allStates, nThreads, true) ) { //if ( Collections.frequency(states, Thread.State.BLOCKED) > 0) new StateTest(states); } @@ -129,13 +129,13 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { @Test(enabled = true, dataProvider = "StateTest") public void testStateTest(final StateTest test) throws InterruptedException { // allows us to test blocking - final StateMonitoringThreadFactory factory = new StateMonitoringThreadFactory(test.getNStates()); + final EfficiencyMonitoringThreadFactory factory = new EfficiencyMonitoringThreadFactory(test.getNStates()); final ExecutorService threadPool = Executors.newFixedThreadPool(test.getNStates(), factory); logger.warn("Running " + test); synchronized (GLOBAL_LOCK) { //logger.warn(" Have lock"); - for ( final StateMonitoringThreadFactory.State threadToRunState : test.getStatesForThreads() ) + for ( final EfficiencyMonitoringThreadFactory.State threadToRunState : test.getStatesForThreads() ) threadPool.submit(new StateTestThread(threadToRunState)); // lock has to be here for the whole running of the activeThreads but end before the sleep so the blocked activeThreads @@ -158,10 +158,10 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { Assert.assertTrue(totalTime >= minTime, "Factory results not properly accumulated: totalTime = " + totalTime + " < minTime = " + minTime); Assert.assertTrue(totalTime <= maxTime, "Factory results not properly accumulated: totalTime = " + totalTime + " > maxTime = " + maxTime); - for (final StateMonitoringThreadFactory.State state : StateMonitoringThreadFactory.State.values() ) { + for (final EfficiencyMonitoringThreadFactory.State state : EfficiencyMonitoringThreadFactory.State.values() ) { final double min = test.minStateFraction(state); final double max = test.maxStateFraction(state); - final double obs = factory.getStateFraction(state); + final double obs = factory.getStatePercent(state); // logger.warn(" Checking " + state // + " min " + String.format("%.2f", min) // + " max " + String.format("%.2f", max) From 63af0cbcbab0560ac273ea1512beceb655d4c469 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 22 Aug 2012 16:45:53 -0400 Subject: [PATCH 010/161] Cleanup GATK efficiency monitor classes -- Invert logic in GATKArgumentCollection to disable monitoring, not enable. That means monitoring is on by default -- Fix testing error in unit tests -- Rename variables in ThreadAllocation to be clearer --- .../sting/gatk/GenomeAnalysisEngine.java | 2 +- .../gatk/arguments/GATKArgumentCollection.java | 10 +++++++--- .../sting/gatk/executive/MicroScheduler.java | 5 ++--- .../resourcemanagement/ThreadAllocation.java | 10 +++++----- .../EfficiencyMonitoringThreadFactory.java | 17 ++++++++--------- ...iciencyMonitoringThreadFactoryUnitTest.java} | 10 +++++----- 6 files changed, 28 insertions(+), 26 deletions(-) rename public/java/test/org/broadinstitute/sting/utils/threading/{StateMonitoringThreadFactoryUnitTest.java => EfficiencyMonitoringThreadFactoryUnitTest.java} (94%) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 9a9febb78..0d1c34ced 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -372,7 +372,7 @@ public class GenomeAnalysisEngine { else if(argCollection.numberOfIOThreads != null) numIOThreads = argCollection.numberOfIOThreads; - this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads, numCPUThreads, numIOThreads, argCollection.monitorThreads); + this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads, numCPUThreads, numIOThreads, ! argCollection.disableEfficiencyMonitor); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 6a14373f3..72cb5e02f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -283,9 +283,13 @@ public class GATKArgumentCollection { @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false) public Integer numberOfThreads = 1; - /** Should we monitor threading efficiency? . */ - @Argument(fullName = "monitorThreads", shortName = "mt", doc = "Should we monitor the threading efficiency when running in multi-threaded mode?", required = false) - public Boolean monitorThreads = false; + /** + * By default the GATK monitors its own efficiency, but this can have a itsy-bitsy tiny + * cost (< 0.1%) in runtime because of turning on the JavaBean. This argument allows you + * to disable the monitor + */ + @Argument(fullName = "disableThreadEfficiencyMonitor", shortName = "dtem", doc = "Disable GATK efficiency monitoring", required = false) + public Boolean disableEfficiencyMonitor = false; /** * The following two arguments (num_cpu_threads, num_io_threads are TEMPORARY since Queue cannot currently support arbitrary tagged data types. diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 0abd75b65..b755cdd77 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -39,7 +39,6 @@ import org.broadinstitute.sting.gatk.traversals.*; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.threading.EfficiencyMonitoringThreadFactory; import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; import javax.management.JMException; @@ -107,11 +106,11 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { if(walker instanceof ReadWalker) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s is a read walker. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads())); - return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.shouldMonitorThreads()); + return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); } else { if(threadAllocation.getNumCPUThreads() > 1) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); - return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.shouldMonitorThreads()); + return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.monitorThreadEfficiency()); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java index 07a45c0f9..caae55ac5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java @@ -43,7 +43,7 @@ public class ThreadAllocation { /** * Should we monitor thread efficiency? */ - private final boolean monitorThreads; + private final boolean monitorEfficiency; public int getNumCPUThreads() { return numCPUThreads; @@ -53,8 +53,8 @@ public class ThreadAllocation { return numIOThreads; } - public boolean shouldMonitorThreads() { - return monitorThreads; + public boolean monitorThreadEfficiency() { + return monitorEfficiency; } /** @@ -71,7 +71,7 @@ public class ThreadAllocation { * @param numCPUThreads Total number of threads allocated to the traversal. * @param numIOThreads Total number of threads allocated exclusively to IO. */ - public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads, final boolean monitorThreads) { + public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads, final boolean monitorEfficiency) { // If no allocation information is present, allocate all threads to CPU if(numCPUThreads == null && numIOThreads == null) { this.numCPUThreads = totalThreads; @@ -98,6 +98,6 @@ public class ThreadAllocation { this.numIOThreads = numIOThreads; } - this.monitorThreads = monitorThreads; + this.monitorEfficiency = monitorEfficiency; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java b/public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java index 51af08681..b30198608 100644 --- a/public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java +++ b/public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java @@ -99,9 +99,9 @@ public class EfficiencyMonitoringThreadFactory extends ThreadEfficiencyMonitor i } @Ensures({ - "activeThreads.size() < old(activeThreads.size())", + "activeThreads.size() <= old(activeThreads.size())", "! activeThreads.contains(thread)", - "countDownLatch.getCount() < old(countDownLatch.getCount())" + "countDownLatch.getCount() <= old(countDownLatch.getCount())" }) @Override public synchronized void threadIsDone(final Thread thread) { @@ -111,13 +111,12 @@ public class EfficiencyMonitoringThreadFactory extends ThreadEfficiencyMonitor i super.threadIsDone(thread); - // remove the thread from the list of active activeThreads - if ( ! activeThreads.remove(thread) ) - throw new IllegalStateException("Thread " + thread + " not in list of active activeThreads"); - - // one less thread is live for those blocking on all activeThreads to be complete - countDownLatch.countDown(); - if ( DEBUG ) logger.warn(" -> Countdown " + countDownLatch.getCount() + " in thread " + Thread.currentThread().getName()); + // remove the thread from the list of active activeThreads, if it's in there, and decrement the countdown latch + if ( activeThreads.remove(thread) ) { + // one less thread is live for those blocking on all activeThreads to be complete + countDownLatch.countDown(); + if ( DEBUG ) logger.warn(" -> Countdown " + countDownLatch.getCount() + " in thread " + Thread.currentThread().getName()); + } } /** diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java similarity index 94% rename from public/java/test/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactoryUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java index 0b655873d..35dc9754c 100755 --- a/public/java/test/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java @@ -39,7 +39,7 @@ import java.util.concurrent.*; /** * Tests for the state monitoring thread factory. */ -public class StateMonitoringThreadFactoryUnitTest extends BaseTest { +public class EfficiencyMonitoringThreadFactoryUnitTest extends BaseTest { // the duration of the tests -- 100 ms is tolerable given the number of tests we are doing private final static long THREAD_TARGET_DURATION_IN_MILLISECOND = 1000; final static Object GLOBAL_LOCK = new Object(); @@ -61,8 +61,8 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { public int getNStates() { return statesForThreads.size(); } - public double maxStateFraction(final EfficiencyMonitoringThreadFactory.State state) { return fraction(state) + TOLERANCE; } - public double minStateFraction(final EfficiencyMonitoringThreadFactory.State state) { return fraction(state) - TOLERANCE; } + public double maxStatePercent(final EfficiencyMonitoringThreadFactory.State state) { return 100*(fraction(state) + TOLERANCE); } + public double minStatePercent(final EfficiencyMonitoringThreadFactory.State state) { return 100*(fraction(state) - TOLERANCE); } private double fraction(final EfficiencyMonitoringThreadFactory.State state) { return Collections.frequency(statesForThreads, state) / (1.0 * statesForThreads.size()); @@ -159,8 +159,8 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { Assert.assertTrue(totalTime <= maxTime, "Factory results not properly accumulated: totalTime = " + totalTime + " > maxTime = " + maxTime); for (final EfficiencyMonitoringThreadFactory.State state : EfficiencyMonitoringThreadFactory.State.values() ) { - final double min = test.minStateFraction(state); - final double max = test.maxStateFraction(state); + final double min = test.minStatePercent(state); + final double max = test.maxStatePercent(state); final double obs = factory.getStatePercent(state); // logger.warn(" Checking " + state // + " min " + String.format("%.2f", min) From e5df91aa23c9a9cf92fc8a573f914fdd3439c979 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 22 Aug 2012 20:17:39 -0400 Subject: [PATCH 011/161] Looks like the @WalkerName annotation doesn't work with the GATK docs, so I'm renaming the walkers. --- ...rnateReference.java => FastaAlternateReferenceMaker.java} | 3 +-- .../fasta/{FastaReference.java => FastaReferenceMaker.java} | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) rename public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/{FastaAlternateReference.java => FastaAlternateReferenceMaker.java} (98%) rename public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/{FastaReference.java => FastaReferenceMaker.java} (96%) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReference.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java similarity index 98% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReference.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java index 8fbd37e30..2b9744b89 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReference.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java @@ -76,10 +76,9 @@ import java.util.List; * */ @DocumentedGATKFeature( groupName = "Companion Utilities", extraDocs = {CommandLineGATK.class} ) -@WalkerName("FastaAlternateReferenceMaker") @Reference(window=@Window(start=-1,stop=50)) @Requires(value={DataSource.REFERENCE}) -public class FastaAlternateReference extends FastaReference { +public class FastaAlternateReferenceMaker extends FastaReferenceMaker { /** * Variants from these input files are used by this tool to construct an alternate reference. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReference.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java similarity index 96% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReference.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java index a835560d4..362867318 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReference.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java @@ -62,15 +62,14 @@ import java.io.PrintStream; *
  * java -Xmx2g -jar GenomeAnalysisTK.jar \
  *   -R ref.fasta \
- *   -T FastaReference \
+ *   -T FastaReferenceMaker \
  *   -o output.fasta \
  *   -L input.intervals
  * 
* */ @DocumentedGATKFeature( groupName = "Companion Utilities", extraDocs = {CommandLineGATK.class} ) -@WalkerName("FastaReferenceMaker") -public class FastaReference extends RefWalker, GenomeLoc> { +public class FastaReferenceMaker extends RefWalker, GenomeLoc> { @Output PrintStream out; From 0b735884dbd1150fc8c21a144ed7d160de9712cc Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 23 Aug 2012 08:55:27 -0400 Subject: [PATCH 013/161] Cleanup code in VariantContext --- .../sting/utils/variantcontext/VariantContext.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 8015889f5..8da6d452e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -1,6 +1,5 @@ package org.broadinstitute.sting.utils.variantcontext; -import org.apache.commons.math.stat.descriptive.rank.Max; import org.apache.log4j.Logger; import org.broad.tribble.Feature; import org.broad.tribble.TribbleException; @@ -179,7 +178,7 @@ import java.util.*; */ public class VariantContext implements Feature { // to enable tribble integration private final static boolean WARN_ABOUT_BAD_END = true; - private final static long MAX_ALLELE_SIZE_FOR_NON_SV = 150; + private final static int MAX_ALLELE_SIZE_FOR_NON_SV = 150; final protected static Logger logger = Logger.getLogger(VariantContext.class); private boolean fullyDecoded = false; protected CommonInfo commonInfo = null; From 857b11b26f9f2d5e78ee7565430b502962b564dc Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 23 Aug 2012 09:59:37 -0400 Subject: [PATCH 014/161] Done with GSA-506: Add nt and efficiency information to GATKRunReport -- GATKRunReports contain itemized information about the numThreads used to execute the GATK, as well as the efficiency of the use of those threads to get real work done, including time spent running, waiting, blocking, and waiting for IO -- See https://jira.broadinstitute.org/browse/GSA-506 for more details --- .../sting/gatk/GenomeAnalysisEngine.java | 24 +++++++++---- .../sting/gatk/executive/MicroScheduler.java | 8 +++++ .../sting/gatk/phonehome/GATKRunReport.java | 34 +++++++++++++++++++ 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 0d1c34ced..c8dbb090d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -30,7 +30,6 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMSequenceDictionary; import org.apache.log4j.Logger; -import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; @@ -52,18 +51,14 @@ import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.GATKLiteUtils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; -import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; +import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; import java.util.*; /** @@ -175,6 +170,13 @@ public class GenomeAnalysisEngine { */ private Collection referenceMetaDataFiles; + /** + * The threading efficiency monitor we use in the GATK to monitor our efficiency. + * + * May be null if one isn't active, or hasn't be initialized yet + */ + private ThreadEfficiencyMonitor threadEfficiencyMonitor = null; + /** * Set the reference metadata files to use for this traversal. * @param referenceMetaDataFiles Collection of files and descriptors over which to traverse. @@ -252,6 +254,7 @@ public class GenomeAnalysisEngine { // our microscheduler, which is in charge of running everything MicroScheduler microScheduler = createMicroscheduler(); + threadEfficiencyMonitor = microScheduler.getThreadEfficiencyMonitor(); // create temp directories as necessary initializeTempDirectory(); @@ -1003,6 +1006,15 @@ public class GenomeAnalysisEngine { return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics(); } + /** + * Return the global ThreadEfficiencyMonitor, if there is one + * + * @return the monitor, or null if none is active + */ + public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() { + return threadEfficiencyMonitor; + } + // ------------------------------------------------------------------------------------- // // code for working with Samples database diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index b755cdd77..4becc5a78 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -158,6 +158,14 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { } } + /** + * Return the ThreadEfficiencyMonitor we are using to track our resource utilization, if there is one + * + * @return the monitor, or null if none is active + */ + public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() { + return threadEfficiencyMonitor; + } /** * Inform this Microscheduler to use the efficiency monitor used to create threads in subclasses diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index 035252c14..6f3f175a2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; import org.jets3t.service.S3Service; import org.jets3t.service.S3ServiceException; import org.jets3t.service.impl.rest.httpclient.RestS3Service; @@ -141,6 +142,21 @@ public class GATKRunReport { @Element(required = true, name = "tag") private String tag; + // ----------------------------------------------------------------- + // elements related to multi-threading and efficiency + // ----------------------------------------------------------------- + + @Element(required = true, name = "numThreads") + private int numThreads; + @Element(required = true, name = "percent_time_running") + private String percentTimeRunning; + @Element(required = true, name = "percent_time_waiting") + private String percentTimeWaiting; + @Element(required = true, name = "percent_time_blocking") + private String percentTimeBlocking; + @Element(required = true, name = "percent_time_waiting_for_io") + private String percentTimeWaitingForIO; + public enum PhoneHomeOption { /** Disable phone home */ NO_ET, @@ -201,12 +217,30 @@ public class GATKRunReport { // if there was an exception, capture it this.mException = e == null ? null : new ExceptionToXML(e); + + numThreads = engine.getArguments().numberOfThreads; + percentTimeRunning = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.USER_CPU); + percentTimeBlocking = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.BLOCKING); + percentTimeWaiting = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.WAITING); + percentTimeWaitingForIO = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.WAITING_FOR_IO); } public String getID() { return id; } + /** + * Return a string representing the percent of time the GATK spent in state, if possible. Otherwise return NA + * + * @param engine the GATK engine whose threading efficiency info we will use + * @param state the state whose occupancy we wish to know + * @return a string representation of the percent occupancy of state, or NA is not possible + */ + private String getThreadEfficiencyPercent(final GenomeAnalysisEngine engine, final ThreadEfficiencyMonitor.State state) { + final ThreadEfficiencyMonitor tem = engine.getThreadEfficiencyMonitor(); + return tem == null ? "NA" : String.format("%.2f", tem.getStatePercent(state)); + } + public void postReport(PhoneHomeOption type) { logger.debug("Posting report of type " + type); From f1166d6d0096a95e636da17be0a20c7245436cca Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Thu, 23 Aug 2012 11:43:19 -0700 Subject: [PATCH 016/161] Spotted a potential bug where sample IDs passed in from the meta data were only checked against the sample IDs in the VCF header if the input file happened to be a meta data file rather than a fam file. Added a check for fam files as well, and added an integration test to cover each case. --- .../variantutils/VariantsToBinaryPed.java | 6 +++++ .../VariantsToBinaryPedIntegrationTest.java | 25 +++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java index 7111bac46..2e6a80462 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java @@ -104,6 +104,12 @@ public class VariantsToBinaryPed extends RodWalker { String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3"; String pheno = mVals.get("phenotype"); outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno); + } else { + // even if a fam file is input, we can't diverge the bed file from the fam file, which + // could lead to a malformed plink trio. Fail fast if there's any extra sample in the VCF. + if ( ! sampleMetaValues.containsKey(sample) ) { + throw new UserException("No metadata provided for sample "+sample); + } } try { File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp"); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java index 07e82b869..a75da6cf9 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java @@ -87,6 +87,31 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest { executeTest(testName, spec); } + + @Test + public void testFailFast() { + String testName = "testFailFast"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("HapMap.testFailFast.vcf", "HapMap_only_famids.fam",10), + 3, + UserException.class + ); + + executeTest(testName, spec); + } + + @Test + public void testFailFastMeta() { + String testName = "testFailFastMeta"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("HapMap.testFailFast.vcf", "HapMap_only_famids.metadata.txt",10), + 3, + UserException.class + ); + + executeTest(testName, spec); + + } } From 1999b95754a80e43d30ea1110f2c4acfb438cbe7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 23 Aug 2012 18:14:10 -0400 Subject: [PATCH 017/161] Work around for GSA-513: ClassCastException in VariantEval --- .../stratifications/AlleleCount.java | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index 2b1bd9c62..fbd6371f3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -45,12 +45,22 @@ public class AlleleCount extends VariantStratifier { if (eval != null) { int AC = 0; // by default, the site is considered monomorphic - if ( eval.hasAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY) && eval.isBiallelic() ) { - // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT is ./. but the exact model may arbitrarily choose an AC>1) - AC = Math.min(eval.getAttributeAsInt(VCFConstants.MLE_ALLELE_COUNT_KEY, 0), nchrom); - } else if ( eval.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) && eval.isBiallelic() ) { - AC = eval.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0); - } else if ( eval.isVariant() ) { + try { + if ( eval.isBiallelic() ) { + if ( eval.hasAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY) ) { + // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT is ./. but the exact model may arbitrarily choose an AC>1) + AC = Math.min(eval.getAttributeAsInt(VCFConstants.MLE_ALLELE_COUNT_KEY, 0), nchrom); + } else if ( eval.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { + AC = eval.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0); + } + } + } catch ( ClassCastException e ) { + // protect ourselves from bad inputs + // TODO -- fully decode VC + } + + if ( AC == 0 && eval.isVariant() ) { + // fall back to the direct calculation for (Allele allele : eval.getAlternateAlleles()) AC = Math.max(AC, eval.getCalledChrCount(allele)); } From 5f8574bd1594771cffe11f8fb5b2cc8a62a05b92 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Fri, 24 Aug 2012 10:48:41 -0400 Subject: [PATCH 021/161] Fixing typo in error message. --- public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 41ca58157..f8faa101b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -141,7 +141,7 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome } if (!(this.contiguousP(that))) { - throw new ReviewedStingException("The two genome loc's need to be contigous"); + throw new ReviewedStingException("The two genome loc's need to be contiguous"); } return new GenomeLoc(getContig(), this.contigIndex, From 740520c23beba300c7894321e5391a9d0420dbba Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 24 Aug 2012 13:20:10 -0400 Subject: [PATCH 022/161] Fix BQSR docs --- .../sting/gatk/walkers/bqsr/BaseRecalibrator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index e45cad971..ea9d0976a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -56,7 +56,7 @@ import java.lang.reflect.Constructor; import java.util.ArrayList; /** - * First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide). + * First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context). * *

* This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating From b3fd74f0c4b02c13bdf9777ece3ac325960f7267 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 24 Aug 2012 13:25:05 -0400 Subject: [PATCH 023/161] HaplotypeCaller forbids BAQ --- .../gatk/walkers/haplotypecaller/HaplotypeCaller.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index acb5c9ebe..845fc68a6 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -28,8 +28,10 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.genotyper.*; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; +import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; @@ -40,10 +42,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.BadMateFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; -import org.broadinstitute.sting.gatk.walkers.PartitionBy; -import org.broadinstitute.sting.gatk.walkers.PartitionType; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.utils.*; @@ -103,6 +101,7 @@ import java.util.*; @DocumentedGATKFeature( groupName = "Variant Discovery Tools", extraDocs = {CommandLineGATK.class} ) @PartitionBy(PartitionType.LOCUS) +@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) @ActiveRegionExtension(extension=65, maxRegion=300) public class HaplotypeCaller extends ActiveRegionWalker implements AnnotatorCompatible { From 0545664f9173b823c30dbee1e1f3057d3c6c98ce Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 24 Aug 2012 13:45:48 -0400 Subject: [PATCH 024/161] Fix ClassCastException seen in Tableau errors --- .../gatk/walkers/annotator/VariantAnnotatorEngine.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index a1bd8dcbd..22ec5468f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -34,7 +34,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -218,7 +217,10 @@ public class VariantAnnotatorEngine { // go through all the requested info annotationTypes for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { - Map annotationsFromCurrentType = ((ActiveRegionBasedAnnotation)annotationType).annotate(perReadAlleleLikelihoodMap, vc); + if ( !(annotationType instanceof ActiveRegionBasedAnnotation) ) + continue; + + Map annotationsFromCurrentType = annotationType.annotate(perReadAlleleLikelihoodMap, vc); if ( annotationsFromCurrentType != null ) { infoAnnotations.putAll(annotationsFromCurrentType); } From d6e6b30caf15d2f7f64fcc1f2b710b458507f7be Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 24 Aug 2012 14:07:44 -0400 Subject: [PATCH 025/161] Initial implementation of GSA-515: Nanoscheduler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit – Write general NanoScheduler framework in utils.threading. Test with reading via iterator from list of integers, map is int * 2, reduce is sum. Should be efficiency using resources to do sum of 2 * (sum(1 - X)). Done! CPU parallelism is nano threads. Pfor across read / map / reduce. Use work queue to implement. Create general read map reduce framework in utils. Test parallelism independently before hooking up to Locus iterator Represent explicitly the dependency graph. Scheduler should choose the work units that are ready for computation, that are marked as "completing a computation", and then finally that maximize the number of sequent available work units. May be worth measuring expected cost for read read / map / reduce unit and use it to balance the compute As input is single threaded just need one thread to populate inputs, which runs as fast as possible on parallel pushing data to fixed size queue. Each push creates map job and links to upcoming reduce job. Note that there's at most one thread for IO tasks, and all of the threads can contribute to CPU tasks --- .../utils/nanoScheduler/MapFunction.java | 12 ++ .../sting/utils/nanoScheduler/MapResult.java | 31 ++++ .../utils/nanoScheduler/NanoScheduler.java | 165 ++++++++++++++++++ .../utils/nanoScheduler/ReduceFunction.java | 13 ++ .../nanoScheduler/NanoSchedulerUnitTest.java | 93 ++++++++++ 5 files changed, 314 insertions(+) create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapFunction.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReduceFunction.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapFunction.java new file mode 100644 index 000000000..dd18e09a9 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapFunction.java @@ -0,0 +1,12 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +/** + * A function that maps from InputType -> ResultType + * + * User: depristo + * Date: 8/24/12 + * Time: 9:49 AM + */ +public interface MapFunction { + public ResultType apply(final InputType input); +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java new file mode 100644 index 000000000..90e7c5908 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java @@ -0,0 +1,31 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 8/24/12 + * Time: 9:57 AM + * To change this template use File | Settings | File Templates. + */ +public class MapResult implements Comparable> { + final Integer id; + final MapType value; + + public MapResult(final int id, final MapType value) { + this.id = id; + this.value = value; + } + + public Integer getId() { + return id; + } + + public MapType getValue() { + return value; + } + + @Override + public int compareTo(MapResult o) { + return getId().compareTo(o.getId()); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java new file mode 100644 index 000000000..48a941515 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -0,0 +1,165 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Queue; +import java.util.concurrent.*; + +/** + * Framework for very fine grained MapReduce parallelism + * + * User: depristo + * Date: 8/24/12 + * Time: 9:47 AM + */ +public class NanoScheduler { + final int bufferSize; + final int nThreads; + final Iterator inputReader; + final MapFunction map; + final ReduceFunction reduce; + + public NanoScheduler(final int bufferSize, + final int nThreads, + final Iterator inputReader, + final MapFunction map, + final ReduceFunction reduce) { + if ( bufferSize < 1 ) throw new IllegalArgumentException("bufferSize must be >= 1, got " + bufferSize); + if ( nThreads < 1 ) throw new IllegalArgumentException("nThreads must be >= 1, got " + nThreads); + + this.bufferSize = bufferSize; + this.inputReader = inputReader; + this.map = map; + this.reduce = reduce; + this.nThreads = nThreads; + } + + public int getnThreads() { + return nThreads; + } + + private int getBufferSize() { + return bufferSize; + } + + public ReduceType execute() { + if ( getnThreads() == 1 ) { + return executeSingleThreaded(); + } else { + return executeMultiThreaded(); + } + } + + /** + * Simple efficient reference implementation for single threaded execution + * @return the reduce result of this map/reduce job + */ + private ReduceType executeSingleThreaded() { + ReduceType sum = reduce.init(); + while ( inputReader.hasNext() ) { + final InputType input = inputReader.next(); + final MapType mapValue = map.apply(input); + sum = reduce.apply(mapValue, sum); + } + return sum; + } + + /** + * Efficient parallel version of Map/Reduce + * + * @return the reduce result of this map/reduce job + */ + private ReduceType executeMultiThreaded() { + final ExecutorService executor = Executors.newFixedThreadPool(getnThreads() - 1); + + ReduceType sum = reduce.init(); + while ( inputReader.hasNext() ) { + try { + // read in our input values + final Queue inputs = readInputs(); + + // send jobs for map + final Queue> mapQueue = submitMapJobs(executor, inputs); + + // send off the reduce job, and block until we get at least one reduce result + sum = reduceParallel(mapQueue, sum); + } catch (InterruptedException ex) { + throw new ReviewedStingException("got execution exception", ex); + } catch (ExecutionException ex) { + throw new ReviewedStingException("got execution exception", ex); + } + } + + final List remaining = executor.shutdownNow(); + if ( ! remaining.isEmpty() ) + throw new ReviewedStingException("Remaining tasks found in the executor, unexpected behavior!"); + + return sum; + } + + @Requires("! mapQueue.isEmpty()") + private ReduceType reduceParallel(final Queue> mapQueue, final ReduceType initSum) + throws InterruptedException, ExecutionException { + ReduceType sum = initSum; + + // while mapQueue has something in it to reduce + for ( final Future future : mapQueue ) { + // block until we get the value for this task + final MapType value = future.get(); + sum = reduce.apply(value, sum); + } + + return sum; + } + + /** + * Read up to inputBufferSize elements from inputReader + * + * @return a queue of inputs read in, containing one or more values of InputType read in + */ + @Requires("inputReader.hasNext()") + @Ensures("!result.isEmpty()") + private Queue readInputs() { + int n = 0; + final Queue inputs = new LinkedList(); + while ( inputReader.hasNext() && n < getBufferSize() ) { + final InputType input = inputReader.next(); + inputs.add(input); + n++; + } + return inputs; + } + + @Ensures("result.size() == inputs.size()") + private Queue> submitMapJobs(final ExecutorService executor, final Queue inputs) { + final Queue> mapQueue = new LinkedList>(); + + for ( final InputType input : inputs ) { + final CallableMap doMap = new CallableMap(input); + final Future future = executor.submit(doMap); + mapQueue.add(future); + } + + return mapQueue; + } + + /** + * A simple callable version of the map function for use with the executor pool + */ + private class CallableMap implements Callable { + final InputType input; + + private CallableMap(final InputType input) { + this.input = input; + } + + @Override public MapType call() throws Exception { + return map.apply(input); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReduceFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReduceFunction.java new file mode 100644 index 000000000..274e22aff --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReduceFunction.java @@ -0,0 +1,13 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +/** + * A function that maps from InputType -> ResultType + * + * User: depristo + * Date: 8/24/12 + * Time: 9:49 AM + */ +public interface ReduceFunction { + public ReduceType init(); + public ReduceType apply(MapType one, ReduceType sum); +} diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java new file mode 100644 index 000000000..18a9f3340 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -0,0 +1,93 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * UnitTests for the NanoScheduler + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class NanoSchedulerUnitTest extends BaseTest { + private class Map2x implements MapFunction { + @Override public Integer apply(Integer input) { return input * 2; } + } + + private class ReduceSum implements ReduceFunction { + @Override public Integer init() { return 0; } + @Override public Integer apply(Integer one, Integer sum) { return one + sum; } + } + + private static int sum2x(final int start, final int end) { + int sum = 0; + for ( int i = start; i < end; i++ ) + sum += 2 * i; + return sum; + } + + private class NanoSchedulerBasicTest extends TestDataProvider { + final int bufferSize, nThreads, start, end, expectedResult; + + public NanoSchedulerBasicTest(final int bufferSize, final int nThreads, final int start, final int end) { + super(NanoSchedulerBasicTest.class); + this.bufferSize = bufferSize; + this.nThreads = nThreads; + this.start = start; + this.end = end; + this.expectedResult = sum2x(start, end); + setName(String.format("%s nt=%d buf=%d start=%d end=%d sum=%d", + getClass().getSimpleName(), nThreads, bufferSize, start, end, expectedResult)); + } + + public Iterator makeReader() { + final List ints = new ArrayList(); + for ( int i = start; i < end; i++ ) + ints.add(i); + return ints.iterator(); + } + + public Map2x makeMap() { return new Map2x(); } + public ReduceSum makeReduce() { return new ReduceSum(); } + } + + @DataProvider(name = "NanoSchedulerBasicTest") + public Object[][] createNanoSchedulerBasicTest() { + for ( final int bufferSize : Arrays.asList(1, 10, 10000, 1000000) ) { + for ( final int nt : Arrays.asList(1, 2, 4, 8, 16, 32) ) { + for ( final int start : Arrays.asList(0) ) { + for ( final int end : Arrays.asList(1, 2, 11, 1000000) ) { + new NanoSchedulerBasicTest(bufferSize, nt, start, end); + } + } + } + } + + return NanoSchedulerBasicTest.getTests(NanoSchedulerBasicTest.class); + } + + @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", timeOut = 2000) + public void testNanoSchedulerBasicTest(final NanoSchedulerBasicTest test) throws InterruptedException { + logger.warn("Running " + test); + final NanoScheduler nanoScheduler = + new NanoScheduler(test.bufferSize, test.nThreads, + test.makeReader(), test.makeMap(), test.makeReduce()); + final Integer sum = nanoScheduler.execute(); + Assert.assertNotNull(sum); + Assert.assertEquals((int)sum, test.expectedResult, "NanoScheduler sum not the same as calculated directly"); + } + + @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", timeOut = 10000, dependsOnMethods = "testNanoSchedulerBasicTest") + public void testNanoSchedulerInLoop(final NanoSchedulerBasicTest test) throws InterruptedException { + logger.warn("Running " + test); + for ( int i = 0; i < 10; i++ ) { + testNanoSchedulerBasicTest(test); + } + } +} From 752f44c332a5f76f512b5190a6529a9ee973dae3 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Fri, 24 Aug 2012 12:25:11 -0700 Subject: [PATCH 026/161] Code cleanup in MVLR and SelectVariants. Should fix JIRA GSA-509 and GSA-510 --- .../walkers/annotator/MVLikelihoodRatio.java | 106 ++++++++++++------ .../walkers/variantutils/SelectVariants.java | 9 +- 2 files changed, 74 insertions(+), 41 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java index 8aa961c75..a2a39da1f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java @@ -10,7 +10,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -21,21 +20,17 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.*; /** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 9/14/11 - * Time: 12:24 PM + * Given a variant context, uses the genotype likelihoods to assess the likelihood of the site being a mendelian violation + * versus the likelihood of the site transmitting according to mendelian rules. This assumes that the organism is + * diploid. When multiple trios are present, the annotation is simply the maximum of the likelihood ratios, rather than + * the strict 1-∏(1-p_i) calculation, as this can scale poorly for uncertain sites and many trios. */ public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation { private MendelianViolation mendelianViolation = null; + public static final String MVLR_KEY = "MVLR"; private Set trios; - private class Trio { - String motherId; - String fatherId; - String childId; - } public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, @@ -44,7 +39,8 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment final VariantContext vc, final Map stratifiedPerReadAlleleLikelihoodMap) { if ( mendelianViolation == null ) { - if (checkAndSetSamples(((Walker) walker).getSampleDB())) { + trios = checkAndSetSamples(((Walker) walker).getSampleDB()); + if ( trios.size() > 0 ) { mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP ); } else { @@ -52,15 +48,12 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment } } - Map toRet = new HashMap(1); + Map attributeMap = new HashMap(1); //double pNoMV = 1.0; double maxMVLR = Double.MIN_VALUE; for ( Trio trio : trios ) { - boolean hasAppropriateGenotypes = vc.hasGenotype(trio.motherId) && vc.getGenotype(trio.motherId).hasLikelihoods() && - vc.hasGenotype(trio.fatherId) && vc.getGenotype(trio.fatherId).hasLikelihoods() && - vc.hasGenotype(trio.childId) && vc.getGenotype(trio.childId).hasLikelihoods(); - if ( hasAppropriateGenotypes ) { - Double likR = mendelianViolation.violationLikelihoodRatio(vc,trio.motherId,trio.fatherId,trio.childId); + if ( contextHasTrioLikelihoods(vc,trio) ) { + Double likR = mendelianViolation.violationLikelihoodRatio(vc,trio.getMaternalID(),trio.getPaternalID(),trio.childId); maxMVLR = likR > maxMVLR ? likR : maxMVLR; //pNoMV *= (1.0-Math.pow(10.0,likR)/(1+Math.pow(10.0,likR))); } @@ -68,34 +61,79 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment //double pSomeMV = 1.0-pNoMV; //toRet.put("MVLR",Math.log10(pSomeMV)-Math.log10(1.0-pSomeMV)); - toRet.put("MVLR",maxMVLR); - return toRet; + if ( Double.compare(maxMVLR,Double.MIN_VALUE) != 0 ) + attributeMap.put(MVLR_KEY,maxMVLR); + return attributeMap; } // return the descriptions used for the VCF INFO meta field - public List getKeyNames() { return Arrays.asList("MVLR"); } + public List getKeyNames() { return Arrays.asList(MVLR_KEY); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MVLR", 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); } + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(MVLR_KEY, 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); } - private boolean checkAndSetSamples(SampleDB db){ - trios = new HashSet(); - Set families = db.getFamilyIDs(); - for ( String familyString : families ) { + private Set checkAndSetSamples(SampleDB db){ + Set trioSet = new HashSet(); + for ( String familyString : db.getFamilyIDs() ) { Set family = db.getFamily(familyString); - Iterator sampleIterator = family.iterator(); - Sample sample; - for ( sample = sampleIterator.next(); sampleIterator.hasNext(); sample=sampleIterator.next()) { + for ( Sample sample : family) { if ( sample.getParents().size() == 2 ) { - Trio trio = new Trio(); - trio.childId = sample.getID(); - trio.fatherId = sample.getFather().getID(); - trio.motherId = sample.getMother().getID(); - trios.add(trio); + Trio trio = new Trio(sample.getMaternalID(),sample.getPaternalID(),sample.getID()); + trioSet.add(trio); } } } - return trios.size() > 0; + return trioSet; } + private boolean contextHasTrioLikelihoods(VariantContext context, Trio trio) { + for ( String sample : trio ) { + if ( ! context.hasGenotype(sample) ) + return false; + if ( ! context.getGenotype(sample).hasLikelihoods() ) + return false; + } + + return true; + } + + private class Trio implements Iterable { + private String maternalID; + private String paternalID; + private String childId; + + public Trio(String mom, String dad, String child) { + this.maternalID = mom; + this.paternalID = dad; + this.childId = child; + } + + public String getMaternalID() { + return this.maternalID; + } + + public String getPaternalID() { + return this.paternalID; + } + + public String getChildId() { + return this.childId; + } + + public void setMaternalID(String id) { + this.maternalID = id; + } + + public void setPaternalID(String id) { + this.paternalID = id; + } + + public void setChildId(String id) { + this.childId = id; + } + + public Iterator iterator() { + return Arrays.asList(maternalID,paternalID,childId).iterator(); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index fc29a7f02..567262756 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -579,14 +579,9 @@ public class SelectVariants extends RodWalker implements TreeR } private boolean badIndelSize(final VariantContext vc) { - if ( vc.getReference().length() > maxIndelSize ) { - return true; - } - - for ( Allele a : vc.getAlternateAlleles() ) { - if ( a.length() > maxIndelSize ) { + for ( Integer indelLength : vc.getIndelLengths() ) { + if ( indelLength > maxIndelSize ) return true; - } } return false; From 9de8077eebe9f1ceef2caa8da8170db35acc6692 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 24 Aug 2012 15:34:23 -0400 Subject: [PATCH 027/161] Working (efficient?) implementation of NanoScheduler -- Groups inputs for each thread so that we don't have one thread execution per map() call -- Added shutdown function -- Documentation everywhere -- Code cleanup -- Extensive unittests -- At this point I'm ready to integrate it into the engine for CPU parallel read walkers --- .../org/broadinstitute/sting/utils/Utils.java | 21 ++ .../utils/nanoScheduler/MapFunction.java | 7 + .../sting/utils/nanoScheduler/MapResult.java | 31 --- .../utils/nanoScheduler/NanoScheduler.java | 206 ++++++++++++++---- .../utils/nanoScheduler/ReduceFunction.java | 9 +- .../nanoScheduler/NanoSchedulerUnitTest.java | 93 ++++++-- 6 files changed, 265 insertions(+), 102 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index a5b5eca6a..74b038032 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -810,4 +810,25 @@ public class Utils { return Collections.unmodifiableMap(map); } + /** + * Divides the input list into a list of sublists, which contains group size elements (except potentially the last one) + * + * list = [A, B, C, D, E] + * groupSize = 2 + * result = [[A, B], [C, D], [E]] + * + * @param list + * @param groupSize + * @return + */ + public static List> groupList(final List list, final int groupSize) { + if ( groupSize < 1 ) throw new IllegalArgumentException("groupSize >= 1"); + + final List> subLists = new LinkedList>(); + int n = list.size(); + for ( int i = 0; i < n; i += groupSize ) { + subLists.add(list.subList(i, Math.min(i + groupSize, n))); + } + return subLists; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapFunction.java index dd18e09a9..440c263b7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapFunction.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapFunction.java @@ -3,10 +3,17 @@ package org.broadinstitute.sting.utils.nanoScheduler; /** * A function that maps from InputType -> ResultType * + * For use with the NanoScheduler + * * User: depristo * Date: 8/24/12 * Time: 9:49 AM */ public interface MapFunction { + /** + * Return function on input, returning a value of ResultType + * @param input + * @return + */ public ResultType apply(final InputType input); } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java deleted file mode 100644 index 90e7c5908..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java +++ /dev/null @@ -1,31 +0,0 @@ -package org.broadinstitute.sting.utils.nanoScheduler; - -/** - * Created with IntelliJ IDEA. - * User: depristo - * Date: 8/24/12 - * Time: 9:57 AM - * To change this template use File | Settings | File Templates. - */ -public class MapResult implements Comparable> { - final Integer id; - final MapType value; - - public MapResult(final int id, final MapType value) { - this.id = id; - this.value = value; - } - - public Integer getId() { - return id; - } - - public MapType getValue() { - return value; - } - - @Override - public int compareTo(MapResult o) { - return getId().compareTo(o.getId()); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 48a941515..fcc6a5723 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -2,6 +2,8 @@ package org.broadinstitute.sting.utils.nanoScheduler; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.Iterator; @@ -13,45 +15,147 @@ import java.util.concurrent.*; /** * Framework for very fine grained MapReduce parallelism * + * The overall framework works like this + * + * nano <- new Nanoschedule(bufferSize, numberOfMapElementsToProcessTogether, nThreads) + * List[Input] outerData : outerDataLoop ) + * result = nano.execute(outerData.iterator(), map, reduce) + * + * bufferSize determines how many elements from the input stream are read in one go by the + * nanoscheduler. The scheduler may hold up to bufferSize in memory at one time, as well + * as up to inputBufferSize map results as well. + * + * numberOfMapElementsToProcessTogether determines how many input elements are processed + * together each thread cycle. For example, if this value is 10, then the input data + * is grouped together in units of 10 elements each, and map called on each in term. The more + * heavy-weight the map function is, in terms of CPU costs, the more it makes sense to + * have this number be small. The lighter the CPU cost per element, though, the more this + * parameter introduces overhead due to need to context switch among threads to process + * each input element. A value of -1 lets the nanoscheduler guess at a reasonable trade-off value. + * + * nThreads is a bit obvious yes? Note though that the nanoscheduler assumes that it gets 1 thread + * from its client during the execute call, as this call blocks until all work is done. The caller + * thread is put to work by execute to help with the processing of the data. So in reality the + * nanoScheduler only spawn nThreads - 1 additional workers (if this is > 1). + * * User: depristo * Date: 8/24/12 * Time: 9:47 AM */ public class NanoScheduler { - final int bufferSize; - final int nThreads; - final Iterator inputReader; - final MapFunction map; - final ReduceFunction reduce; + private static Logger logger = Logger.getLogger(NanoScheduler.class); + final int bufferSize; + final int mapGroupSize; + final int nThreads; + final ExecutorService executor; + boolean shutdown = false; + + /** + * Create a new nanoschedule with the desire characteristics requested by the argument + * + * @param bufferSize the number of input elements to read in each scheduling cycle. + * @param mapGroupSize How many inputs should be grouped together per map? If -1 we make a reasonable guess + * @param nThreads the number of threads to use to get work done, in addition to the thread calling execute + */ public NanoScheduler(final int bufferSize, - final int nThreads, - final Iterator inputReader, - final MapFunction map, - final ReduceFunction reduce) { + final int mapGroupSize, + final int nThreads) { if ( bufferSize < 1 ) throw new IllegalArgumentException("bufferSize must be >= 1, got " + bufferSize); if ( nThreads < 1 ) throw new IllegalArgumentException("nThreads must be >= 1, got " + nThreads); + if ( mapGroupSize > bufferSize ) throw new IllegalArgumentException("mapGroupSize " + mapGroupSize + " must be <= bufferSize " + bufferSize); + if ( mapGroupSize == 0 || mapGroupSize < -1 ) throw new IllegalArgumentException("mapGroupSize cannot be <= 0" + mapGroupSize); + this.bufferSize = bufferSize; - this.inputReader = inputReader; - this.map = map; - this.reduce = reduce; this.nThreads = nThreads; + + if ( mapGroupSize == -1 ) { + this.mapGroupSize = (int)Math.ceil(this.bufferSize / (10.0*this.nThreads)); + logger.info(String.format("Dynamically setting grouping size to %d based on buffer size %d and n threads %d", + this.mapGroupSize, this.bufferSize, this.nThreads)); + } else { + this.mapGroupSize = mapGroupSize; + } + + this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads - 1); } + /** + * The number of parallel map threads in use with this NanoScheduler + * @return + */ public int getnThreads() { return nThreads; } - private int getBufferSize() { + /** + * The input buffer size used by this NanoScheduler + * @return + */ + public int getBufferSize() { return bufferSize; } - public ReduceType execute() { + /** + * The grouping size used by this NanoScheduler + * @return + */ + public int getMapGroupSize() { + return mapGroupSize; + } + + /** + * Tells this nanoScheduler to shutdown immediately, releasing all its resources. + * + * After this call, execute cannot be invoked without throwing an error + */ + public void shutdown() { + if ( executor != null ) { + final List remaining = executor.shutdownNow(); + if ( ! remaining.isEmpty() ) + throw new IllegalStateException("Remaining tasks found in the executor, unexpected behavior!"); + } + shutdown = true; + } + + /** + * @return true if this nanoScheduler is shutdown, or false if its still open for business + */ + public boolean isShutdown() { + return shutdown; + } + + /** + * Execute a map/reduce job with this nanoScheduler + * + * Data comes from inputReader. Will be read until hasNext() == false. + * map is called on each element provided by inputReader. No order of operations is guarenteed + * reduce is called in order of the input data provided by inputReader on the result of map() applied + * to each element. + * + * Note that the caller thread is put to work with this function call. The call doesn't return + * until all elements have been processes. + * + * It is safe to call this function repeatedly on a single nanoScheduler, at least until the + * shutdown method is called. + * + * @param inputReader + * @param map + * @param reduce + * @return + */ + public ReduceType execute(final Iterator inputReader, + final MapFunction map, + final ReduceType initialValue, + final ReduceFunction reduce) { + if ( isShutdown() ) + throw new IllegalStateException("execute called on already shutdown NanoScheduler"); + if ( getnThreads() == 1 ) { - return executeSingleThreaded(); + return executeSingleThreaded(inputReader, map, initialValue, reduce); } else { - return executeMultiThreaded(); + return executeMultiThreaded(inputReader, map, initialValue, reduce); } } @@ -59,8 +163,11 @@ public class NanoScheduler { * Simple efficient reference implementation for single threaded execution * @return the reduce result of this map/reduce job */ - private ReduceType executeSingleThreaded() { - ReduceType sum = reduce.init(); + private ReduceType executeSingleThreaded(final Iterator inputReader, + final MapFunction map, + final ReduceType initialValue, + final ReduceFunction reduce) { + ReduceType sum = initialValue; while ( inputReader.hasNext() ) { final InputType input = inputReader.next(); final MapType mapValue = map.apply(input); @@ -74,20 +181,21 @@ public class NanoScheduler { * * @return the reduce result of this map/reduce job */ - private ReduceType executeMultiThreaded() { - final ExecutorService executor = Executors.newFixedThreadPool(getnThreads() - 1); - - ReduceType sum = reduce.init(); + private ReduceType executeMultiThreaded(final Iterator inputReader, + final MapFunction map, + final ReduceType initialValue, + final ReduceFunction reduce) { + ReduceType sum = initialValue; while ( inputReader.hasNext() ) { try { // read in our input values - final Queue inputs = readInputs(); + final List inputs = readInputs(inputReader); // send jobs for map - final Queue> mapQueue = submitMapJobs(executor, inputs); + final Queue>> mapQueue = submitMapJobs(map, executor, inputs); // send off the reduce job, and block until we get at least one reduce result - sum = reduceParallel(mapQueue, sum); + sum = reduceParallel(reduce, mapQueue, sum); } catch (InterruptedException ex) { throw new ReviewedStingException("got execution exception", ex); } catch (ExecutionException ex) { @@ -95,23 +203,20 @@ public class NanoScheduler { } } - final List remaining = executor.shutdownNow(); - if ( ! remaining.isEmpty() ) - throw new ReviewedStingException("Remaining tasks found in the executor, unexpected behavior!"); - return sum; } @Requires("! mapQueue.isEmpty()") - private ReduceType reduceParallel(final Queue> mapQueue, final ReduceType initSum) + private ReduceType reduceParallel(final ReduceFunction reduce, + final Queue>> mapQueue, + final ReduceType initSum) throws InterruptedException, ExecutionException { ReduceType sum = initSum; // while mapQueue has something in it to reduce - for ( final Future future : mapQueue ) { - // block until we get the value for this task - final MapType value = future.get(); - sum = reduce.apply(value, sum); + for ( final Future> future : mapQueue ) { + for ( final MapType value : future.get() ) // block until we get the values for this task + sum = reduce.apply(value, sum); } return sum; @@ -124,9 +229,9 @@ public class NanoScheduler { */ @Requires("inputReader.hasNext()") @Ensures("!result.isEmpty()") - private Queue readInputs() { + private List readInputs(final Iterator inputReader) { int n = 0; - final Queue inputs = new LinkedList(); + final List inputs = new LinkedList(); while ( inputReader.hasNext() && n < getBufferSize() ) { final InputType input = inputReader.next(); inputs.add(input); @@ -136,12 +241,14 @@ public class NanoScheduler { } @Ensures("result.size() == inputs.size()") - private Queue> submitMapJobs(final ExecutorService executor, final Queue inputs) { - final Queue> mapQueue = new LinkedList>(); + private Queue>> submitMapJobs(final MapFunction map, + final ExecutorService executor, + final List inputs) { + final Queue>> mapQueue = new LinkedList>>(); - for ( final InputType input : inputs ) { - final CallableMap doMap = new CallableMap(input); - final Future future = executor.submit(doMap); + for ( final List subinputs : Utils.groupList(inputs, getMapGroupSize()) ) { + final CallableMap doMap = new CallableMap(map, subinputs); + final Future> future = executor.submit(doMap); mapQueue.add(future); } @@ -151,15 +258,20 @@ public class NanoScheduler { /** * A simple callable version of the map function for use with the executor pool */ - private class CallableMap implements Callable { - final InputType input; + private class CallableMap implements Callable> { + final List inputs; + final MapFunction map; - private CallableMap(final InputType input) { - this.input = input; + private CallableMap(final MapFunction map, final List inputs) { + this.inputs = inputs; + this.map = map; } - @Override public MapType call() throws Exception { - return map.apply(input); + @Override public List call() throws Exception { + final List outputs = new LinkedList(); + for ( final InputType input : inputs ) + outputs.add(map.apply(input)); + return outputs; } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReduceFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReduceFunction.java index 274e22aff..8f1b0eddd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReduceFunction.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReduceFunction.java @@ -1,13 +1,18 @@ package org.broadinstitute.sting.utils.nanoScheduler; /** - * A function that maps from InputType -> ResultType + * A function that combines a value of MapType with an existing ReduceValue into a new ResultType * * User: depristo * Date: 8/24/12 * Time: 9:49 AM */ public interface ReduceFunction { - public ReduceType init(); + /** + * Combine one with sum into a new ReduceType + * @param one the result of a map call on an input element + * @param sum the cumulative reduce result over all previous map calls + * @return + */ public ReduceType apply(MapType one, ReduceType sum); } diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index 18a9f3340..211e43dc1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -21,7 +21,6 @@ public class NanoSchedulerUnitTest extends BaseTest { } private class ReduceSum implements ReduceFunction { - @Override public Integer init() { return 0; } @Override public Integer apply(Integer one, Integer sum) { return one + sum; } } @@ -33,17 +32,18 @@ public class NanoSchedulerUnitTest extends BaseTest { } private class NanoSchedulerBasicTest extends TestDataProvider { - final int bufferSize, nThreads, start, end, expectedResult; + final int bufferSize, mapGroupSize, nThreads, start, end, expectedResult; - public NanoSchedulerBasicTest(final int bufferSize, final int nThreads, final int start, final int end) { + public NanoSchedulerBasicTest(final int bufferSize, final int mapGroupSize, final int nThreads, final int start, final int end) { super(NanoSchedulerBasicTest.class); this.bufferSize = bufferSize; + this.mapGroupSize = mapGroupSize; this.nThreads = nThreads; this.start = start; this.end = end; this.expectedResult = sum2x(start, end); - setName(String.format("%s nt=%d buf=%d start=%d end=%d sum=%d", - getClass().getSimpleName(), nThreads, bufferSize, start, end, expectedResult)); + setName(String.format("%s nt=%d buf=%d mapGroupSize=%d start=%d end=%d sum=%d", + getClass().getSimpleName(), nThreads, bufferSize, mapGroupSize, start, end, expectedResult)); } public Iterator makeReader() { @@ -54,16 +54,22 @@ public class NanoSchedulerUnitTest extends BaseTest { } public Map2x makeMap() { return new Map2x(); } + public Integer initReduce() { return 0; } public ReduceSum makeReduce() { return new ReduceSum(); } } + static NanoSchedulerBasicTest exampleTest = null; @DataProvider(name = "NanoSchedulerBasicTest") public Object[][] createNanoSchedulerBasicTest() { - for ( final int bufferSize : Arrays.asList(1, 10, 10000, 1000000) ) { - for ( final int nt : Arrays.asList(1, 2, 4, 8, 16, 32) ) { - for ( final int start : Arrays.asList(0) ) { - for ( final int end : Arrays.asList(1, 2, 11, 1000000) ) { - new NanoSchedulerBasicTest(bufferSize, nt, start, end); + for ( final int bufferSize : Arrays.asList(1, 10, 1000, 1000000) ) { + for ( final int mapGroupSize : Arrays.asList(-1, 1, 10, 100, 1000) ) { + if ( mapGroupSize <= bufferSize ) { + for ( final int nt : Arrays.asList(1, 2, 4) ) { + for ( final int start : Arrays.asList(0) ) { + for ( final int end : Arrays.asList(1, 2, 11, 10000, 100000) ) { + exampleTest = new NanoSchedulerBasicTest(bufferSize, mapGroupSize, nt, start, end); + } + } } } } @@ -72,22 +78,65 @@ public class NanoSchedulerUnitTest extends BaseTest { return NanoSchedulerBasicTest.getTests(NanoSchedulerBasicTest.class); } - @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", timeOut = 2000) - public void testNanoSchedulerBasicTest(final NanoSchedulerBasicTest test) throws InterruptedException { + @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest") + public void testSingleThreadedNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { logger.warn("Running " + test); - final NanoScheduler nanoScheduler = - new NanoScheduler(test.bufferSize, test.nThreads, - test.makeReader(), test.makeMap(), test.makeReduce()); - final Integer sum = nanoScheduler.execute(); - Assert.assertNotNull(sum); - Assert.assertEquals((int)sum, test.expectedResult, "NanoScheduler sum not the same as calculated directly"); + if ( test.nThreads == 1 ) + testNanoScheduler(test); } - @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", timeOut = 10000, dependsOnMethods = "testNanoSchedulerBasicTest") - public void testNanoSchedulerInLoop(final NanoSchedulerBasicTest test) throws InterruptedException { + @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", timeOut = 10000, dependsOnMethods = "testSingleThreadedNanoScheduler") + public void testMultiThreadedNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { logger.warn("Running " + test); - for ( int i = 0; i < 10; i++ ) { - testNanoSchedulerBasicTest(test); + if ( test.nThreads >= 1 ) + testNanoScheduler(test); + } + + private void testNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { + final NanoScheduler nanoScheduler = + new NanoScheduler(test.bufferSize, test.mapGroupSize, test.nThreads); + + Assert.assertEquals(nanoScheduler.getBufferSize(), test.bufferSize, "bufferSize argument"); + Assert.assertTrue(nanoScheduler.getMapGroupSize() >= test.mapGroupSize, "mapGroupSize argument"); + Assert.assertEquals(nanoScheduler.getnThreads(), test.nThreads, "nThreads argument"); + + final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); + Assert.assertNotNull(sum); + Assert.assertEquals((int)sum, test.expectedResult, "NanoScheduler sum not the same as calculated directly"); + nanoScheduler.shutdown(); + } + + @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", dependsOnMethods = "testMultiThreadedNanoScheduler") + public void testNanoSchedulerInLoop(final NanoSchedulerBasicTest test) throws InterruptedException { + if ( test.bufferSize > 1 && (test.mapGroupSize > 1 || test.mapGroupSize == -1)) { + logger.warn("Running " + test); + + final NanoScheduler nanoScheduler = + new NanoScheduler(test.bufferSize, test.mapGroupSize, test.nThreads); + + // test reusing the scheduler + for ( int i = 0; i < 10; i++ ) { + final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); + Assert.assertNotNull(sum); + Assert.assertEquals((int)sum, test.expectedResult, "NanoScheduler sum not the same as calculated directly"); + } + + nanoScheduler.shutdown(); } } + + @Test() + public void testShutdown() throws InterruptedException { + final NanoScheduler nanoScheduler = new NanoScheduler(1, 1, 2); + Assert.assertFalse(nanoScheduler.isShutdown(), "scheduler should be alive"); + nanoScheduler.shutdown(); + Assert.assertTrue(nanoScheduler.isShutdown(), "scheduler should be dead"); + } + + @Test(expectedExceptions = IllegalStateException.class) + public void testShutdownExecuteFailure() throws InterruptedException { + final NanoScheduler nanoScheduler = new NanoScheduler(1, 1, 2); + nanoScheduler.shutdown(); + nanoScheduler.execute(exampleTest.makeReader(), exampleTest.makeMap(), exampleTest.initReduce(), exampleTest.makeReduce()); + } } From 0996bbd5485493e6211c1806bea1e597f3278962 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 24 Aug 2012 16:04:58 -0400 Subject: [PATCH 032/161] Comments for Chris on cleanup --- .../sting/gatk/walkers/annotator/MVLikelihoodRatio.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java index a2a39da1f..d6cf50522 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java @@ -71,6 +71,7 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(MVLR_KEY, 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); } + // todo - this entire function should be in samples DB private Set checkAndSetSamples(SampleDB db){ Set trioSet = new HashSet(); for ( String familyString : db.getFamilyIDs() ) { @@ -97,6 +98,10 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment return true; } + // TODO -- this class is too much. + // TODO -- Why iterable? + // TODO -- shuoldn't this be in samplesDB() so you can just called samplesDB().getTrios() + // TODO -- should just have final string IDs, and getters, no setters private class Trio implements Iterable { private String maternalID; private String paternalID; From b59948709f176722228bd7c4e4ed3920189a6982 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Sat, 25 Aug 2012 08:48:27 -0700 Subject: [PATCH 033/161] Code improvements re: JIRA GSA-510. Trio class migrated into the Samples package - because the trio structure is so ubiquitously used, it makes sense, I think, to have a class which imposes the structure on the samples. Existing functions which slightly duplicated the getTrios() method look like they have bugs. These functions are now deprecated. A number of functions int he sampleDB looked to be assuming that samples could not share IDs (e.g. sample IDs are unique, so a sample present in two families could not be represented by multiple Sample objects). Added an assertion in the SampleDBBuilder to document/test this assumption. MVLikelihoodRatio now uses the trio methods from SampleDB. --- .../sting/gatk/samples/SampleDB.java | 65 ++++++++++++++++++ .../sting/gatk/samples/SampleDBBuilder.java | 13 +++- .../sting/gatk/samples/Trio.java | 45 +++++++++++++ .../walkers/annotator/MVLikelihoodRatio.java | 67 ++----------------- 4 files changed, 124 insertions(+), 66 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/samples/Trio.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java index 31149cd8a..3de85028f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java @@ -168,13 +168,70 @@ public class SampleDB { return families; } + /** + * Returns all the trios present in the sample database. The strictOneChild parameter determines + * whether multiple children of the same parents resolve to multiple trios, or are excluded + * @param strictOneChild - exclude pedigrees with >1 child for parental pair + * @return - all of the mother+father=child triplets, subject to strictOneChild + */ + public final Set getTrios(boolean strictOneChild) { + Set trioSet = new HashSet(); + for ( String familyString : getFamilyIDs() ) { + Set family = getFamily(familyString); + for ( Sample sample : family) { + if ( sample.getParents().size() == 2 ) { + Trio trio = new Trio(sample.getMother(),sample.getFather(),sample); + trioSet.add(trio); + } + } + } + + if ( strictOneChild ) + trioSet = removeTriosWithSameParents(trioSet); + + return trioSet; + } + + /** + * Returns all the trios present in the db. See getTrios(boolean strictOneChild) + * @return all the trios present in the samples db. + */ + public final Set getTrios() { + return getTrios(false); + } + + /** + * Subsets a set of trios to only those with nonmatching founders. If two (or more) trio objects have + * the same mother and father, then both (all) are removed from the returned set. + * @param trios - a set of Trio objects + * @return those subset of Trio objects in the input set with nonmatching founders + */ + private Set removeTriosWithSameParents(final Set trios) { + Set filteredTrios = new HashSet(); + filteredTrios.addAll(trios); + Set triosWithSameParents = new HashSet(); + for ( Trio referenceTrio : filteredTrios ) { + for ( Trio compareTrio : filteredTrios ) { + if ( referenceTrio != compareTrio && + referenceTrio.getFather().equals(compareTrio.getFather()) && + referenceTrio.getMother().equals(compareTrio.getMother()) ) { + triosWithSameParents.add(referenceTrio); + triosWithSameParents.add(compareTrio); + } + } + } + filteredTrios.removeAll(triosWithSameParents); + return filteredTrios; + } /** * Returns the set of all children that have both of their parents. * Note that if a family is composed of more than 1 child, each child is * returned. * @return - all the children that have both of their parents + * @deprecated - getTrios() replaces this function */ + @Deprecated public final Set getChildrenWithParents(){ return getChildrenWithParents(false); } @@ -188,7 +245,15 @@ public class SampleDB { * * @param triosOnly - if set to true, only strict trios are returned * @return - all the children that have both of their parents + * @deprecated - getTrios(boolean strict) replaces this function + * @bug -- does not work for extracting multiple generations of trios, e.g. + * ..........Mom1------Dad1 + * ................| + * ..............Child1--------Mom2 + * .......................| + * .....................Child2 */ + @Deprecated public final Set getChildrenWithParents(boolean triosOnly) { Map> families = getFamilies(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java index 44a8600b0..612e342db 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java @@ -135,9 +135,8 @@ public class SampleDBBuilder { // -------------------------------------------------------------------------------- protected final void validate() { - if ( validationStrictness == PedigreeValidationType.SILENT ) - return; - else { + validatePedigreeIDUniqueness(); + if ( validationStrictness != PedigreeValidationType.SILENT ) { // check that samples in data sources are all annotated, if anything is annotated if ( ! samplesFromPedigrees.isEmpty() && ! samplesFromDataSources.isEmpty() ) { final Set sampleNamesFromPedigrees = new HashSet(); @@ -150,4 +149,12 @@ public class SampleDBBuilder { } } } + + private void validatePedigreeIDUniqueness() { + Set pedigreeIDs = new HashSet(); + for ( Sample sample : samplesFromPedigrees ) { + pedigreeIDs.add(sample.getID()); + } + assert pedigreeIDs.size() == samplesFromPedigrees.size() : "The number of sample IDs extracted from the pedigree does not equal the number of samples in the pedigree. Is a sample associated with multiple families?"; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Trio.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Trio.java new file mode 100644 index 000000000..314baad3d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Trio.java @@ -0,0 +1,45 @@ +package org.broadinstitute.sting.gatk.samples; + +/** + * A class for imposing a trio structure on three samples; a common paradigm + * + * todo -- there should probably be an interface or abstract class "Pedigree" that generalizes the notion of + * -- imposing structure on samples. But given how complex pedigrees can quickly become, it's not + * -- clear the best way to do this. + */ +public class Trio { + private Sample mother; + private Sample father; + private Sample child; + + public Trio(Sample mom, Sample dad, Sample spawn) { + assert mom.getID().equals(spawn.getMaternalID()) && dad.getID().equals(spawn.getPaternalID()) : "Samples passed to trio constructor do not form a trio"; + mother = mom; + father = dad; + child = spawn; + } + + public Sample getMother() { + return mother; + } + + public String getMaternalID() { + return mother.getID(); + } + + public Sample getFather() { + return father; + } + + public String getPaternalID() { + return father.getID(); + } + + public Sample getChild() { + return child; + } + + public String getChildID() { + return child.getID(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java index d6cf50522..f644c4c6d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java @@ -3,8 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.samples.Sample; -import org.broadinstitute.sting.gatk.samples.SampleDB; +import org.broadinstitute.sting.gatk.samples.Trio; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; @@ -39,7 +38,7 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment final VariantContext vc, final Map stratifiedPerReadAlleleLikelihoodMap) { if ( mendelianViolation == null ) { - trios = checkAndSetSamples(((Walker) walker).getSampleDB()); + trios = ((Walker) walker).getSampleDB().getTrios(); if ( trios.size() > 0 ) { mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP ); } @@ -53,7 +52,7 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment double maxMVLR = Double.MIN_VALUE; for ( Trio trio : trios ) { if ( contextHasTrioLikelihoods(vc,trio) ) { - Double likR = mendelianViolation.violationLikelihoodRatio(vc,trio.getMaternalID(),trio.getPaternalID(),trio.childId); + Double likR = mendelianViolation.violationLikelihoodRatio(vc,trio.getMaternalID(),trio.getPaternalID(),trio.getChildID()); maxMVLR = likR > maxMVLR ? likR : maxMVLR; //pNoMV *= (1.0-Math.pow(10.0,likR)/(1+Math.pow(10.0,likR))); } @@ -71,24 +70,9 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(MVLR_KEY, 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); } - // todo - this entire function should be in samples DB - private Set checkAndSetSamples(SampleDB db){ - Set trioSet = new HashSet(); - for ( String familyString : db.getFamilyIDs() ) { - Set family = db.getFamily(familyString); - for ( Sample sample : family) { - if ( sample.getParents().size() == 2 ) { - Trio trio = new Trio(sample.getMaternalID(),sample.getPaternalID(),sample.getID()); - trioSet.add(trio); - } - } - } - - return trioSet; - } private boolean contextHasTrioLikelihoods(VariantContext context, Trio trio) { - for ( String sample : trio ) { + for ( String sample : Arrays.asList(trio.getMaternalID(),trio.getPaternalID(),trio.getChildID()) ) { if ( ! context.hasGenotype(sample) ) return false; if ( ! context.getGenotype(sample).hasLikelihoods() ) @@ -98,47 +82,4 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment return true; } - // TODO -- this class is too much. - // TODO -- Why iterable? - // TODO -- shuoldn't this be in samplesDB() so you can just called samplesDB().getTrios() - // TODO -- should just have final string IDs, and getters, no setters - private class Trio implements Iterable { - private String maternalID; - private String paternalID; - private String childId; - - public Trio(String mom, String dad, String child) { - this.maternalID = mom; - this.paternalID = dad; - this.childId = child; - } - - public String getMaternalID() { - return this.maternalID; - } - - public String getPaternalID() { - return this.paternalID; - } - - public String getChildId() { - return this.childId; - } - - public void setMaternalID(String id) { - this.maternalID = id; - } - - public void setPaternalID(String id) { - this.paternalID = id; - } - - public void setChildId(String id) { - this.childId = id; - } - - public Iterator iterator() { - return Arrays.asList(maternalID,paternalID,childId).iterator(); - } - } } From dcc972a55759b9c16362e6f4f3501fda55f3a6c2 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 25 Aug 2012 14:53:00 -0400 Subject: [PATCH 036/161] Usability cleanup for BQSR -- I'm seeing a lot of people trying to use BinaryTagCovariate in the community. They really shouldn't do this, so I moved it to private. -- Throw an exception if its required bintag argument is missing -- Check explicitly if user is requesting DinucCovariate and tell them that its been retired in favor of ContextCovariate -- Show the type (Required, Experimental, Standard) of the covariates when running --list --- .../sting/utils/classloader/JVMUtils.java | 15 +++++ .../sting/utils/recalibration/RecalUtils.java | 25 +++++--- .../covariates/BinaryTagCovariate.java | 63 ------------------- 3 files changed, 31 insertions(+), 72 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/BinaryTagCovariate.java diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java b/public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java index fa154fca3..dd12ce761 100755 --- a/public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java @@ -25,12 +25,14 @@ package org.broadinstitute.sting.utils.classloader; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.reflections.util.ClasspathHelper; import java.io.File; import java.io.IOException; +import java.lang.annotation.Annotation; import java.lang.reflect.*; import java.net.URL; import java.util.*; @@ -234,4 +236,17 @@ public class JVMUtils { } else throw new ReviewedStingException("BUG: could not find generic type on class " + t); } + + /** + * Returns a comma-separated list of the names of the interfaces implemented by this class + * + * @param covClass + * @return + */ + public static String classInterfaces(final Class covClass) { + final List interfaces = new ArrayList(); + for ( final Class interfaceClass : covClass.getInterfaces() ) + interfaces.add(interfaceClass.getSimpleName()); + return Utils.join(", ", interfaces); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java index 2d05877af..8a9143c89 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -29,6 +29,7 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.classloader.JVMUtils; import org.broadinstitute.sting.utils.recalibration.covariates.*; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.R.RScriptExecutor; @@ -117,6 +118,12 @@ public class RecalUtils { if (argumentCollection.COVARIATES != null) { // parse the -cov arguments that were provided, skipping over the ones already specified for (String requestedCovariateString : argumentCollection.COVARIATES) { + // help the transition from BQSR v1 to BQSR v2 + if ( requestedCovariateString.equals("DinucCovariate") ) + throw new UserException.CommandLineException("DinucCovariate has been retired. Please use its successor covariate " + + "ContextCovariate instead, which includes the 2 bp (dinuc) substitution model of the retired DinucCovariate " + + "as well as an indel context to model the indel error rates"); + boolean foundClass = false; for (Class covClass : covariateClasses) { if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class @@ -178,18 +185,18 @@ public class RecalUtils { return dest; } - public static void listAvailableCovariates(Logger logger) { - // Get a list of all available covariates - final List> covariateClasses = new PluginManager(Covariate.class).getPlugins(); - - // Print and exit if that's what was requested + /** + * Print a list of all available covariates to logger as info + * + * @param logger + */ + public static void listAvailableCovariates(final Logger logger) { logger.info("Available covariates:"); - for (Class covClass : covariateClasses) - logger.info(covClass.getSimpleName()); - logger.info(""); + for (final Class covClass : new PluginManager(Covariate.class).getPlugins()) { + logger.info(String.format("\t%30s\t%s", covClass.getSimpleName(), JVMUtils.classInterfaces(covClass))); + } } - public enum SOLID_RECAL_MODE { /** * Treat reference inserted bases as reference matching bases. Very unsafe! diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/BinaryTagCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/BinaryTagCovariate.java deleted file mode 100644 index cebdebf9d..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/BinaryTagCovariate.java +++ /dev/null @@ -1,63 +0,0 @@ -package org.broadinstitute.sting.utils.recalibration.covariates; - -import org.broadinstitute.sting.utils.recalibration.ReadCovariates; -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -/** - * Binary covariate allows BQSR to recalibrate based on a binary covariate in the BAM file. This covariate should assume values of 1 and 0. - * - * @author Mauricio Carneiro - * @since 7/6/12 - */ -public class BinaryTagCovariate implements ExperimentalCovariate { - - private String tag; - - @Override - public void initialize(RecalibrationArgumentCollection RAC) { - tag = RAC.BINARY_TAG_NAME; - } - - @Override - public void recordValues(GATKSAMRecord read, ReadCovariates values) { - final Object tagObject = read.getAttribute(tag); - - byte[] binaryTag; - if (tagObject instanceof byte[]) - binaryTag = (byte[]) tagObject; - else if (tagObject instanceof String) { - int readLength = ((String) tagObject).length(); - binaryTag = new byte[readLength]; - for (int i = 0; i Date: Sat, 25 Aug 2012 12:38:23 -0700 Subject: [PATCH 037/161] Fix for badIndelLength() throwing NPE at non-indel sites. Added integration test. --- .../gatk/walkers/variantutils/SelectVariants.java | 3 +++ .../sting/utils/variantcontext/VariantContext.java | 2 +- .../variantutils/SelectVariantsIntegrationTest.java | 13 +++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 567262756..3d14308b6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -579,6 +579,9 @@ public class SelectVariants extends RodWalker implements TreeR } private boolean badIndelSize(final VariantContext vc) { + List lengths = vc.getIndelLengths(); + if ( lengths == null ) + return false; // VC does not harbor indel for ( Integer indelLength : vc.getIndelLengths() ) { if ( indelLength > maxIndelSize ) return true; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 8da6d452e..929e53ce7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -716,7 +716,7 @@ public class VariantContext implements Feature { // to enable tribble integratio * @return a list of indel lengths ( null if not of type indel or mixed ) */ public List getIndelLengths() { - if ( getType() != Type.INDEL && getType() != Type.MIXED ) { + if ( getType() != Type.INDEL && getType() != Type.MIXED && getType() != Type.STRUCTURAL_INDEL ) { return null; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index bde597fbe..77e29f87b 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -128,6 +128,19 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testVariantTypeSelection--" + testFile, spec); } + @Test + public void testIndelLengthSelection() { + String testFile = privateTestDir + "complexExample1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -selectType INDEL --variant " + testFile + " -o %s --no_cmdline_in_header --maxIndelSize 3", + 1, + Arrays.asList("004589868ca5dc887e2dff876b4cc797") + ); + + executeTest("testIndelLengthSelection--" + testFile, spec); + } + @Test public void testUsingDbsnpName() { String testFile = privateTestDir + "combine.3.vcf"; From 275a5e5439403104b06e596be5366901d4a1bae2 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 25 Aug 2012 15:33:01 -0400 Subject: [PATCH 038/161] More tests for NanoScheduler -- Add more contracts -- Test in the UnitTest that the reduce is being called in the correct order --- .../sting/utils/nanoScheduler/NanoScheduler.java | 15 +++++++++++---- .../nanoScheduler/NanoSchedulerUnitTest.java | 7 ++++++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index fcc6a5723..63ae1958c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -85,6 +85,7 @@ public class NanoScheduler { * The number of parallel map threads in use with this NanoScheduler * @return */ + @Ensures("result > 0") public int getnThreads() { return nThreads; } @@ -93,6 +94,7 @@ public class NanoScheduler { * The input buffer size used by this NanoScheduler * @return */ + @Ensures("result > 0") public int getBufferSize() { return bufferSize; } @@ -101,6 +103,7 @@ public class NanoScheduler { * The grouping size used by this NanoScheduler * @return */ + @Ensures("result > 0") public int getMapGroupSize() { return mapGroupSize; } @@ -149,8 +152,10 @@ public class NanoScheduler { final MapFunction map, final ReduceType initialValue, final ReduceFunction reduce) { - if ( isShutdown() ) - throw new IllegalStateException("execute called on already shutdown NanoScheduler"); + if ( isShutdown() ) throw new IllegalStateException("execute called on already shutdown NanoScheduler"); + if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null"); + if ( map == null ) throw new IllegalArgumentException("map function cannot be null"); + if ( reduce == null ) throw new IllegalArgumentException("reduce function cannot be null"); if ( getnThreads() == 1 ) { return executeSingleThreaded(inputReader, map, initialValue, reduce); @@ -206,7 +211,7 @@ public class NanoScheduler { return sum; } - @Requires("! mapQueue.isEmpty()") + @Requires({"reduce != null", "! mapQueue.isEmpty()"}) private ReduceType reduceParallel(final ReduceFunction reduce, final Queue>> mapQueue, final ReduceType initSum) @@ -240,7 +245,7 @@ public class NanoScheduler { return inputs; } - @Ensures("result.size() == inputs.size()") + @Requires({"map != null", "! inputs.isEmpty()"}) private Queue>> submitMapJobs(final MapFunction map, final ExecutorService executor, final List inputs) { @@ -262,11 +267,13 @@ public class NanoScheduler { final List inputs; final MapFunction map; + @Requires({"map != null", "inputs.size() <= getMapGroupSize()"}) private CallableMap(final MapFunction map, final List inputs) { this.inputs = inputs; this.map = map; } + @Ensures("result.size() == inputs.size()") @Override public List call() throws Exception { final List outputs = new LinkedList(); for ( final InputType input : inputs ) diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index 211e43dc1..454441240 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -21,7 +21,12 @@ public class NanoSchedulerUnitTest extends BaseTest { } private class ReduceSum implements ReduceFunction { - @Override public Integer apply(Integer one, Integer sum) { return one + sum; } + int prevOne = Integer.MIN_VALUE; + + @Override public Integer apply(Integer one, Integer sum) { + Assert.assertTrue(prevOne < one, "Reduce came in out of order. Prev " + prevOne + " cur " + one); + return one + sum; + } } private static int sum2x(final int start, final int end) { From e060b148e2c5c2cc5a2b1a33a563915b1df66e7e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 25 Aug 2012 15:36:04 -0400 Subject: [PATCH 039/161] Minor cleanup of TraverseReads --- .../sting/gatk/traversals/TraverseReads.java | 52 ++++++++----------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java index d29e9a5f2..2dc0444b2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java @@ -1,20 +1,3 @@ -package org.broadinstitute.sting.gatk.traversals; - -import net.sf.samtools.SAMRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.ReadMetrics; -import org.broadinstitute.sting.gatk.WalkerManager; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrderedView; -import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView; -import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.sting.gatk.datasources.providers.ReadView; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - /* * Copyright (c) 2009 The Broad Institute * @@ -39,6 +22,19 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ +package org.broadinstitute.sting.gatk.traversals; + +import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrderedView; +import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView; +import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.providers.ReadView; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /** * @author aaron @@ -75,29 +71,27 @@ public class TraverseReads extends TraversalEngine,Read if( !dataProvider.hasReads() ) throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); - ReadView reads = new ReadView(dataProvider); - ReadReferenceView reference = new ReadReferenceView(dataProvider); + final ReadView reads = new ReadView(dataProvider); + final ReadReferenceView reference = new ReadReferenceView(dataProvider); // get the reference ordered data - ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); + final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); boolean done = walker.isDone(); // while we still have more reads - for (SAMRecord read : reads) { + for (final SAMRecord read : reads) { if ( done ) break; - // ReferenceContext -- the reference bases covered by the read - ReferenceContext refContext = null; - // get the array of characters for the reference sequence, since we're a mapped read - if (!read.getReadUnmappedFlag() && dataProvider.hasReference()) - refContext = reference.getReferenceContext(read); + // ReferenceContext -- the reference bases covered by the read + final ReferenceContext refContext = ! read.getReadUnmappedFlag() && dataProvider.hasReference() + ? reference.getReferenceContext(read) + : null; // update the number of reads we've seen - ReadMetrics readMetrics = dataProvider.getShard().getReadMetrics(); - readMetrics.incrementNumIterations(); + dataProvider.getShard().getReadMetrics().incrementNumIterations(); // if the read is mapped, create a metadata tracker - ReadMetaDataTracker tracker = (read.getReferenceIndex() >= 0) ? rodView.getReferenceOrderedDataForRead(read) : null; + final ReadMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null; final boolean keepMeP = walker.filter(refContext, (GATKSAMRecord) read); if (keepMeP) { From af540888f198d863ad4ae38a8e8917062a77bc14 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 25 Aug 2012 16:48:22 -0400 Subject: [PATCH 040/161] Limited version of parallel read walkers -- Currently doesn't support accessing reference or ROD data -- Parallel versions of PrintReads and CountReads --- .../providers/ShardDataProvider.java | 7 + .../executive/HierarchicalMicroScheduler.java | 2 +- .../gatk/executive/LinearMicroScheduler.java | 3 +- .../sting/gatk/executive/MicroScheduler.java | 22 ++- .../gatk/traversals/TraverseReadsNano.java | 167 ++++++++++++++++++ .../sting/gatk/walkers/PrintReads.java | 6 +- .../sting/gatk/walkers/qc/CountReads.java | 12 +- .../utils/nanoScheduler/NanoScheduler.java | 17 ++ .../threading/ThreadEfficiencyMonitor.java | 1 + 9 files changed, 221 insertions(+), 16 deletions(-) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java index 803bd885b..4279381d7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java @@ -94,6 +94,13 @@ public abstract class ShardDataProvider { return referenceOrderedData; } + /** + * @return true if reference ordered data will be provided by this shard + */ + public boolean hasReferenceOrderedData() { + return ! getReferenceOrderedData().isEmpty(); + } + /** * Create a data provider for the shard given the reads and reference. * @param shard The chunk of data over which traversals happen. diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 70cdaab22..9198d210d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -88,7 +88,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar final Collection rods, final int nThreadsToUse, final boolean monitorThreadPerformance ) { - super(engine, walker, reads, reference, rods); + super(engine, walker, reads, reference, rods, nThreadsToUse); if ( monitorThreadPerformance ) { final EfficiencyMonitoringThreadFactory monitoringThreadFactory = new EfficiencyMonitoringThreadFactory(nThreadsToUse); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 7a6902fff..5bcb16c94 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -39,8 +39,9 @@ public class LinearMicroScheduler extends MicroScheduler { final SAMDataSource reads, final IndexedFastaSequenceFile reference, final Collection rods, + final int numThreads, // may be > 1 if are nanoScheduling final boolean monitorThreadPerformance ) { - super(engine, walker, reads, reference, rods); + super(engine, walker, reads, reference, rods, numThreads); if ( monitorThreadPerformance ) setThreadEfficiencyMonitor(new ThreadEfficiencyMonitor()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 4becc5a78..9b4fe53ed 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -103,14 +103,16 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { if (walker instanceof TreeReducible && threadAllocation.getNumCPUThreads() > 1) { if(walker.isReduceByInterval()) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); - if(walker instanceof ReadWalker) - throw new UserException.BadArgumentValue("nt", String.format("The analysis %s is a read walker. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads())); - return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); + + if ( walker instanceof ReadWalker ) + return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); + else + return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); } else { if(threadAllocation.getNumCPUThreads() > 1) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); - return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.monitorThreadEfficiency()); + return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); } } @@ -121,15 +123,23 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * @param reads The reads. * @param reference The reference. * @param rods the rods to include in the traversal + * @param numThreads the number of threads we are using in the underlying traversal */ - protected MicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods) { + protected MicroScheduler(final GenomeAnalysisEngine engine, + final Walker walker, + final SAMDataSource reads, + final IndexedFastaSequenceFile reference, + final Collection rods, + final int numThreads) { this.engine = engine; this.reads = reads; this.reference = reference; this.rods = rods; if (walker instanceof ReadWalker) { - traversalEngine = new TraverseReads(); + traversalEngine = numThreads > 1 ? new TraverseReadsNano(numThreads) : new TraverseReads(); + } else if ( numThreads > 1 ) { + throw new IllegalArgumentException("BUG: numThreads > 1 but this is only allowed for ReadWalkers"); } else if (walker instanceof LocusWalker) { traversalEngine = new TraverseLoci(); } else if (walker instanceof DuplicateWalker) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java new file mode 100755 index 000000000..dc774230b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.traversals; + +import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.datasources.reads.ReadShard; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.nanoScheduler.MapFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; +import org.broadinstitute.sting.utils.nanoScheduler.ReduceFunction; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/** + * @author aaron + * @version 1.0 + * @date Apr 24, 2009 + *

+ * Class TraverseReads + *

+ * This class handles traversing by reads in the new shardable style + */ +public class TraverseReadsNano extends TraversalEngine,ReadShardDataProvider> { + /** our log, which we want to capture anything from this class */ + protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class); + private static final boolean DEBUG = false; + final int bufferSize = ReadShard.MAX_READS; + final int mapGroupSize = bufferSize / 10 + 1; + final int nThreads; + + public TraverseReadsNano(int nThreads) { + this.nThreads = nThreads; + } + + @Override + protected String getTraversalType() { + return "reads"; + } + + /** + * Traverse by reads, given the data and the walker + * + * @param walker the walker to traverse with + * @param dataProvider the provider of the reads data + * @param sum the value of type T, specified by the walker, to feed to the walkers reduce function + * @return the reduce variable of the read walker + */ + public T traverse(ReadWalker walker, + ReadShardDataProvider dataProvider, + T sum) { + logger.debug(String.format("TraverseReadsNano.traverse Covered dataset is %s", dataProvider)); + + if( !dataProvider.hasReads() ) + throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); + + if ( dataProvider.hasReferenceOrderedData() ) + throw new ReviewedStingException("Parallel read walkers currently don't support access to reference ordered data"); + + final ReadView reads = new ReadView(dataProvider); + final ReadReferenceView reference = new NotImplementedReadReferenceView(dataProvider); + final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); + + final NanoScheduler nanoScheduler = new NanoScheduler(bufferSize, mapGroupSize, nThreads); + nanoScheduler.setDebug(DEBUG); + final TraverseReadsMap myMap = new TraverseReadsMap(reads, reference, rodView, walker); + final TraverseReadsReduce myReduce = new TraverseReadsReduce(walker); + + T result = nanoScheduler.execute(reads.iterator().iterator(), myMap, sum, myReduce); + nanoScheduler.shutdown(); + //printProgress(dataProvider.getShard(), ???); + + return result; + } + + private static class NotImplementedReadReferenceView extends ReadReferenceView { + private NotImplementedReadReferenceView(ShardDataProvider provider) { + super(provider); + } + + @Override + protected byte[] getReferenceBases(SAMRecord read) { + throw new ReviewedStingException("Parallel read walkers don't support accessing reference yet"); + } + + @Override + protected byte[] getReferenceBases(GenomeLoc genomeLoc) { + throw new ReviewedStingException("Parallel read walkers don't support accessing reference yet"); + } + } + + private class TraverseReadsReduce implements ReduceFunction { + final ReadWalker walker; + + private TraverseReadsReduce(ReadWalker walker) { + this.walker = walker; + } + + @Override + public T apply(M one, T sum) { + return walker.reduce(one, sum); + } + } + + private class TraverseReadsMap implements MapFunction { + final ReadView reads; + final ReadReferenceView reference; + final ReadBasedReferenceOrderedView rodView; + final ReadWalker walker; + + private TraverseReadsMap(ReadView reads, ReadReferenceView reference, ReadBasedReferenceOrderedView rodView, ReadWalker walker) { + this.reads = reads; + this.reference = reference; + this.rodView = rodView; + this.walker = walker; + } + + @Override + public M apply(final SAMRecord read) { + if ( ! walker.isDone() ) { + // ReferenceContext -- the reference bases covered by the read + final ReferenceContext refContext = ! read.getReadUnmappedFlag() && reference != null + ? reference.getReferenceContext(read) + : null; + + // update the number of reads we've seen + //dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + // if the read is mapped, create a metadata tracker + final ReadMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null; + + final boolean keepMeP = walker.filter(refContext, (GATKSAMRecord) read); + if (keepMeP) { + return walker.map(refContext, (GATKSAMRecord) read, tracker); + } + } + + return null; // TODO -- what should we return in the case where the walker is done or the read is filtered? + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java index 8257794d7..2b05e4dc5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java @@ -93,7 +93,7 @@ import java.util.TreeSet; @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) @Requires({DataSource.READS, DataSource.REFERENCE}) -public class PrintReads extends ReadWalker { +public class PrintReads extends ReadWalker implements TreeReducible { @Output(doc="Write output to this BAM filename instead of STDOUT", required = true) SAMFileWriter out; @@ -246,4 +246,8 @@ public class PrintReads extends ReadWalker { return output; } + @Override + public SAMFileWriter treeReduce(SAMFileWriter lhs, SAMFileWriter rhs) { + return lhs; // nothing to do + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java index 5a9e5e7d2..d33db2925 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -40,15 +41,12 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) -public class CountReads extends ReadWalker { +public class CountReads extends ReadWalker implements TreeReducible { public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { - return 1; } - public Integer reduceInit() { return 0; } - - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } + @Override public Integer reduceInit() { return 0; } + @Override public Integer reduce(Integer value, Integer sum) { return value + sum; } + @Override public Integer treeReduce(Integer lhs, Integer rhs) { return lhs + rhs; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 63ae1958c..c587e44c6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -50,6 +50,7 @@ public class NanoScheduler { final int nThreads; final ExecutorService executor; boolean shutdown = false; + boolean debug = false; /** * Create a new nanoschedule with the desire characteristics requested by the argument @@ -129,6 +130,20 @@ public class NanoScheduler { return shutdown; } + public boolean isDebug() { + return debug; + } + + private void debugPrint(final String format, Object ... args) { + if ( isDebug() ) + logger.info(String.format(format, args)); + } + + + public void setDebug(boolean debug) { + this.debug = debug; + } + /** * Execute a map/reduce job with this nanoScheduler * @@ -190,6 +205,7 @@ public class NanoScheduler { final MapFunction map, final ReduceType initialValue, final ReduceFunction reduce) { + debugPrint("Executing nanoScheduler with initial reduce value " + initialValue); ReduceType sum = initialValue; while ( inputReader.hasNext() ) { try { @@ -278,6 +294,7 @@ public class NanoScheduler { final List outputs = new LinkedList(); for ( final InputType input : inputs ) outputs.add(map.apply(input)); + debugPrint(" Processed %d elements with map", outputs.size()); return outputs; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java b/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java index ef836a06d..9159f5657 100644 --- a/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java +++ b/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java @@ -140,6 +140,7 @@ public class ThreadEfficiencyMonitor { logger.log(priority, String.format("CPU efficiency : %6.2f%% of time spent %s", getStatePercent(State.USER_CPU), State.USER_CPU.getUserFriendlyName())); logger.log(priority, String.format("Walker inefficiency : %6.2f%% of time spent %s", getStatePercent(State.BLOCKING), State.BLOCKING.getUserFriendlyName())); logger.log(priority, String.format("I/O inefficiency : %6.2f%% of time spent %s", getStatePercent(State.WAITING_FOR_IO), State.WAITING_FOR_IO.getUserFriendlyName())); + logger.log(priority, String.format("Thread inefficiency : %6.2f%% of time spent %s", getStatePercent(State.WAITING), State.WAITING.getUserFriendlyName())); } /** From 5066b143355319dcb5fb0a4ae39a0c0d539e6d8a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 25 Aug 2012 17:19:57 -0400 Subject: [PATCH 041/161] Parallel FlagStat --- .../sting/gatk/walkers/FlagStat.java | 111 +++++++++++------- 1 file changed, 69 insertions(+), 42 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java index e881dcab7..b0cc3b12a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java @@ -45,12 +45,12 @@ import java.text.NumberFormat; */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS}) -public class FlagStat extends ReadWalker { +public class FlagStat extends ReadWalker implements TreeReducible { @Output PrintStream out; // what comes out of the flagstat - static class FlagStatus { + public final static class FlagStatus { long readCount = 0L; long QC_failure = 0L; long duplicates = 0L; @@ -117,62 +117,89 @@ public class FlagStat extends ReadWalker { return builder.toString(); } - } + public FlagStatus add(final FlagStatus other) { + readCount += other.readCount; + QC_failure += other.QC_failure; + duplicates += other.duplicates; + mapped += other.mapped; + paired_in_sequencing += other.paired_in_sequencing; + read1 += other.read1; + read2 += other.read2; + properly_paired += other.properly_paired; + with_itself_and_mate_mapped += other.with_itself_and_mate_mapped; + singletons += other.singletons; + with_mate_mapped_to_a_different_chr += other.with_mate_mapped_to_a_different_chr; + with_mate_mapped_to_a_different_chr_maq_greaterequal_than_5 += other.with_mate_mapped_to_a_different_chr_maq_greaterequal_than_5; - - private FlagStatus myStat = new FlagStatus(); - - public Integer map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) { - myStat.readCount++; - if (read.getReadFailsVendorQualityCheckFlag()) { - myStat.QC_failure++; + return this; } - if (read.getDuplicateReadFlag()) { - myStat.duplicates++; - } - if (!read.getReadUnmappedFlag()) { - myStat.mapped++; - } - if (read.getReadPairedFlag()) { - myStat.paired_in_sequencing++; - if (read.getSecondOfPairFlag()) { - myStat.read2++; - } else if (read.getReadPairedFlag()) { - myStat.read1++; + public FlagStatus add(final GATKSAMRecord read) { + this.readCount++; + + if (read.getReadFailsVendorQualityCheckFlag()) { + this.QC_failure++; } - if (read.getProperPairFlag()) { - myStat.properly_paired++; + if (read.getDuplicateReadFlag()) { + this.duplicates++; } - if (!read.getReadUnmappedFlag() && !read.getMateUnmappedFlag()) { - myStat.with_itself_and_mate_mapped++; + if (!read.getReadUnmappedFlag()) { + this.mapped++; + } + if (read.getReadPairedFlag()) { + this.paired_in_sequencing++; - if (!read.getReferenceIndex().equals(read.getMateReferenceIndex())) { - myStat.with_mate_mapped_to_a_different_chr++; + if (read.getSecondOfPairFlag()) { + this.read2++; + } else if (read.getReadPairedFlag()) { + this.read1++; + } + if (read.getProperPairFlag()) { + this.properly_paired++; + } + if (!read.getReadUnmappedFlag() && !read.getMateUnmappedFlag()) { + this.with_itself_and_mate_mapped++; - if (read.getMappingQuality() >= 5) { - myStat.with_mate_mapped_to_a_different_chr_maq_greaterequal_than_5++; + if (!read.getReferenceIndex().equals(read.getMateReferenceIndex())) { + this.with_mate_mapped_to_a_different_chr++; + + if (read.getMappingQuality() >= 5) { + this.with_mate_mapped_to_a_different_chr_maq_greaterequal_than_5++; + } } } + if (!read.getReadUnmappedFlag() && read.getMateUnmappedFlag()) { + this.singletons++; + } } - if (!read.getReadUnmappedFlag() && read.getMateUnmappedFlag()) { - myStat.singletons++; - } + + return this; } - return 1; - } - public Integer reduceInit() { - return 0; + + @Override + public FlagStatus map( final ReferenceContext ref, final GATKSAMRecord read, final ReadMetaDataTracker metaDataTracker ) { + return new FlagStatus().add(read); + } + + @Override + public FlagStatus reduceInit() { + return new FlagStatus(); } - public Integer reduce(Integer value, Integer sum) { - return value + sum; + @Override + public FlagStatus reduce(final FlagStatus value, final FlagStatus sum) { + return sum.add(value); } - public void onTraversalDone(Integer result) { - //out.println("[REDUCE RESULT] Traversal result is: " + result); - out.println(myStat.toString()); + @Override + public FlagStatus treeReduce(final FlagStatus value, final FlagStatus sum) { + return sum.add(value); + } + + @Override + public void onTraversalDone(final FlagStatus result) { + out.println(result.toString()); } } \ No newline at end of file From fde98247654bd744ba67ad9f40329d9d85ab44ea Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 25 Aug 2012 17:21:02 -0400 Subject: [PATCH 042/161] Optimizations for parallel read walkers -- TraversalReadsNano only creates the NanoScheduler once, and shuts it down onTraversalDone -- Nicer debugging output in NanoScheduler -- ReadShard has a getBufferSize() method now --- .../sting/gatk/datasources/reads/ReadShard.java | 9 +++++++++ .../gatk/traversals/TraverseReadsNano.java | 17 +++++++++++------ .../utils/nanoScheduler/NanoScheduler.java | 4 ++-- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java index 96b55674a..f5a4cb4cf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java @@ -58,6 +58,15 @@ public class ReadShard extends Shard { MAX_READS = bufferSize; } + /** + * What read buffer size are we using? + * + * @return + */ + public static int getReadBufferSize() { + return MAX_READS; + } + /** * Returns true if this shard is meant to buffer reads, rather * than just holding pointers to their locations. diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index dc774230b..9d543c322 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -51,12 +51,12 @@ public class TraverseReadsNano extends TraversalEngine, /** our log, which we want to capture anything from this class */ protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class); private static final boolean DEBUG = false; - final int bufferSize = ReadShard.MAX_READS; - final int mapGroupSize = bufferSize / 10 + 1; - final int nThreads; + final NanoScheduler nanoScheduler; public TraverseReadsNano(int nThreads) { - this.nThreads = nThreads; + final int bufferSize = ReadShard.getReadBufferSize() + 1; // actually has 1 more than max + final int mapGroupSize = bufferSize / 10 + 1; + nanoScheduler = new NanoScheduler(bufferSize, mapGroupSize, nThreads); } @Override @@ -87,18 +87,23 @@ public class TraverseReadsNano extends TraversalEngine, final ReadReferenceView reference = new NotImplementedReadReferenceView(dataProvider); final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); - final NanoScheduler nanoScheduler = new NanoScheduler(bufferSize, mapGroupSize, nThreads); nanoScheduler.setDebug(DEBUG); final TraverseReadsMap myMap = new TraverseReadsMap(reads, reference, rodView, walker); final TraverseReadsReduce myReduce = new TraverseReadsReduce(walker); T result = nanoScheduler.execute(reads.iterator().iterator(), myMap, sum, myReduce); - nanoScheduler.shutdown(); + // TODO -- how do we print progress? //printProgress(dataProvider.getShard(), ???); return result; } + @Override + public void printOnTraversalDone() { + nanoScheduler.shutdown(); + super.printOnTraversalDone(); //To change body of overridden methods use File | Settings | File Templates. + } + private static class NotImplementedReadReferenceView extends ReadReferenceView { private NotImplementedReadReferenceView(ShardDataProvider provider) { super(provider); diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index c587e44c6..4bca3728f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -136,7 +136,7 @@ public class NanoScheduler { private void debugPrint(final String format, Object ... args) { if ( isDebug() ) - logger.info(String.format(format, args)); + logger.info("Thread " + Thread.currentThread().getId() + ":" + String.format(format, args)); } @@ -205,7 +205,7 @@ public class NanoScheduler { final MapFunction map, final ReduceType initialValue, final ReduceFunction reduce) { - debugPrint("Executing nanoScheduler with initial reduce value " + initialValue); + debugPrint("Executing nanoScheduler"); ReduceType sum = initialValue; while ( inputReader.hasNext() ) { try { From 846e0c11bc58a64723ff6cbebbae711f160e3c1b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 26 Aug 2012 08:18:43 -0400 Subject: [PATCH 043/161] Add TimeOuts to new threading tests, in case there's a underlying deadlock --- .../utils/nanoScheduler/NanoSchedulerUnitTest.java | 12 +++++++----- .../EfficiencyMonitoringThreadFactoryUnitTest.java | 3 ++- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index 454441240..f2a34c46d 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -16,6 +16,8 @@ import java.util.*; * To change this template use File | Settings | File Templates. */ public class NanoSchedulerUnitTest extends BaseTest { + public static final int NANO_SCHEDULE_MAX_RUNTIME = 10000; + private class Map2x implements MapFunction { @Override public Integer apply(Integer input) { return input * 2; } } @@ -83,14 +85,14 @@ public class NanoSchedulerUnitTest extends BaseTest { return NanoSchedulerBasicTest.getTests(NanoSchedulerBasicTest.class); } - @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest") + @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testSingleThreadedNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { logger.warn("Running " + test); if ( test.nThreads == 1 ) testNanoScheduler(test); } - @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", timeOut = 10000, dependsOnMethods = "testSingleThreadedNanoScheduler") + @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", timeOut = NANO_SCHEDULE_MAX_RUNTIME, dependsOnMethods = "testSingleThreadedNanoScheduler") public void testMultiThreadedNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { logger.warn("Running " + test); if ( test.nThreads >= 1 ) @@ -111,7 +113,7 @@ public class NanoSchedulerUnitTest extends BaseTest { nanoScheduler.shutdown(); } - @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", dependsOnMethods = "testMultiThreadedNanoScheduler") + @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", dependsOnMethods = "testMultiThreadedNanoScheduler", timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testNanoSchedulerInLoop(final NanoSchedulerBasicTest test) throws InterruptedException { if ( test.bufferSize > 1 && (test.mapGroupSize > 1 || test.mapGroupSize == -1)) { logger.warn("Running " + test); @@ -130,7 +132,7 @@ public class NanoSchedulerUnitTest extends BaseTest { } } - @Test() + @Test(timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testShutdown() throws InterruptedException { final NanoScheduler nanoScheduler = new NanoScheduler(1, 1, 2); Assert.assertFalse(nanoScheduler.isShutdown(), "scheduler should be alive"); @@ -138,7 +140,7 @@ public class NanoSchedulerUnitTest extends BaseTest { Assert.assertTrue(nanoScheduler.isShutdown(), "scheduler should be dead"); } - @Test(expectedExceptions = IllegalStateException.class) + @Test(expectedExceptions = IllegalStateException.class, timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testShutdownExecuteFailure() throws InterruptedException { final NanoScheduler nanoScheduler = new NanoScheduler(1, 1, 2); nanoScheduler.shutdown(); diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java index 35dc9754c..6544b9845 100755 --- a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java @@ -42,6 +42,7 @@ import java.util.concurrent.*; public class EfficiencyMonitoringThreadFactoryUnitTest extends BaseTest { // the duration of the tests -- 100 ms is tolerable given the number of tests we are doing private final static long THREAD_TARGET_DURATION_IN_MILLISECOND = 1000; + private final static int MAX_THREADS = 4; final static Object GLOBAL_LOCK = new Object(); private class StateTest extends TestDataProvider { @@ -126,7 +127,7 @@ public class EfficiencyMonitoringThreadFactoryUnitTest extends BaseTest { return StateTest.getTests(StateTest.class); } - @Test(enabled = true, dataProvider = "StateTest") + @Test(enabled = true, dataProvider = "StateTest", timeOut = MAX_THREADS * THREAD_TARGET_DURATION_IN_MILLISECOND) public void testStateTest(final StateTest test) throws InterruptedException { // allows us to test blocking final EfficiencyMonitoringThreadFactory factory = new EfficiencyMonitoringThreadFactory(test.getNStates()); From faacacd6c0a62d7e1113fd7a693ea7f774631fa1 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 26 Aug 2012 08:42:58 -0400 Subject: [PATCH 044/161] Increase runtime of nano scheduler tests to 1 min --- .../sting/utils/nanoScheduler/NanoSchedulerUnitTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index f2a34c46d..cf97d3e73 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -16,7 +16,7 @@ import java.util.*; * To change this template use File | Settings | File Templates. */ public class NanoSchedulerUnitTest extends BaseTest { - public static final int NANO_SCHEDULE_MAX_RUNTIME = 10000; + public static final int NANO_SCHEDULE_MAX_RUNTIME = 60000; private class Map2x implements MapFunction { @Override public Integer apply(Integer input) { return input * 2; } From 68c5142d2d952a3abaeb0604ce60b47d8466e654 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 26 Aug 2012 14:36:13 -0400 Subject: [PATCH 045/161] numThreads > 1 any time you have -nt > 1 silly --- .../org/broadinstitute/sting/gatk/executive/MicroScheduler.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 9b4fe53ed..70201a6cc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -138,8 +138,6 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { if (walker instanceof ReadWalker) { traversalEngine = numThreads > 1 ? new TraverseReadsNano(numThreads) : new TraverseReads(); - } else if ( numThreads > 1 ) { - throw new IllegalArgumentException("BUG: numThreads > 1 but this is only allowed for ReadWalkers"); } else if (walker instanceof LocusWalker) { traversalEngine = new TraverseLoci(); } else if (walker instanceof DuplicateWalker) { From 2d1ea7124b576764137cad7ca3f458a71a8fd69b Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Mon, 27 Aug 2012 12:04:50 -0400 Subject: [PATCH 046/161] One less Queue command line requirement: -tempDir now defaults to .queue/tmp. Also moved queueScatterGather to .queue/scatterGather. --- .../broadinstitute/sting/utils/io/IOUtils.java | 17 +++++++++++++---- .../sting/queue/QCommandLine.scala | 8 +++++--- .../ScatterGatherableFunction.scala | 2 +- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java b/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java index 160df0e51..b79211e74 100644 --- a/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java @@ -48,14 +48,23 @@ public class IOUtils { * @param tempDir Temporary directory. */ public static void checkTempDir(File tempDir) { + if (isDefaultTempDir(tempDir)) + throw new UserException.BadTmpDir("java.io.tmpdir must be explicitly set"); + if (!tempDir.exists() && !tempDir.mkdirs()) + throw new UserException.BadTmpDir("Could not create directory: " + tempDir.getAbsolutePath()); + } + + /** + * Returns true if the directory is a default temporary directory. + * @param tempDir the directory to check. + * @return true if the directory is a default temporary directory. + */ + public static boolean isDefaultTempDir(File tempDir) { String tempDirPath = tempDir.getAbsolutePath(); // Keeps the user from leaving the temp directory as the default, and on Macs from having pluses // in the path which can cause problems with the Google Reflections library. // see also: http://benjchristensen.com/2009/09/22/mac-osx-10-6-java-java-io-tmpdir/ - if (tempDirPath.startsWith("/var/folders/") || (tempDirPath.equals("/tmp")) || (tempDirPath.equals("/tmp/"))) - throw new UserException.BadTmpDir("java.io.tmpdir must be explicitly set"); - if (!tempDir.exists() && !tempDir.mkdirs()) - throw new UserException.BadTmpDir("Could not create directory: " + tempDir.getAbsolutePath()); + return (tempDirPath.startsWith("/var/folders/") || (tempDirPath.equals("/tmp")) || (tempDirPath.equals("/tmp/"))); } /** diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index 775847ba9..0d0fab9d1 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -64,10 +64,10 @@ object QCommandLine extends Logging { Runtime.getRuntime.removeShutdownHook(shutdownHook) qCommandLine.shutdown() } catch { - case _ => /* ignore, example 'java.lang.IllegalStateException: Shutdown in progress' */ + case e: Exception => /* ignore, example 'java.lang.IllegalStateException: Shutdown in progress' */ } if (CommandLineProgram.result != 0) - System.exit(CommandLineProgram.result); + System.exit(CommandLineProgram.result) } catch { case e: Exception => CommandLineProgram.exitSystemWithError(e) } @@ -105,9 +105,11 @@ class QCommandLine extends CommandLineProgram with Logging { def execute = { if (settings.qSettings.runName == null) settings.qSettings.runName = FilenameUtils.removeExtension(scripts.head.getName) + if (IOUtils.isDefaultTempDir(settings.qSettings.tempDirectory)) + settings.qSettings.tempDirectory = IOUtils.absolute(settings.qSettings.runDirectory, ".queue/tmp") qGraph.initializeWithSettings(settings) - val allQScripts = pluginManager.createAllTypes(); + val allQScripts = pluginManager.createAllTypes() for (script <- allQScripts) { logger.info("Scripting " + pluginManager.getName(script.getClass.asSubclass(classOf[QScript]))) loadArgumentsIntoObject(script) diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala index 4578f0e82..5dd7d4c79 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala @@ -91,7 +91,7 @@ trait ScatterGatherableFunction extends CommandLineFunction { if (qSettings.jobScatterGatherDirectory != null) { this.scatterGatherDirectory = IOUtils.absolute(qSettings.jobScatterGatherDirectory) } else { - this.scatterGatherDirectory = IOUtils.absolute(this.commandDirectory, "queueScatterGather") + this.scatterGatherDirectory = IOUtils.absolute(this.commandDirectory, ".queue/scatterGather") } } } From e5b1f1c7f41622eacd3f88ff50ff14f38e9c7ecd Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 27 Aug 2012 09:24:16 -0400 Subject: [PATCH 047/161] Add simple main function to unit test so we can run the nano scheduler test from the command line --- .../nanoScheduler/NanoSchedulerUnitTest.java | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index cf97d3e73..89506dcb1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -18,11 +18,11 @@ import java.util.*; public class NanoSchedulerUnitTest extends BaseTest { public static final int NANO_SCHEDULE_MAX_RUNTIME = 60000; - private class Map2x implements MapFunction { + private static class Map2x implements MapFunction { @Override public Integer apply(Integer input) { return input * 2; } } - private class ReduceSum implements ReduceFunction { + private static class ReduceSum implements ReduceFunction { int prevOne = Integer.MIN_VALUE; @Override public Integer apply(Integer one, Integer sum) { @@ -38,7 +38,7 @@ public class NanoSchedulerUnitTest extends BaseTest { return sum; } - private class NanoSchedulerBasicTest extends TestDataProvider { + private static class NanoSchedulerBasicTest extends TestDataProvider { final int bufferSize, mapGroupSize, nThreads, start, end, expectedResult; public NanoSchedulerBasicTest(final int bufferSize, final int mapGroupSize, final int nThreads, final int start, final int end) { @@ -146,4 +146,13 @@ public class NanoSchedulerUnitTest extends BaseTest { nanoScheduler.shutdown(); nanoScheduler.execute(exampleTest.makeReader(), exampleTest.makeMap(), exampleTest.initReduce(), exampleTest.makeReduce()); } + + public static void main(String [ ] args) { + final NanoSchedulerBasicTest test = new NanoSchedulerBasicTest(1000, 100, Integer.valueOf(args[0]), 0, Integer.valueOf(args[1])); + final NanoScheduler nanoScheduler = + new NanoScheduler(test.bufferSize, test.mapGroupSize, test.nThreads); + + final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); + System.out.printf("Sum = %d, expected =%d%n", sum, test.expectedResult); + } } From 63a9ae817a6490bc4b261aab246fcd568e41f1ab Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 27 Aug 2012 12:11:38 -0400 Subject: [PATCH 048/161] Ensure thread-safety of CachingIndexedFastaSequenceFile -- Cosmetic cleanup of ReadReferenceView -- TraverseReadsNano provides the reference context, since it's thread-safe -- Cleanup CachingIndexedFastaSequenceFile. Add docs, remove unnecessary setters -- Expand CachingIndexedFastaSequenceFileUnitTest to test explicitly multi-threaded safety. --- .../providers/ReadReferenceView.java | 12 +- .../gatk/traversals/TraverseReadsNano.java | 20 +-- .../CachingIndexedFastaSequenceFile.java | 123 +++++++++++------- ...chingIndexedFastaSequenceFileUnitTest.java | 121 ++++++++++++----- 4 files changed, 170 insertions(+), 106 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java index 3d62faf49..5cc8faa0e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java @@ -59,16 +59,18 @@ public class ReadReferenceView extends ReferenceView { } public byte[] getBases() { -// System.out.printf("Getting bases for location %s%n", loc); -// throw new StingException("x"); return getReferenceBases(loc); } } - public ReferenceContext getReferenceContext( SAMRecord read ) { + /** + * Return a reference context appropriate for the span of read + * + * @param read the mapped read to test + * @return + */ + public ReferenceContext getReferenceContext( final SAMRecord read ) { GenomeLoc loc = genomeLocParser.createGenomeLoc(read); -// byte[] bases = super.getReferenceBases(loc); -// return new ReferenceContext( loc, loc, bases ); return new ReferenceContext( genomeLocParser, loc, loc, getReferenceBasesProvider(loc) ); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index 9d543c322..4215230b8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -84,7 +84,7 @@ public class TraverseReadsNano extends TraversalEngine, throw new ReviewedStingException("Parallel read walkers currently don't support access to reference ordered data"); final ReadView reads = new ReadView(dataProvider); - final ReadReferenceView reference = new NotImplementedReadReferenceView(dataProvider); + final ReadReferenceView reference = new ReadReferenceView(dataProvider); final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); nanoScheduler.setDebug(DEBUG); @@ -101,23 +101,7 @@ public class TraverseReadsNano extends TraversalEngine, @Override public void printOnTraversalDone() { nanoScheduler.shutdown(); - super.printOnTraversalDone(); //To change body of overridden methods use File | Settings | File Templates. - } - - private static class NotImplementedReadReferenceView extends ReadReferenceView { - private NotImplementedReadReferenceView(ShardDataProvider provider) { - super(provider); - } - - @Override - protected byte[] getReferenceBases(SAMRecord read) { - throw new ReviewedStingException("Parallel read walkers don't support accessing reference yet"); - } - - @Override - protected byte[] getReferenceBases(GenomeLoc genomeLoc) { - throw new ReviewedStingException("Parallel read walkers don't support accessing reference yet"); - } + super.printOnTraversalDone(); } private class TraverseReadsReduce implements ReduceFunction { diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java index 48706543a..db54851dd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java +++ b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -29,6 +29,7 @@ import net.sf.picard.reference.FastaSequenceIndex; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequence; import net.sf.samtools.SAMSequenceRecord; +import org.apache.log4j.Priority; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.io.File; @@ -38,14 +39,11 @@ import java.util.Arrays; /** * A caching version of the IndexedFastaSequenceFile that avoids going to disk as often as the raw indexer. * - * Thread-safe! Uses a lock object to protect write and access to the cache. + * Thread-safe! Uses a thread-local cache */ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(CachingIndexedFastaSequenceFile.class); - /** global enable flag */ - private static final boolean USE_CACHE = true; - /** do we want to print debugging information about cache efficiency? */ private static final boolean PRINT_EFFICIENCY = false; @@ -53,31 +51,29 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { private static final int PRINT_FREQUENCY = 10000; /** The default cache size in bp */ - private static final long DEFAULT_CACHE_SIZE = 1000000; + public static final long DEFAULT_CACHE_SIZE = 1000000; + + /** The cache size of this CachingIndexedFastaSequenceFile */ + final long cacheSize; + + /** When we have a cache miss at position X, we load sequence from X - cacheMissBackup */ + final long cacheMissBackup; // information about checking efficiency long cacheHits = 0; long cacheMisses = 0; - /** The cache size of this CachingIndexedFastaSequenceFile */ - long cacheSize = DEFAULT_CACHE_SIZE; - - /** When we have a cache miss at position X, we load sequence from X - cacheMissBackup */ - long cacheMissBackup = 100; - /** Represents a specific cached sequence, with a specific start and stop, as well as the bases */ private static class Cache { long start = -1, stop = -1; ReferenceSequence seq = null; } + /** + * Thread local cache to allow multi-threaded use of this class + */ private ThreadLocal cache; - { - resetThreadLocalCache(); - } - - protected void resetThreadLocalCache() { cache = new ThreadLocal () { @Override protected Cache initialValue() { return new Cache(); @@ -87,76 +83,107 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { /** * Same as general constructor but allows one to override the default cacheSize - * @param file + * + * @param fasta * @param index * @param cacheSize */ - public CachingIndexedFastaSequenceFile(final File file, final FastaSequenceIndex index, long cacheSize) { - super(file, index); - setCacheSize(cacheSize); - } - - private void setCacheSize(long cacheSize) { + public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) { + super(fasta, index); + if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); this.cacheSize = cacheSize; this.cacheMissBackup = Math.max(cacheSize / 1000, 1); } /** * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. - * @param file The file to open. + * + * @param fasta The file to open. * @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk. * @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found. */ - public CachingIndexedFastaSequenceFile(final File file, final FastaSequenceIndex index) { - this(file, index, DEFAULT_CACHE_SIZE); + public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index) { + this(fasta, index, DEFAULT_CACHE_SIZE); } /** * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. - * @param file The file to open. + * + * Looks for a index file for fasta on disk + * + * @param fasta The file to open. */ - public CachingIndexedFastaSequenceFile(final File file) throws FileNotFoundException { - this(file, DEFAULT_CACHE_SIZE); + public CachingIndexedFastaSequenceFile(final File fasta) throws FileNotFoundException { + this(fasta, DEFAULT_CACHE_SIZE); } - public CachingIndexedFastaSequenceFile(final File file, long cacheSize ) throws FileNotFoundException { - super(file); - setCacheSize(cacheSize); + /** + * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. + * + * Looks for a index file for fasta on disk + * Uses provided cacheSize instead of the default + * + * @param fasta The file to open. + */ + public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException { + super(fasta); + if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); + this.cacheSize = cacheSize; + this.cacheMissBackup = Math.max(cacheSize / 1000, 1); } - public void printEfficiency() { - // comment out to disable tracking - if ( (cacheHits + cacheMisses) % PRINT_FREQUENCY == 0 ) { - logger.info(String.format("### CachingIndexedFastaReader: hits=%d misses=%d efficiency %.6f%%%n", cacheHits, cacheMisses, calcEfficiency())); - } + /** + * Print the efficiency (hits / queries) to logger with priority + */ + public void printEfficiency(final Priority priority) { + logger.log(priority, String.format("### CachingIndexedFastaReader: hits=%d misses=%d efficiency %.6f%%", cacheHits, cacheMisses, calcEfficiency())); } + /** + * Returns the efficiency (% of hits of all queries) of this object + * @return + */ public double calcEfficiency() { return 100.0 * cacheHits / (cacheMisses + cacheHits * 1.0); } + /** + * @return the number of cache hits that have occurred + */ public long getCacheHits() { return cacheHits; } + /** + * @return the number of cache misses that have occurred + */ public long getCacheMisses() { return cacheMisses; } + /** + * @return the size of the cache we are using + */ + public long getCacheSize() { + return cacheSize; + } /** * Gets the subsequence of the contig in the range [start,stop] + * + * Uses the sequence cache if possible, or updates the cache to handle the request. If the range + * is larger than the cache itself, just loads the sequence directly, not changing the cache at all + * * @param contig Contig whose subsequence to retrieve. * @param start inclusive, 1-based start of region. * @param stop inclusive, 1-based stop of region. * @return The partial reference sequence associated with this range. */ - public ReferenceSequence getSubsequenceAt( String contig, long start, long stop ) { - ReferenceSequence result; - Cache myCache = cache.get(); - //System.out.printf("getSubsequentAt cache=%s%n", myCache); + public ReferenceSequence getSubsequenceAt( final String contig, final long start, final long stop ) { + final ReferenceSequence result; + final Cache myCache = cache.get(); - if ( ! USE_CACHE || (stop - start) >= cacheSize ) { + if ( (stop - start) >= cacheSize ) { cacheMisses++; result = super.getSubsequenceAt(contig, start, stop); } else { @@ -177,8 +204,8 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { } // at this point we determine where in the cache we want to extract the requested subsequence - int cacheOffsetStart = (int)(start - myCache.start); - int cacheOffsetStop = (int)(stop - start + cacheOffsetStart + 1); + final int cacheOffsetStart = (int)(start - myCache.start); + final int cacheOffsetStop = (int)(stop - start + cacheOffsetStart + 1); try { result = new ReferenceSequence(myCache.seq.getName(), myCache.seq.getContigIndex(), Arrays.copyOfRange(myCache.seq.getBases(), cacheOffsetStart, cacheOffsetStop)); @@ -188,12 +215,8 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { } } -// // comment out to disable testing -// ReferenceSequence verify = super.getSubsequenceAt(contig, start, stop); -// if ( ! Arrays.equals(verify.getBases(), result.getBases()) ) -// throw new ReviewedStingException(String.format("BUG: cached reference sequence not the same as clean fetched version at %s %d %d", contig, start, stop)); - - if ( PRINT_EFFICIENCY ) printEfficiency(); + if ( PRINT_EFFICIENCY && (getCacheHits() + getCacheMisses()) % PRINT_FREQUENCY == 0 ) + printEfficiency(Priority.INFO); return result; } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java index c05b11cf7..736162300 100644 --- a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java @@ -5,21 +5,24 @@ package org.broadinstitute.sting.utils.fasta; // the imports for unit testing. -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.testng.Assert; -import org.testng.annotations.Test; -import org.testng.annotations.DataProvider; -import org.broadinstitute.sting.BaseTest; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.Arrays; -import java.util.List; -import java.util.ArrayList; - import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequence; import net.sf.samtools.SAMSequenceRecord; +import org.apache.log4j.Priority; +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; /** * Basic unit test for GenomeLoc @@ -30,7 +33,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { //private static final List QUERY_SIZES = Arrays.asList(1); private static final List QUERY_SIZES = Arrays.asList(1, 10, 100); - private static final List CACHE_SIZES = Arrays.asList(-1, 1000); + private static final List CACHE_SIZES = Arrays.asList(-1, 100, 1000); @DataProvider(name = "fastas") public Object[][] createData1() { @@ -46,20 +49,24 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { return params.toArray(new Object[][]{}); } - @Test(dataProvider = "fastas", enabled = true) - public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) { - IndexedFastaSequenceFile caching, uncached; - try { - caching = cacheSize == -1 ? new CachingIndexedFastaSequenceFile(fasta) : new CachingIndexedFastaSequenceFile(fasta, cacheSize); - uncached = new IndexedFastaSequenceFile(fasta); - } - catch(FileNotFoundException ex) { - throw new UserException.CouldNotReadInputFile(fasta,ex); - } + private static long getCacheSize(final long cacheSizeRequested) { + return cacheSizeRequested == -1 ? CachingIndexedFastaSequenceFile.DEFAULT_CACHE_SIZE : cacheSizeRequested; + } - SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); + @Test(dataProvider = "fastas", enabled = true) + public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) throws FileNotFoundException { + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize)); + + SAMSequenceRecord contig = caching.getSequenceDictionary().getSequence(0); logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d", contig.getSequenceName(), contig.getSequenceLength(), cacheSize, querySize)); + testSequential(caching, fasta, querySize); + } + + private void testSequential(final CachingIndexedFastaSequenceFile caching, final File fasta, final int querySize) throws FileNotFoundException { + final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta); + + SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); for ( int i = 0; i < contig.getSequenceLength(); i += STEP_SIZE ) { int start = i; int stop = start + querySize; @@ -72,19 +79,23 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases()); } } + + // asserts for efficiency. We are going to make contig.length / STEP_SIZE queries + // at each of range: start -> start + querySize against a cache with size of X. + // we expect to hit the cache each time range falls within X. We expect a hit + // on the cache if range is within X. Which should happen at least (X - query_size * 2) / STEP_SIZE + // times. + final int minExpectedHits = (int)Math.floor((Math.min(caching.getCacheSize(), contig.getSequenceLength()) - querySize * 2.0) / STEP_SIZE); + caching.printEfficiency(Priority.WARN); + Assert.assertTrue(caching.getCacheHits() >= minExpectedHits, "Expected at least " + minExpectedHits + " cache hits but only got " + caching.getCacheHits()); + } // Tests grabbing sequences around a middle cached value. @Test(dataProvider = "fastas", enabled = true) - public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) { - IndexedFastaSequenceFile caching, uncached; - try { - uncached = new IndexedFastaSequenceFile(fasta); - caching = new CachingIndexedFastaSequenceFile(fasta, cacheSize); - } - catch(FileNotFoundException ex) { - throw new UserException.CouldNotReadInputFile(fasta,ex); - } + public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) throws FileNotFoundException { + final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta); + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize)); SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); @@ -108,4 +119,48 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { } } } + + @DataProvider(name = "ParallelFastaTest") + public Object[][] createParallelFastaTest() { + List params = new ArrayList(); +// for ( int nt : Arrays.asList(1, 2, 3) ) { +// for ( int cacheSize : CACHE_SIZES ) { +// params.add(new Object[]{simpleFasta, cacheSize, 10, nt}); +// } +// } + + for ( File fasta : Arrays.asList(simpleFasta) ) { + for ( int cacheSize : CACHE_SIZES ) { + for ( int querySize : QUERY_SIZES ) { + for ( int nt : Arrays.asList(1, 2, 3, 4) ) { + params.add(new Object[]{fasta, cacheSize, querySize, nt}); + } + } + } + } + + return params.toArray(new Object[][]{}); + } + + + @Test(dataProvider = "ParallelFastaTest", enabled = true, timeOut = 60000) + public void testCachingIndexedFastaReaderParallel(final File fasta, final int cacheSize, final int querySize, final int nt) throws FileNotFoundException, InterruptedException { + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize)); + + logger.warn(String.format("Parallel caching index fasta reader test cacheSize %d querySize %d nt %d", caching.getCacheSize(), querySize, nt)); + for ( int iterations = 0; iterations < 1; iterations++ ) { + final ExecutorService executor = Executors.newFixedThreadPool(nt); + final Collection> tasks = new ArrayList>(nt); + for ( int i = 0; i < nt; i++ ) + tasks.add(new Callable() { + @Override + public Object call() throws Exception { + testSequential(caching, fasta, querySize); + return null; + } + }); + executor.invokeAll(tasks); + executor.shutdownNow(); + } + } } From 3d476487c664774409c254a9d877edcd8685325c Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 27 Aug 2012 12:13:12 -0400 Subject: [PATCH 049/161] LIBS is totally busted for deletions. Putting a check in AD for bad pileup event bases so that we don't produce busted alleles. We must fix LIBS ASAP. --- .../sting/gatk/iterators/LocusIteratorByState.java | 1 + .../gatk/walkers/annotator/DepthPerAlleleBySample.java | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 75af7976f..64f914064 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -318,6 +318,7 @@ public class LocusIteratorByState extends LocusIterator { continue; if (op == CigarOperator.D) { + // TODO -- LIBS is totally busted for deletions so that reads with Ds right before Is in their CIGAR are broken; must fix if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1)); size++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 5d83ddd51..61e30f3b9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -88,7 +88,11 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa for ( PileupElement p : pileup ) { if ( p.isBeforeInsertion() ) { - final Allele insertion = Allele.create((char)refBase + p.getEventBases(), false); + final String eventBases = p.getEventBases(); + if ( eventBases == null ) + continue; + + final Allele insertion = Allele.create((char)refBase + eventBases, false); if ( alleleCounts.containsKey(insertion) ) { alleleCounts.put(insertion, alleleCounts.get(insertion)+1); } From 2996693c9f6cc211b50e646539881e90d5b69f30 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 27 Aug 2012 20:57:44 -0400 Subject: [PATCH 050/161] FisherStrand now computed with and without filtering low-qual bases, and least significant pvalue is kept -- Old way (filtering for Q > 17 bases) resulted in biased FS when the site was good but there was a systematic shift in the QUAL of REF and ALT between strands of the reads (sometimes happens) -- New way (taking all bases) was consistent with BaseQualRankSum and other tests, but there can be a lot of low qual reference bases on one strand in some techs (ION/PROTON/PACBIO) because of the preference for introducing an indel vs. a mismatch. -- This implementation allows us to have our cake and eat it to by computing both p-values, and taking the maximum one (i.e., least significant). -- No integration tests updated yet -- still exploring the consequences of this change --- .../gatk/walkers/annotator/FisherStrand.java | 67 ++++++++++++++----- 1 file changed, 49 insertions(+), 18 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index ad0ad50b0..dee470cb3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -55,6 +55,8 @@ import java.util.*; public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private static final String FS = "FS"; private static final double MIN_PVALUE = 1E-320; + private static final int MIN_QUAL_FOR_FILTERED_TEST = 17; + public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, final ReferenceContext ref, @@ -64,30 +66,53 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat if ( !vc.isVariant() ) return null; - int[][] table; - if (vc.isSNP() && stratifiedContexts != null) { - table = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); + final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1); + final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), MIN_QUAL_FOR_FILTERED_TEST); + return pValueForBestTable(tableFiltering, tableNoFiltering); } else if (stratifiedPerReadAlleleLikelihoodMap != null) { // either SNP with no alignment context, or indels: per-read likelihood map needed - table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); + final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); + return pValueForBestTable(table, null); } else - // for non-snp variants, we need per-read likelihoods. - // for snps, we can get same result from simple pileup + // for non-snp variants, we need per-read likelihoods. + // for snps, we can get same result from simple pileup return null; + } - if (table == null) - return null; + /** + * Create an annotation for the highest (i.e., least significant) p-value of table1 and table2 + * + * @param table1 a contingency table, may be null + * @param table2 a contingency table, may be null + * @return annotation result for FS given tables + */ + private Map pValueForBestTable(final int[][] table1, final int[][] table2) { + if ( table2 == null ) + return table1 == null ? null : annotationForOneTable(pValueForContingencyTable(table1)); + else if (table1 == null) + return annotationForOneTable(pValueForContingencyTable(table2)); + else { // take the one with the best (i.e., least significant pvalue) + double pvalue1 = Math.max(pValueForContingencyTable(table1), MIN_PVALUE); + double pvalue2 = Math.max(pValueForContingencyTable(table2), MIN_PVALUE); + return annotationForOneTable(Math.max(pvalue1, pvalue2)); + } + } - Double pvalue = Math.max(pValueForContingencyTable(table), MIN_PVALUE); - if ( pvalue == null ) - return null; - - Map map = new HashMap(); - map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue))); - return map; + /** + * Returns an annotation result given a pValue + * + * @param pValue + * @return a hash map from FS -> phred-scaled pValue + */ + private Map annotationForOneTable(final double pValue) { + final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(pValue)); + return Collections.singletonMap(FS, value); +// Map map = new HashMap(); +// map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pValue))); +// return map; } public List getKeyNames() { @@ -244,7 +269,10 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat * allele2 # # * @return a 2x2 contingency table */ - private static int[][] getSNPContingencyTable(Map stratifiedContexts, Allele ref, Allele alt) { + private static int[][] getSNPContingencyTable(final Map stratifiedContexts, + final Allele ref, + final Allele alt, + final int minQScoreToConsider ) { int[][] table = new int[2][2]; for ( Map.Entry sample : stratifiedContexts.entrySet() ) { @@ -252,8 +280,11 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat if ( ! RankSumTest.isUsableBase(p, false) || p.getRead().isReducedRead() ) // ignore deletions and reduced reads continue; - Allele base = Allele.create(p.getBase(), false); - boolean isFW = !p.getRead().getReadNegativeStrandFlag(); + if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider ) + continue; + + final Allele base = Allele.create(p.getBase(), false); + final boolean isFW = !p.getRead().getReadNegativeStrandFlag(); final boolean matchesRef = ref.equals(base, true); final boolean matchesAlt = alt.equals(base, true); From 4b8d9c39150cf5e0fd10e55172b0c5024c02f6cd Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 28 Aug 2012 08:05:05 -0400 Subject: [PATCH 051/161] Actually load the library necessary to compactPDF -- Old version was buggy in that if you didn't load "tools" package in your script it wouldn't compact the resulting PDF! Fixed --- .../broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R index 45dacd835..748f00e28 100644 --- a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R +++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R @@ -1,5 +1,6 @@ library(gplots) library(ggplot2) +library(tools) # ------------------------------------------------------- # Utilities for displaying multiple plots per page @@ -59,6 +60,7 @@ closePDF <- function(outputPDF) { if ( ! is.na(outputPDF) ) { dev.off() if (exists("compactPDF")) { + print("compacting PDF") compactPDF(outputPDF) } } From 0f4acaae1b5d39a6e8388411c7146f67ac510a92 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 28 Aug 2012 08:05:16 -0400 Subject: [PATCH 052/161] Update MD5s with new FS score --- ...GenotyperGeneralPloidyIntegrationTest.java | 8 ++-- .../VariantAnnotatorIntegrationTest.java | 10 ++--- .../UnifiedGenotyperIntegrationTest.java | 38 +++++++++---------- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index b5b0abc6e..e0bf07809 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -47,12 +47,12 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testBOTH_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","077db83cf7dc5490f670c85856b408b2"); + PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","0ff90fa3882a3fb5089a7bba50dd8ae3"); } @Test(enabled = true) public void testINDEL_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","e460a17377b731ff4eab36fb56042ecd"); + PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","90af837f372e3d5143af30bf5c8c2b75"); } @Test(enabled = true) @@ -67,11 +67,11 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testMT_SNP_DISCOVERY_sp4() { - PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","da359fe7dd6dce045193198c264301ee"); + PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","c32e10070e10d30d33e5b882c1f89413"); } @Test(enabled = true) public void testMT_SNP_GGA_sp10() { - PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "ad0eef3a9deaa098d79df62af7e5448a"); + PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "4d16d3c9475637bad70e9dc2eafe2da2"); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index aa4fd7a75..01dff0089 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -32,7 +32,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("4a0318d0452d2dccde48ef081c431bf8")); + Arrays.asList("fbfbd4d13b7ba3d76e8e186902e81378")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -40,7 +40,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("da19c8e3c58340ba8bcc88e95ece4ac1")); + Arrays.asList("19aef8914efc497192f89a9038310ca5")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -66,7 +66,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("cdefe79f46482a3d050ca2132604663a")); + Arrays.asList("4f0b8033da18e6cf6e9b8d5d36c21ba2")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -74,7 +74,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("5ec4c07b6801fca7013e3b0beb8b5418")); + Arrays.asList("64ca176d587dfa2b3b9dec9f7999305c")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @@ -90,7 +90,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testOverwritingHeader() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, - Arrays.asList("28c07151f5c5fae87c691d8f7d1a3929")); + Arrays.asList("0c810f6c4abef9d9dc5513ca872d3d22")); executeTest("test overwriting header", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 02e1bdf12..2f0bfb507 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("9a7fa3e9ec8350e3e9cfdce0c00ddcc3")); + Arrays.asList("cafd404f1b4f53586f7aa7a7084b91da")); executeTest("test MultiSample Pilot1", spec); } @@ -36,7 +36,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("78693f3bf5d588e250507a596aa400da")); + Arrays.asList("9a760dffbb299bda4934bcb4f7aad42a")); executeTest("test MultiSample Pilot2 with alleles passed in", spec1); } @@ -44,7 +44,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("babf24ec8e5b5708d4a049629f7ea073")); + Arrays.asList("8391146877aa7801ffdb3aa954bf2965")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -52,7 +52,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("754187e70c1d117087e2270950a1c230")); + Arrays.asList("85b79ff7910f218dd59595d03ffe6ccc")); executeTest("test SingleSample Pilot2", spec); } @@ -60,7 +60,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("f9a2f882d050a90e6d8e6a1fba00f858")); + Arrays.asList("8472b1ad2fe1060e732da9e29d10cf99")); executeTest("test Multiple SNP alleles", spec); } @@ -86,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "ebb42960e115fb8dacd3edff5541b4da"; + private final static String COMPRESSED_OUTPUT_MD5 = "712e87db5e278e92bd36e96d377303c6"; @Test public void testCompressedOutput() { @@ -139,7 +139,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinBaseQualityScore() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, - Arrays.asList("91f7e112200ed2c3b0a5d0d9e16e9369")); + Arrays.asList("f73dec2e77f14c170f7b6a8eee5793ff")); executeTest("test min_base_quality_score 26", spec); } @@ -147,7 +147,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSLOD() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("b86e52b18496ab43a6b9a1bda632b5e6")); + Arrays.asList("da7a5a3aa1c9f401896c34199c535954")); executeTest("test SLOD", spec); } @@ -155,7 +155,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNDA() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("79b3e4f8b4476ce3c3acbc271d6ddcdc")); + Arrays.asList("07f5962f790673a1299f3a0f56579b65")); executeTest("test NDA", spec); } @@ -163,23 +163,23 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testCompTrack() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("bf7f21a600956eda0a357b97b21e3069")); + Arrays.asList("22037eac40a3b1df3086c2d7b27f0d5f")); executeTest("test using comp track", spec); } @Test public void testOutputParameterSitesOnly() { - testOutputParameters("-sites_only", "976109543d8d97d94e0fe0521ff326e8"); + testOutputParameters("-sites_only", "92db524b334f1416e595c711abc2d798"); } @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "bec7bcc50b42782e20a970db11201399"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "da318257d25a02abd26a3348421c3c69"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "09494afd12cef97293ed35d1a972f623"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "13c4f01cffbbfac600318be95b3ca02f"); } private void testOutputParameters(final String args, final String md5) { @@ -193,7 +193,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("e94be02fc5484c20b512840884e3d463")); + Arrays.asList("7326eb84d8418546a408b68839a0a47e")); executeTest("test confidence 1", spec1); } @@ -201,7 +201,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("e94be02fc5484c20b512840884e3d463")); + Arrays.asList("7326eb84d8418546a408b68839a0a47e")); executeTest("test confidence 2", spec2); } @@ -212,12 +212,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testHeterozyosity1() { - testHeterozosity( 0.01, "0dca2699f709793026b853c6f339bf08" ); + testHeterozosity( 0.01, "7aed8361e692eff559e6bca88752db0d" ); } @Test public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "35f14e436927e64712a8e28080e90c91" ); + testHeterozosity( 1.0 / 1850, "989e65bb7337117d31cd615163a8ac84" ); } private void testHeterozosity(final double arg, final String md5) { @@ -241,7 +241,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("0360b79163aa28ae66d0dde4c26b3d76")); + Arrays.asList("c155587aa0410f43d7ccc57e1ae09a68")); executeTest(String.format("test multiple technologies"), spec); } @@ -260,7 +260,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("59892388916bdfa544750ab76e43eabb")); + Arrays.asList("0748a711c6154f8d85847afb79aead94")); executeTest(String.format("test calling with BAQ"), spec); } From 67d348a31d6ead966e207eec81fc8701c9b05181 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 28 Aug 2012 10:16:49 -0400 Subject: [PATCH 053/161] Retiring the alignment walkers and related integration test since we don't want to support them anymore. --- .../sting/alignment/AlignmentWalker.java | 139 ------------------ .../sting/alignment/CountBestAlignments.java | 132 ----------------- .../alignment/AlignerIntegrationTest.java | 27 ---- 3 files changed, 298 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/alignment/AlignmentWalker.java delete mode 100644 public/java/src/org/broadinstitute/sting/alignment/CountBestAlignments.java delete mode 100644 public/java/test/org/broadinstitute/sting/alignment/AlignerIntegrationTest.java diff --git a/public/java/src/org/broadinstitute/sting/alignment/AlignmentWalker.java b/public/java/src/org/broadinstitute/sting/alignment/AlignmentWalker.java deleted file mode 100644 index 6206fc2ce..000000000 --- a/public/java/src/org/broadinstitute/sting/alignment/AlignmentWalker.java +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.alignment; - -import net.sf.picard.reference.ReferenceSequenceFileFactory; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMSequenceDictionary; -import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; -import org.broadinstitute.sting.alignment.bwa.BWTFiles; -import org.broadinstitute.sting.alignment.bwa.c.BWACAligner; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.walkers.WalkerName; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.io.File; - -/** - * Aligns reads to a given reference using Heng Li's BWA aligner, presenting the resulting alignments in SAM or BAM format. - * Mimics the steps 'bwa aln' followed by 'bwa samse' using the BWA/C implementation. - * - * @author mhanna - * @version 0.1 - */ -@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} ) -@WalkerName("Align") -public class AlignmentWalker extends ReadWalker { - @Argument(fullName="target_reference",shortName="target_ref",doc="The reference to which reads in the source file should be aligned. Alongside this reference should sit index files " + - "generated by bwa index -d bwtsw. If unspecified, will default " + - "to the reference specified via the -R argument.",required=false) - private File targetReferenceFile = null; - - @Output - private StingSAMFileWriter out = null; - - /** - * The actual aligner. - */ - private BWACAligner aligner = null; - - /** - * New header to use, if desired. - */ - private SAMFileHeader header; - - /** - * Create an aligner object. The aligner object will load and hold the BWT until close() is called. - */ - @Override - public void initialize() { - if(targetReferenceFile == null) - targetReferenceFile = getToolkit().getArguments().referenceFile; - BWTFiles bwtFiles = new BWTFiles(targetReferenceFile.getAbsolutePath()); - BWAConfiguration configuration = new BWAConfiguration(); - aligner = new BWACAligner(bwtFiles,configuration); - - // Take the header of the SAM file, tweak it by adding in the reference dictionary and specifying that the target file is unsorted. - header = getToolkit().getSAMFileHeader().clone(); - SAMSequenceDictionary referenceDictionary = - ReferenceSequenceFileFactory.getReferenceSequenceFile(targetReferenceFile).getSequenceDictionary(); - header.setSequenceDictionary(referenceDictionary); - header.setSortOrder(SAMFileHeader.SortOrder.unsorted); - - out.writeHeader(header); - } - - /** - * Aligns a read to the given reference. - * - * @param ref Reference over the read. Read will most likely be unmapped, so ref will be null. - * @param read Read to align. - * @return Number of alignments found for this read. - */ - @Override - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { - SAMRecord alignedRead = aligner.align(read,header); - out.addAlignment(alignedRead); - return 1; - } - - /** - * Initial value for reduce. In this case, alignments will be counted. - * @return 0, indicating no alignments yet found. - */ - @Override - public Integer reduceInit() { return 0; } - - /** - * Calculates the number of alignments found. - * @param value Number of alignments found by this map. - * @param sum Number of alignments found before this map. - * @return Number of alignments found up to and including this map. - */ - @Override - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } - - /** - * Cleanup. - * @param result Number of reads processed. - */ - @Override - public void onTraversalDone(Integer result) { - aligner.close(); - super.onTraversalDone(result); - } - -} diff --git a/public/java/src/org/broadinstitute/sting/alignment/CountBestAlignments.java b/public/java/src/org/broadinstitute/sting/alignment/CountBestAlignments.java deleted file mode 100644 index 336c95d42..000000000 --- a/public/java/src/org/broadinstitute/sting/alignment/CountBestAlignments.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.alignment; - -import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; -import org.broadinstitute.sting.alignment.bwa.BWTFiles; -import org.broadinstitute.sting.alignment.bwa.c.BWACAligner; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.io.PrintStream; -import java.util.Iterator; -import java.util.Map; -import java.util.SortedMap; -import java.util.TreeMap; - -/** - * Counts the number of best alignments as presented by BWA and outputs a histogram of number of placements vs. the - * frequency of that number of placements. - * - * @author mhanna - * @version 0.1 - */ -@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} ) -public class CountBestAlignments extends ReadWalker { - /** - * The supporting BWT index generated using BWT. - */ - @Argument(fullName="BWTPrefix",shortName="BWT",doc="Index files generated by bwa index -d bwtsw",required=false) - private String prefix = null; - - @Output - private PrintStream out = null; - - /** - * The actual aligner. - */ - private Aligner aligner = null; - - private SortedMap alignmentFrequencies = new TreeMap(); - - /** - * Create an aligner object. The aligner object will load and hold the BWT until close() is called. - */ - @Override - public void initialize() { - if(prefix == null) - prefix = getToolkit().getArguments().referenceFile.getAbsolutePath(); - BWTFiles bwtFiles = new BWTFiles(prefix); - BWAConfiguration configuration = new BWAConfiguration(); - aligner = new BWACAligner(bwtFiles,configuration); - } - - /** - * Aligns a read to the given reference. - * - * @param ref Reference over the read. Read will most likely be unmapped, so ref will be null. - * @param read Read to align. - * @return Number of alignments found for this read. - */ - @Override - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { - Iterator alignmentIterator = aligner.getAllAlignments(read.getReadBases()).iterator(); - if(alignmentIterator.hasNext()) { - int numAlignments = alignmentIterator.next().length; - if(alignmentFrequencies.containsKey(numAlignments)) - alignmentFrequencies.put(numAlignments,alignmentFrequencies.get(numAlignments)+1); - else - alignmentFrequencies.put(numAlignments,1); - } - return 1; - } - - /** - * Initial value for reduce. In this case, validated reads will be counted. - * @return 0, indicating no reads yet validated. - */ - @Override - public Integer reduceInit() { return 0; } - - /** - * Calculates the number of reads processed. - * @param value Number of reads processed by this map. - * @param sum Number of reads processed before this map. - * @return Number of reads processed up to and including this map. - */ - @Override - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } - - /** - * Cleanup. - * @param result Number of reads processed. - */ - @Override - public void onTraversalDone(Integer result) { - aligner.close(); - for(Map.Entry alignmentFrequency: alignmentFrequencies.entrySet()) - out.printf("%d\t%d%n", alignmentFrequency.getKey(), alignmentFrequency.getValue()); - super.onTraversalDone(result); - } -} diff --git a/public/java/test/org/broadinstitute/sting/alignment/AlignerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/alignment/AlignerIntegrationTest.java deleted file mode 100644 index a6af034cb..000000000 --- a/public/java/test/org/broadinstitute/sting/alignment/AlignerIntegrationTest.java +++ /dev/null @@ -1,27 +0,0 @@ -package org.broadinstitute.sting.alignment; - -import org.testng.annotations.Test; -import org.broadinstitute.sting.WalkerTest; - -import java.util.Arrays; - -/** - * Integration tests for the aligner. - * - * @author mhanna - * @version 0.1 - */ -public class AlignerIntegrationTest extends WalkerTest { - @Test - public void testBasicAlignment() { - String md5 = "a2bdf907b18114a86ca47f9fc23791bf"; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + GATKDataLocation + "bwa/human_b36_both.fasta" + - " -T Align" + - " -I " + validationDataLocation + "NA12878_Pilot1_20.trimmed.unmapped.bam" + - " -o %s", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testBasicAlignment", spec); - } -} From e74c527d47410e7a3a240366783996878ab1f820 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 28 Aug 2012 10:19:18 -0400 Subject: [PATCH 054/161] Register the depricated walkers as depricated starting in v2.2 so that users get a helpful error message --- .../src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index c8dbb090d..00614b9aa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -283,6 +283,8 @@ public class GenomeAnalysisEngine { static { deprecatedGATKWalkers.put("CountCovariates", "2.0"); deprecatedGATKWalkers.put("TableRecalibration", "2.0"); + deprecatedGATKWalkers.put("AlignmentWalker", "2.2"); + deprecatedGATKWalkers.put("CountBestAlignments", "2.2"); } /** From 18eca3544e123373e8b7b54e1ec2252f072c4dcf Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 28 Aug 2012 15:24:20 -0400 Subject: [PATCH 055/161] Initial commit of the delocalized BQSR written as a read walker. --- .../bqsr/AdvancedRecalibrationEngine.java | 51 +++++++++++++++++++ .../walkers/bqsr/RecalibrationEngine.java | 2 + .../bqsr/StandardRecalibrationEngine.java | 10 ++++ .../utils/recalibration/QuantizationInfo.java | 2 +- .../sting/utils/recalibration/RecalDatum.java | 29 +++++------ .../utils/recalibration/RecalDatumNode.java | 6 +-- .../sting/utils/recalibration/RecalUtils.java | 4 +- 7 files changed, 83 insertions(+), 21 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java index d714ca185..e6be01b82 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java @@ -34,17 +34,20 @@ import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.recalibration.ReadCovariates; import org.broadinstitute.sting.utils.recalibration.RecalDatum; import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource { // optimizations: don't reallocate an array each time private byte[] tempQualArray; private boolean[] tempErrorArray; + private double[] tempFractionalErrorArray; public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) { super.initialize(covariates, recalibrationTables); tempQualArray = new byte[EventType.values().length]; tempErrorArray = new boolean[EventType.values().length]; + tempFractionalErrorArray = new double[EventType.values().length]; } /** @@ -56,6 +59,7 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp * @param pileupElement The pileup element to update * @param refBase The reference base at this locus */ + @Override public synchronized void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) { final int offset = pileupElement.getOffset(); final ReadCovariates readCovariates = covariateKeySetFrom(pileupElement.getRead()); @@ -100,4 +104,51 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp } } } + + @Override + public synchronized void updateDataForRead(final GATKSAMRecord read, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { + for( int offset = 0; offset < read.getReadBases().length; offset++ ) { + final ReadCovariates readCovariates = covariateKeySetFrom(read); + + tempQualArray[EventType.BASE_SUBSTITUTION.index] = read.getBaseQualities()[offset]; + tempFractionalErrorArray[EventType.BASE_SUBSTITUTION.index] = snpErrors[offset]; + tempQualArray[EventType.BASE_INSERTION.index] = read.getBaseInsertionQualities()[offset]; + tempFractionalErrorArray[EventType.BASE_INSERTION.index] = insertionErrors[offset]; + tempQualArray[EventType.BASE_DELETION.index] = read.getBaseDeletionQualities()[offset]; + tempFractionalErrorArray[EventType.BASE_DELETION.index] = deletionErrors[offset]; + + for (final EventType eventType : EventType.values()) { + final int[] keys = readCovariates.getKeySet(offset, eventType); + final int eventIndex = eventType.index; + final byte qual = tempQualArray[eventIndex]; + final double isError = tempFractionalErrorArray[eventIndex]; + + final NestedIntegerArray rgRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE); + final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex); + final RecalDatum rgThisDatum = createDatumObject(qual, isError); + if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it + rgRecalTable.put(rgThisDatum, keys[0], eventIndex); + else + rgPreviousDatum.combine(rgThisDatum); + + final NestedIntegerArray qualRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE); + final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex); + if (qualPreviousDatum == null) + qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex); + else + qualPreviousDatum.increment(1.0, isError); + + for (int i = 2; i < covariates.length; i++) { + if (keys[i] < 0) + continue; + final NestedIntegerArray covRecalTable = recalibrationTables.getTable(i); + final RecalDatum covPreviousDatum = covRecalTable.get(keys[0], keys[1], keys[i], eventIndex); + if (covPreviousDatum == null) + covRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], keys[i], eventIndex); + else + covPreviousDatum.increment(1.0, isError); + } + } + } + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java index 38e306939..ab65c1462 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -34,4 +35,5 @@ public interface RecalibrationEngine { public void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase); + public void updateDataForRead(final GATKSAMRecord read, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java index aec1bf7a8..5459e9cfa 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java @@ -54,6 +54,7 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP * @param pileupElement The pileup element to update * @param refBase The reference base at this locus */ + @Override public synchronized void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) { final int offset = pileupElement.getOffset(); final ReadCovariates readCovariates = covariateKeySetFrom(pileupElement.getRead()); @@ -91,6 +92,11 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP } } + @Override + public synchronized void updateDataForRead( final GATKSAMRecord read, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { + throw new UnsupportedOperationException("Delocalized BQSR is not available in the GATK-lite version"); + } + /** * creates a datum object with one observation and one or zero error * @@ -102,6 +108,10 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP return new RecalDatum(1, isError ? 1:0, reportedQual); } + protected RecalDatum createDatumObject(final byte reportedQual, final double isError) { + return new RecalDatum(1, isError, reportedQual); + } + /** * Get the covariate key set from a read * diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java index 2b67d12a9..f1f702a38 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java @@ -39,7 +39,7 @@ public class QuantizationInfo { for (final RecalDatum value : qualTable.getAllValues()) { final RecalDatum datum = value; final int empiricalQual = MathUtils.fastRound(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL ) - qualHistogram[empiricalQual] += datum.getNumObservations(); // add the number of observations for every key + qualHistogram[empiricalQual] += (long) datum.getNumObservations(); // add the number of observations for every key } empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities quantizeQualityScores(quantizationLevels); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java index 8c8815b54..9794e7b4e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java @@ -28,7 +28,6 @@ package org.broadinstitute.sting.utils.recalibration; import com.google.java.contract.Ensures; import com.google.java.contract.Invariant; import com.google.java.contract.Requires; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import java.util.Random; @@ -68,12 +67,12 @@ public class RecalDatum { /** * number of bases seen in total */ - private long numObservations; + private double numObservations; /** * number of bases seen that didn't match the reference */ - private long numMismatches; + private double numMismatches; /** * used when calculating empirical qualities to avoid division by zero @@ -93,7 +92,7 @@ public class RecalDatum { * @param _numMismatches * @param reportedQuality */ - public RecalDatum(final long _numObservations, final long _numMismatches, final byte reportedQuality) { + public RecalDatum(final double _numObservations, final double _numMismatches, final byte reportedQuality) { if ( _numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0"); if ( _numMismatches < 0 ) throw new IllegalArgumentException("numMismatches < 0"); if ( reportedQuality < 0 ) throw new IllegalArgumentException("reportedQuality < 0"); @@ -167,9 +166,9 @@ public class RecalDatum { return 0.0; else { // cache the value so we don't call log over and over again - final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT); + final double doubleMismatches = numMismatches + SMOOTHING_CONSTANT; // smoothing is one error and one non-error observation, for example - final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT); + final double doubleObservations = numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT; return doubleMismatches / doubleObservations; } } @@ -200,7 +199,7 @@ public class RecalDatum { @Override public String toString() { - return String.format("%d,%d,%d", getNumObservations(), getNumMismatches(), (byte) Math.floor(getEmpiricalQuality())); + return String.format("%d,%d,%d", Math.round(getNumObservations()), Math.round(getNumMismatches()), (byte) Math.floor(getEmpiricalQuality())); } public String stringForCSV() { @@ -229,42 +228,42 @@ public class RecalDatum { // //--------------------------------------------------------------------------------------------------------------- - public long getNumObservations() { + public double getNumObservations() { return numObservations; } - public synchronized void setNumObservations(final long numObservations) { + public synchronized void setNumObservations(final double numObservations) { if ( numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0"); this.numObservations = numObservations; empiricalQuality = UNINITIALIZED; } - public long getNumMismatches() { + public double getNumMismatches() { return numMismatches; } @Requires({"numMismatches >= 0"}) - public synchronized void setNumMismatches(final long numMismatches) { + public synchronized void setNumMismatches(final double numMismatches) { if ( numMismatches < 0 ) throw new IllegalArgumentException("numMismatches < 0"); this.numMismatches = numMismatches; empiricalQuality = UNINITIALIZED; } @Requires({"by >= 0"}) - public synchronized void incrementNumObservations(final long by) { + public synchronized void incrementNumObservations(final double by) { numObservations += by; empiricalQuality = UNINITIALIZED; } @Requires({"by >= 0"}) - public synchronized void incrementNumMismatches(final long by) { + public synchronized void incrementNumMismatches(final double by) { numMismatches += by; empiricalQuality = UNINITIALIZED; } @Requires({"incObservations >= 0", "incMismatches >= 0"}) @Ensures({"numObservations == old(numObservations) + incObservations", "numMismatches == old(numMismatches) + incMismatches"}) - public synchronized void increment(final long incObservations, final long incMismatches) { + public synchronized void increment(final double incObservations, final double incMismatches) { incrementNumObservations(incObservations); incrementNumMismatches(incMismatches); } @@ -300,6 +299,6 @@ public class RecalDatum { */ @Ensures("result >= 0.0") private double calcExpectedErrors() { - return (double) getNumObservations() * QualityUtils.qualToErrorProb(estimatedQReported); + return getNumObservations() * QualityUtils.qualToErrorProb(estimatedQReported); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java index 41e96222c..6c94c3c42 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java @@ -263,14 +263,14 @@ public class RecalDatumNode { int i = 0; for ( final RecalDatumNode subnode : subnodes ) { // use the yates correction to help avoid all zeros => NaN - counts[i][0] = subnode.getRecalDatum().getNumMismatches() + 1; - counts[i][1] = subnode.getRecalDatum().getNumObservations() + 2; + counts[i][0] = Math.round(subnode.getRecalDatum().getNumMismatches()) + 1L; + counts[i][1] = Math.round(subnode.getRecalDatum().getNumObservations()) + 2L; i++; } try { final double chi2PValue = new ChiSquareTestImpl().chiSquareTest(counts); - final double penalty = -10 * Math.log10(Math.max(chi2PValue, SMALLEST_CHI2_PVALUE)); + final double penalty = -10.0 * Math.log10(Math.max(chi2PValue, SMALLEST_CHI2_PVALUE)); // make sure things are reasonable and fail early if not if (Double.isInfinite(penalty) || Double.isNaN(penalty)) diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java index 8a9143c89..8d2e799a0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -317,8 +317,8 @@ public class RecalUtils { reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEmpiricalQuality()); if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.index) reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getNumObservations()); - reportTable.set(rowIndex, columnNames.get(columnIndex).getFirst(), datum.getNumMismatches()); + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), Math.round(datum.getNumObservations())); + reportTable.set(rowIndex, columnNames.get(columnIndex).getFirst(), Math.round(datum.getNumMismatches())); rowIndex++; } From 6d6ca090c694304a8ebbda84e66751a4cc467282 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 28 Aug 2012 16:00:52 -0400 Subject: [PATCH 056/161] RecalDatums now hold doubles so the test for equality needs an epsilon. --- .../sting/utils/recalibration/RecalDatumUnitTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java index 33985e0ac..715acad03 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java @@ -100,8 +100,8 @@ public class RecalDatumUnitTest extends BaseTest { } private static void assertBasicFeaturesOfRecalDatum(final RecalDatum datum, final RecalDatumTestProvider cfg) { - Assert.assertEquals(datum.getNumMismatches(), cfg.exError); - Assert.assertEquals(datum.getNumObservations(), cfg.exTotal); + Assert.assertEquals(datum.getNumMismatches(), cfg.exError, 1E-6); + Assert.assertEquals(datum.getNumObservations(), cfg.exTotal, 1E-6); if ( cfg.getReportedQual() != -1 ) Assert.assertEquals(datum.getEstimatedQReportedAsByte(), cfg.getReportedQual()); BaseTest.assertEqualsDoubleSmart(datum.getEmpiricalQuality(), cfg.getErrorRatePhredScaled()); From e12ae65d33b3e6fd009fcd47eef3f90ed4e75a12 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 29 Aug 2012 11:27:45 -0400 Subject: [PATCH 059/161] Changing the commenting style in the BQSR --- .../bqsr/AdvancedRecalibrationEngine.java | 4 +- .../gatk/walkers/annotator/FisherStrand.java | 4 -- .../gatk/walkers/bqsr/BaseRecalibrator.java | 40 ++++++------ .../bqsr/StandardRecalibrationEngine.java | 2 +- .../recalibration/BaseRecalibration.java | 30 ++++----- .../utils/recalibration/QuantizationInfo.java | 12 ++-- .../sting/utils/recalibration/RecalUtils.java | 63 +++++++++---------- .../recalibration/RecalibrationReport.java | 25 ++++---- .../covariates/ContextCovariate.java | 17 ++--- .../covariates/CycleCovariate.java | 10 +-- 10 files changed, 105 insertions(+), 102 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java index e6be01b82..e5c952b76 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java @@ -80,7 +80,7 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp final NestedIntegerArray rgRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE); final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex); final RecalDatum rgThisDatum = createDatumObject(qual, isError); - if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it + if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it rgRecalTable.put(rgThisDatum, keys[0], eventIndex); else rgPreviousDatum.combine(rgThisDatum); @@ -126,7 +126,7 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp final NestedIntegerArray rgRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE); final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex); final RecalDatum rgThisDatum = createDatumObject(qual, isError); - if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it + if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it rgRecalTable.put(rgThisDatum, keys[0], eventIndex); else rgPreviousDatum.combine(rgThisDatum); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index dee470cb3..e95af71c2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -32,13 +32,11 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -299,6 +297,4 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat return table; } - - } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index ea9d0976a..30d2e24ef 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -106,26 +106,26 @@ import java.util.ArrayList; @DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} ) @BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) @By(DataSource.READS) -@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file -@Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality -@PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta +@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file +@Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality +@PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta public class BaseRecalibrator extends LocusWalker implements TreeReducible { @ArgumentCollection - private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates + private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates - private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization + private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization private RecalibrationTables recalibrationTables; - private Covariate[] requestedCovariates; // list to hold the all the covariate objects that were requested (required + standard + experimental) + private Covariate[] requestedCovariates; // list to hold the all the covariate objects that were requested (required + standard + experimental) private RecalibrationEngine recalibrationEngine; private int minimumQToUse; - protected static final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped. - protected static final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed. - protected static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\ + protected static final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped. + protected static final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed. + protected static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\ private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation."; @@ -143,16 +143,16 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed if (RAC.FORCE_PLATFORM != null) RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; - if (RAC.knownSites.isEmpty() && !RAC.RUN_WITHOUT_DBSNP) // Warn the user if no dbSNP file or other variant mask was specified + if (RAC.knownSites.isEmpty() && !RAC.RUN_WITHOUT_DBSNP) // Warn the user if no dbSNP file or other variant mask was specified throw new UserException.CommandLineException(NO_DBSNP_EXCEPTION); if (RAC.LIST_ONLY) { RecalUtils.listAvailableCovariates(logger); System.exit(0); } - RAC.recalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table + RAC.recalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table - Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates + Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates ArrayList requiredCovariates = covariates.getFirst(); ArrayList optionalCovariates = covariates.getSecond(); @@ -164,9 +164,9 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed requestedCovariates[covariateIndex++] = covariate; logger.info("The covariates being used here: "); - for (Covariate cov : requestedCovariates) { // list all the covariates being used + for (Covariate cov : requestedCovariates) { // list all the covariates being used logger.info("\t" + cov.getClass().getSimpleName()); - cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection + cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection } int numReadGroups = 0; @@ -216,12 +216,14 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed */ public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { long countedSites = 0L; - if (tracker.getValues(RAC.knownSites).size() == 0) { // Only analyze sites not present in the provided known sites + // Only analyze sites not present in the provided known sites + if (tracker.getValues(RAC.knownSites).size() == 0) { for (final PileupElement p : context.getBasePileup()) { final GATKSAMRecord read = p.getRead(); final int offset = p.getOffset(); - if (readHasBeenSkipped(read) || isLowQualityBase(read, offset)) // This read has been marked to be skipped or base is low quality (we don't recalibrate low quality bases) + // This read has been marked to be skipped or base is low quality (we don't recalibrate low quality bases) + if (readHasBeenSkipped(read) || isLowQualityBase(read, offset)) continue; if (readNotSeen(read)) { @@ -234,10 +236,12 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed read.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalUtils.computeCovariates(read, requestedCovariates)); } - if (!ReadUtils.isSOLiDRead(read) || // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it + // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it + if (!ReadUtils.isSOLiDRead(read) || RAC.SOLID_RECAL_MODE == RecalUtils.SOLID_RECAL_MODE.DO_NOTHING || RecalUtils.isColorSpaceConsistent(read, offset)) - recalibrationEngine.updateDataForPileupElement(p, ref.getBase()); // This base finally passed all the checks for a good base, so add it to the big data hashmap + // This base finally passed all the checks for a good base, so add it to the big data hashmap + recalibrationEngine.updateDataForPileupElement(p, ref.getBase()); } countedSites++; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java index 5459e9cfa..76a82a134 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java @@ -68,7 +68,7 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP final NestedIntegerArray rgRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE); final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex); final RecalDatum rgThisDatum = createDatumObject(qual, isError); - if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it + if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it rgRecalTable.put(rgThisDatum, keys[0], eventIndex); else rgPreviousDatum.combine(rgThisDatum); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index c09eb0063..a563b18fc 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -48,9 +48,9 @@ public class BaseRecalibration { private final static int MAXIMUM_RECALIBRATED_READ_LENGTH = 5000; private final ReadCovariates readCovariates; - private final QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done) + private final QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done) private final RecalibrationTables recalibrationTables; - private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation + private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation private final boolean disableIndelQuals; private final int preserveQLessThan; @@ -76,9 +76,9 @@ public class BaseRecalibration { recalibrationTables = recalibrationReport.getRecalibrationTables(); requestedCovariates = recalibrationReport.getRequestedCovariates(); quantizationInfo = recalibrationReport.getQuantizationInfo(); - if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores + if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores quantizationInfo.noQuantization(); - else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wnats to use what's in the report. + else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wnats to use what's in the report. quantizationInfo.quantizeQualityScores(quantizationLevels); readCovariates = new ReadCovariates(MAXIMUM_RECALIBRATED_READ_LENGTH, requestedCovariates.length); @@ -103,24 +103,26 @@ public class BaseRecalibration { } } - RecalUtils.computeCovariates(read, requestedCovariates, readCovariates); // compute all covariates for the read - for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings + // Compute all covariates for the read + RecalUtils.computeCovariates(read, requestedCovariates, readCovariates); + + for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings if (disableIndelQuals && errorModel != EventType.BASE_SUBSTITUTION) { read.setBaseQualities(null, errorModel); continue; } final byte[] quals = read.getBaseQualities(errorModel); - final int[][] fullReadKeySet = readCovariates.getKeySet(errorModel); // get the keyset for this base using the error model + final int[][] fullReadKeySet = readCovariates.getKeySet(errorModel); // get the keyset for this base using the error model final int readLength = read.getReadLength(); - for (int offset = 0; offset < readLength; offset++) { // recalibrate all bases in the read + for (int offset = 0; offset < readLength; offset++) { // recalibrate all bases in the read final byte originalQualityScore = quals[offset]; - if (originalQualityScore >= preserveQLessThan) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) - final int[] keySet = fullReadKeySet[offset]; // get the keyset for this base using the error model - final byte recalibratedQualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base + if (originalQualityScore >= preserveQLessThan) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) + final int[] keySet = fullReadKeySet[offset]; // get the keyset for this base using the error model + final byte recalibratedQualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base quals[offset] = recalibratedQualityScore; } } @@ -152,10 +154,10 @@ public class BaseRecalibration { final double deltaQReported = calculateDeltaQReported(recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE), key, errorModel, globalDeltaQ, qualFromRead); final double deltaQCovariates = calculateDeltaQCovariates(recalibrationTables, key, errorModel, globalDeltaQ, deltaQReported, qualFromRead); - double recalibratedQual = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; // calculate the recalibrated qual using the BQSR formula - recalibratedQual = QualityUtils.boundQual(MathUtils.fastRound(recalibratedQual), QualityUtils.MAX_RECALIBRATED_Q_SCORE); // recalibrated quality is bound between 1 and MAX_QUAL + double recalibratedQual = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; // calculate the recalibrated qual using the BQSR formula + recalibratedQual = QualityUtils.boundQual(MathUtils.fastRound(recalibratedQual), QualityUtils.MAX_RECALIBRATED_Q_SCORE); // recalibrated quality is bound between 1 and MAX_QUAL - return quantizationInfo.getQuantizedQuals().get((int) recalibratedQual); // return the quantized version of the recalibrated quality + return quantizationInfo.getQuantizedQuals().get((int) recalibratedQual); // return the quantized version of the recalibrated quality } private double calculateGlobalDeltaQ(final NestedIntegerArray table, final int[] key, final EventType errorModel) { diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java index f1f702a38..d3c6c3d83 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java @@ -30,7 +30,7 @@ public class QuantizationInfo { } public QuantizationInfo(final RecalibrationTables recalibrationTables, final int quantizationLevels) { - final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1]; // create a histogram with the empirical quality distribution + final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1]; // create a histogram with the empirical quality distribution for (int i = 0; i < qualHistogram.length; i++) qualHistogram[i] = 0L; @@ -38,10 +38,10 @@ public class QuantizationInfo { for (final RecalDatum value : qualTable.getAllValues()) { final RecalDatum datum = value; - final int empiricalQual = MathUtils.fastRound(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL ) - qualHistogram[empiricalQual] += (long) datum.getNumObservations(); // add the number of observations for every key + final int empiricalQual = MathUtils.fastRound(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL ) + qualHistogram[empiricalQual] += (long) datum.getNumObservations(); // add the number of observations for every key } - empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities + empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities quantizeQualityScores(quantizationLevels); this.quantizationLevels = quantizationLevels; @@ -49,8 +49,8 @@ public class QuantizationInfo { public void quantizeQualityScores(int nLevels) { - QualQuantizer quantizer = new QualQuantizer(empiricalQualCounts, nLevels, QualityUtils.MIN_USABLE_Q_SCORE); // quantize the qualities to the desired number of levels - quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC) + QualQuantizer quantizer = new QualQuantizer(empiricalQualCounts, nLevels, QualityUtils.MIN_USABLE_Q_SCORE); // quantize the qualities to the desired number of levels + quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC) } public void noQuantization() { diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java index 8d2e799a0..20aabdb83 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -81,8 +81,8 @@ public class RecalUtils { public final static String NUMBER_OBSERVATIONS_COLUMN_NAME = "Observations"; public final static String NUMBER_ERRORS_COLUMN_NAME = "Errors"; - private final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams - private final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color + private final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams + private final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color private static boolean warnUserNullPlatform = false; private static final String SCRIPT_FILE = "BQSR.R"; @@ -111,12 +111,13 @@ public class RecalUtils { final List> requiredClasses = new PluginManager(RequiredCovariate.class).getPlugins(); final List> standardClasses = new PluginManager(StandardCovariate.class).getPlugins(); - final ArrayList requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates + final ArrayList requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates ArrayList optionalCovariates = new ArrayList(); if (!argumentCollection.DO_NOT_USE_STANDARD_COVARIATES) - optionalCovariates = addStandardCovariatesToList(standardClasses); // add the standard covariates if -standard was specified by the user + optionalCovariates = addStandardCovariatesToList(standardClasses); // add the standard covariates if -standard was specified by the user - if (argumentCollection.COVARIATES != null) { // parse the -cov arguments that were provided, skipping over the ones already specified + // parse the -cov arguments that were provided, skipping over the ones already specified + if (argumentCollection.COVARIATES != null) { for (String requestedCovariateString : argumentCollection.COVARIATES) { // help the transition from BQSR v1 to BQSR v2 if ( requestedCovariateString.equals("DinucCovariate") ) @@ -126,12 +127,12 @@ public class RecalUtils { boolean foundClass = false; for (Class covClass : covariateClasses) { - if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class + if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class foundClass = true; if (!requiredClasses.contains(covClass) && (argumentCollection.DO_NOT_USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) { try { - final Covariate covariate = covClass.newInstance(); // now that we've found a matching class, try to instantiate it + final Covariate covariate = covClass.newInstance(); // now that we've found a matching class, try to instantiate it optionalCovariates.add(covariate); } catch (Exception e) { throw new DynamicClassResolutionException(covClass, e); @@ -161,7 +162,7 @@ public class RecalUtils { if (classes.size() != 2) throw new ReviewedStingException("The number of required covariates has changed, this is a hard change in the code and needs to be inspected"); - dest.add(new ReadGroupCovariate()); // enforce the order with RG first and QS next. + dest.add(new ReadGroupCovariate()); // enforce the order with RG first and QS next. dest.add(new QualityScoreCovariate()); return dest; } @@ -266,20 +267,20 @@ public class RecalUtils { for (int tableIndex = 0; tableIndex < recalibrationTables.numTables(); tableIndex++) { - final ArrayList> columnNames = new ArrayList>(); // initialize the array to hold the column names - columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[0]), "%s")); // save the required covariate name so we can reference it in the future + final ArrayList> columnNames = new ArrayList>(); // initialize the array to hold the column names + columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[0]), "%s")); // save the required covariate name so we can reference it in the future if (tableIndex != RecalibrationTables.TableType.READ_GROUP_TABLE.index) { - columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[1]), "%s")); // save the required covariate name so we can reference it in the future + columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[1]), "%s")); // save the required covariate name so we can reference it in the future if (tableIndex >= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index) { columnNames.add(covariateValue); columnNames.add(covariateName); } } - columnNames.add(eventType); // the order of these column names is important here + columnNames.add(eventType); // the order of these column names is important here columnNames.add(empiricalQuality); if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.index) - columnNames.add(estimatedQReported); // only the read group table needs the estimated Q reported + columnNames.add(estimatedQReported); // only the read group table needs the estimated Q reported columnNames.add(nObservations); columnNames.add(nErrors); @@ -288,7 +289,7 @@ public class RecalUtils { reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size()); for (final Pair columnName : columnNames) reportTable.addColumn(columnName.getFirst(), columnName.getSecond()); - rowIndex = 0; // reset the row index since we're starting with a new table + rowIndex = 0; // reset the row index since we're starting with a new table } else { reportTable = result.get(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index); } @@ -316,7 +317,7 @@ public class RecalUtils { reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEmpiricalQuality()); if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.index) - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), Math.round(datum.getNumObservations())); reportTable.set(rowIndex, columnNames.get(columnIndex).getFirst(), Math.round(datum.getNumMismatches())); @@ -349,7 +350,6 @@ public class RecalUtils { return Utils.join(",", names); } - public static void outputRecalibrationReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final PrintStream outputFile) { outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, requestedCovariates), outputFile); } @@ -410,13 +410,13 @@ public class RecalUtils { // add the quality score table to the delta table final NestedIntegerArray qualTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE); - for (final NestedIntegerArray.Leaf leaf : qualTable.getAllLeaves()) { // go through every element in the covariates table to create the delta table + for (final NestedIntegerArray.Leaf leaf : qualTable.getAllLeaves()) { // go through every element in the covariates table to create the delta table final int[] newCovs = new int[4]; newCovs[0] = leaf.keys[0]; - newCovs[1] = requestedCovariates.length; // replace the covariate name with an arbitrary (unused) index for QualityScore + newCovs[1] = requestedCovariates.length; // replace the covariate name with an arbitrary (unused) index for QualityScore newCovs[2] = leaf.keys[1]; newCovs[3] = leaf.keys[2]; - addToDeltaTable(deltaTable, newCovs, (RecalDatum)leaf.value); // add this covariate to the delta table + addToDeltaTable(deltaTable, newCovs, (RecalDatum)leaf.value); // add this covariate to the delta table } // add the optional covariates to the delta table @@ -425,10 +425,10 @@ public class RecalUtils { for (final NestedIntegerArray.Leaf leaf : covTable.getAllLeaves()) { final int[] covs = new int[4]; covs[0] = leaf.keys[0]; - covs[1] = i; // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS) + covs[1] = i; // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS) covs[2] = leaf.keys[2]; covs[3] = leaf.keys[3]; - addToDeltaTable(deltaTable, covs, (RecalDatum) leaf.value); // add this covariate to the delta table + addToDeltaTable(deltaTable, covs, (RecalDatum) leaf.value); // add this covariate to the delta table } } @@ -486,11 +486,11 @@ public class RecalUtils { */ private static void addToDeltaTable(final NestedHashMap deltaTable, final int[] deltaKey, final RecalDatum recalDatum) { Object[] wrappedKey = wrapKeys(deltaKey); - final RecalDatum deltaDatum = (RecalDatum)deltaTable.get(wrappedKey); // check if we already have a RecalDatum for this key + final RecalDatum deltaDatum = (RecalDatum)deltaTable.get(wrappedKey); // check if we already have a RecalDatum for this key if (deltaDatum == null) - deltaTable.put(new RecalDatum(recalDatum), wrappedKey); // if we don't have a key yet, create a new one with the same values as the curent datum + deltaTable.put(new RecalDatum(recalDatum), wrappedKey); // if we don't have a key yet, create a new one with the same values as the curent datum else - deltaDatum.combine(recalDatum); // if we do have a datum, combine it with this one. + deltaDatum.combine(recalDatum); // if we do have a datum, combine it with this one. } private static Object[] wrapKeys(final int[] keys) { @@ -539,10 +539,11 @@ public class RecalUtils { * @return true if this read is consistent or false if this read should be skipped */ public static boolean isColorSpaceConsistent(final SOLID_NOCALL_STRATEGY strategy, final GATKSAMRecord read) { - if (!ReadUtils.isSOLiDRead(read)) // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base + if (!ReadUtils.isSOLiDRead(read)) // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base return true; - if (read.getAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read + // Haven't calculated the inconsistency array yet for this read + if (read.getAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG) == null) { final Object attr = read.getAttribute(RecalUtils.COLOR_SPACE_ATTRIBUTE_TAG); if (attr != null) { byte[] colorSpace; @@ -562,13 +563,13 @@ public class RecalUtils { } } - byte[] readBases = read.getReadBases(); // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read + byte[] readBases = read.getReadBases(); // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read if (read.getReadNegativeStrandFlag()) readBases = BaseUtils.simpleReverseComplement(read.getReadBases()); final byte[] inconsistency = new byte[readBases.length]; int i; - byte prevBase = colorSpace[0]; // The sentinel + byte prevBase = colorSpace[0]; // The sentinel for (i = 0; i < readBases.length; i++) { final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[i + 1]); inconsistency[i] = (byte) (thisBase == readBases[i] ? 0 : 1); @@ -576,11 +577,11 @@ public class RecalUtils { } read.setAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency); } - else if (strategy == SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) // if the strategy calls for an exception, throw it + else if (strategy == SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) // if the strategy calls for an exception, throw it throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); else - return false; // otherwise, just skip the read + return false; // otherwise, just skip the read } return true; @@ -774,6 +775,4 @@ public class RecalUtils { return base; } } - - } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index e6ab9e38b..271c07649 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -19,13 +19,13 @@ import java.util.*; * @since 3/26/12 */ public class RecalibrationReport { - private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done) - private final RecalibrationTables recalibrationTables; // quick access reference to the tables - private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation + private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done) + private final RecalibrationTables recalibrationTables; // quick access reference to the tables + private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation private final HashMap optionalCovariateIndexes; - private final GATKReportTable argumentTable; // keep the argument table untouched just for output purposes - private final RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter + private final GATKReportTable argumentTable; // keep the argument table untouched just for output purposes + private final RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter private final int[] tempRGarray = new int[2]; private final int[] tempQUALarray = new int[3]; @@ -40,7 +40,7 @@ public class RecalibrationReport { GATKReportTable quantizedTable = report.getTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE); quantizationInfo = initializeQuantizationTable(quantizedTable); - Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates + Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates ArrayList requiredCovariates = covariates.getFirst(); ArrayList optionalCovariates = covariates.getSecond(); requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; @@ -50,13 +50,13 @@ public class RecalibrationReport { requestedCovariates[covariateIndex++] = covariate; for (final Covariate covariate : optionalCovariates) { requestedCovariates[covariateIndex] = covariate; - final String covariateName = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport + final String covariateName = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport optionalCovariateIndexes.put(covariateName, covariateIndex-2); covariateIndex++; } for (Covariate cov : requestedCovariates) - cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection + cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection recalibrationTables = new RecalibrationTables(requestedCovariates, countReadGroups(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE))); @@ -198,9 +198,10 @@ public class RecalibrationReport { final long nErrors = (Long) reportTable.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME); final double empiricalQuality = (Double) reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME); - final double estimatedQReported = hasEstimatedQReportedColumn ? // the estimatedQreported column only exists in the ReadGroup table - (Double) reportTable.get(row, RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table - Byte.parseByte((String) reportTable.get(row, RecalUtils.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table + // the estimatedQreported column only exists in the ReadGroup table + final double estimatedQReported = hasEstimatedQReportedColumn ? + (Double) reportTable.get(row, RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table + Byte.parseByte((String) reportTable.get(row, RecalUtils.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table final RecalDatum datum = new RecalDatum(nObservations, nErrors, (byte)1); datum.setEstimatedQReported(estimatedQReported); @@ -242,7 +243,7 @@ public class RecalibrationReport { final String argument = table.get(i, "Argument").toString(); Object value = table.get(i, RecalUtils.ARGUMENT_VALUE_COLUMN_NAME); if (value.equals("null")) - value = null; // generic translation of null values that were printed out as strings | todo -- add this capability to the GATKReport + value = null; // generic translation of null values that were printed out as strings | todo -- add this capability to the GATKReport if (argument.equals("covariate") && value != null) RAC.COVARIATES = value.toString().split(","); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java index 570944245..5e470b35f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java @@ -87,7 +87,8 @@ public class ContextCovariate implements StandardCovariate { // store the original bases and then write Ns over low quality ones final byte[] originalBases = read.getReadBases().clone(); - final GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); // Write N's over the low quality tail of the reads to avoid adding them into the context + // Write N's over the low quality tail of the reads to avoid adding them into the context + final GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); final boolean negativeStrand = clippedRead.getReadNegativeStrandFlag(); byte[] bases = clippedRead.getReadBases(); @@ -115,7 +116,7 @@ public class ContextCovariate implements StandardCovariate { @Override public String formatKey(final int key) { - if (key == -1) // this can only happen in test routines because we do not propagate null keys to the csv file + if (key == -1) // this can only happen in test routines because we do not propagate null keys to the csv file return null; return contextFromKey(key); @@ -176,9 +177,9 @@ public class ContextCovariate implements StandardCovariate { for (int currentIndex = contextSize; currentIndex < readLength; currentIndex++) { final int baseIndex = BaseUtils.simpleBaseToBaseIndex(bases[currentIndex]); - if (baseIndex == -1) { // ignore non-ACGT bases + if (baseIndex == -1) { // ignore non-ACGT bases currentNPenalty = contextSize; - currentKey = 0; // reset the key + currentKey = 0; // reset the key } else { // push this base's contribution onto the key: shift everything 2 bits, mask out the non-context bits, and add the new base and the length in currentKey = (currentKey >> 2) & mask; @@ -215,7 +216,7 @@ public class ContextCovariate implements StandardCovariate { int bitOffset = LENGTH_BITS; for (int i = start; i < end; i++) { final int baseIndex = BaseUtils.simpleBaseToBaseIndex(dna[i]); - if (baseIndex == -1) // ignore non-ACGT bases + if (baseIndex == -1) // ignore non-ACGT bases return -1; key |= (baseIndex << bitOffset); bitOffset += 2; @@ -233,15 +234,15 @@ public class ContextCovariate implements StandardCovariate { if (key < 0) throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?"); - final int length = key & LENGTH_MASK; // the first bits represent the length (in bp) of the context - int mask = 48; // use the mask to pull out bases + final int length = key & LENGTH_MASK; // the first bits represent the length (in bp) of the context + int mask = 48; // use the mask to pull out bases int offset = LENGTH_BITS; StringBuilder dna = new StringBuilder(); for (int i = 0; i < length; i++) { final int baseIndex = (key & mask) >> offset; dna.append((char)BaseUtils.baseIndexToSimpleBase(baseIndex)); - mask = mask << 2; // move the mask over to the next 2 bits + mask = mask << 2; // move the mask over to the next 2 bits offset += 2; } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java index cdf12d284..5d0d94b69 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java @@ -108,7 +108,7 @@ public class CycleCovariate implements StandardCovariate { // the current sequential model would consider the effects independently instead of jointly. final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag(); - int cycle = multiplyByNegative1 ? -1 : 1; // todo -- check if this is the right behavior for mate paired reads in flow cycle platforms. + int cycle = multiplyByNegative1 ? -1 : 1; // todo -- check if this is the right behavior for mate paired reads in flow cycle platforms. // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change // For example, AAAAAAA was probably read in two flow cycles but here we count it as one @@ -201,9 +201,9 @@ public class CycleCovariate implements StandardCovariate { @Override public String formatKey(final int key) { - int cycle = key >> 1; // shift so we can remove the "sign" bit - if ( (key & 1) != 0 ) // is the last bit set? - cycle *= -1; // then the cycle is negative + int cycle = key >> 1; // shift so we can remove the "sign" bit + if ( (key & 1) != 0 ) // is the last bit set? + cycle *= -1; // then the cycle is negative return String.format("%d", cycle); } @@ -222,7 +222,7 @@ public class CycleCovariate implements StandardCovariate { int result = Math.abs(cycle); result = result << 1; // shift so we can add the "sign" bit if ( cycle < 0 ) - result++; // negative cycles get the lower-most bit set + result++; // negative cycles get the lower-most bit set return result; } } \ No newline at end of file From 69b56e11c8f83621c1419e81598e8efbf6f6d406 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 28 Aug 2012 13:33:28 -0400 Subject: [PATCH 060/161] ReadClipper won't modify the original read Reverting back to the original implementation, but now including write N's and write Q0's due to walkers that look at the same read multiple times in different reference windows --- .../sting/utils/clipping/ClippingOp.java | 58 ++++++++++++++----- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index 08c50b982..91414d8fe 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -37,34 +37,60 @@ public class ClippingOp { * Clips the bases in read according to this operation's start and stop. Uses the clipping * representation used is the one provided by algorithm argument. * - * @param algorithm - * @param read + * @param algorithm clipping algorithm to use + * @param originalRead the read to be clipped */ - public GATKSAMRecord apply(ClippingRepresentation algorithm, GATKSAMRecord read) { + public GATKSAMRecord apply(ClippingRepresentation algorithm, GATKSAMRecord originalRead) { + GATKSAMRecord read; + try { + read = (GATKSAMRecord) originalRead.clone(); + } catch (CloneNotSupportedException e) { + throw new ReviewedStingException("Where did the clone go?"); + } byte[] quals = read.getBaseQualities(); byte[] bases = read.getReadBases(); + byte[] newBases = new byte[bases.length]; + byte[] newQuals = new byte[quals.length]; switch (algorithm) { // important note: // it's not safe to call read.getReadBases()[i] = 'N' or read.getBaseQualities()[i] = 0 // because you're not guaranteed to get a pointer to the actual array of bytes in the GATKSAMRecord case WRITE_NS: - for (int i = start; i <= stop; i++) - bases[i] = 'N'; - read.setReadBases(bases); + for (int i = 0; i < bases.length; i++) { + if (i >= start && i <= stop) { + newBases[i] = 'N'; + } + else { + newBases[i] = bases[i]; + } + } + read.setReadBases(newBases); break; case WRITE_Q0S: - for (int i = start; i <= stop; i++) - quals[i] = 0; - read.setBaseQualities(quals); + for (int i = 0; i < quals.length; i++) { + if (i >= start && i <= stop) { + newQuals[i] = 0; + } + else { + newQuals[i] = quals[i]; + } + } + read.setBaseQualities(newQuals); break; case WRITE_NS_Q0S: - for (int i = start; i <= stop; i++) { - bases[i] = 'N'; - quals[i] = 0; + for (int i = 0; i < bases.length; i++) { + if (i >= start && i <= stop) { + newQuals[i] = 0; + newBases[i] = 'N'; + } + else { + newQuals[i] = quals[i]; + newBases[i] = bases[i]; + } } - read.setReadBases(bases); - read.setBaseQualities(quals); + read.setBaseQualities(newBases); + read.setReadBases(newBases); break; case HARDCLIP_BASES: read = hardClip(read, start, stop); @@ -437,8 +463,8 @@ public class ClippingOp { * Checks if a hard clipped cigar left a read starting or ending with insertions/deletions * and cleans it up accordingly. * - * @param cigar - * @return + * @param cigar the original cigar + * @return an object with the shifts (see CigarShift class) */ private CigarShift cleanHardClippedCigar(Cigar cigar) { Cigar cleanCigar = new Cigar(); From ce55ba98f4b1fec0c84047168a8edda0cc94a033 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 29 Aug 2012 15:01:11 -0400 Subject: [PATCH 062/161] Don't try to left align indels in unmapped reads (which for some reason can still have CIGARs) because the ref context is null. --- .../broadinstitute/sting/gatk/contexts/ReferenceContext.java | 2 +- .../sting/gatk/walkers/indels/LeftAlignIndels.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java index 1290319e2..af330bba9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java @@ -177,7 +177,7 @@ public class ReferenceContext { * @return The base at the given locus from the reference. */ public byte getBase() { - return getBases()[(int)(locus.getStart() - window.getStart())]; + return getBases()[(locus.getStart() - window.getStart())]; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java index b08def44f..6b9bd04d2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java @@ -82,7 +82,7 @@ public class LeftAlignIndels extends ReadWalker { public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { // we can not deal with screwy records - if ( read.getCigar().numCigarElements() == 0 ) { + if ( read.getReadUnmappedFlag() || read.getCigar().numCigarElements() == 0 ) { emit(read); return 0; } From 150a9692797744d4b0147d6f49b07d3522b8ec22 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 29 Aug 2012 15:13:28 -0400 Subject: [PATCH 063/161] Be careful with String manipulation when constructing alleles in SomaticIndelDetector --- .../sting/gatk/walkers/indels/SomaticIndelDetector.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java index b0c09f78e..3965a63fb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java @@ -1181,10 +1181,10 @@ public class SomaticIndelDetector extends ReadWalker { if ( event_length == 0 ) { // insertion l.add( Allele.create(referencePaddingBase,true) ); - l.add( Allele.create(referencePaddingBase + call.getVariant().getBases(), false )); + l.add( Allele.create((char)referencePaddingBase + new String(call.getVariant().getBases()), false )); } else { //deletion: - l.add( Allele.create(referencePaddingBase + call.getVariant().getBases(), true )); + l.add( Allele.create((char)referencePaddingBase + new String(call.getVariant().getBases()), true )); l.add( Allele.create(referencePaddingBase,false) ); } } From 1acf0f0b2cd62c16e35d496c1eb0d23f9b9c480f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 29 Aug 2012 22:36:27 -0400 Subject: [PATCH 064/161] Fixing bug in fasta .fai generation: trim the contig names to the first whitespace if one appears. We now generate indexes identical to samtools. --- .../sf/picard/reference/FastaSequenceIndexBuilder.java | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java b/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java index 10326ef2e..507d4b786 100644 --- a/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java +++ b/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java @@ -245,7 +245,7 @@ public class FastaSequenceIndexBuilder { * Reset iterators and add contig to sequence index */ private void finishReadingContig(FastaSequenceIndex sequenceIndex) { - sequenceIndex.add(new FastaSequenceIndexEntry(contig, location, size, (int) basesPerLine, (int) bytesPerLine, thisSequenceIndex++)); + sequenceIndex.add(new FastaSequenceIndexEntry(trimContigName(contig), location, size, (int) basesPerLine, (int) bytesPerLine, thisSequenceIndex++)); status = Status.NONE; contig = ""; size = 0; @@ -258,6 +258,14 @@ public class FastaSequenceIndexBuilder { } } + /* + * Trims the contig name to the expected value by removing any characters after the first whitespace + */ + private static String trimContigName(final String contigName) { + int whitespaceIndex = contigName.indexOf(' '); + return ( whitespaceIndex == -1 ) ? contigName : contigName.substring(0, whitespaceIndex); + } + /** * Stores FastaSequenceIndex as a .fasta.fai file on local machine * Although method is public it cannot be called on any old FastaSequenceIndex - must be created by a FastaSequenceIndexBuilder From 35baf0b15542b77dcaf702f2b3c1d990bbad4e27 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 30 Aug 2012 09:07:36 -0400 Subject: [PATCH 065/161] This along with Mauricio's previous commit (thanks!) fixes GSA-522. There are no longer any modifications to reads in the map calls of ActiveRegion walkers. Added the bam which identified this error as a new integration test. --- .../HaplotypeCallerIntegrationTest.java | 8 ++++++++ .../gatk/traversals/TraverseActiveRegions.java | 1 + .../sting/utils/sam/GATKSAMRecord.java | 16 +++++++--------- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 2ae1f2ca5..c1a1e065a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -66,4 +66,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testHaplotypeCallerSingleSampleIndelQualityScores() { HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "e1f88fac91424740c0eaac1de48b3970"); } + + @Test + public void HCTestProblematicReadsModifiedInActiveRegions() { + final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("000fd36d5cf8090386bb2ac15e3ab0b5")); + executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); + } + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 67de427e8..af981e676 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -241,6 +241,7 @@ public class TraverseActiveRegions extends TraversalEngine> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); final M x = walker.map( activeRegion, null ); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index c9b3a2df8..53e6dc0dc 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -228,8 +228,7 @@ public class GATKSAMRecord extends BAMRecord { if( quals == null ) { quals = new byte[getBaseQualities().length]; Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will - // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 - setBaseQualities(quals, EventType.BASE_INSERTION); + // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 } return quals; } @@ -246,7 +245,6 @@ public class GATKSAMRecord extends BAMRecord { quals = new byte[getBaseQualities().length]; Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 - setBaseQualities(quals, EventType.BASE_DELETION); } return quals; } @@ -262,7 +260,7 @@ public class GATKSAMRecord extends BAMRecord { public void setReadGroup( final GATKSAMReadGroupRecord readGroup ) { mReadGroup = readGroup; retrievedReadGroup = true; - setAttribute("RG", mReadGroup.getId()); // todo -- this should be standardized, but we don't have access to SAMTagUtils! + setAttribute("RG", mReadGroup.getId()); // todo -- this should be standardized, but we don't have access to SAMTagUtils! } /////////////////////////////////////////////////////////////////////////////// @@ -367,15 +365,15 @@ public class GATKSAMRecord extends BAMRecord { * Clears all attributes except ReadGroup of the read. */ public GATKSAMRecord simplify () { - GATKSAMReadGroupRecord rg = getReadGroup(); // save the read group information + GATKSAMReadGroupRecord rg = getReadGroup(); // save the read group information byte[] insQuals = (this.getAttribute(BQSR_BASE_INSERTION_QUALITIES) == null) ? null : getBaseInsertionQualities(); byte[] delQuals = (this.getAttribute(BQSR_BASE_DELETION_QUALITIES) == null) ? null : getBaseDeletionQualities(); - this.clearAttributes(); // clear all attributes from the read - this.setReadGroup(rg); // restore read group + this.clearAttributes(); // clear all attributes from the read + this.setReadGroup(rg); // restore read group if (insQuals != null) - this.setBaseQualities(insQuals, EventType.BASE_INSERTION); // restore base insertion if we had any + this.setBaseQualities(insQuals, EventType.BASE_INSERTION); // restore base insertion if we had any if (delQuals != null) - this.setBaseQualities(delQuals, EventType.BASE_DELETION); // restore base deletion if we had any + this.setBaseQualities(delQuals, EventType.BASE_DELETION); // restore base deletion if we had any return this; } From 57d997f06f9286aae0ff2c59eeab5dbaa2a44d88 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 30 Aug 2012 10:10:43 -0400 Subject: [PATCH 066/161] Fixing bug from when FragmentUtils merging function moved over to the soft clipped start instead of the unclipped start --- .../HaplotypeCallerIntegrationTest.java | 1 - .../sting/utils/fragments/FragmentUtils.java | 26 +++---------------- 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index c1a1e065a..b5359af46 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -73,5 +73,4 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("000fd36d5cf8090386bb2ac15e3ab0b5")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } - } diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index 2f31c154c..a4a5d578a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -128,22 +128,13 @@ public class FragmentUtils { return create(reads, reads.size(), SamRecordGetter); } - public final static List mergeOverlappingPairedFragments( List overlappingPair ) { + public final static List mergeOverlappingPairedFragments( final List overlappingPair ) { final byte MIN_QUAL_BAD_OVERLAP = 16; if( overlappingPair.size() != 2 ) { throw new ReviewedStingException("Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2."); } GATKSAMRecord firstRead = overlappingPair.get(0); GATKSAMRecord secondRead = overlappingPair.get(1); - /* - System.out.println("read 0 unclipped start:"+overlappingPair.get(0).getUnclippedStart()); - System.out.println("read 0 unclipped end:"+overlappingPair.get(0).getUnclippedEnd()); - System.out.println("read 1 unclipped start:"+overlappingPair.get(1).getUnclippedStart()); - System.out.println("read 1 unclipped end:"+overlappingPair.get(1).getUnclippedEnd()); - System.out.println("read 0 start:"+overlappingPair.get(0).getAlignmentStart()); - System.out.println("read 0 end:"+overlappingPair.get(0).getAlignmentEnd()); - System.out.println("read 1 start:"+overlappingPair.get(1).getAlignmentStart()); - System.out.println("read 1 end:"+overlappingPair.get(1).getAlignmentEnd()); - */ + if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) { firstRead = overlappingPair.get(1); // swap them secondRead = overlappingPair.get(0); @@ -155,15 +146,6 @@ public class FragmentUtils { return overlappingPair; // fragments contain indels so don't merge them } -/* // check for inconsistent start positions between uncliped/soft alignment starts - if (secondRead.getAlignmentStart() >= firstRead.getAlignmentStart() && secondRead.getUnclippedStart() < firstRead.getUnclippedStart()) - return overlappingPair; - if (secondRead.getAlignmentStart() <= firstRead.getAlignmentStart() && secondRead.getUnclippedStart() > firstRead.getUnclippedStart()) - return overlappingPair; - - if (secondRead.getUnclippedStart() < firstRead.getAlignmentEnd() && secondRead.getAlignmentStart() >= firstRead.getAlignmentEnd()) - return overlappingPair; - */ final Pair pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getSoftStart()); final int firstReadStop = ( pair.getSecond() ? pair.getFirst() + 1 : pair.getFirst() ); @@ -183,7 +165,7 @@ public class FragmentUtils { } for(int iii = firstReadStop; iii < firstRead.getReadLength(); iii++) { if( firstReadQuals[iii] > MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] > MIN_QUAL_BAD_OVERLAP && firstReadBases[iii] != secondReadBases[iii-firstReadStop] ) { - return overlappingPair;// high qual bases don't match exactly, probably indel in only one of the fragments, so don't merge them + return overlappingPair; // high qual bases don't match exactly, probably indel in only one of the fragments, so don't merge them } if( firstReadQuals[iii] < MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] < MIN_QUAL_BAD_OVERLAP ) { return overlappingPair; // both reads have low qual bases in the overlap region so don't merge them because don't know what is going on @@ -197,7 +179,7 @@ public class FragmentUtils { } final GATKSAMRecord returnRead = new GATKSAMRecord( firstRead.getHeader() ); - returnRead.setAlignmentStart( firstRead.getUnclippedStart() ); + returnRead.setAlignmentStart( firstRead.getSoftStart() ); returnRead.setReadBases( bases ); returnRead.setBaseQualities( quals ); returnRead.setReadGroup( firstRead.getReadGroup() ); From 8fc6a0a68b8073c1ec83e3bf983c18c60d13a016 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 29 Aug 2012 09:25:05 -0400 Subject: [PATCH 068/161] Cleanup RefMetaDataTracker before refactoring ReadMetaDataTracker --- .../sting/commandline/RodBinding.java | 9 +++ .../ManagingReferenceOrderedView.java | 2 +- .../datasources/providers/RodLocusView.java | 2 +- .../gatk/refdata/RefMetaDataTracker.java | 69 ++++++++----------- .../ReferenceOrderedViewUnitTest.java | 7 +- .../refdata/RefMetaDataTrackerUnitTest.java | 2 +- 6 files changed, 44 insertions(+), 47 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java b/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java index e0b1154c4..15d134fa2 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java +++ b/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java @@ -117,6 +117,15 @@ public final class RodBinding { this.bound = true; } + /** + * For testing purposes only. Creates a RodBinding sufficient for looking up associations to rawName + * @param type + * @param rawName + */ + public RodBinding(Class type, final String rawName) { + this(type, rawName, "missing", type.getSimpleName(), new Tags()); + } + /** * Make an unbound RodBinding. Only available for creating the globally unique UNBOUND object * @param type class this unbound RodBinding creates diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java index d065635c8..080ac6686 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java @@ -58,7 +58,7 @@ public class ManagingReferenceOrderedView implements ReferenceOrderedView { // todo -- warning, I removed the reference to the name from states bindings.add( state.iterator.seekForward(loc) ); - return new RefMetaDataTracker(bindings, referenceContext); + return new RefMetaDataTracker(bindings); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java index 54f8b44ed..4be7c63c8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java @@ -101,7 +101,7 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc, ReferenceContext referenceContext ) { // special case the interval again -- add it into the ROD if ( interval != null ) { allTracksHere.add(interval); } - return new RefMetaDataTracker(allTracksHere, referenceContext); + return new RefMetaDataTracker(allTracksHere); } public boolean hasNext() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index 2c2ee51bb..7e32ec112 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -5,7 +5,6 @@ import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLoc; @@ -32,11 +31,10 @@ import java.util.*; * Time: 3:05:23 PM */ public class RefMetaDataTracker { - // TODO: this should be a list, not a map, actually + // TODO: this should be a list, not a bindings, actually private final static RODRecordList EMPTY_ROD_RECORD_LIST = new RODRecordListImpl("EMPTY"); - final Map map; - final ReferenceContext ref; + final Map bindings; final protected static Logger logger = Logger.getLogger(RefMetaDataTracker.class); // ------------------------------------------------------------------------------------------ @@ -48,28 +46,25 @@ public class RefMetaDataTracker { // ------------------------------------------------------------------------------------------ /** - * Only for testing -- not accesssible in any other context + * Create an tracker with no bindings */ public RefMetaDataTracker() { - ref = null; - map = Collections.emptyMap(); + bindings = Collections.emptyMap(); } - public RefMetaDataTracker(final Collection allBindings, final ReferenceContext ref) { - this.ref = ref; - - // set up the map + public RefMetaDataTracker(final Collection allBindings) { + // set up the bindings if ( allBindings.isEmpty() ) - map = Collections.emptyMap(); + bindings = Collections.emptyMap(); else { - Map tmap = new HashMap(allBindings.size()); + final Map tmap = new HashMap(allBindings.size()); for ( RODRecordList rod : allBindings ) { if ( rod != null && ! rod.isEmpty() ) tmap.put(canonicalName(rod.getName()), rod); } - // ensure that no one modifies the map itself - map = Collections.unmodifiableMap(tmap); + // ensure that no one modifies the bindings itself + bindings = Collections.unmodifiableMap(tmap); } } @@ -99,7 +94,7 @@ public class RefMetaDataTracker { @Requires({"type != null"}) @Ensures("result != null") public List getValues(final Class type) { - return addValues(map.keySet(), type, new ArrayList(), null, false, false); + return addValues(bindings.keySet(), type, new ArrayList(), null, false, false); } /** @@ -114,7 +109,7 @@ public class RefMetaDataTracker { @Requires({"type != null", "onlyAtThisLoc != null"}) @Ensures("result != null") public List getValues(final Class type, final GenomeLoc onlyAtThisLoc) { - return addValues(map.keySet(), type, new ArrayList(), onlyAtThisLoc, true, false); + return addValues(bindings.keySet(), type, new ArrayList(), onlyAtThisLoc, true, false); } /** @@ -296,7 +291,7 @@ public class RefMetaDataTracker { */ @Requires({"rodBinding != null"}) public boolean hasValues(final RodBinding rodBinding) { - return map.containsKey(canonicalName(rodBinding.getName())); + return bindings.containsKey(canonicalName(rodBinding.getName())); } /** @@ -306,7 +301,7 @@ public class RefMetaDataTracker { * @return List of all tracks */ public List getBoundRodTracks() { - return new ArrayList(map.values()); + return new ArrayList(bindings.values()); } /** @@ -314,38 +309,30 @@ public class RefMetaDataTracker { * @return the number of tracks with at least one bound Feature */ public int getNTracksWithBoundFeatures() { - return map.size(); + return bindings.size(); } // ------------------------------------------------------------------------------------------ - // - // - // old style accessors - // - // TODO -- DELETE ME - // - // + // Protected accessors using strings for unit testing // ------------------------------------------------------------------------------------------ - @Deprecated - public boolean hasValues(final String name) { - return map.containsKey(canonicalName(name)); + protected boolean hasValues(final String name) { + return bindings.containsKey(canonicalName(name)); } - @Deprecated - public List getValues(final Class type, final String name) { + protected List getValues(final Class type, final String name) { return addValues(name, type, new ArrayList(), getTrackDataByName(name), null, false, false); } - @Deprecated - public List getValues(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { + + protected List getValues(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { return addValues(name, type, new ArrayList(), getTrackDataByName(name), onlyAtThisLoc, true, false); } - @Deprecated - public T getFirstValue(final Class type, final String name) { + + protected T getFirstValue(final Class type, final String name) { return safeGetFirst(getValues(type, name)); } - @Deprecated - public T getFirstValue(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { + + protected T getFirstValue(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { return safeGetFirst(getValues(type, name, onlyAtThisLoc)); } @@ -366,7 +353,7 @@ public class RefMetaDataTracker { * @return */ @Requires({"l != null"}) - final private T safeGetFirst(final List l) { + private T safeGetFirst(final List l) { return l.isEmpty() ? null : l.get(0); } @@ -435,7 +422,7 @@ public class RefMetaDataTracker { */ private RODRecordList getTrackDataByName(final String name) { final String luName = canonicalName(name); - RODRecordList l = map.get(luName); + RODRecordList l = bindings.get(luName); return l == null ? EMPTY_ROD_RECORD_LIST : l; } @@ -448,7 +435,7 @@ public class RefMetaDataTracker { * @param name the name of the rod * @return canonical name of the rod */ - private final String canonicalName(final String name) { + private String canonicalName(final String name) { // todo -- remove me after switch to RodBinding syntax return name.toLowerCase(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java index d75beae23..11a7b4cf7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.datasources.providers; import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.datasources.reads.MockLocusShard; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; @@ -89,7 +90,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20), null); - TableFeature datum = tracker.getFirstValue(TableFeature.class, "tableTest"); + TableFeature datum = tracker.getFirstValue(new RodBinding(TableFeature.class, "tableTest")); Assert.assertEquals(datum.get("COL1"),"C","datum parameter for COL1 is incorrect"); Assert.assertEquals(datum.get("COL2"),"D","datum parameter for COL2 is incorrect"); @@ -115,13 +116,13 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20), null); - TableFeature datum1 = tracker.getFirstValue(TableFeature.class, "tableTest1"); + TableFeature datum1 = tracker.getFirstValue(new RodBinding(TableFeature.class, "tableTest1")); Assert.assertEquals(datum1.get("COL1"),"C","datum1 parameter for COL1 is incorrect"); Assert.assertEquals(datum1.get("COL2"),"D","datum1 parameter for COL2 is incorrect"); Assert.assertEquals(datum1.get("COL3"),"E","datum1 parameter for COL3 is incorrect"); - TableFeature datum2 = tracker.getFirstValue(TableFeature.class, "tableTest2"); + TableFeature datum2 = tracker.getFirstValue(new RodBinding(TableFeature.class, "tableTest2")); Assert.assertEquals(datum2.get("COL1"),"C","datum2 parameter for COL1 is incorrect"); Assert.assertEquals(datum2.get("COL2"),"D","datum2 parameter for COL2 is incorrect"); diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java index 91c18078e..2f73e373c 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java @@ -133,7 +133,7 @@ public class RefMetaDataTrackerUnitTest { List x = new ArrayList(); if ( AValues != null ) x.add(AValues); if ( BValues != null ) x.add(BValues); - return new RefMetaDataTracker(x, context); + return new RefMetaDataTracker(x); } public int nBoundTracks() { From 972be8b4a4babce3b198f5c871e2359130696b6e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 29 Aug 2012 16:58:21 -0400 Subject: [PATCH 069/161] Part I of GSA-462: Consistent RODBinding access across Ref and Read trackers -- ReadMetaDataTracker is dead! Long live the RefMetaDataTracker. Read walkers will soon just take RefMetaDataTracker objects. In this commit they take a class that trivially extends them -- Rewrote ReadBasedReferenceOrderedView to produce RefMetaDataTrackers not the old class. -- This new implementation produces thread-safe objects (i.e., holds no points to shared state). Suitable for use (to be tested) with nano scheduling -- Simplified interfaces to use the simplest data structures (PeekableIterator) not the LocusAwareSeekableIterator, since I both hate those classes and this is on the long term trajectory to remove those from the GATK entirely. -- Massively expanded DataProvider unit tests for ReadBasedReferenceOrderedView -- Note that the old implementation of offset -> ROD in ReadRefMetaDataTracker was broken for any read not completely matching the reference. Rather than provide broken code the ReadMetaDataTracker only provides a "bag of RODs" interface. If you want to work with the relationship between the read and the RODs in your tool you need to manage the CIGAR element itself. -- This commit breaks the new read walker BQSR, but Ryan knows this is coming -- Subsequent commit will be retiring / fixing ValidateRODForReads --- .../IntervalOverlappingRODsFromStream.java | 143 ++++++ .../ReadBasedReferenceOrderedView.java | 210 ++++----- .../gatk/refdata/ReadMetaDataTracker.java | 140 +----- .../gatk/walkers/indels/IndelRealigner.java | 15 +- .../broadinstitute/sting/utils/GenomeLoc.java | 9 + ...ReadBasedReferenceOrderedViewUnitTest.java | 438 ++++++++++++------ .../refdata/ReadMetaDataTrackerUnitTest.java | 276 ----------- 7 files changed, 537 insertions(+), 694 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java delete mode 100644 public/java/test/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTrackerUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java new file mode 100644 index 000000000..1e39d6836 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java @@ -0,0 +1,143 @@ +package org.broadinstitute.sting.gatk.datasources.providers; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.picard.util.PeekableIterator; +import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.Collection; +import java.util.LinkedList; +import java.util.ListIterator; + +/** + * Key algorithmic helper for ReadBasedReferenceOrderedData + * + * Takes a single iterator of features, and provides a single capability that returns + * the list of RODs that overlap an interval. Allows sequential getOverlapping calls + * from intervals provided that these intervals always have increasing getStart() values. + * + */ +class IntervalOverlappingRODsFromStream { + /** + * Only held for QC purposes + */ + GenomeLoc lastQuery = null; + + private final String name; + private final LinkedList currentFeatures = new LinkedList(); + private final PeekableIterator futureFeatures; + + /** + * Create a new IntervalOverlappingRODsFromStream that reads elements from futureFeatures and + * returns RODRecordLists having name + * + * @param name + * @param futureFeatures + */ + IntervalOverlappingRODsFromStream(final String name, final PeekableIterator futureFeatures) { + if ( futureFeatures == null ) throw new IllegalArgumentException("futureFeatures cannot be null"); + + this.name = name; + this.futureFeatures = futureFeatures; + } + + /** + * Get the list of RODs overlapping loc from this stream of RODs. + * + * Sequential calls to this function must obey the rule that loc2.getStart >= loc1.getStart + * + * @param loc the interval to query + * @return a non-null RODRecordList containing the overlapping RODs, which may be empty + */ + @Ensures({"overlaps(loc, result)", + "! futureFeatures.hasNext() || futureFeatures.peek().getLocation().isPast(loc)", + "result != null"}) + public RODRecordList getOverlapping(final GenomeLoc loc) { + if ( lastQuery != null && loc.getStart() < lastQuery.getStart() ) + throw new IllegalArgumentException(String.format("BUG: query interval (%s) starts before the previous interval %s", loc, lastQuery)); + + trimCurrentFeaturesToLoc(loc); + readOverlappingFutureFeatures(loc); + return new RODRecordListImpl(name, subsetToOverlapping(loc, currentFeatures), loc); + } + + + /** + * For contract assurance. Checks that all bindings in loc overlap + * + * @param loc + * @param bindings + * @return + */ + @Requires({"loc != null", "bindings != null"}) + private boolean overlaps(final GenomeLoc loc, final RODRecordList bindings) { + for ( final GATKFeature feature : bindings ) + if ( ! feature.getLocation().overlapsP(loc) ) + return false; + return true; + } + + /** + * Subset the features in all to those that overlap with loc + * + * The current features list contains everything read that cannot be thrown away yet, but not + * everything in there necessarily overlaps with loc. Subset to just those that do overlap + * + * @param loc the location that features must overlap + * @param all the list of all features + * @return a subset of all that overlaps with loc + */ + @Requires({"loc != null", "all != null"}) + @Ensures("result.size() <= all.size()") + private Collection subsetToOverlapping(final GenomeLoc loc, final Collection all) { + final LinkedList overlapping = new LinkedList(); + for ( final GATKFeature feature : all ) + if ( feature.getLocation().overlapsP(loc) ) + overlapping.add(feature); + return overlapping; + } + + /** + * Update function. Remove all elements of currentFeatures that end before loc + * + * @param loc the location to use + */ + @Requires("loc != null") + @Ensures("currentFeatures.size() <= old(currentFeatures.size())") + private void trimCurrentFeaturesToLoc(final GenomeLoc loc) { + final ListIterator it = currentFeatures.listIterator(); + while ( it.hasNext() ) { + final GATKFeature feature = it.next(); + if ( feature.getLocation().isBefore(loc) ) + it.remove(); + } + } + + /** + * Update function: Read all elements from futureFeatures that overlap with loc + * + * Stops at the first element that starts before the end of loc, or the stream empties + * + * @param loc + */ + @Requires("loc != null") + @Ensures("currentFeatures.size() >= old(currentFeatures.size())") + private void readOverlappingFutureFeatures(final GenomeLoc loc) { + while ( futureFeatures.hasNext() ) { + final GenomeLoc nextLoc = futureFeatures.peek().getLocation(); + if ( nextLoc.isBefore(loc) ) { + futureFeatures.next(); // next rod element is before loc, throw it away and keep looking + } else if ( nextLoc.isPast(loc) ) { + break; // next element is past loc, stop looking but don't pop it + } else if ( nextLoc.overlapsP(loc) ) { + // add overlapping elements to our current features, removing from stream + for ( final GATKFeature feature : futureFeatures.next() ) { + currentFeatures.add(feature); + } + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java index 01e24df67..054758101 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java @@ -23,40 +23,63 @@ package org.broadinstitute.sting.gatk.datasources.providers; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.picard.util.PeekableIterator; import net.sf.samtools.SAMRecord; -import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import java.util.ArrayList; import java.util.Collection; import java.util.List; -import java.util.TreeMap; /** a ROD view for reads. This provides the Read traversals a way of getting a ReadMetaDataTracker */ public class ReadBasedReferenceOrderedView implements View { - private final WindowedData window; + // a list of the RMDDataState (location->iterators) + private final List states = new ArrayList(1); + private final static ReadMetaDataTracker EMPTY_TRACKER = new ReadMetaDataTracker(); - public ReadBasedReferenceOrderedView(ShardDataProvider provider) { - window = new WindowedData(provider); + /** + * Used to get genome locs for reads + */ + private final GenomeLocParser genomeLocParser; + + /** + * The total extent of all reads in this span. We create iterators from our RODs + * from the start of this span, to the end. + */ + private final GenomeLoc shardSpan; + + public ReadBasedReferenceOrderedView(final ShardDataProvider provider) { + this(provider.getGenomeLocParser(), provider.getShard().getLocation()); provider.register(this); + + if ( provider.getReferenceOrderedData() != null && ! shardSpan.isUnmapped() ) { + for (ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData()) + states.add(new RMDDataState(dataSource, dataSource.seek(shardSpan))); + } + } + + private ReadBasedReferenceOrderedView(final GenomeLocParser genomeLocParser, final GenomeLoc shardSpan) { + this.genomeLocParser = genomeLocParser; + this.shardSpan = shardSpan; } /** - * for testing only please - * - * @param data the window provider + * Testing constructor */ - ReadBasedReferenceOrderedView(WindowedData data) { - window = data; - } - - public ReadMetaDataTracker getReferenceOrderedDataForRead(SAMRecord read) { - return window.getTracker(read); + protected ReadBasedReferenceOrderedView(final GenomeLocParser genomeLocParser, + final GenomeLoc shardSpan, + final List names, + final List> featureSources) { + this(genomeLocParser, shardSpan); + for ( int i = 0; i < names.size(); i++ ) + states.add(new RMDDataState(names.get(i), featureSources.get(i))); } public Collection> getConflictingViews() { @@ -65,74 +88,6 @@ public class ReadBasedReferenceOrderedView implements View { return classes; } - public void close() { - if (window != null) window.close(); - } -} - - -/** stores a window of data, dropping RODs if we've passed the new reads start point. */ -class WindowedData { - // the queue of possibly in-frame RODs; RODs are removed as soon as they are out of scope - private final TreeMap mapping = new TreeMap(); - - // our current location from the last read we processed - private GenomeLoc currentLoc; - - // a list of the RMDDataState (location->iterators) - private List states; - - // the provider; where we get all our information - private final ShardDataProvider provider; - - /** - * our log, which we want to capture anything from this class - */ - private static Logger logger = Logger.getLogger(WindowedData.class); - - /** - * create a WindowedData given a shard provider - * - * @param provider the ShardDataProvider - */ - public WindowedData(ShardDataProvider provider) { - this.provider = provider; - } - - /** - * load the states dynamically, since the only way to get a genome loc is from the read (the shard doesn't have one) - * - * @param provider the ShardDataProvider - * @param rec the current read - */ - private void getStates(ShardDataProvider provider, SAMRecord rec) { - - int stop = Integer.MAX_VALUE; - // figure out the appropriate alignment stop - if (provider.hasReference()) { - stop = provider.getReference().getSequenceDictionary().getSequence(rec.getReferenceIndex()).getSequenceLength(); - } - - // calculate the range of positions we need to look at - GenomeLoc range = provider.getGenomeLocParser().createGenomeLoc(rec.getReferenceName(), - rec.getAlignmentStart(), - stop); - states = new ArrayList(); - if (provider.getReferenceOrderedData() != null) - for (ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData()) - states.add(new RMDDataState(dataSource, dataSource.seek(range))); - } - - /** - * this function is for testing only - * - * @param states a list of RMDDataState to initialize with - */ - WindowedData(List states) { - this.states = states; - provider = null; - } - /** * create a ReadMetaDataTracker given the current read * @@ -140,60 +95,65 @@ class WindowedData { * * @return a ReadMetaDataTracker for the read, from which you can get ROD -> read alignments */ - public ReadMetaDataTracker getTracker(SAMRecord rec) { - updatePosition(rec); - return new ReadMetaDataTracker(provider.getGenomeLocParser(), rec, mapping); + @Requires("rec != null") + @Ensures("result != null") + public ReadMetaDataTracker getReferenceOrderedDataForRead(final SAMRecord rec) { + if ( rec.getReadUnmappedFlag() ) + // empty RODs for unmapped reads + return new ReadMetaDataTracker(); + else + return getReferenceOrderedDataForInterval(genomeLocParser.createGenomeLoc(rec)); } - /** - * update the position we're storing - * - * @param rec the read to use for start and end - */ - private void updatePosition(SAMRecord rec) { - if (states == null) getStates(this.provider, rec); - currentLoc = provider.getGenomeLocParser().createGenomeLoc(rec); - - // flush the queue looking for records we've passed over - while (mapping.size() > 0 && mapping.firstKey() < currentLoc.getStart()) - mapping.pollFirstEntry(); // toss away records that we've passed - - // add new data to the queue - for (RMDDataState state : states) { - // move into position - while (state.iterator.hasNext() && state.iterator.peekNextLocation().isBefore(currentLoc)) - state.iterator.next(); - while (state.iterator.hasNext() && state.iterator.peekNextLocation().overlapsP(currentLoc)) { - RODRecordList list = state.iterator.next(); - for (GATKFeature datum : list) { - if (!mapping.containsKey(list.getLocation().getStart())) - mapping.put(list.getLocation().getStart(), new RODMetaDataContainer()); - mapping.get(list.getLocation().getStart()).addEntry(datum); - } - } + @Requires({"interval != null", "shardSpan.containsP(interval)"}) + @Ensures("result != null") + public ReadMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) { + if ( states.isEmpty() ) // optimization for no bindings (common for read walkers) + return EMPTY_TRACKER; + else { + final List bindings = new ArrayList(states.size()); + for ( final RMDDataState state : states ) + bindings.add(state.stream.getOverlapping(interval)); + return new ReadMetaDataTracker(bindings); } } - /** Closes the current view. */ + /** + * Closes the current view. + */ public void close() { - if (states == null) return; - for (RMDDataState state : states) - state.dataSource.close( state.iterator ); + for (final RMDDataState state : states) + state.close(); // Clear out the existing data so that post-close() accesses to this data will fail-fast. - states = null; + states.clear(); } + /** Models the traversal state of a given ROD lane. */ + private static class RMDDataState { + public final ReferenceOrderedDataSource dataSource; + public final IntervalOverlappingRODsFromStream stream; + private final LocationAwareSeekableRODIterator iterator; -} + public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) { + this.dataSource = dataSource; + this.iterator = iterator; + this.stream = new IntervalOverlappingRODsFromStream(dataSource.getName(), new PeekableIterator(iterator)); + } -/** Models the traversal state of a given ROD lane. */ -class RMDDataState { - public final ReferenceOrderedDataSource dataSource; - public final LocationAwareSeekableRODIterator iterator; + /** + * For testing + */ + public RMDDataState(final String name, final PeekableIterator iterator) { + this.dataSource = null; + this.iterator = null; + this.stream = new IntervalOverlappingRODsFromStream(name, new PeekableIterator(iterator)); + } - public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) { - this.dataSource = dataSource; - this.iterator = iterator; + public void close() { + if ( dataSource != null ) + dataSource.close( iterator ); + } } } + diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java index 96dbd15f2..cfea5901e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.refdata; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.datasources.providers.RODMetaDataContainer; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -39,141 +40,12 @@ import java.util.*; *

* a read-based meta data tracker */ -public class ReadMetaDataTracker { - /** - * The parser, used to create new GenomeLocs. - */ - private final GenomeLocParser genomeLocParser; - - private final SAMRecord record; - - // the buffer of positions and RODs we've stored - private final TreeMap mapping; - - /** - * create a read meta data tracker, given the read and a queue of RODatum positions - * - * @param record the read to create offset from - * @param mapping the mapping of reference ordered datum - */ - public ReadMetaDataTracker(GenomeLocParser genomeLocParser, SAMRecord record, TreeMap mapping) { - this.genomeLocParser = genomeLocParser; - this.record = record; - this.mapping = mapping; +public class ReadMetaDataTracker extends RefMetaDataTracker { + public ReadMetaDataTracker() { + super(); } - /** - * create an alignment of read position to reference ordered datum - * - * @param record the SAMRecord - * @param queue the queue (as a tree set) - * @param cl the class name, null if not filtered by classname - * @param name the datum track name, null if not filtered by name - * - * @return a mapping from the position in the read to the reference ordered datum - */ - private Map> createReadAlignment(SAMRecord record, TreeMap queue, Class cl, String name) { - if (name != null && cl != null) throw new IllegalStateException("Both a class and name cannot be specified"); - Map> ret = new LinkedHashMap>(); - GenomeLoc location = genomeLocParser.createGenomeLoc(record); - int length = record.getReadLength(); - for (Integer loc : queue.keySet()) { - Integer position = loc - location.getStart(); - if (position >= 0 && position < length) { - Collection set; - if (cl != null) - set = queue.get(loc).getSet(cl); - else - set = queue.get(loc).getSet(name); - if (set != null && set.size() > 0) - ret.put(position, set); - } - } - return ret; - - } - - /** - * create an alignment of read position to reference ordered datum - * - * @return a mapping from the position in the read to the reference ordered datum - */ - private Map> createGenomeLocAlignment(SAMRecord record, TreeMap mapping, Class cl, String name) { - Map> ret = new LinkedHashMap>(); - int start = record.getAlignmentStart(); - int stop = record.getAlignmentEnd(); - for (Integer location : mapping.keySet()) { - if (location >= start && location <= stop) - if (cl != null) - ret.put(location, mapping.get(location).getSet(cl)); - else - ret.put(location, mapping.get(location).getSet(name)); - } - return ret; - } - - /** - * get the position mapping, from read offset to ROD - * - * @return a mapping of read offset to ROD(s) - */ - public Map> getReadOffsetMapping() { - return createReadAlignment(record, mapping, null, null); - } - - /** - * get the position mapping, from read offset to ROD - * - * @return a mapping of genome loc position to ROD(s) - */ - public Map> getContigOffsetMapping() { - return createGenomeLocAlignment(record, mapping, null, null); - } - - /** - * get the position mapping, from read offset to ROD - * - * @return a mapping of read offset to ROD(s) - */ - public Map> getReadOffsetMapping(String name) { - return createReadAlignment(record, mapping, null, name); - } - - /** - * get the position mapping, from read offset to ROD - * - * @return a mapping of genome loc position to ROD(s) - */ - public Map> getContigOffsetMapping(String name) { - return createGenomeLocAlignment(record, mapping, null, name); - } - - /** - * get the position mapping, from read offset to ROD - * - * @return a mapping of read offset to ROD(s) - */ - public Map> getReadOffsetMapping(Class cl) { - return createReadAlignment(record, mapping, cl, null); - } - - /** - * get the position mapping, from read offset to ROD - * - * @return a mapping of genome loc position to ROD(s) - */ - public Map> getContigOffsetMapping(Class cl) { - return createGenomeLocAlignment(record, mapping, cl, null); - } - - /** - * get the list of all the RODS overlapping this read, without any information about their position - * @return a Collection (no order guaranteed), of all the RODs covering this read - */ - public List getAllCoveringRods() { - List ret = new ArrayList(); - for (Map.Entry entry : mapping.entrySet()) - ret.addAll(entry.getValue().getSet()); - return ret; + public ReadMetaDataTracker(Collection allBindings) { + super(allBindings); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index d61b9e9b6..e6eddc0b7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -620,16 +620,11 @@ public class IndelRealigner extends ReadWalker { } private void populateKnownIndels(ReadMetaDataTracker metaDataTracker, ReferenceContext ref) { - for ( Collection rods : metaDataTracker.getContigOffsetMapping().values() ) { - Iterator rodIter = rods.iterator(); - while ( rodIter.hasNext() ) { - Object rod = rodIter.next().getUnderlyingObject(); - if ( indelRodsSeen.contains(rod) ) - continue; - indelRodsSeen.add(rod); - if ( rod instanceof VariantContext ) - knownIndelsToTry.add((VariantContext)rod); - } + for ( final VariantContext vc : metaDataTracker.getValues(known) ) { + if ( indelRodsSeen.contains(vc) ) + continue; + indelRodsSeen.add(vc); + knownIndelsToTry.add(vc); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index f8faa101b..0b35dd599 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -125,6 +125,15 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome return ! discontinuousP( that ); } + /** + * Return true if this GenomeLoc represents the UNMAPPED location + * @return + */ + public final boolean isUnmapped() { + return isUnmapped(this); + } + + /** * Returns a new GenomeLoc that represents the entire span of this and that. Requires that * this and that GenomeLoc are contiguous and both mapped diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java index 41bdda0e0..ff8952dfa 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java @@ -1,207 +1,347 @@ /* - * Copyright (c) 2010. The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ +* Copyright (c) 2010. The Broad Institute +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +* OTHER DEALINGS IN THE SOFTWARE. +*/ package org.broadinstitute.sting.gatk.datasources.providers; +import net.sf.picard.util.PeekableIterator; import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMSequenceDictionary; -import org.testng.Assert; +import org.broad.tribble.BasicFeature; +import org.broad.tribble.Feature; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTrackerUnitTest; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; - -import org.testng.annotations.BeforeMethod; - +import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; - /** - * @author aaron - *

- * Class ReadBasedReferenceOrderedViewUnitTest - *

- * test out the ReadBasedReferenceOrderedView class + * @author depristo */ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { - private GenomeLocParser genomeLocParser; - private static int startingChr = 1; private static int endingChr = 2; private static int readCount = 100; private static int DEFAULT_READ_LENGTH = ArtificialSAMUtils.DEFAULT_READ_LENGTH; + private static String contig; private static SAMFileHeader header; + private GenomeLocParser genomeLocParser; + @BeforeClass public void beforeClass() { header = ArtificialSAMUtils.createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + contig = header.getSequence(0).getSequenceName(); genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + + initializeTests(); } - @BeforeMethod - public void beforeEach() { - } - - @Test - public void testCreateReadMetaDataTrackerOnePerSite() { - // make ten reads, - List records = new ArrayList(); - for (int x = 1; x < 11; x++) { - SAMRecord rec = ArtificialSAMUtils.createArtificialRead(header, "name", 0, x, 10); + private class CompareFeatures implements Comparator { + @Override + public int compare(Feature o1, Feature o2) { + return genomeLocParser.createGenomeLoc(o1).compareTo(genomeLocParser.createGenomeLoc(o2)); } - GenomeLoc start = genomeLocParser.createGenomeLoc(header.getSequenceDictionary().getSequence(0).getSequenceName(), 0, 0); - List list = new ArrayList(); - list.add(new RMDDataState(null, new FakePeekingRODIterator(genomeLocParser,start, "fakeName"))); - ReadBasedReferenceOrderedView view = new ReadBasedReferenceOrderedView(new WindowedData(list)); + } - for (SAMRecord rec : records) { - ReadMetaDataTracker tracker = view.getReferenceOrderedDataForRead(rec); - Map> map = tracker.getReadOffsetMapping(); - for (Integer i : map.keySet()) { - Assert.assertEquals(map.get(i).size(), 1); + private class ReadMetaDataTrackerRODStreamTest extends TestDataProvider { + final List allFeatures; + final List intervals; + + public ReadMetaDataTrackerRODStreamTest(final List allFeatures, final GenomeLoc interval) { + this(allFeatures, Collections.singletonList(interval)); + } + + public ReadMetaDataTrackerRODStreamTest(final List allFeatures, final List intervals) { + super(ReadMetaDataTrackerRODStreamTest.class); + this.allFeatures = new ArrayList(allFeatures); + Collections.sort(this.allFeatures, new CompareFeatures()); + this.intervals = new ArrayList(intervals); + Collections.sort(this.intervals); + setName(String.format("%s nFeatures %d intervals %s", getClass().getSimpleName(), allFeatures.size(), + intervals.size() == 1 ? intervals.get(0) : "size " + intervals.size())); + } + + public PeekableIterator getIterator(final String name) { + return new PeekableIterator(new TribbleIteratorFromCollection(name, genomeLocParser, allFeatures)); + } + + public Set getExpectedOverlaps(final GenomeLoc interval) { + final Set overlapping = new HashSet(); + for ( final Feature f : allFeatures ) + if ( genomeLocParser.createGenomeLoc(f).overlapsP(interval) ) + overlapping.add(f); + return overlapping; + } + } + + public void initializeTests() { + final List handPickedFeatures = new ArrayList(); + + handPickedFeatures.add(new BasicFeature(contig, 1, 1)); + handPickedFeatures.add(new BasicFeature(contig, 2, 5)); + handPickedFeatures.add(new BasicFeature(contig, 4, 4)); + handPickedFeatures.add(new BasicFeature(contig, 6, 6)); + handPickedFeatures.add(new BasicFeature(contig, 9, 10)); + handPickedFeatures.add(new BasicFeature(contig, 10, 10)); + handPickedFeatures.add(new BasicFeature(contig, 10, 11)); + handPickedFeatures.add(new BasicFeature(contig, 13, 20)); + + createTestsForFeatures(handPickedFeatures); + + // test in the present of a large spanning element + { + List oneLargeSpan = new ArrayList(handPickedFeatures); + oneLargeSpan.add(new BasicFeature(contig, 1, 100)); + createTestsForFeatures(oneLargeSpan); + } + + // test in the presence of a partially spanning element + { + List partialSpanStart = new ArrayList(handPickedFeatures); + partialSpanStart.add(new BasicFeature(contig, 1, 6)); + createTestsForFeatures(partialSpanStart); + } + + // test in the presence of a partially spanning element at the end + { + List partialSpanEnd = new ArrayList(handPickedFeatures); + partialSpanEnd.add(new BasicFeature(contig, 10, 100)); + createTestsForFeatures(partialSpanEnd); + } + + // no data at all + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, 5, 5); + new ReadMetaDataTrackerRODStreamTest(Collections.emptyList(), loc); + } + + // -------------------------------------------------------------------------------- + // + // tests for the lower level IntervalOverlappingRODsFromStream + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "ReadMetaDataTrackerRODStreamTest") + public Object[][] createReadMetaDataTrackerRODStreamTest() { + return ReadMetaDataTrackerRODStreamTest.getTests(ReadMetaDataTrackerRODStreamTest.class); + } + + private GenomeLoc span(final List features) { + int featuresStart = 1; for ( final GenomeLoc f : features ) featuresStart = Math.min(featuresStart, f.getStart()); + int featuresStop = 1; for ( final GenomeLoc f : features ) featuresStop = Math.max(featuresStop, f.getStop()); + return genomeLocParser.createGenomeLoc(contig, featuresStart, featuresStop); + } + + private void createTestsForFeatures(final List features) { + int featuresStart = 1; for ( final Feature f : features ) featuresStart = Math.min(featuresStart, f.getStart()); + int featuresStop = 1; for ( final Feature f : features ) featuresStop = Math.max(featuresStop, f.getEnd()); + + for ( final int size : Arrays.asList(1, 5, 10, 100, 1000) ) { + final List allIntervals = new ArrayList(); + // regularly spaced + for ( int start = featuresStart; start < featuresStop; start++) { + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, start, start + size - 1); + allIntervals.add(loc); + new ReadMetaDataTrackerRODStreamTest(features, loc); } - Assert.assertEquals(map.keySet().size(), 10); + + // starting and stopping at every feature + for ( final Feature f : features ) { + // just at the feature + allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart(), f.getEnd())); + new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); + + // up to end + allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart() - 1, f.getEnd())); + new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); + + // missing by 1 + allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart() + 1, f.getEnd() + 1)); + new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); + + // just spanning + allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart() - 1, f.getEnd() + 1)); + new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); + } + + new ReadMetaDataTrackerRODStreamTest(features, allIntervals); + } + } + + @Test(enabled = true, dataProvider = "ReadMetaDataTrackerRODStreamTest") + public void runReadMetaDataTrackerRODStreamTest_singleQuery(final ReadMetaDataTrackerRODStreamTest data) { + if ( data.intervals.size() == 1 ) { + final String name = "testName"; + final PeekableIterator iterator = data.getIterator(name); + final IntervalOverlappingRODsFromStream stream = new IntervalOverlappingRODsFromStream(name, iterator); + testRODStream(data, stream, Collections.singletonList(data.intervals.get(0))); + } + } + + @Test(enabled = true, dataProvider = "ReadMetaDataTrackerRODStreamTest", dependsOnMethods = "runReadMetaDataTrackerRODStreamTest_singleQuery") + public void runReadMetaDataTrackerRODStreamTest_multipleQueries(final ReadMetaDataTrackerRODStreamTest data) { + if ( data.intervals.size() > 1 ) { + final String name = "testName"; + final PeekableIterator iterator = data.getIterator(name); + final IntervalOverlappingRODsFromStream stream = new IntervalOverlappingRODsFromStream(name, iterator); + testRODStream(data, stream, data.intervals); + } + } + + private void testRODStream(final ReadMetaDataTrackerRODStreamTest test, final IntervalOverlappingRODsFromStream stream, final List intervals) { + for ( final GenomeLoc interval : intervals ) { + final RODRecordList query = stream.getOverlapping(interval); + final HashSet queryFeatures = new HashSet(); + for ( final GATKFeature f : query ) queryFeatures.add((Feature)f.getUnderlyingObject()); + final Set overlaps = test.getExpectedOverlaps(interval); + + Assert.assertEquals(queryFeatures.size(), overlaps.size(), "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected size = " + overlaps.size() + " but saw " + queryFeatures.size()); + + BaseTest.assertEqualsSet(queryFeatures, overlaps, "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected = " + Utils.join(",", overlaps) + " but saw " + Utils.join(",", queryFeatures)); + } + } + + // -------------------------------------------------------------------------------- + // + // tests for the higher level tracker itself + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "ReadMetaDataTrackerTests") + public Object[][] createTrackerTests() { + List tests = new ArrayList(); + + final Object[][] singleTests = ReadMetaDataTrackerRODStreamTest.getTests(ReadMetaDataTrackerRODStreamTest.class); + final List multiSiteTests = new ArrayList(); + for ( final Object[] singleTest : singleTests ) { + if ( ((ReadMetaDataTrackerRODStreamTest)singleTest[0]).intervals.size() > 1 ) + multiSiteTests.add((ReadMetaDataTrackerRODStreamTest)singleTest[0]); } + // all pairwise tests + for ( List singleTest : Utils.makePermutations(multiSiteTests, 2, false)) { + tests.add(new Object[]{singleTest}); + } + + // all 3 way pairwise tests + for ( List singleTest : Utils.makePermutations(multiSiteTests, 3, false)) { + tests.add(new Object[]{singleTest}); + } + + return tests.toArray(new Object[][]{}); } -} + @Test(enabled = true, dataProvider = "ReadMetaDataTrackerTests", dependsOnMethods = "runReadMetaDataTrackerRODStreamTest_multipleQueries") + public void runReadMetaDataTrackerTest(final List RODs) { + final List names = new ArrayList(); + final List> iterators = new ArrayList>(); + final List intervals = new ArrayList(); + final List> rodBindings = new ArrayList>(); + for ( int i = 0; i < RODs.size(); i++ ) { + final RodBinding rodBinding = new RodBinding(Feature.class, "name"+i); + rodBindings.add(rodBinding); + final String name = rodBinding.getName(); + names.add(name); + iterators.add(RODs.get(i).getIterator(name)); + intervals.addAll(RODs.get(i).intervals); + } -class FakePeekingRODIterator implements LocationAwareSeekableRODIterator { - private GenomeLocParser genomeLocParser; + Collections.sort(intervals); + final GenomeLoc span = span(intervals); + final ReadBasedReferenceOrderedView view = new ReadBasedReferenceOrderedView(genomeLocParser, span, names, iterators); - // current location - private GenomeLoc location; - private GATKFeature curROD; - private final String name; + for ( final GenomeLoc interval : intervals ) { + final ReadMetaDataTracker tracker = view.getReferenceOrderedDataForInterval(interval); - public FakePeekingRODIterator(GenomeLocParser genomeLocParser, GenomeLoc startingLoc, String name) { - this.name = name; - this.location = genomeLocParser.createGenomeLoc(startingLoc.getContig(), startingLoc.getStart() + 1, startingLoc.getStop() + 1); + for ( int i = 0; i < RODs.size(); i++ ) { + final ReadMetaDataTrackerRODStreamTest test = RODs.get(i); + final List queryFeaturesList = tracker.getValues(rodBindings.get(i)); + final Set queryFeatures = new HashSet(queryFeaturesList); + final Set overlaps = test.getExpectedOverlaps(interval); + + Assert.assertEquals(queryFeatures.size(), overlaps.size(), "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected size = " + overlaps.size() + " but saw " + queryFeatures.size()); + + BaseTest.assertEqualsSet(queryFeatures, overlaps, "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected = " + Utils.join(",", overlaps) + " but saw " + Utils.join(",", queryFeatures)); + } + } } /** - * Gets the header associated with the backing input stream. - * @return the ROD header. + * Created with IntelliJ IDEA. + * User: depristo + * Date: 8/29/12 + * Time: 1:19 PM + * To change this template use File | Settings | File Templates. */ - @Override - public Object getHeader() { - return null; - } + static class TribbleIteratorFromCollection implements Iterator { + // current location + private final String name; + final Queue gatkFeatures; - /** - * Gets the sequence dictionary associated with the backing input stream. - * @return sequence dictionary from the ROD header. - */ - @Override - public SAMSequenceDictionary getSequenceDictionary() { - return null; - } + public TribbleIteratorFromCollection(final String name, final GenomeLocParser genomeLocParser, final List features) { + this.name = name; + this.gatkFeatures = new LinkedList(); + for ( final Feature f : features ) + gatkFeatures.add(new GATKFeature.TribbleGATKFeature(genomeLocParser, f, name)); + } - @Override - public GenomeLoc peekNextLocation() { - System.err.println("Peek Next -> " + location); - return location; - } + @Override + public boolean hasNext() { + return ! gatkFeatures.isEmpty(); + } - @Override - public GenomeLoc position() { - return location; - } + @Override + public RODRecordList next() { + final GATKFeature first = gatkFeatures.poll(); + final Collection myFeatures = new LinkedList(); + myFeatures.add(first); + while ( gatkFeatures.peek() != null && gatkFeatures.peek().getLocation().getStart() == first.getStart() ) + myFeatures.add(gatkFeatures.poll()); - @Override - public RODRecordList seekForward(GenomeLoc interval) { - while (location.isBefore(interval)) - next(); - return next(); // we always move by one, we know the next location will be right - } + GenomeLoc loc = first.getLocation(); + for ( final GATKFeature feature : myFeatures ) + loc = loc.merge(feature.getLocation()); - @Override - public boolean hasNext() { - return true; // we always have next - } + return new RODRecordListImpl(name, myFeatures, loc); // is this safe? + } - @Override - public RODRecordList next() { - System.err.println("Next -> " + location); - curROD = new ReadMetaDataTrackerUnitTest.FakeRODatum(location, name); - location = genomeLocParser.createGenomeLoc(location.getContig(), location.getStart() + 1, location.getStop() + 1); - FakeRODRecordList list = new FakeRODRecordList(); - list.add(curROD); - return list; - } - - @Override - public void remove() { - throw new IllegalStateException("GRRR"); - } - - @Override - public void close() { - // nothing to do + @Override public void remove() { throw new IllegalStateException("GRRR"); } } } -class FakeRODRecordList extends AbstractList implements RODRecordList { - private final List list = new ArrayList(); - public boolean add(GATKFeature data) { - return list.add(data); - } - - @Override - public GATKFeature get(int i) { - return list.get(i); - } - - @Override - public int size() { - return list.size(); - } - - @Override - public GenomeLoc getLocation() { - return list.get(0).getLocation(); - } - - @Override - public String getName() { - return "test"; - } - - @Override - public int compareTo(RODRecordList rodRecordList) { - return this.list.get(0).getLocation().compareTo(rodRecordList.getLocation()); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTrackerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTrackerUnitTest.java deleted file mode 100644 index 2198c461d..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTrackerUnitTest.java +++ /dev/null @@ -1,276 +0,0 @@ -/* - * Copyright (c) 2010. The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import org.testng.Assert; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.datasources.providers.RODMetaDataContainer; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; - -import org.testng.annotations.BeforeMethod; - -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.util.*; - - -/** - * @author aaron - *

- * Class ReadMetaDataTrackerUnitTest - *

- * test out the ReadMetaDataTracker - */ -public class ReadMetaDataTrackerUnitTest extends BaseTest { - private static int startingChr = 1; - private static int endingChr = 2; - private static int readCount = 100; - private static int DEFAULT_READ_LENGTH = ArtificialSAMUtils.DEFAULT_READ_LENGTH; - private static SAMFileHeader header; - private Set nameSet; - - private GenomeLocParser genomeLocParser; - - @BeforeClass - public void beforeClass() { - header = ArtificialSAMUtils.createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); - genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); - } - - @BeforeMethod - public void beforeEach() { - nameSet = new TreeSet(); - nameSet.add("default"); - } - - @Test - public void twoRodsAtEachReadBase() { - nameSet.add("default2"); - ReadMetaDataTracker tracker = getRMDT(1, nameSet, true); - - // count the positions - int count = 0; - for (Integer x : tracker.getReadOffsetMapping().keySet()) { - count++; - Assert.assertEquals(tracker.getReadOffsetMapping().get(x).size(), 2); - } - Assert.assertEquals(count, 10); - } - - @Test - public void rodAtEachReadBase() { - - ReadMetaDataTracker tracker = getRMDT(1, nameSet, true); - - // count the positions - int count = 0; - for (Integer x : tracker.getReadOffsetMapping().keySet()) { - count++; - Assert.assertEquals(tracker.getReadOffsetMapping().get(x).size(), 1); - } - Assert.assertEquals(count, 10); - } - - @Test - public void filterByName() { - nameSet.add("default2"); - ReadMetaDataTracker tracker = getRMDT(1, nameSet, true); - - // count the positions - int count = 0; - Map> map = tracker.getReadOffsetMapping("default"); - for (Integer x : map.keySet()) { - count++; - Assert.assertEquals(map.get(x).size(), 1); - } - Assert.assertEquals(count, 10); - } - - @Test - public void filterByDupType() { - nameSet.add("default2"); - ReadMetaDataTracker tracker = getRMDT(1, nameSet, false); // create both RODs of the same type - // count the positions - int count = 0; - Map> map = tracker.getReadOffsetMapping(FakeRODatum.class); - for (Integer x : map.keySet()) { - count++; - Assert.assertEquals(map.get(x).size(), 2); - } - Assert.assertEquals(count, 10); - } - - // @Test this test can be uncommented to determine the speed impacts of any changes to the RODs for reads system - - public void filterByMassiveDupType() { - - for (int y = 0; y < 20; y++) { - nameSet.add("default" + String.valueOf(y)); - long firstTime = System.currentTimeMillis(); - for (int lp = 0; lp < 1000; lp++) { - ReadMetaDataTracker tracker = getRMDT(1, nameSet, false); // create both RODs of the same type - // count the positions - int count = 0; - Map> map = tracker.getReadOffsetMapping(FakeRODatum.class); - for (Integer x : map.keySet()) { - count++; - Assert.assertEquals(map.get(x).size(), y + 2); - } - Assert.assertEquals(count, 10); - } - System.err.println(y + " = " + (System.currentTimeMillis() - firstTime)); - } - } - - - @Test - public void filterByType() { - nameSet.add("default2"); - ReadMetaDataTracker tracker = getRMDT(1, nameSet, true); - - // count the positions - int count = 0; - Map> map = tracker.getReadOffsetMapping(Fake2RODatum.class); - for (int x : map.keySet()) { - count++; - Assert.assertEquals(map.get(x).size(), 1); - } - Assert.assertEquals(count, 10); - } - - @Test - public void sparceRODsForRead() { - ReadMetaDataTracker tracker = getRMDT(7, nameSet, true); - - // count the positions - int count = 0; - for (Integer x : tracker.getReadOffsetMapping().keySet()) { - count++; - Assert.assertEquals(tracker.getReadOffsetMapping().get(x).size(), 1); - } - Assert.assertEquals(count, 2); - } - - @Test - public void rodByGenomeLoc() { - ReadMetaDataTracker tracker = getRMDT(1, nameSet, true); - - // count the positions - int count = 0; - for (Integer x : tracker.getContigOffsetMapping().keySet()) { - count++; - Assert.assertEquals(tracker.getContigOffsetMapping().get(x).size(), 1); - } - Assert.assertEquals(count, 10); - } - - - /** - * create a ReadMetaDataTracker given: - * - * @param incr the spacing between site locations - * @param names the names of the reference ordered data to create: one will be created at every location for each name - * - * @return a ReadMetaDataTracker - */ - private ReadMetaDataTracker getRMDT(int incr, Set names, boolean alternateTypes) { - SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "name", 0, 1, 10); - TreeMap data = new TreeMap(); - for (int x = 0; x < record.getAlignmentEnd(); x += incr) { - GenomeLoc loc = genomeLocParser.createGenomeLoc(record.getReferenceName(), record.getAlignmentStart() + x, record.getAlignmentStart() + x); - RODMetaDataContainer set = new RODMetaDataContainer(); - - int cnt = 0; - for (String name : names) { - if (alternateTypes) - set.addEntry((cnt % 2 == 0) ? new FakeRODatum(loc, name) : new Fake2RODatum(loc, name)); - else - set.addEntry(new FakeRODatum(loc, name)); - cnt++; - } - data.put(record.getAlignmentStart() + x, set); - } - ReadMetaDataTracker tracker = new ReadMetaDataTracker(genomeLocParser, record, data); - return tracker; - } - - - /** for testing, we want a fake rod with a different classname, for the get-by-class-name functions */ - static public class Fake2RODatum extends FakeRODatum { - - public Fake2RODatum(GenomeLoc location, String name) { - super(location, name); - } - } - - - /** for testing only */ - static public class FakeRODatum extends GATKFeature { - - final GenomeLoc location; - final String name; - - public FakeRODatum(GenomeLoc location, String name) { - super(name); - this.location = location; - this.name = name; - } - - @Override - public String getName() { - return name; - } - - @Override - public GenomeLoc getLocation() { - return this.location; - } - - @Override - public Object getUnderlyingObject() { - return null; //To change body of implemented methods use File | Settings | File Templates. - } - - @Override - public String getChr() { - return location.getContig(); - } - - @Override - public int getStart() { - return (int)this.location.getStart(); - } - - @Override - public int getEnd() { - return (int)this.location.getStop(); - } - } -} From 1200848bbfb7069f898e1933ed687a0e18f56e0a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 29 Aug 2012 17:39:32 -0400 Subject: [PATCH 070/161] Part II of GSA-462: Consistent RODBinding access across Ref and Read trackers -- Deleted ReadMetaDataTracker -- Added function to ReadShard to give us the span from the left most position of the reads in the shard to the right most, which is needed for the new view --- .../compression/reducereads/ReduceReads.java | 4 +- .../haplotypecaller/HaplotypeCaller.java | 2 +- .../sting/alignment/AlignmentValidation.java | 4 +- .../ReadBasedReferenceOrderedView.java | 21 ++++---- .../gatk/datasources/reads/ReadShard.java | 31 +++++++++-- .../gatk/refdata/ReadMetaDataTracker.java | 51 ------------------- .../sting/gatk/traversals/TraverseReads.java | 4 +- .../gatk/traversals/TraverseReadsNano.java | 5 +- .../sting/gatk/walkers/ClipReads.java | 4 +- .../sting/gatk/walkers/FlagStat.java | 4 +- .../sting/gatk/walkers/PrintReads.java | 4 +- .../sting/gatk/walkers/ReadWalker.java | 5 +- .../sting/gatk/walkers/SplitSamFile.java | 4 +- .../diagnostics/ReadGroupProperties.java | 4 +- .../diagnostics/ReadLengthDistribution.java | 4 +- .../gatk/walkers/indels/IndelRealigner.java | 9 ++-- .../gatk/walkers/indels/LeftAlignIndels.java | 5 +- .../walkers/indels/SomaticIndelDetector.java | 4 +- .../sting/gatk/walkers/qc/CountBases.java | 4 +- .../sting/gatk/walkers/qc/CountMales.java | 4 +- .../gatk/walkers/qc/CountReadEvents.java | 4 +- .../sting/gatk/walkers/qc/CountReads.java | 4 +- .../gatk/walkers/qc/CountTerminusEvent.java | 4 +- .../gatk/walkers/qc/ReadClippingStats.java | 4 +- ...ReadBasedReferenceOrderedViewUnitTest.java | 4 +- .../reads/GATKWalkerBenchmark.java | 4 +- 26 files changed, 85 insertions(+), 116 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 177050667..d1ec9c474 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -34,7 +34,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.*; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.PartitionBy; import org.broadinstitute.sting.gatk.walkers.PartitionType; import org.broadinstitute.sting.gatk.walkers.ReadFilters; @@ -247,7 +247,7 @@ public class ReduceReads extends ReadWalker, ReduceRea * @return a linked list with all the reads produced by the clipping operations */ @Override - public LinkedList map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { + public LinkedList map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { LinkedList mappedReads; totalReads++; if (!debugRead.isEmpty() && read.getReadName().contains(debugRead)) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 845fc68a6..3d41b7233 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -308,7 +308,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { for( final VariantContext vc : tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()) ) { if( !allelesToGenotype.contains(vc) ) { - allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a ReadMetaDataTracker object + allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a RefMetaDataTracker object } } if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) { diff --git a/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidation.java b/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidation.java index e8eea5ff0..b903b9f7d 100644 --- a/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidation.java +++ b/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidation.java @@ -31,7 +31,7 @@ import org.broadinstitute.sting.alignment.bwa.c.BWACAligner; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -81,7 +81,7 @@ public class AlignmentValidation extends ReadWalker { * @return Number of reads aligned by this map (aka 1). */ @Override - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { //logger.info(String.format("examining read %s", read.getReadName())); byte[] bases = read.getReadBases(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java index 054758101..adf1b34df 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java @@ -27,8 +27,9 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.picard.util.PeekableIterator; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.datasources.reads.ReadShard; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLoc; @@ -38,11 +39,11 @@ import java.util.ArrayList; import java.util.Collection; import java.util.List; -/** a ROD view for reads. This provides the Read traversals a way of getting a ReadMetaDataTracker */ +/** a ROD view for reads. This provides the Read traversals a way of getting a RefMetaDataTracker */ public class ReadBasedReferenceOrderedView implements View { // a list of the RMDDataState (location->iterators) private final List states = new ArrayList(1); - private final static ReadMetaDataTracker EMPTY_TRACKER = new ReadMetaDataTracker(); + private final static RefMetaDataTracker EMPTY_TRACKER = new RefMetaDataTracker(); /** * Used to get genome locs for reads @@ -56,7 +57,7 @@ public class ReadBasedReferenceOrderedView implements View { private final GenomeLoc shardSpan; public ReadBasedReferenceOrderedView(final ShardDataProvider provider) { - this(provider.getGenomeLocParser(), provider.getShard().getLocation()); + this(provider.getGenomeLocParser(), ((ReadShard)provider.getShard()).getReadsSpan()); provider.register(this); if ( provider.getReferenceOrderedData() != null && ! shardSpan.isUnmapped() ) { @@ -89,32 +90,32 @@ public class ReadBasedReferenceOrderedView implements View { } /** - * create a ReadMetaDataTracker given the current read + * create a RefMetaDataTracker given the current read * * @param rec the read * - * @return a ReadMetaDataTracker for the read, from which you can get ROD -> read alignments + * @return a RefMetaDataTracker for the read, from which you can get ROD -> read alignments */ @Requires("rec != null") @Ensures("result != null") - public ReadMetaDataTracker getReferenceOrderedDataForRead(final SAMRecord rec) { + public RefMetaDataTracker getReferenceOrderedDataForRead(final SAMRecord rec) { if ( rec.getReadUnmappedFlag() ) // empty RODs for unmapped reads - return new ReadMetaDataTracker(); + return new RefMetaDataTracker(); else return getReferenceOrderedDataForInterval(genomeLocParser.createGenomeLoc(rec)); } @Requires({"interval != null", "shardSpan.containsP(interval)"}) @Ensures("result != null") - public ReadMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) { + public RefMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) { if ( states.isEmpty() ) // optimization for no bindings (common for read walkers) return EMPTY_TRACKER; else { final List bindings = new ArrayList(states.size()); for ( final RMDDataState state : states ) bindings.add(state.stream.getOverlapping(interval)); - return new ReadMetaDataTracker(bindings); + return new RefMetaDataTracker(bindings); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java index f5a4cb4cf..9e1c12186 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java @@ -7,10 +7,7 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; -import java.util.Map; +import java.util.*; /** * @@ -125,4 +122,30 @@ public class ReadShard extends Shard { } return sb.toString(); } + + /** + * Get the full span from the start of the left most read to the end of the right most one + * + * Note this may be different than the getLocation() of the shard, as this reflects the + * targeted span, not the actual span of reads + * + * @return the genome loc representing the span of these reads on the genome + */ + public GenomeLoc getReadsSpan() { + if ( isUnmapped() || super.getGenomeLocs() == null || reads.isEmpty() ) + return super.getLocation(); + else { + int start = Integer.MAX_VALUE; + int stop = Integer.MIN_VALUE; + String contig = null; + + for ( final SAMRecord read : reads ) { + contig = read.getReferenceName(); + if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart(); + if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd(); + } + + return parser.createGenomeLoc(contig, start, stop); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java deleted file mode 100644 index cfea5901e..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2010. The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.datasources.providers.RODMetaDataContainer; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; - -import java.util.*; - - -/** - * @author aaron - *

- * Class ReadMetaDataTracker - *

- * a read-based meta data tracker - */ -public class ReadMetaDataTracker extends RefMetaDataTracker { - public ReadMetaDataTracker() { - super(); - } - - public ReadMetaDataTracker(Collection allBindings) { - super(allBindings); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java index 2dc0444b2..3b712c973 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java @@ -31,7 +31,7 @@ import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrd import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ReadView; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -91,7 +91,7 @@ public class TraverseReads extends TraversalEngine,Read dataProvider.getShard().getReadMetrics().incrementNumIterations(); // if the read is mapped, create a metadata tracker - final ReadMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null; + final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null; final boolean keepMeP = walker.filter(refContext, (GATKSAMRecord) read); if (keepMeP) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index 4215230b8..081c6b8fc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -29,9 +29,8 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.*; import org.broadinstitute.sting.gatk.datasources.reads.ReadShard; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.nanoScheduler.MapFunction; import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; @@ -142,7 +141,7 @@ public class TraverseReadsNano extends TraversalEngine, //dataProvider.getShard().getReadMetrics().incrementNumIterations(); // if the read is mapped, create a metadata tracker - final ReadMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null; + final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null; final boolean keepMeP = walker.filter(refContext, (GATKSAMRecord) read); if (keepMeP) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReads.java index 4eaa16692..e63dbcabd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReads.java @@ -36,7 +36,7 @@ import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.clipping.ClippingOp; @@ -299,7 +299,7 @@ public class ClipReads extends ReadWalker impleme * @param read the read itself, as a GATKSAMRecord * @return the read itself */ - public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) { + public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker ) { return simplifyReads ? read.simplify() : read; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java index 77e3af93f..42fbb32bd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java @@ -1,8 +1,7 @@ package org.broadinstitute.sting.gatk.walkers; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /** @@ -27,5 +26,5 @@ public abstract class ReadWalker extends Walker { } @Override - public Integer map(ReferenceContext referenceContext, GATKSAMRecord read, ReadMetaDataTracker readMetaDataTracker) { + public Integer map(ReferenceContext referenceContext, GATKSAMRecord read, RefMetaDataTracker RefMetaDataTracker) { final String rgID = read.getReadGroup().getId(); final PerReadGroupInfo info = readGroupInfo.get(rgID); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java index 1dc8a7ec1..2b84cccc9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java @@ -4,7 +4,7 @@ import net.sf.samtools.SAMReadGroupRecord; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.gatk.walkers.ReadWalker; @@ -74,7 +74,7 @@ public class ReadLengthDistribution extends ReadWalker { } @Override - public Integer map(ReferenceContext referenceContext, GATKSAMRecord samRecord, ReadMetaDataTracker readMetaDataTracker) { + public Integer map(ReferenceContext referenceContext, GATKSAMRecord samRecord, RefMetaDataTracker RefMetaDataTracker) { GATKReportTable table = report.getTable("ReadLengthDistribution"); int length = Math.abs(samRecord.getReadLength()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index e6eddc0b7..d9b71f938 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -36,8 +36,7 @@ import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.BAQMode; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.*; @@ -473,7 +472,7 @@ public class IndelRealigner extends ReadWalker { readsActuallyCleaned.clear(); } - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { if ( currentInterval == null ) { emit(read); return 0; @@ -540,7 +539,7 @@ public class IndelRealigner extends ReadWalker { // TODO -- it would be nice if we could use indels from 454/Ion reads as alternate consenses } - private void cleanAndCallMap(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker, GenomeLoc readLoc) { + private void cleanAndCallMap(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker, GenomeLoc readLoc) { if ( readsToClean.size() > 0 ) { GenomeLoc earliestPossibleMove = getToolkit().getGenomeLocParser().createGenomeLoc(readsToClean.getReads().get(0)); if ( manager.canMoveReads(earliestPossibleMove) ) @@ -619,7 +618,7 @@ public class IndelRealigner extends ReadWalker { } } - private void populateKnownIndels(ReadMetaDataTracker metaDataTracker, ReferenceContext ref) { + private void populateKnownIndels(RefMetaDataTracker metaDataTracker, ReferenceContext ref) { for ( final VariantContext vc : metaDataTracker.getValues(known) ) { if ( indelRodsSeen.contains(vc) ) continue; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java index 6b9bd04d2..21b3b71d8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java @@ -27,12 +27,11 @@ package org.broadinstitute.sting.gatk.walkers.indels; import net.sf.samtools.Cigar; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.sam.AlignmentUtils; @@ -80,7 +79,7 @@ public class LeftAlignIndels extends ReadWalker { writer.addAlignment(read); } - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { // we can not deal with screwy records if ( read.getReadUnmappedFlag() || read.getCigar().numCigarElements() == 0 ) { emit(read); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java index 3965a63fb..7c73f59e9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java @@ -39,7 +39,7 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; import org.broadinstitute.sting.gatk.filters.Platform454Filter; import org.broadinstitute.sting.gatk.filters.PlatformUnitFilter; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; @@ -477,7 +477,7 @@ public class SomaticIndelDetector extends ReadWalker { @Override - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { // if ( read.getReadName().equals("428EFAAXX090610:2:36:1384:639#0") ) System.out.println("GOT READ"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java index 0c323934e..9954a25e8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java @@ -2,7 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.qc; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; @@ -36,7 +36,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountBases extends ReadWalker { - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { return read.getReadLength(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java index bc178119d..f2e4cf1ad 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java @@ -26,7 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.qc; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.samples.Gender; import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.walkers.DataSource; @@ -41,7 +41,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountMales extends ReadWalker { - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { Sample sample = getSampleDB().getSample(read); return sample.getGender() == Gender.MALE ? 1 : 0; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java index 80845c447..80afd19fa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java @@ -4,7 +4,7 @@ import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; @@ -47,7 +47,7 @@ public class CountReadEvents extends ReadWalker> map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + public Map> map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { return ReadUtils.getCigarOperatorForAllBases(read); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java index d33db2925..72bda03e9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java @@ -2,7 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.qc; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; @@ -42,7 +42,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountReads extends ReadWalker implements TreeReducible { - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { return 1; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java index 971b5bb85..09d239126 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java @@ -4,7 +4,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; @@ -41,7 +41,7 @@ import java.util.List; @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountTerminusEvent extends ReadWalker, Pair> { - public Pair map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + public Pair map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { List cigarElements = read.getCigar().getCigarElements(); CigarElement lastElement = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java index 16d614afc..ec4f081a6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java @@ -29,7 +29,7 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; @@ -75,7 +75,7 @@ public class ReadClippingStats extends ReadWalker { private long Gs; private long Ts; - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { for(byte base: read.getReadBases()) { switch(base) { case 'A': As++; break; From 53376b94236066e21c575c54d349a1f965e6eba9 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 29 Aug 2012 17:44:35 -0400 Subject: [PATCH 071/161] Part III of GSA-462: Consistent RODBinding access across Ref and Read trackers -- shardSpan is only calculated when there some ROD is live in the GATK. No sense in paying the cost per read when you don't need it -- Update contract to allow null span or unmapped span (good catch unittests!) --- .../providers/ReadBasedReferenceOrderedView.java | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java index adf1b34df..40fe03f4a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java @@ -57,7 +57,9 @@ public class ReadBasedReferenceOrderedView implements View { private final GenomeLoc shardSpan; public ReadBasedReferenceOrderedView(final ShardDataProvider provider) { - this(provider.getGenomeLocParser(), ((ReadShard)provider.getShard()).getReadsSpan()); + this.genomeLocParser = provider.getGenomeLocParser(); + // conditional to optimize the case where we don't have any ROD data + this.shardSpan = provider.getReferenceOrderedData() != null ? ((ReadShard)provider.getShard()).getReadsSpan() : null; provider.register(this); if ( provider.getReferenceOrderedData() != null && ! shardSpan.isUnmapped() ) { @@ -66,10 +68,6 @@ public class ReadBasedReferenceOrderedView implements View { } } - private ReadBasedReferenceOrderedView(final GenomeLocParser genomeLocParser, final GenomeLoc shardSpan) { - this.genomeLocParser = genomeLocParser; - this.shardSpan = shardSpan; - } /** * Testing constructor @@ -78,7 +76,8 @@ public class ReadBasedReferenceOrderedView implements View { final GenomeLoc shardSpan, final List names, final List> featureSources) { - this(genomeLocParser, shardSpan); + this.genomeLocParser = genomeLocParser; + this.shardSpan = shardSpan; for ( int i = 0; i < names.size(); i++ ) states.add(new RMDDataState(names.get(i), featureSources.get(i))); } @@ -106,10 +105,10 @@ public class ReadBasedReferenceOrderedView implements View { return getReferenceOrderedDataForInterval(genomeLocParser.createGenomeLoc(rec)); } - @Requires({"interval != null", "shardSpan.containsP(interval)"}) + @Requires({"interval != null", "shardSpan == null || shardSpan.isUnmapped() || shardSpan.containsP(interval)"}) @Ensures("result != null") public RefMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) { - if ( states.isEmpty() ) // optimization for no bindings (common for read walkers) + if ( states.isEmpty() || shardSpan.isUnmapped() ) // optimization for no bindings (common for read walkers) return EMPTY_TRACKER; else { final List bindings = new ArrayList(states.size()); From ce3d1f89ea6c79f3765e6174476f6031474dc60a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 29 Aug 2012 18:43:02 -0400 Subject: [PATCH 073/161] ReadShard are no longer allowed to span multiple contigs -- Previous behavior was unnecessary and causes all sorts of problems with RODs for reads. The old implementation simply failed in this case. The new code handles this correctly by forcing shards to have all of their data on a single contig. -- Added a PrintReads integration test to ensure this behavior is correct -- Adding test BAMs that have < 200 reads and span across contig boundaries --- .../sting/gatk/datasources/reads/ReadShard.java | 4 ++++ .../sting/gatk/datasources/reads/SAMDataSource.java | 12 +++++++++--- .../gatk/walkers/PrintReadsIntegrationTest.java | 3 ++- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java index 9e1c12186..fd1ee9859 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.*; @@ -140,6 +141,9 @@ public class ReadShard extends Shard { String contig = null; for ( final SAMRecord read : reads ) { + if ( contig != null && ! read.getReferenceName().equals(contig) ) + throw new ReviewedStingException("ReadShard contains reads spanning contig boundaries, which is no longer allowed. " + + "First contig is " + contig + " next read was " + read.getReferenceName() ); contig = read.getReferenceName(); if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart(); if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 7f0a0c4c0..c8b654f81 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -486,9 +486,15 @@ public class SAMDataSource { CloseableIterator iterator = getIterator(readers,shard,sortOrder == SAMFileHeader.SortOrder.coordinate); while(!shard.isBufferFull() && iterator.hasNext()) { - read = iterator.next(); - shard.addRead(read); - noteFilePositionUpdate(positionUpdates,read); + final SAMRecord nextRead = iterator.next(); + if ( read == null || (nextRead.getReferenceIndex().equals(read.getReferenceIndex())) ) { + // only add reads to the shard if they are on the same contig + read = nextRead; + shard.addRead(read); + noteFilePositionUpdate(positionUpdates,read); + } else { + break; + } } // If the reads are sorted in queryname order, ensure that all reads diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java index 057cf1cf9..717d9d953 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java @@ -38,7 +38,8 @@ public class PrintReadsIntegrationTest extends WalkerTest { {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1", "6e920b8505e7e95d67634b0905237dbc")}, {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L unmapped", "13bb9a91b1d4dd2425f73302b8a1ac1c")}, {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1 -L unmapped", "6e920b8505e7e95d67634b0905237dbc")}, - {new PRTest(b37KGReference, "oneReadAllInsertion.bam", "", "6caec4f8a25befb6aba562955401af93")} + {new PRTest(b37KGReference, "oneReadAllInsertion.bam", "", "6caec4f8a25befb6aba562955401af93")}, + {new PRTest(b37KGReference, "NA12878.1_10mb_2_10mb.bam", "", "c43380ac39b98853af457b90e52f8427")} }; } From 21dd70ed365ada928a5389db75b07966aa35202e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 10:10:00 -0400 Subject: [PATCH 074/161] Test to ensure that ReadBasedReferenceOrderedView produces stateless objects -- Stateless objects are required for nano-scheduling. This means you can take the RefMetaDataTracker provided by ReadBasedReferenceOrderedView, store it way, get another from the same view, and the original one behaves the same. --- ...ReadBasedReferenceOrderedViewUnitTest.java | 72 +++++++++++-------- 1 file changed, 44 insertions(+), 28 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java index 6aa860a2e..d55c48054 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java @@ -249,21 +249,23 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { multiSiteTests.add((ReadMetaDataTrackerRODStreamTest)singleTest[0]); } - // all pairwise tests - for ( List singleTest : Utils.makePermutations(multiSiteTests, 2, false)) { - tests.add(new Object[]{singleTest}); - } + for ( final boolean testStateless : Arrays.asList(true, false) ) { + // all pairwise tests + for ( List singleTest : Utils.makePermutations(multiSiteTests, 2, false)) { + tests.add(new Object[]{singleTest, testStateless}); + } - // all 3 way pairwise tests - for ( List singleTest : Utils.makePermutations(multiSiteTests, 3, false)) { - tests.add(new Object[]{singleTest}); + // all 3 way pairwise tests + for ( List singleTest : Utils.makePermutations(multiSiteTests, 3, false)) { + tests.add(new Object[]{singleTest, testStateless}); + } } return tests.toArray(new Object[][]{}); } @Test(enabled = true, dataProvider = "ReadMetaDataTrackerTests", dependsOnMethods = "runReadMetaDataTrackerRODStreamTest_multipleQueries") - public void runReadMetaDataTrackerTest(final List RODs) { + public void runReadMetaDataTrackerTest(final List RODs, final boolean testStateless) { final List names = new ArrayList(); final List> iterators = new ArrayList>(); final List intervals = new ArrayList(); @@ -282,31 +284,45 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { final GenomeLoc span = span(intervals); final ReadBasedReferenceOrderedView view = new ReadBasedReferenceOrderedView(genomeLocParser, span, names, iterators); - for ( final GenomeLoc interval : intervals ) { - final RefMetaDataTracker tracker = view.getReferenceOrderedDataForInterval(interval); + if ( testStateless ) { + // test each tracker is well formed, as each is created + for ( final GenomeLoc interval : intervals ) { + final RefMetaDataTracker tracker = view.getReferenceOrderedDataForInterval(interval); + testMetaDataTrackerBindings(tracker, interval, RODs, rodBindings); + } + } else { + // tests all trackers are correct after reading them into an array + // this checks that the trackers are be safely stored away and analyzed later (critical for nano-scheduling) + final List trackers = new ArrayList(); + for ( final GenomeLoc interval : intervals ) { + final RefMetaDataTracker tracker = view.getReferenceOrderedDataForInterval(interval); + trackers.add(tracker); + } - for ( int i = 0; i < RODs.size(); i++ ) { - final ReadMetaDataTrackerRODStreamTest test = RODs.get(i); - final List queryFeaturesList = tracker.getValues(rodBindings.get(i)); - final Set queryFeatures = new HashSet(queryFeaturesList); - final Set overlaps = test.getExpectedOverlaps(interval); - - Assert.assertEquals(queryFeatures.size(), overlaps.size(), "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + - " Expected size = " + overlaps.size() + " but saw " + queryFeatures.size()); - - BaseTest.assertEqualsSet(queryFeatures, overlaps, "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + - " Expected = " + Utils.join(",", overlaps) + " but saw " + Utils.join(",", queryFeatures)); + for ( int i = 0; i < trackers.size(); i++) { + testMetaDataTrackerBindings(trackers.get(i), intervals.get(i), RODs, rodBindings); } } } - /** - * Created with IntelliJ IDEA. - * User: depristo - * Date: 8/29/12 - * Time: 1:19 PM - * To change this template use File | Settings | File Templates. - */ + private void testMetaDataTrackerBindings(final RefMetaDataTracker tracker, + final GenomeLoc interval, + final List RODs, + final List> rodBindings) { + for ( int i = 0; i < RODs.size(); i++ ) { + final ReadMetaDataTrackerRODStreamTest test = RODs.get(i); + final List queryFeaturesList = tracker.getValues(rodBindings.get(i)); + final Set queryFeatures = new HashSet(queryFeaturesList); + final Set overlaps = test.getExpectedOverlaps(interval); + + Assert.assertEquals(queryFeatures.size(), overlaps.size(), "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected size = " + overlaps.size() + " but saw " + queryFeatures.size()); + + BaseTest.assertEqualsSet(queryFeatures, overlaps, "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected = " + Utils.join(",", overlaps) + " but saw " + Utils.join(",", queryFeatures)); + } + } + static class TribbleIteratorFromCollection implements Iterator { // current location private final String name; From 792092b8917128868aedfbc4d5c86327dedb0371 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 10:39:16 -0400 Subject: [PATCH 076/161] ReadShards now default to 10K (up from 1K) reads per samFile up to 250K -- This should help make the inputs for parallel read walkers a little meater, and avoid spinning the shard creation infrastructure so often --- .../sting/gatk/datasources/reads/SAMDataSource.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index c8b654f81..2b88775b1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -262,7 +262,7 @@ public class SAMDataSource { else { // Choose a sensible default for the read buffer size. For the moment, we're picking 1000 reads per BAM per shard (which effectively // will mean per-thread once ReadWalkers are parallelized) with a max cap of 250K reads in memory at once. - ReadShard.setReadBufferSize(Math.min(1000*samFiles.size(),250000)); + ReadShard.setReadBufferSize(Math.min(10000*samFiles.size(),250000)); } resourcePool = new SAMResourcePool(Integer.MAX_VALUE); From 7b366d404900dd456ed271b96bfa03e0ef7b949d Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 30 Aug 2012 11:01:01 -0400 Subject: [PATCH 077/161] misc cleanup in active region traversal. --- .../sting/gatk/traversals/TraverseActiveRegions.java | 12 ++++++------ .../sting/gatk/walkers/ActiveRegionWalker.java | 3 ++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index af981e676..ecaa15fe9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -185,7 +185,7 @@ public class TraverseActiveRegions extends TraversalEngine walker ) { // Just want to output the active regions to a file, not actually process them - for( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion : workQueue ) { + for( final ActiveRegion activeRegion : workQueue ) { if( activeRegion.isActive ) { walker.activeRegionOutStream.println( activeRegion.getLocation() ); } @@ -198,7 +198,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine reads, final Queue workQueue, final T sum, final ActiveRegionWalker walker ) { + private T processActiveRegion( final ActiveRegion activeRegion, final LinkedHashSet reads, final Queue workQueue, final T sum, final ActiveRegionWalker walker ) { final ArrayList placedReads = new ArrayList(); for( final GATKSAMRecord read : reads ) { final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); if( activeRegion.getLocation().overlapsP( readLoc ) ) { // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc ); - org.broadinstitute.sting.utils.activeregion.ActiveRegion bestRegion = activeRegion; - for( final org.broadinstitute.sting.utils.activeregion.ActiveRegion otherRegionToTest : workQueue ) { + ActiveRegion bestRegion = activeRegion; + for( final ActiveRegion otherRegionToTest : workQueue ) { if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) { maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc ); bestRegion = otherRegionToTest; @@ -229,7 +229,7 @@ public class TraverseActiveRegions extends TraversalEngine extends Walker Date: Thu, 30 Aug 2012 15:07:02 -0400 Subject: [PATCH 078/161] Bugfix to compareTo and equals in GenomeLoc -- Yes, GenomeLoc.compareTo was broken. The compareTo function only considered the contig and start position, but not the stop, when comparing genome locs. -- Updated GenomeLoc.compareTo function to account for stop. Updated GATK code where necessary to fix resulting problems that depended on this. -- Added unit tests to ensure that hashcode, equals, and compareTo are all correct for GenomeLocs --- .../gatk/iterators/VerifyingSamIterator.java | 4 +- .../broadinstitute/sting/utils/GenomeLoc.java | 5 +- .../sting/utils/GenomeLocUnitTest.java | 56 +++++++++++++++++++ 3 files changed, 61 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java index f33dd414b..2763bca7c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java @@ -48,9 +48,7 @@ public class VerifyingSamIterator implements StingSAMIterator { if(cur.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX || cur.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) throw new UserException.MalformedBAM(last,String.format("read %s has inconsistent mapping information.",cur.format())); - GenomeLoc lastLoc = genomeLocParser.createGenomeLoc( last ); - GenomeLoc curLoc = genomeLocParser.createGenomeLoc( cur ); - return curLoc.compareTo(lastLoc) == -1; + return last.getAlignmentStart() > cur.getAlignmentStart(); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 0b35dd599..6df9c9f1d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -427,7 +427,10 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome result = cmpContig; } else { if ( this.getStart() < that.getStart() ) result = -1; - if ( this.getStart() > that.getStart() ) result = 1; + else if ( this.getStart() > that.getStart() ) result = 1; + // these have the same start, so check the ends + else if ( this.getStop() < that.getStop() ) result = -1; + else if ( this.getStop() > that.getStop() ) result = 1; } } diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java index 49778a4d8..122e0265f 100644 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java @@ -16,6 +16,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import java.io.File; import java.io.FileNotFoundException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -211,4 +212,59 @@ public class GenomeLocUnitTest extends BaseTest { Assert.assertEquals(cfg.gl1.reciprocialOverlapFraction(cfg.gl2), cfg.overlapFraction); } } + + // ------------------------------------------------------------------------------------- + // + // testing comparison, hashcode, and equals + // + // ------------------------------------------------------------------------------------- + + @DataProvider(name = "GenomeLocComparisons") + public Object[][] createGenomeLocComparisons() { + List tests = new ArrayList(); + + final int start = 10; + for ( int stop = start; stop < start + 3; stop++ ) { + final GenomeLoc g1 = genomeLocParser.createGenomeLoc("chr2", start, stop); + for ( final String contig : Arrays.asList("chr1", "chr2", "chr3")) { + for ( int start2 = start - 1; start2 <= stop + 1; start2++ ) { + for ( int stop2 = start2; stop2 < stop + 2; stop2++ ) { + final GenomeLoc g2 = genomeLocParser.createGenomeLoc(contig, start2, stop2); + + ComparisonResult cmp = ComparisonResult.EQUALS; + if ( contig.equals("chr3") ) cmp = ComparisonResult.LESS_THAN; + else if ( contig.equals("chr1") ) cmp = ComparisonResult.GREATER_THAN; + else if ( start < start2 ) cmp = ComparisonResult.LESS_THAN; + else if ( start > start2 ) cmp = ComparisonResult.GREATER_THAN; + else if ( stop < stop2 ) cmp = ComparisonResult.LESS_THAN; + else if ( stop > stop2 ) cmp = ComparisonResult.GREATER_THAN; + + tests.add(new Object[]{g1, g2, cmp}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + private enum ComparisonResult { + LESS_THAN(-1), + EQUALS(0), + GREATER_THAN(1); + + final int cmp; + + private ComparisonResult(int cmp) { + this.cmp = cmp; + } + } + + @Test(dataProvider = "GenomeLocComparisons") + public void testGenomeLocComparisons(GenomeLoc g1, GenomeLoc g2, ComparisonResult expected) { + Assert.assertEquals(g1.compareTo(g2), expected.cmp, "Comparing genome locs failed"); + Assert.assertEquals(g1.equals(g2), expected == ComparisonResult.EQUALS); + if ( expected == ComparisonResult.EQUALS ) + Assert.assertEquals(g1.hashCode(), g2.hashCode(), "Equal genome locs don't have the same hash code"); + } } From 72cf6bdd9f7d675797d0a76902907e3af05cea56 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 15:10:58 -0400 Subject: [PATCH 079/161] Fix GSA-529: Fix RODs for parallel read walkers -- TraverseReadsNano modified to read in all input data before invoking maps, so the input to TraverseReadsNano is a MapData object holding the sam record, the ref context, and the refmetadatatracker. -- Update ValidateRODForReads to be tree reducible, using synchronized map and explicitly sort the output map from locations -> counts in onTraversalDone -- Expanded integration tests to test nt 1, 2, 4. --- .../gatk/traversals/TraverseReadsNano.java | 91 +++++++++++-------- .../utils/nanoScheduler/NanoScheduler.java | 5 +- 2 files changed, 58 insertions(+), 38 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index 081c6b8fc..b397cb8c0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -27,16 +27,21 @@ package org.broadinstitute.sting.gatk.traversals; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrderedView; +import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView; +import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.providers.ReadView; import org.broadinstitute.sting.gatk.datasources.reads.ReadShard; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.nanoScheduler.MapFunction; import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; import org.broadinstitute.sting.utils.nanoScheduler.ReduceFunction; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.util.ArrayList; +import java.util.List; + /** * @author aaron * @version 1.0 @@ -50,12 +55,13 @@ public class TraverseReadsNano extends TraversalEngine, /** our log, which we want to capture anything from this class */ protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class); private static final boolean DEBUG = false; - final NanoScheduler nanoScheduler; + private static final int MIN_GROUP_SIZE = 100; + final NanoScheduler nanoScheduler; public TraverseReadsNano(int nThreads) { final int bufferSize = ReadShard.getReadBufferSize() + 1; // actually has 1 more than max - final int mapGroupSize = bufferSize / 10 + 1; - nanoScheduler = new NanoScheduler(bufferSize, mapGroupSize, nThreads); + final int mapGroupSize = (int)Math.max(Math.ceil(bufferSize / 50.0 + 1), MIN_GROUP_SIZE); + nanoScheduler = new NanoScheduler(bufferSize, mapGroupSize, nThreads); } @Override @@ -79,24 +85,42 @@ public class TraverseReadsNano extends TraversalEngine, if( !dataProvider.hasReads() ) throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); - if ( dataProvider.hasReferenceOrderedData() ) - throw new ReviewedStingException("Parallel read walkers currently don't support access to reference ordered data"); - - final ReadView reads = new ReadView(dataProvider); - final ReadReferenceView reference = new ReadReferenceView(dataProvider); - final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); - nanoScheduler.setDebug(DEBUG); - final TraverseReadsMap myMap = new TraverseReadsMap(reads, reference, rodView, walker); + final TraverseReadsMap myMap = new TraverseReadsMap(walker); final TraverseReadsReduce myReduce = new TraverseReadsReduce(walker); - T result = nanoScheduler.execute(reads.iterator().iterator(), myMap, sum, myReduce); + T result = nanoScheduler.execute(aggregateMapData(dataProvider).iterator(), myMap, sum, myReduce); // TODO -- how do we print progress? //printProgress(dataProvider.getShard(), ???); return result; } + private List aggregateMapData(final ReadShardDataProvider dataProvider) { + final ReadView reads = new ReadView(dataProvider); + final ReadReferenceView reference = new ReadReferenceView(dataProvider); + final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); + + final List mapData = new ArrayList(); // TODO -- need size of reads + for ( final SAMRecord read : reads ) { + final ReferenceContext refContext = ! read.getReadUnmappedFlag() + ? reference.getReferenceContext(read) + : null; + + // if the read is mapped, create a metadata tracker + final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 + ? rodView.getReferenceOrderedDataForRead(read) + : null; + + // update the number of reads we've seen + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + mapData.add(new MapData((GATKSAMRecord)read, refContext, tracker)); + } + + return mapData; + } + @Override public void printOnTraversalDone() { nanoScheduler.shutdown(); @@ -116,36 +140,31 @@ public class TraverseReadsNano extends TraversalEngine, } } - private class TraverseReadsMap implements MapFunction { - final ReadView reads; - final ReadReferenceView reference; - final ReadBasedReferenceOrderedView rodView; + private class MapData { + final GATKSAMRecord read; + final ReferenceContext refContext; + final RefMetaDataTracker tracker; + + private MapData(GATKSAMRecord read, ReferenceContext refContext, RefMetaDataTracker tracker) { + this.read = read; + this.refContext = refContext; + this.tracker = tracker; + } + } + + private class TraverseReadsMap implements MapFunction { final ReadWalker walker; - private TraverseReadsMap(ReadView reads, ReadReferenceView reference, ReadBasedReferenceOrderedView rodView, ReadWalker walker) { - this.reads = reads; - this.reference = reference; - this.rodView = rodView; + private TraverseReadsMap(ReadWalker walker) { this.walker = walker; } @Override - public M apply(final SAMRecord read) { + public M apply(final MapData data) { if ( ! walker.isDone() ) { - // ReferenceContext -- the reference bases covered by the read - final ReferenceContext refContext = ! read.getReadUnmappedFlag() && reference != null - ? reference.getReferenceContext(read) - : null; - - // update the number of reads we've seen - //dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // if the read is mapped, create a metadata tracker - final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null; - - final boolean keepMeP = walker.filter(refContext, (GATKSAMRecord) read); + final boolean keepMeP = walker.filter(data.refContext, data.read); if (keepMeP) { - return walker.map(refContext, (GATKSAMRecord) read, tracker); + return walker.map(data.refContext, data.read, data.tracker); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 4bca3728f..25ed0766d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -43,7 +43,8 @@ import java.util.concurrent.*; * Time: 9:47 AM */ public class NanoScheduler { - private static Logger logger = Logger.getLogger(NanoScheduler.class); + private final static Logger logger = Logger.getLogger(NanoScheduler.class); + private final static boolean ALLOW_SINGLE_THREAD_FASTPATH = true; final int bufferSize; final int mapGroupSize; @@ -172,7 +173,7 @@ public class NanoScheduler { if ( map == null ) throw new IllegalArgumentException("map function cannot be null"); if ( reduce == null ) throw new IllegalArgumentException("reduce function cannot be null"); - if ( getnThreads() == 1 ) { + if ( ALLOW_SINGLE_THREAD_FASTPATH && getnThreads() == 1 ) { return executeSingleThreaded(inputReader, map, initialValue, reduce); } else { return executeMultiThreaded(inputReader, map, initialValue, reduce); From 27d1c63448384d0d6b6bf74949608c7a92c42ccf Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 15:56:58 -0400 Subject: [PATCH 080/161] Reduce the number of test combinations in ReadBasedREferenceOrderedView --- .../ReadBasedReferenceOrderedViewUnitTest.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java index d55c48054..eaa098793 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java @@ -121,7 +121,7 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { // test in the present of a large spanning element { List oneLargeSpan = new ArrayList(handPickedFeatures); - oneLargeSpan.add(new BasicFeature(contig, 1, 100)); + oneLargeSpan.add(new BasicFeature(contig, 1, 30)); createTestsForFeatures(oneLargeSpan); } @@ -135,7 +135,7 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { // test in the presence of a partially spanning element at the end { List partialSpanEnd = new ArrayList(handPickedFeatures); - partialSpanEnd.add(new BasicFeature(contig, 10, 100)); + partialSpanEnd.add(new BasicFeature(contig, 10, 30)); createTestsForFeatures(partialSpanEnd); } @@ -165,7 +165,7 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { int featuresStart = 1; for ( final Feature f : features ) featuresStart = Math.min(featuresStart, f.getStart()); int featuresStop = 1; for ( final Feature f : features ) featuresStop = Math.max(featuresStop, f.getEnd()); - for ( final int size : Arrays.asList(1, 5, 10, 100, 1000) ) { + for ( final int size : Arrays.asList(1, 5, 10, 100) ) { final List allIntervals = new ArrayList(); // regularly spaced for ( int start = featuresStart; start < featuresStop; start++) { @@ -256,11 +256,12 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { } // all 3 way pairwise tests - for ( List singleTest : Utils.makePermutations(multiSiteTests, 3, false)) { - tests.add(new Object[]{singleTest, testStateless}); - } + //for ( List singleTest : Utils.makePermutations(multiSiteTests, 3, false)) { + // tests.add(new Object[]{singleTest, testStateless}); + //} } + logger.warn("Creating " + tests.size() + " tests for ReadMetaDataTrackerTests"); return tests.toArray(new Object[][]{}); } From 59508f82663ce27637c4a968b831cc6796537f1d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 15:57:29 -0400 Subject: [PATCH 081/161] tasking for n threads should give you n threads in NanoScheduler, not n - 1 --- .../broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 25ed0766d..668c82524 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -80,7 +80,7 @@ public class NanoScheduler { this.mapGroupSize = mapGroupSize; } - this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads - 1); + this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads); } /** From 863a3d73b8796510ca1461d759115cf1ed4e2f11 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 16:21:17 -0400 Subject: [PATCH 082/161] Added ThreadSafeMapReduce interface, super of TreeReducible -- A higher level interface to declare parallelism capability of a walker. This interface means that the walker can be multi-threaded, but doesn't necessarily support TreeReducible interface, which forces you to have a combine ReduceType operation that isn't appropriate for parallel read walkers -- Updated ReadWalkers to implement ThreadSafeMapReduce not TreeReducible --- .../sting/gatk/executive/MicroScheduler.java | 19 ++++++++---- .../gatk/iterators/VerifyingSamIterator.java | 5 +-- .../sting/gatk/walkers/FlagStat.java | 7 +---- .../sting/gatk/walkers/PrintReads.java | 7 +---- .../gatk/walkers/ThreadSafeMapReduce.java | 31 +++++++++++++++++++ .../sting/gatk/walkers/TreeReducible.java | 2 +- .../sting/gatk/walkers/qc/CountReads.java | 5 ++- 7 files changed, 52 insertions(+), 24 deletions(-) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 70201a6cc..417a0982f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -100,22 +100,29 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * @return The best-fit microscheduler. */ public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { - if (walker instanceof TreeReducible && threadAllocation.getNumCPUThreads() > 1) { - if(walker.isReduceByInterval()) + if (threadAllocation.getNumCPUThreads() > 1) { + if (walker.isReduceByInterval()) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); + logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads())); - if ( walker instanceof ReadWalker ) + if ( walker instanceof ReadWalker ) { + if ( ! (walker instanceof ThreadSafeMapReduce) ) badNT(engine, walker); return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); - else + } else { + // TODO -- update test for when nano scheduling only is an option + if ( ! (walker instanceof TreeReducible) ) badNT(engine, walker); return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); + } } else { - if(threadAllocation.getNumCPUThreads() > 1) - throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); } } + private static void badNT(final GenomeAnalysisEngine engine, final Walker walker) { + throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); + } + /** * Create a microscheduler given the reads and reference. * diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java index 2763bca7c..3ffe95e8b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java @@ -1,7 +1,6 @@ package org.broadinstitute.sting.gatk.iterators; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -48,7 +47,9 @@ public class VerifyingSamIterator implements StingSAMIterator { if(cur.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX || cur.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) throw new UserException.MalformedBAM(last,String.format("read %s has inconsistent mapping information.",cur.format())); - return last.getAlignmentStart() > cur.getAlignmentStart(); + return (last.getReferenceIndex() > cur.getReferenceIndex()) || + (last.getReferenceIndex().equals(cur.getReferenceIndex()) && + last.getAlignmentStart() > cur.getAlignmentStart()); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java index 6f28e8726..14d14aca5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java @@ -45,7 +45,7 @@ import java.text.NumberFormat; */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS}) -public class FlagStat extends ReadWalker implements TreeReducible { +public class FlagStat extends ReadWalker implements ThreadSafeMapReduce { @Output PrintStream out; @@ -193,11 +193,6 @@ public class FlagStat extends ReadWalker implements TreeReducible { +public class PrintReads extends ReadWalker implements ThreadSafeMapReduce { @Output(doc="Write output to this BAM filename instead of STDOUT", required = true) SAMFileWriter out; @@ -245,9 +245,4 @@ public class PrintReads extends ReadWalker impleme output.addAlignment(read); return output; } - - @Override - public SAMFileWriter treeReduce(SAMFileWriter lhs, SAMFileWriter rhs) { - return lhs; // nothing to do - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java new file mode 100755 index 000000000..1ce469f8c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2010. The Broad Institute + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers; + +/** + * Root parallelism interface. Walkers that implement this + * declare that their map function is thread-safe and so multiple + * map calls can be run in parallel in the same JVM instance. + */ +public interface ThreadSafeMapReduce { +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java index c950e07e4..8621c0e9d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java @@ -13,7 +13,7 @@ package org.broadinstitute.sting.gatk.walkers; * shards of the data can reduce with each other, and the composite result * can be reduced with other composite results. */ -public interface TreeReducible { +public interface TreeReducible extends ThreadSafeMapReduce { /** * A composite, 'reduce of reduces' function. * @param lhs 'left-most' portion of data in the composite reduce. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java index 72bda03e9..856ea77f5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java @@ -6,7 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.gatk.walkers.ThreadSafeMapReduce; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -41,12 +41,11 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) -public class CountReads extends ReadWalker implements TreeReducible { +public class CountReads extends ReadWalker implements ThreadSafeMapReduce { public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { return 1; } @Override public Integer reduceInit() { return 0; } @Override public Integer reduce(Integer value, Integer sum) { return value + sum; } - @Override public Integer treeReduce(Integer lhs, Integer rhs) { return lhs + rhs; } } From 7b4caec8cb45504fbeaf5df2c685dcb131f72c83 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 16:56:36 -0400 Subject: [PATCH 083/161] Fix: GSA-531 ApplyRecalibration writing to BCF: java.lang.String cannot be cast to java.lang.Double -- LOD must be added a double to attributes, not as string, so that it can be written out as BCF --- .../walkers/variantrecalibration/ApplyRecalibration.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 011f3471c..158d1e78a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -39,11 +39,11 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import java.io.File; import java.util.*; @@ -218,7 +218,7 @@ public class ApplyRecalibration extends RodWalker implements T String filterString = null; // Annotate the new record with its VQSLOD and the worst performing annotation - builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lodString); // use the String representation so that we don't lose precision on output + builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod); builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY)); for( int i = tranches.size() - 1; i >= 0; i-- ) { From 82b2845b9f71cebc76d3a5953ab5a2ad4d8a3fe7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 16:56:36 -0400 Subject: [PATCH 084/161] Fix: GSA-531 ApplyRecalibration writing to BCF: java.lang.String cannot be cast to java.lang.Double -- LOD must be added a double to attributes, not as string, so that it can be written out as BCF --- .../walkers/variantrecalibration/ApplyRecalibration.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 011f3471c..158d1e78a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -39,11 +39,11 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import java.io.File; import java.util.*; @@ -218,7 +218,7 @@ public class ApplyRecalibration extends RodWalker implements T String filterString = null; // Annotate the new record with its VQSLOD and the worst performing annotation - builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lodString); // use the String representation so that we don't lose precision on output + builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod); builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY)); for( int i = tranches.size() - 1; i >= 0; i-- ) { From 5a142fe2656643ac8d2b6b3c356d83f233d8724b Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Thu, 30 Aug 2012 17:57:31 -0400 Subject: [PATCH 085/161] After dicussion with Ryan/Eric, the Structural_Indel variant type is now gone, and has been entirely replaced with the access pattern .isStructuralIndel(). This makes it a strict subtype of indel. I agree that this method is a bit more sensible. In addition, fix for GSA-310. If supplied -rf argument does not match a known read filter, the list of read filters will be printed, and users directed to the documentation for more information. --- .../sting/gatk/filters/FilterManager.java | 26 +++++++++++++++++++ .../VariantDataManager.java | 1 - .../utils/classloader/PluginManager.java | 12 ++++++++- .../utils/variantcontext/VariantContext.java | 22 +++++++++------- 4 files changed, 49 insertions(+), 12 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java index 67f82235d..bddfa6a0d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java @@ -25,9 +25,13 @@ package org.broadinstitute.sting.gatk.filters; +import com.google.common.base.Function; +import com.google.common.collect.Collections2; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.PluginManager; import java.util.Collection; +import java.util.List; /** * Manage filters and filter options. Any requests for basic filtering classes @@ -54,4 +58,26 @@ public class FilterManager extends PluginManager { public Collection> getValues() { return this.getPlugins(); } + + /** + * Rather than use the default error message, print out a list of read filters as well. + * @param pluginCategory - string, the category of the plugin (e.g. read filter) + * @param pluginName - string, what we were trying to match (but failed to) + * @return - A wall of text with the default message, followed by a listing of available read filters + */ + @Override + protected String formatErrorMessage(String pluginCategory, String pluginName) { + List> availableFilters = this.getPluginsImplementing(ReadFilter.class); + Collection availableFilterNames = Collections2.transform(availableFilters, new Function,String>(){ + + @Override + public String apply(final Class input) { + return getName(input); + } + }); + + return String.format("Read filter %s not found. Available read filters:%n%s.%n%n%s",pluginName, + Utils.join(String.format(", "),availableFilterNames), + "Please consult the GATK Documentation (http://www.broadinstitute.org/gatk/gatkdocs/) for more information."); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 33a543e39..aacd987d5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -286,7 +286,6 @@ public class VariantDataManager { case INDEL: case MIXED: case SYMBOLIC: - case STRUCTURAL_INDEL: return checkVariationClass( evalVC, VariantRecalibratorArgumentCollection.Mode.INDEL ); default: return false; diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java index 9a2cb68db..9f1b6db93 100644 --- a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java @@ -277,7 +277,7 @@ public class PluginManager { public PluginType createByName(String pluginName) { Class plugin = pluginsByName.get(pluginName); if( plugin == null ) - throw new UserException(String.format("Could not find %s with name: %s", pluginCategory,pluginName)); + throw new UserException(formatErrorMessage(pluginCategory,pluginName)); try { return plugin.newInstance(); } catch (Exception e) { @@ -330,4 +330,14 @@ public class PluginManager { return pluginName; } + + /** + * Generate the error message for the plugin manager. The message is allowed to depend on the class. + * @param pluginCategory - string, the category of the plugin (e.g. read filter) + * @param pluginName - string, what we were trying to match (but failed to) + * @return error message text describing the error + */ + protected String formatErrorMessage(String pluginCategory, String pluginName ) { + return String.format("Could not find %s with name: %s", pluginCategory,pluginName); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 929e53ce7..dd16cf7e1 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -457,7 +457,6 @@ public class VariantContext implements Feature { // to enable tribble integratio SNP, MNP, // a multi-nucleotide polymorphism INDEL, - STRUCTURAL_INDEL, SYMBOLIC, MIXED, } @@ -531,7 +530,17 @@ public class VariantContext implements Feature { // to enable tribble integratio } public boolean isStructuralIndel() { - return getType() == Type.STRUCTURAL_INDEL; + if ( getType() == Type.INDEL ) { + List sizes = getIndelLengths(); + if ( sizes != null ) { + for ( Integer length : sizes ) { + if ( length > MAX_ALLELE_SIZE_FOR_NON_SV ) { + return true; + } + } + } + } + return false; } /** @@ -716,7 +725,7 @@ public class VariantContext implements Feature { // to enable tribble integratio * @return a list of indel lengths ( null if not of type indel or mixed ) */ public List getIndelLengths() { - if ( getType() != Type.INDEL && getType() != Type.MIXED && getType() != Type.STRUCTURAL_INDEL ) { + if ( getType() != Type.INDEL && getType() != Type.MIXED ) { return null; } @@ -1263,13 +1272,6 @@ public class VariantContext implements Feature { // to enable tribble integratio // is reserved for cases of multiple alternate alleles of different types). Therefore, if we've reached this point // in the code (so we're not a SNP, MNP, or symbolic allele), we absolutely must be an INDEL. - // Because a number of structural variation callers write the whole alternate allele into the VCF where possible, - // this can result in insertion/deletion alleles of structural variant size, e.g. 151+. As of July 2012, we now - // classify these as structural events, rather than indel events, as we think differently about the mechanism, - // representation, and handling of these events. Check for this case here: - if ( ref.length() > MAX_ALLELE_SIZE_FOR_NON_SV || allele.length() > MAX_ALLELE_SIZE_FOR_NON_SV ) - return Type.STRUCTURAL_INDEL; - return Type.INDEL; // old incorrect logic: From 5a9610d87591fb9327e6fac552bdf26cba28a6b3 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 10:39:16 -0400 Subject: [PATCH 086/161] ReadShards now default to 10K (up from 1K) reads per samFile up to 250K -- This should help make the inputs for parallel read walkers a little meater, and avoid spinning the shard creation infrastructure so often --- .../sting/gatk/datasources/reads/SAMDataSource.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index c8b654f81..2b88775b1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -262,7 +262,7 @@ public class SAMDataSource { else { // Choose a sensible default for the read buffer size. For the moment, we're picking 1000 reads per BAM per shard (which effectively // will mean per-thread once ReadWalkers are parallelized) with a max cap of 250K reads in memory at once. - ReadShard.setReadBufferSize(Math.min(1000*samFiles.size(),250000)); + ReadShard.setReadBufferSize(Math.min(10000*samFiles.size(),250000)); } resourcePool = new SAMResourcePool(Integer.MAX_VALUE); From 7d95176539546585bbc76cfde2866fba64ee83c2 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 15:07:02 -0400 Subject: [PATCH 087/161] Bugfix to compareTo and equals in GenomeLoc -- Yes, GenomeLoc.compareTo was broken. The compareTo function only considered the contig and start position, but not the stop, when comparing genome locs. -- Updated GenomeLoc.compareTo function to account for stop. Updated GATK code where necessary to fix resulting problems that depended on this. -- Added unit tests to ensure that hashcode, equals, and compareTo are all correct for GenomeLocs --- .../gatk/iterators/VerifyingSamIterator.java | 4 +- .../broadinstitute/sting/utils/GenomeLoc.java | 5 +- .../sting/utils/GenomeLocUnitTest.java | 56 +++++++++++++++++++ 3 files changed, 61 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java index f33dd414b..2763bca7c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java @@ -48,9 +48,7 @@ public class VerifyingSamIterator implements StingSAMIterator { if(cur.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX || cur.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) throw new UserException.MalformedBAM(last,String.format("read %s has inconsistent mapping information.",cur.format())); - GenomeLoc lastLoc = genomeLocParser.createGenomeLoc( last ); - GenomeLoc curLoc = genomeLocParser.createGenomeLoc( cur ); - return curLoc.compareTo(lastLoc) == -1; + return last.getAlignmentStart() > cur.getAlignmentStart(); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 0b35dd599..6df9c9f1d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -427,7 +427,10 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome result = cmpContig; } else { if ( this.getStart() < that.getStart() ) result = -1; - if ( this.getStart() > that.getStart() ) result = 1; + else if ( this.getStart() > that.getStart() ) result = 1; + // these have the same start, so check the ends + else if ( this.getStop() < that.getStop() ) result = -1; + else if ( this.getStop() > that.getStop() ) result = 1; } } diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java index 49778a4d8..122e0265f 100644 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java @@ -16,6 +16,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import java.io.File; import java.io.FileNotFoundException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -211,4 +212,59 @@ public class GenomeLocUnitTest extends BaseTest { Assert.assertEquals(cfg.gl1.reciprocialOverlapFraction(cfg.gl2), cfg.overlapFraction); } } + + // ------------------------------------------------------------------------------------- + // + // testing comparison, hashcode, and equals + // + // ------------------------------------------------------------------------------------- + + @DataProvider(name = "GenomeLocComparisons") + public Object[][] createGenomeLocComparisons() { + List tests = new ArrayList(); + + final int start = 10; + for ( int stop = start; stop < start + 3; stop++ ) { + final GenomeLoc g1 = genomeLocParser.createGenomeLoc("chr2", start, stop); + for ( final String contig : Arrays.asList("chr1", "chr2", "chr3")) { + for ( int start2 = start - 1; start2 <= stop + 1; start2++ ) { + for ( int stop2 = start2; stop2 < stop + 2; stop2++ ) { + final GenomeLoc g2 = genomeLocParser.createGenomeLoc(contig, start2, stop2); + + ComparisonResult cmp = ComparisonResult.EQUALS; + if ( contig.equals("chr3") ) cmp = ComparisonResult.LESS_THAN; + else if ( contig.equals("chr1") ) cmp = ComparisonResult.GREATER_THAN; + else if ( start < start2 ) cmp = ComparisonResult.LESS_THAN; + else if ( start > start2 ) cmp = ComparisonResult.GREATER_THAN; + else if ( stop < stop2 ) cmp = ComparisonResult.LESS_THAN; + else if ( stop > stop2 ) cmp = ComparisonResult.GREATER_THAN; + + tests.add(new Object[]{g1, g2, cmp}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + private enum ComparisonResult { + LESS_THAN(-1), + EQUALS(0), + GREATER_THAN(1); + + final int cmp; + + private ComparisonResult(int cmp) { + this.cmp = cmp; + } + } + + @Test(dataProvider = "GenomeLocComparisons") + public void testGenomeLocComparisons(GenomeLoc g1, GenomeLoc g2, ComparisonResult expected) { + Assert.assertEquals(g1.compareTo(g2), expected.cmp, "Comparing genome locs failed"); + Assert.assertEquals(g1.equals(g2), expected == ComparisonResult.EQUALS); + if ( expected == ComparisonResult.EQUALS ) + Assert.assertEquals(g1.hashCode(), g2.hashCode(), "Equal genome locs don't have the same hash code"); + } } From 7a462399cee869fa345afa3da6b00d14084f9edd Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 15:10:58 -0400 Subject: [PATCH 088/161] Fix GSA-529: Fix RODs for parallel read walkers -- TraverseReadsNano modified to read in all input data before invoking maps, so the input to TraverseReadsNano is a MapData object holding the sam record, the ref context, and the refmetadatatracker. -- Update ValidateRODForReads to be tree reducible, using synchronized map and explicitly sort the output map from locations -> counts in onTraversalDone -- Expanded integration tests to test nt 1, 2, 4. --- .../gatk/traversals/TraverseReadsNano.java | 91 +++++++++++-------- .../utils/nanoScheduler/NanoScheduler.java | 5 +- 2 files changed, 58 insertions(+), 38 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index 081c6b8fc..b397cb8c0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -27,16 +27,21 @@ package org.broadinstitute.sting.gatk.traversals; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrderedView; +import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView; +import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.providers.ReadView; import org.broadinstitute.sting.gatk.datasources.reads.ReadShard; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.nanoScheduler.MapFunction; import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; import org.broadinstitute.sting.utils.nanoScheduler.ReduceFunction; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.util.ArrayList; +import java.util.List; + /** * @author aaron * @version 1.0 @@ -50,12 +55,13 @@ public class TraverseReadsNano extends TraversalEngine, /** our log, which we want to capture anything from this class */ protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class); private static final boolean DEBUG = false; - final NanoScheduler nanoScheduler; + private static final int MIN_GROUP_SIZE = 100; + final NanoScheduler nanoScheduler; public TraverseReadsNano(int nThreads) { final int bufferSize = ReadShard.getReadBufferSize() + 1; // actually has 1 more than max - final int mapGroupSize = bufferSize / 10 + 1; - nanoScheduler = new NanoScheduler(bufferSize, mapGroupSize, nThreads); + final int mapGroupSize = (int)Math.max(Math.ceil(bufferSize / 50.0 + 1), MIN_GROUP_SIZE); + nanoScheduler = new NanoScheduler(bufferSize, mapGroupSize, nThreads); } @Override @@ -79,24 +85,42 @@ public class TraverseReadsNano extends TraversalEngine, if( !dataProvider.hasReads() ) throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); - if ( dataProvider.hasReferenceOrderedData() ) - throw new ReviewedStingException("Parallel read walkers currently don't support access to reference ordered data"); - - final ReadView reads = new ReadView(dataProvider); - final ReadReferenceView reference = new ReadReferenceView(dataProvider); - final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); - nanoScheduler.setDebug(DEBUG); - final TraverseReadsMap myMap = new TraverseReadsMap(reads, reference, rodView, walker); + final TraverseReadsMap myMap = new TraverseReadsMap(walker); final TraverseReadsReduce myReduce = new TraverseReadsReduce(walker); - T result = nanoScheduler.execute(reads.iterator().iterator(), myMap, sum, myReduce); + T result = nanoScheduler.execute(aggregateMapData(dataProvider).iterator(), myMap, sum, myReduce); // TODO -- how do we print progress? //printProgress(dataProvider.getShard(), ???); return result; } + private List aggregateMapData(final ReadShardDataProvider dataProvider) { + final ReadView reads = new ReadView(dataProvider); + final ReadReferenceView reference = new ReadReferenceView(dataProvider); + final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); + + final List mapData = new ArrayList(); // TODO -- need size of reads + for ( final SAMRecord read : reads ) { + final ReferenceContext refContext = ! read.getReadUnmappedFlag() + ? reference.getReferenceContext(read) + : null; + + // if the read is mapped, create a metadata tracker + final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 + ? rodView.getReferenceOrderedDataForRead(read) + : null; + + // update the number of reads we've seen + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + mapData.add(new MapData((GATKSAMRecord)read, refContext, tracker)); + } + + return mapData; + } + @Override public void printOnTraversalDone() { nanoScheduler.shutdown(); @@ -116,36 +140,31 @@ public class TraverseReadsNano extends TraversalEngine, } } - private class TraverseReadsMap implements MapFunction { - final ReadView reads; - final ReadReferenceView reference; - final ReadBasedReferenceOrderedView rodView; + private class MapData { + final GATKSAMRecord read; + final ReferenceContext refContext; + final RefMetaDataTracker tracker; + + private MapData(GATKSAMRecord read, ReferenceContext refContext, RefMetaDataTracker tracker) { + this.read = read; + this.refContext = refContext; + this.tracker = tracker; + } + } + + private class TraverseReadsMap implements MapFunction { final ReadWalker walker; - private TraverseReadsMap(ReadView reads, ReadReferenceView reference, ReadBasedReferenceOrderedView rodView, ReadWalker walker) { - this.reads = reads; - this.reference = reference; - this.rodView = rodView; + private TraverseReadsMap(ReadWalker walker) { this.walker = walker; } @Override - public M apply(final SAMRecord read) { + public M apply(final MapData data) { if ( ! walker.isDone() ) { - // ReferenceContext -- the reference bases covered by the read - final ReferenceContext refContext = ! read.getReadUnmappedFlag() && reference != null - ? reference.getReferenceContext(read) - : null; - - // update the number of reads we've seen - //dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // if the read is mapped, create a metadata tracker - final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null; - - final boolean keepMeP = walker.filter(refContext, (GATKSAMRecord) read); + final boolean keepMeP = walker.filter(data.refContext, data.read); if (keepMeP) { - return walker.map(refContext, (GATKSAMRecord) read, tracker); + return walker.map(data.refContext, data.read, data.tracker); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 4bca3728f..25ed0766d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -43,7 +43,8 @@ import java.util.concurrent.*; * Time: 9:47 AM */ public class NanoScheduler { - private static Logger logger = Logger.getLogger(NanoScheduler.class); + private final static Logger logger = Logger.getLogger(NanoScheduler.class); + private final static boolean ALLOW_SINGLE_THREAD_FASTPATH = true; final int bufferSize; final int mapGroupSize; @@ -172,7 +173,7 @@ public class NanoScheduler { if ( map == null ) throw new IllegalArgumentException("map function cannot be null"); if ( reduce == null ) throw new IllegalArgumentException("reduce function cannot be null"); - if ( getnThreads() == 1 ) { + if ( ALLOW_SINGLE_THREAD_FASTPATH && getnThreads() == 1 ) { return executeSingleThreaded(inputReader, map, initialValue, reduce); } else { return executeMultiThreaded(inputReader, map, initialValue, reduce); From 1212dfd2ef97a6847c0a2189c47c36faf1a1b54d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 15:56:58 -0400 Subject: [PATCH 089/161] Reduce the number of test combinations in ReadBasedREferenceOrderedView --- .../ReadBasedReferenceOrderedViewUnitTest.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java index d55c48054..eaa098793 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java @@ -121,7 +121,7 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { // test in the present of a large spanning element { List oneLargeSpan = new ArrayList(handPickedFeatures); - oneLargeSpan.add(new BasicFeature(contig, 1, 100)); + oneLargeSpan.add(new BasicFeature(contig, 1, 30)); createTestsForFeatures(oneLargeSpan); } @@ -135,7 +135,7 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { // test in the presence of a partially spanning element at the end { List partialSpanEnd = new ArrayList(handPickedFeatures); - partialSpanEnd.add(new BasicFeature(contig, 10, 100)); + partialSpanEnd.add(new BasicFeature(contig, 10, 30)); createTestsForFeatures(partialSpanEnd); } @@ -165,7 +165,7 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { int featuresStart = 1; for ( final Feature f : features ) featuresStart = Math.min(featuresStart, f.getStart()); int featuresStop = 1; for ( final Feature f : features ) featuresStop = Math.max(featuresStop, f.getEnd()); - for ( final int size : Arrays.asList(1, 5, 10, 100, 1000) ) { + for ( final int size : Arrays.asList(1, 5, 10, 100) ) { final List allIntervals = new ArrayList(); // regularly spaced for ( int start = featuresStart; start < featuresStop; start++) { @@ -256,11 +256,12 @@ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { } // all 3 way pairwise tests - for ( List singleTest : Utils.makePermutations(multiSiteTests, 3, false)) { - tests.add(new Object[]{singleTest, testStateless}); - } + //for ( List singleTest : Utils.makePermutations(multiSiteTests, 3, false)) { + // tests.add(new Object[]{singleTest, testStateless}); + //} } + logger.warn("Creating " + tests.size() + " tests for ReadMetaDataTrackerTests"); return tests.toArray(new Object[][]{}); } From 544740d45de3cfd59090e817da8725826bffa73b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 15:57:29 -0400 Subject: [PATCH 090/161] tasking for n threads should give you n threads in NanoScheduler, not n - 1 --- .../broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 25ed0766d..668c82524 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -80,7 +80,7 @@ public class NanoScheduler { this.mapGroupSize = mapGroupSize; } - this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads - 1); + this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads); } /** From 2f749b5e5271a5ecacfbe406461772e86011fb0f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 16:21:17 -0400 Subject: [PATCH 091/161] Added ThreadSafeMapReduce interface, super of TreeReducible -- A higher level interface to declare parallelism capability of a walker. This interface means that the walker can be multi-threaded, but doesn't necessarily support TreeReducible interface, which forces you to have a combine ReduceType operation that isn't appropriate for parallel read walkers -- Updated ReadWalkers to implement ThreadSafeMapReduce not TreeReducible --- .../sting/gatk/executive/MicroScheduler.java | 19 ++++++++---- .../gatk/iterators/VerifyingSamIterator.java | 5 +-- .../sting/gatk/walkers/FlagStat.java | 7 +---- .../sting/gatk/walkers/PrintReads.java | 7 +---- .../gatk/walkers/ThreadSafeMapReduce.java | 31 +++++++++++++++++++ .../sting/gatk/walkers/TreeReducible.java | 2 +- .../sting/gatk/walkers/qc/CountReads.java | 5 ++- 7 files changed, 52 insertions(+), 24 deletions(-) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 70201a6cc..417a0982f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -100,22 +100,29 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * @return The best-fit microscheduler. */ public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { - if (walker instanceof TreeReducible && threadAllocation.getNumCPUThreads() > 1) { - if(walker.isReduceByInterval()) + if (threadAllocation.getNumCPUThreads() > 1) { + if (walker.isReduceByInterval()) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); + logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads())); - if ( walker instanceof ReadWalker ) + if ( walker instanceof ReadWalker ) { + if ( ! (walker instanceof ThreadSafeMapReduce) ) badNT(engine, walker); return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); - else + } else { + // TODO -- update test for when nano scheduling only is an option + if ( ! (walker instanceof TreeReducible) ) badNT(engine, walker); return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); + } } else { - if(threadAllocation.getNumCPUThreads() > 1) - throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); } } + private static void badNT(final GenomeAnalysisEngine engine, final Walker walker) { + throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); + } + /** * Create a microscheduler given the reads and reference. * diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java index 2763bca7c..3ffe95e8b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java @@ -1,7 +1,6 @@ package org.broadinstitute.sting.gatk.iterators; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -48,7 +47,9 @@ public class VerifyingSamIterator implements StingSAMIterator { if(cur.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX || cur.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) throw new UserException.MalformedBAM(last,String.format("read %s has inconsistent mapping information.",cur.format())); - return last.getAlignmentStart() > cur.getAlignmentStart(); + return (last.getReferenceIndex() > cur.getReferenceIndex()) || + (last.getReferenceIndex().equals(cur.getReferenceIndex()) && + last.getAlignmentStart() > cur.getAlignmentStart()); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java index 6f28e8726..14d14aca5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java @@ -45,7 +45,7 @@ import java.text.NumberFormat; */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS}) -public class FlagStat extends ReadWalker implements TreeReducible { +public class FlagStat extends ReadWalker implements ThreadSafeMapReduce { @Output PrintStream out; @@ -193,11 +193,6 @@ public class FlagStat extends ReadWalker implements TreeReducible { +public class PrintReads extends ReadWalker implements ThreadSafeMapReduce { @Output(doc="Write output to this BAM filename instead of STDOUT", required = true) SAMFileWriter out; @@ -245,9 +245,4 @@ public class PrintReads extends ReadWalker impleme output.addAlignment(read); return output; } - - @Override - public SAMFileWriter treeReduce(SAMFileWriter lhs, SAMFileWriter rhs) { - return lhs; // nothing to do - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java new file mode 100755 index 000000000..1ce469f8c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2010. The Broad Institute + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers; + +/** + * Root parallelism interface. Walkers that implement this + * declare that their map function is thread-safe and so multiple + * map calls can be run in parallel in the same JVM instance. + */ +public interface ThreadSafeMapReduce { +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java index c950e07e4..8621c0e9d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java @@ -13,7 +13,7 @@ package org.broadinstitute.sting.gatk.walkers; * shards of the data can reduce with each other, and the composite result * can be reduced with other composite results. */ -public interface TreeReducible { +public interface TreeReducible extends ThreadSafeMapReduce { /** * A composite, 'reduce of reduces' function. * @param lhs 'left-most' portion of data in the composite reduce. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java index 72bda03e9..856ea77f5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java @@ -6,7 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.gatk.walkers.ThreadSafeMapReduce; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -41,12 +41,11 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) -public class CountReads extends ReadWalker implements TreeReducible { +public class CountReads extends ReadWalker implements ThreadSafeMapReduce { public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { return 1; } @Override public Integer reduceInit() { return 0; } @Override public Integer reduce(Integer value, Integer sum) { return value + sum; } - @Override public Integer treeReduce(Integer lhs, Integer rhs) { return lhs + rhs; } } From 39400c56a95f5221b98067cd866f4d4f9a04a572 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 30 Aug 2012 19:41:36 -0400 Subject: [PATCH 092/161] Update md5s for VQSR, as VQSLOD is now a double and gets the standard double precision treatment in VCF --- ...VariantRecalibrationWalkersIntegrationTest.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index b780bcd00..aec087f2c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -1,10 +1,10 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; -import java.util.*; +import java.util.Arrays; public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { private static class VRTest { @@ -28,7 +28,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf", "f360ce3eb2b0b887301be917a9843e2b", // tranches "287fea5ea066bf3fdd71f5ce9b58eab3", // recal file - "356b9570817b9389da71fbe991d8b2f5"); // cut VCF + "afa297c743437551cc2bd36ddd6d6d75"); // cut VCF @DataProvider(name = "VRTest") public Object[][] createData1() { @@ -77,7 +77,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf", "a8ce3cd3dccafdf7d580bcce7d660a9a", // tranches "74c10fc15f9739a938b7138909fbde04", // recal file - "62fda105e14b619a1c263855cf56af1d"); // cut VCF + "c30d163871a37f2bbf8ee7f761e870b4"); // cut VCF @DataProvider(name = "VRBCFTest") public Object[][] createVRBCFTest() { @@ -129,13 +129,13 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { validationDataLocation + "combined.phase1.chr20.raw.indels.unfiltered.sites.vcf", // all FILTERs as . "b7589cd098dc153ec64c02dcff2838e4", // tranches "a04a9001f62eff43d363f4d63769f3ee", // recal file - "64f576881e21323dd4078262604717a2"); // cut VCF + "b2c6827be592c24a4692b1753edc7d23"); // cut VCF VRTest indelFiltered = new VRTest( validationDataLocation + "combined.phase1.chr20.raw.indels.filtered.sites.vcf", // all FILTERs as PASS "b7589cd098dc153ec64c02dcff2838e4", // tranches "a04a9001f62eff43d363f4d63769f3ee", // recal file - "af22c55d91394c56a222fd40d6d54781"); // cut VCF + "5d483fe1ba2ef36ee9e6c14cbd654706"); // cut VCF @DataProvider(name = "VRIndelTest") public Object[][] createTestVariantRecalibratorIndel() { @@ -193,7 +193,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -o %s" + " -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" + " -recalFile " + privateTestDir + "VQSR.mixedTest.recal", - Arrays.asList("ec519e1f01459813dab57aefffc019e2")); + Arrays.asList("018b3a5cc7cf0cb5468c6a0c80ccaa8b")); executeTest("testApplyRecalibrationSnpAndIndelTogether", spec); } } From ac0c44720b4c5d616bc15587b3742b440ee0d008 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 30 Aug 2012 22:49:13 -0400 Subject: [PATCH 093/161] I started to put together a set of unit tests for the PileupElement creation functionality of LocusIteratorByState and found pretty quickly that it's definitely still busted for indels. The data provider is nowhere near comprehensive yet, but I need to sit back and think about how to really test some of the functionality of LIBS. Committing what I have for now because at the very least it'll be helpful going forward (failing tests are commented out with TODO). --- .../LocusIteratorByStateUnitTest.java | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index edd97f17f..4480acacd 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -19,6 +19,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; @@ -255,6 +256,90 @@ public class LocusIteratorByStateUnitTest extends BaseTest { } } + //////////////////////////////////////////// + // comprehensive LIBS/PileupElement tests // + //////////////////////////////////////////// + + private static final int IS_BEFORE_DELETED_BASE_FLAG = 1; + private static final int IS_BEFORE_DELETION_START_FLAG = 2; + private static final int IS_AFTER_DELETED_BASE_FLAG = 4; + private static final int IS_AFTER_DELETION_END_FLAG = 8; + private static final int IS_BEFORE_INSERTION_FLAG = 16; + private static final int IS_AFTER_INSERTION_FLAG = 32; + private static final int IS_NEXT_TO_SOFTCLIP_FLAG = 64; + + private static class LIBSTest { + + + final String cigar; + final int readLength; + final List offsets; + final List flags; + + private LIBSTest(final String cigar, final int readLength, final List offsets, final List flags) { + this.cigar = cigar; + this.readLength = readLength; + this.offsets = offsets; + this.flags = flags; + } + } + + @DataProvider(name = "LIBSTest") + public Object[][] createLIBSTestData() { + return new Object[][]{ + {new LIBSTest("1I", 1, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))}, + {new LIBSTest("10I", 10, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))}, + {new LIBSTest("2M2I2M", 6, Arrays.asList(0,1,4,5), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG,IS_AFTER_INSERTION_FLAG,0))}, + {new LIBSTest("2M2I", 4, Arrays.asList(0,1), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG))}, + //TODO -- uncomment these when LIBS is fixed + //{new LIBSTest("2I2M", 4, Arrays.asList(2,3), Arrays.asList(IS_AFTER_INSERTION_FLAG,0))}, + //{new LIBSTest("1I1M1D1M", 3, Arrays.asList(0,1), Arrays.asList(IS_AFTER_INSERTION_FLAG | IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG))}, + //{new LIBSTest("1S1I1M", 3, Arrays.asList(2), Arrays.asList(IS_AFTER_INSERTION_FLAG))}, + {new LIBSTest("1M2D2M", 3, Arrays.asList(0,1,2), Arrays.asList(IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG,0))}, + {new LIBSTest("1S1M", 2, Arrays.asList(1), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))}, + {new LIBSTest("1M1S", 2, Arrays.asList(0), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))}, + {new LIBSTest("1S1M1I", 3, Arrays.asList(1), Arrays.asList(IS_BEFORE_INSERTION_FLAG | IS_NEXT_TO_SOFTCLIP_FLAG))} + }; + } + + @Test(dataProvider = "LIBSTest") + public void testLIBS(LIBSTest params) { + final int locus = 44367788; + + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, params.readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', params.readLength)); + read.setBaseQualities(Utils.dupBytes((byte) '@', params.readLength)); + read.setCigarString(params.cigar); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(Arrays.asList(read), createTestReadProperties()); + + int offset = 0; + while ( li.hasNext() ) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + + final int flag = params.flags.get(offset); + Assert.assertEquals(pe.isBeforeDeletedBase(), (flag & IS_BEFORE_DELETED_BASE_FLAG) != 0); + Assert.assertEquals(pe.isBeforeDeletionStart(), (flag & IS_BEFORE_DELETION_START_FLAG) != 0); + Assert.assertEquals(pe.isAfterDeletedBase(), (flag & IS_AFTER_DELETED_BASE_FLAG) != 0); + Assert.assertEquals(pe.isAfterDeletionEnd(), (flag & IS_AFTER_DELETION_END_FLAG) != 0); + Assert.assertEquals(pe.isBeforeInsertion(), (flag & IS_BEFORE_INSERTION_FLAG) != 0); + Assert.assertEquals(pe.isAfterInsertion(), (flag & IS_AFTER_INSERTION_FLAG) != 0); + Assert.assertEquals(pe.isNextToSoftClip(), (flag & IS_NEXT_TO_SOFTCLIP_FLAG) != 0); + + Assert.assertEquals(pe.getOffset(), params.offsets.get(offset).intValue()); + + offset++; + } + } + + //////////////////////////////////////////////// + // End comprehensive LIBS/PileupElement tests // + //////////////////////////////////////////////// + private static ReadProperties createTestReadProperties() { return new ReadProperties( Collections.emptyList(), From 817ece37a20cf935a9f38cc27b7618e45f5e1dfd Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 31 Aug 2012 11:42:50 -0400 Subject: [PATCH 096/161] General infrastructure for ReadTransformers -- These are like read filters but can be applied either on input, on output, of handled by the walker -- Previous example of BAQ now uses the general framework -- Resulted in massive conceptual cleanup of SAMDataSource and ReadProperties! Yeah! -- BQSR now uses this framework. We can now do BQSR on input, on output, or within a walker -- PrintReads now handles all read transformers in the walker in map, enabling us to parallelize PrintReads with BAQ and BQSR -- Currently BQSR is excepting in parallel, which subsequent commit with fix -- Removed global variable setting in GenomeAnalysisEngine for BAQ, as command line parameters are cleanly handled by ReadTransformer infrastructure -- In principle ReadFilters are just a special kind of ReadTransformer, but this refactoring is larger than I can do. It's a JIRA entry -- Many files touched simply due to the refactoring and renaming of classes --- .../haplotypecaller/HaplotypeCaller.java | 14 +- .../sting/gatk/GenomeAnalysisEngine.java | 58 +++++-- .../sting/gatk/ReadProperties.java | 38 ++--- .../sting/gatk/WalkerManager.java | 9 +- .../gatk/datasources/reads/SAMDataSource.java | 41 ++--- .../gatk/io/stubs/SAMFileWriterStub.java | 40 +++-- .../sting/gatk/iterators/ReadTransformer.java | 144 ++++++++++++++++++ .../gatk/iterators/ReadTransformersMode.java | 28 ++++ .../sting/gatk/walkers/BAQMode.java | 4 +- .../sting/gatk/walkers/PrintReads.java | 20 ++- .../sting/gatk/walkers/Walker.java | 5 +- .../gatk/walkers/bqsr/BaseRecalibrator.java | 6 +- .../walkers/genotyper/UnifiedGenotyper.java | 3 +- .../gatk/walkers/indels/IndelRealigner.java | 3 +- .../indels/RealignerTargetCreator.java | 4 +- .../broadinstitute/sting/utils/baq/BAQ.java | 20 +-- .../sting/utils/baq/BAQReadTransformer.java | 49 ++++++ .../sting/utils/baq/BAQSamIterator.java | 59 ------- .../utils/baq/ReadTransformingIterator.java | 44 ++++++ .../sting/utils/recalibration/BQSRMode.java | 30 ++++ .../recalibration/BQSRReadTransformer.java | 40 +++++ .../utils/recalibration/BQSRSamIterator.java | 50 ------ 22 files changed, 485 insertions(+), 224 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/baq/BAQSamIterator.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRSamIterator.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 3d41b7233..f4d8a88e0 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -27,24 +27,23 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.genotyper.*; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; +import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.BadMateFilter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.genotyper.*; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.collections.Pair; @@ -52,6 +51,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.fragments.FragmentUtils; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -101,7 +101,7 @@ import java.util.*; @DocumentedGATKFeature( groupName = "Variant Discovery Tools", extraDocs = {CommandLineGATK.class} ) @PartitionBy(PartitionType.LOCUS) -@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) +@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) @ActiveRegionExtension(extension=65, maxRegion=300) public class HaplotypeCaller extends ActiveRegionWalker implements AnnotatorCompatible { diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 00614b9aa..b9b5e452d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -42,6 +42,8 @@ import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.stubs.Stub; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; @@ -49,8 +51,8 @@ import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.samples.SampleDBBuilder; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.GATKLiteUtils; +import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -131,6 +133,11 @@ public class GenomeAnalysisEngine { */ private Collection filters; + /** + * Collection of the read transformers applied to the reads + */ + private List readTransformers; + /** * Controls the allocation of threads between CPU vs IO. */ @@ -354,6 +361,39 @@ public class GenomeAnalysisEngine { return Collections.unmodifiableList(filters); } + /** + * Returns a list of active, initialized read transformers + * + * @param walker the walker we need to apply read transformers too + * @return a non-null list of read transformers + */ + public void initializeReadTransformers(final Walker walker) { + final List activeTransformers = new ArrayList(); + + final ReadTransformersMode overrideMode = WalkerManager.getWalkerAnnotation(walker, ReadTransformersMode.class); + final ReadTransformer.ApplicationTime overrideTime = overrideMode != null ? overrideMode.ApplicationTime() : null; + + final PluginManager pluginManager = new PluginManager(ReadTransformer.class); + + for ( final ReadTransformer transformer : pluginManager.createAllTypes() ) { + transformer.initialize(overrideTime, this, walker); + if ( transformer.enabled() ) + activeTransformers.add(transformer); + } + + setReadTransformers(activeTransformers); + } + + public List getReadTransformers() { + return readTransformers; + } + + private void setReadTransformers(final List readTransformers) { + if ( readTransformers == null ) + throw new ReviewedStingException("read transformers cannot be null"); + this.readTransformers = readTransformers; + } + /** * Parse out the thread allocation from the given command-line argument. */ @@ -419,9 +459,6 @@ public class GenomeAnalysisEngine { argCollection.setDownsamplingMethod(method); } - public BAQ.QualityMode getWalkerBAQQualityMode() { return WalkerManager.getBAQQualityMode(walker); } - public BAQ.ApplicationTime getWalkerBAQApplicationTime() { return WalkerManager.getBAQApplicationTime(walker); } - protected boolean includeReadsWithDeletionAtLoci() { return walker.includeReadsWithDeletionAtLoci(); } @@ -702,13 +739,12 @@ public class GenomeAnalysisEngine { protected void initializeDataSources() { logger.info("Strictness is " + argCollection.strictnessLevel); - // TODO -- REMOVE ME - BAQ.DEFAULT_GOP = argCollection.BAQGOP; - validateSuppliedReference(); setReferenceDataSource(argCollection.referenceFile); validateSuppliedReads(); + initializeReadTransformers(walker); + readsDataSource = createReadsDataSource(argCollection,genomeLocParser,referenceDataSource.getReference()); for (ReadFilter filter : filters) @@ -795,9 +831,6 @@ public class GenomeAnalysisEngine { // interrogating for the downsample method during command line recreation. setDownsamplingMethod(method); - if ( getWalkerBAQApplicationTime() == BAQ.ApplicationTime.FORBIDDEN && argCollection.BAQMode != BAQ.CalculationMode.OFF) - throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + argCollection.BAQMode + " was requested."); - if (argCollection.removeProgramRecords && argCollection.keepProgramRecords) throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options"); @@ -817,11 +850,8 @@ public class GenomeAnalysisEngine { method, new ValidationExclusion(Arrays.asList(argCollection.unsafe)), filters, + readTransformers, includeReadsWithDeletionAtLoci(), - getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF, - getWalkerBAQQualityMode(), - refReader, - getBaseRecalibration(), argCollection.defaultBaseQualities, removeProgramRecords); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java index e02b9d5af..b2d4d202d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java +++ b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java @@ -1,15 +1,14 @@ package org.broadinstitute.sting.gatk; -import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.filters.ReadFilter; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import java.util.Collection; +import java.util.List; /** * User: hanna * Date: May 14, 2009 @@ -34,12 +33,9 @@ public class ReadProperties { private final DownsamplingMethod downsamplingMethod; private final ValidationExclusion exclusionList; private final Collection supplementalFilters; + private final List readTransformers; private final boolean includeReadsWithDeletionAtLoci; private final boolean useOriginalBaseQualities; - private final BAQ.CalculationMode cmode; - private final BAQ.QualityMode qmode; - private final IndexedFastaSequenceFile refReader; // read for BAQ, if desired - private final BaseRecalibration bqsrApplier; private final byte defaultBaseQualities; /** @@ -95,6 +91,11 @@ public class ReadProperties { return supplementalFilters; } + + public List getReadTransformers() { + return readTransformers; + } + /** * Return whether to use original base qualities. * @return Whether to use original base qualities. @@ -103,16 +104,6 @@ public class ReadProperties { return useOriginalBaseQualities; } - - public BAQ.QualityMode getBAQQualityMode() { return qmode; } - public BAQ.CalculationMode getBAQCalculationMode() { return cmode; } - - public IndexedFastaSequenceFile getRefReader() { - return refReader; - } - - public BaseRecalibration getBQSRApplier() { return bqsrApplier; } - /** * @return Default base quality value to fill reads missing base quality information. */ @@ -134,9 +125,6 @@ public class ReadProperties { * @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method * will explicitly list reads with deletion over the current reference base; otherwise, only observed * bases will be seen in the pileups, and the deletions will be skipped silently. - * @param cmode How should we apply the BAQ calculation to the reads? - * @param qmode How should we apply the BAQ calculation to the reads? - * @param refReader if applyBAQ is true, must be a valid pointer to a indexed fasta file reads so we can get the ref bases for BAQ calculation * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. */ public ReadProperties( Collection samFiles, @@ -146,11 +134,8 @@ public class ReadProperties { DownsamplingMethod downsamplingMethod, ValidationExclusion exclusionList, Collection supplementalFilters, + List readTransformers, boolean includeReadsWithDeletionAtLoci, - BAQ.CalculationMode cmode, - BAQ.QualityMode qmode, - IndexedFastaSequenceFile refReader, - BaseRecalibration bqsrApplier, byte defaultBaseQualities) { this.readers = samFiles; this.header = header; @@ -158,12 +143,9 @@ public class ReadProperties { this.downsamplingMethod = downsamplingMethod == null ? DownsamplingMethod.NONE : downsamplingMethod; this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList; this.supplementalFilters = supplementalFilters; + this.readTransformers = readTransformers; this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci; this.useOriginalBaseQualities = useOriginalBaseQualities; - this.cmode = cmode; - this.qmode = qmode; - this.refReader = refReader; - this.bqsrApplier = bqsrApplier; this.defaultBaseQualities = defaultBaseQualities; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java index 8843d4bfe..ae59ce438 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java @@ -29,13 +29,14 @@ import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.help.ResourceBundleExtractorDoclet; import org.broadinstitute.sting.utils.text.TextFormattingUtils; +import java.lang.annotation.Annotation; import java.util.*; /** @@ -319,11 +320,11 @@ public class WalkerManager extends PluginManager { return downsamplingMethod; } - public static BAQ.QualityMode getBAQQualityMode(Walker walker) { - return walker.getClass().getAnnotation(BAQMode.class).QualityMode(); + public static T getWalkerAnnotation(final Walker walker, final Class clazz) { + return walker.getClass().getAnnotation(clazz); } - public static BAQ.ApplicationTime getBAQApplicationTime(Walker walker) { + public static ReadTransformer.ApplicationTime getBAQApplicationTime(Walker walker) { return walker.getClass().getAnnotation(BAQMode.class).ApplicationTime(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 2b88775b1..7d027438b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -24,7 +24,6 @@ package org.broadinstitute.sting.gatk.datasources.reads; -import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.sam.MergingSamRecordIterator; import net.sf.picard.sam.SamFileHeaderMerger; import net.sf.samtools.*; @@ -42,12 +41,9 @@ import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.SimpleTimer; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.baq.BAQSamIterator; +import org.broadinstitute.sting.utils.baq.ReadTransformingIterator; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.recalibration.BQSRSamIterator; -import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; import java.io.File; @@ -200,11 +196,8 @@ public class SAMDataSource { downsamplingMethod, exclusionList, supplementalFilters, + Collections.emptyList(), includeReadsWithDeletionAtLoci, - BAQ.CalculationMode.OFF, - BAQ.QualityMode.DONT_MODIFY, - null, // no BAQ - null, // no BQSR (byte) -1, false); } @@ -234,11 +227,8 @@ public class SAMDataSource { DownsamplingMethod downsamplingMethod, ValidationExclusion exclusionList, Collection supplementalFilters, + List readTransformers, boolean includeReadsWithDeletionAtLoci, - BAQ.CalculationMode cmode, - BAQ.QualityMode qmode, - IndexedFastaSequenceFile refReader, - BaseRecalibration bqsrApplier, byte defaultBaseQualities, boolean removeProgramRecords) { this.readMetrics = new ReadMetrics(); @@ -308,11 +298,8 @@ public class SAMDataSource { downsamplingMethod, exclusionList, supplementalFilters, + readTransformers, includeReadsWithDeletionAtLoci, - cmode, - qmode, - refReader, - bqsrApplier, defaultBaseQualities); // cache the read group id (original) -> read group id (merged) @@ -603,10 +590,7 @@ public class SAMDataSource { readProperties.getDownsamplingMethod().toFraction, readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), readProperties.getSupplementalFilters(), - readProperties.getBAQCalculationMode(), - readProperties.getBAQQualityMode(), - readProperties.getRefReader(), - readProperties.getBQSRApplier(), + readProperties.getReadTransformers(), readProperties.defaultBaseQualities()); } @@ -673,10 +657,7 @@ public class SAMDataSource { Double downsamplingFraction, Boolean noValidationOfReadOrder, Collection supplementalFilters, - BAQ.CalculationMode cmode, - BAQ.QualityMode qmode, - IndexedFastaSequenceFile refReader, - BaseRecalibration bqsrApplier, + List readTransformers, byte defaultBaseQualities) { // *********************************************************************************** // @@ -698,11 +679,11 @@ public class SAMDataSource { // only wrap if we are replacing the original qualities or using a default base quality wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities); - if (bqsrApplier != null) - wrappedIterator = new BQSRSamIterator(wrappedIterator, bqsrApplier); - - if (cmode != BAQ.CalculationMode.OFF) - wrappedIterator = new BAQSamIterator(refReader, wrappedIterator, cmode, qmode); + // set up read transformers + for ( final ReadTransformer readTransformer : readTransformers ) { + if ( readTransformer.enabled() && readTransformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_INPUT ) + wrappedIterator = new ReadTransformingIterator(wrappedIterator, readTransformer); + } return wrappedIterator; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java index d8e59a3dd..d2e7066e9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java @@ -31,12 +31,16 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.File; import java.io.OutputStream; +import java.util.ArrayList; +import java.util.List; /** * A stub for routing and management of SAM file reading and writing. @@ -116,15 +120,15 @@ public class SAMFileWriterStub implements Stub, StingSAMFileWrite */ private boolean simplifyBAM = false; + private List onOutputReadTransformers = null; + /** * Create a new stub given the requested SAM file and compression level. * @param engine source of header data, maybe other data about input files. * @param samFile SAM file to (ultimately) create. */ public SAMFileWriterStub( GenomeAnalysisEngine engine, File samFile ) { - this.engine = engine; - this.samFile = samFile; - this.samOutputStream = null; + this(engine, samFile, null); } /** @@ -133,8 +137,12 @@ public class SAMFileWriterStub implements Stub, StingSAMFileWrite * @param stream Output stream to which data should be written. */ public SAMFileWriterStub( GenomeAnalysisEngine engine, OutputStream stream ) { + this(engine, null, stream); + } + + private SAMFileWriterStub(final GenomeAnalysisEngine engine, final File samFile, final OutputStream stream) { this.engine = engine; - this.samFile = null; + this.samFile = samFile; this.samOutputStream = stream; } @@ -274,17 +282,29 @@ public class SAMFileWriterStub implements Stub, StingSAMFileWrite this.headerOverride = header; } + private void initializeReadTransformers() { + this.onOutputReadTransformers = new ArrayList(engine.getReadTransformers().size()); + for ( final ReadTransformer transformer : engine.getReadTransformers() ) { + if ( transformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_OUTPUT ) + onOutputReadTransformers.add(transformer); + } + } + /** * @{inheritDoc} */ - public void addAlignment( SAMRecord alignment ) { - if ( engine.getArguments().BAQMode != BAQ.CalculationMode.OFF && engine.getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_OUTPUT ) { - //System.out.printf("Writing BAQ at OUTPUT TIME%n"); - baqHMM.baqRead(alignment, engine.getReferenceDataSource().getReference(), engine.getArguments().BAQMode, engine.getWalkerBAQQualityMode()); - } + public void addAlignment( final SAMRecord readIn ) { + if ( onOutputReadTransformers == null ) + initializeReadTransformers(); + + GATKSAMRecord workingRead = (GATKSAMRecord)readIn; + + // run on output read transformers + for ( final ReadTransformer transform : onOutputReadTransformers ) + workingRead = transform.apply(workingRead); writeStarted = true; - outputTracker.getStorage(this).addAlignment(alignment); + outputTracker.getStorage(this).addAlignment(workingRead); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java new file mode 100644 index 000000000..d307789f3 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java @@ -0,0 +1,144 @@ +package org.broadinstitute.sting.gatk.iterators; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/** + * Baseclass used to describe a read transformer like BAQ and BQSR + * + * Read transformers are plugable infrastructure that modify read state + * either on input, on output, or within walkers themselves. + * + * The function apply() is called on each read seen by the GATK (after passing + * all ReadFilters) and it can do as it sees fit (without modifying the alignment) + * to the read to change qualities, add tags, etc. + * + * Initialize is called once right before the GATK traversal begins providing + * the ReadTransformer with the ability to collect and initialize data from the + * engine. + * + * Note that all ReadTransformers within the classpath are created and initialized. If one + * shouldn't be run it should look at the command line options of the engine and override + * the enabled. + * + * @since 8/31/12 + * @author depristo + */ +abstract public class ReadTransformer { + /** + * When should this read transform be applied? + */ + private ApplicationTime applicationTime; + + /** + * Keep track of whether we've been initialized already, and ensure it's not called more than once. + */ + private boolean initialized = false; + + protected ReadTransformer() {} + + /** + * Master initialization routine. Called to setup a ReadTransform, using it's overloaded initialialSub routine. + * + * @param overrideTime if not null, we will run this ReadTransform at the time provided, regardless of the timing of this read transformer itself + * @param engine the engine, for initializing values + * @param walker the walker we intend to run + */ + @Requires({"initialized == false", "engine != null", "walker == null"}) + @Ensures("initialized == true") + public final void initialize(final ApplicationTime overrideTime, final GenomeAnalysisEngine engine, final Walker walker) { + if ( engine == null ) throw new IllegalArgumentException("engine cannot be null"); + if ( walker == null ) throw new IllegalArgumentException("walker cannot be null"); + + this.applicationTime = initializeSub(engine, walker); + if ( overrideTime != null ) this.applicationTime = overrideTime; + initialized = true; + } + + /** + * Subclasses must override this to initialize themeselves + * + * @param engine the engine, for initializing values + * @param walker the walker we intend to run + * @return the point of time we'd like this read transform to be run + */ + @Requires({"engine != null", "walker != null"}) + @Ensures("result != null") + protected abstract ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker); + + /** + * Should this ReadTransformer be activated? Called after initialize, which allows this + * read transformer to look at its arguments and decide if it should be active. All + * ReadTransformers must override this, as by default they are not enabled. + * + * @return true if this ReadTransformer should be used on the read stream + */ + public boolean enabled() { + return false; + } + + /** + * Has this transformer been initialized? + * + * @return true if it has + */ + public final boolean isInitialized() { + return initialized; + } + + /** + * When should we apply this read transformer? + * + * @return true if yes + */ + public final ApplicationTime getApplicationTime() { + return applicationTime; + } + + /** + * Primary interface function for a read transform to actually do some work + * + * The function apply() is called on each read seen by the GATK (after passing + * all ReadFilters) and it can do as it sees fit (without modifying the alignment) + * to the read to change qualities, add tags, etc. + * + * @param read the read to transform + * @return the transformed read + */ + @Requires("read != null") + @Ensures("result != null") + abstract public GATKSAMRecord apply(final GATKSAMRecord read); + + @Override + public String toString() { + return getClass().getSimpleName(); + } + + /** + * When should a read transformer be applied? + */ + public static enum ApplicationTime { + /** + * Walker does not tolerate this read transformer + */ + FORBIDDEN, + + /** + * apply the transformation to the incoming reads, the default + */ + ON_INPUT, + + /** + * apply the transformation to the outgoing read stream + */ + ON_OUTPUT, + + /** + * the walker will deal with the calculation itself + */ + HANDLED_IN_WALKER + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java new file mode 100644 index 000000000..be227619f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java @@ -0,0 +1,28 @@ +package org.broadinstitute.sting.gatk.iterators; + +import java.lang.annotation.*; + +/** + * User: hanna + * Date: May 14, 2009 + * Time: 1:51:22 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Allows the walker to indicate what type of data it wants to consume. + */ + +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface ReadTransformersMode { + public abstract ReadTransformer.ApplicationTime ApplicationTime() default ReadTransformer.ApplicationTime.ON_INPUT; +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/BAQMode.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/BAQMode.java index 03097887d..42582f178 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/BAQMode.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/BAQMode.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.gatk.walkers; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; + import java.lang.annotation.*; /** @@ -25,5 +27,5 @@ import java.lang.annotation.*; @Target(ElementType.TYPE) public @interface BAQMode { public abstract org.broadinstitute.sting.utils.baq.BAQ.QualityMode QualityMode() default org.broadinstitute.sting.utils.baq.BAQ.QualityMode.OVERWRITE_QUALS; - public abstract org.broadinstitute.sting.utils.baq.BAQ.ApplicationTime ApplicationTime() default org.broadinstitute.sting.utils.baq.BAQ.ApplicationTime.ON_INPUT; + public abstract ReadTransformer.ApplicationTime ApplicationTime() default ReadTransformer.ApplicationTime.ON_INPUT; } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java index 52ed20ef9..dca23ae66 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java @@ -32,6 +32,8 @@ import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.baq.BAQ; @@ -91,7 +93,8 @@ import java.util.TreeSet; * */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) +@ReadTransformersMode(ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER) +@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER) @Requires({DataSource.READS, DataSource.REFERENCE}) public class PrintReads extends ReadWalker implements ThreadSafeMapReduce { @@ -217,11 +220,20 @@ public class PrintReads extends ReadWalker impleme * The reads map function. * * @param ref the reference bases that correspond to our read, if a reference was provided - * @param read the read itself, as a GATKSAMRecord + * @param readIn the read itself, as a GATKSAMRecord * @return the read itself */ - public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker ) { - return simplifyReads ? read.simplify() : read; + public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord readIn, RefMetaDataTracker metaDataTracker ) { + GATKSAMRecord workingRead = readIn; + + for ( final ReadTransformer transformer : getToolkit().getReadTransformers() ) { + if ( logger.isDebugEnabled() ) logger.debug("Applying transformer " + transformer + " to read " + readIn.getReadName()); + workingRead = transformer.apply(workingRead); + } + + if ( simplifyReads ) workingRead = workingRead.simplify(); + + return workingRead; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java index 6cd2e8aea..4478f8515 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java @@ -30,12 +30,14 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.filters.MalformedReadFilter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.recalibration.BQSRMode; import java.util.List; @@ -48,7 +50,8 @@ import java.util.List; */ @ReadFilters(MalformedReadFilter.class) @PartitionBy(PartitionType.NONE) -@BAQMode(QualityMode = BAQ.QualityMode.OVERWRITE_QUALS, ApplicationTime = BAQ.ApplicationTime.ON_INPUT) +@BAQMode(QualityMode = BAQ.QualityMode.OVERWRITE_QUALS, ApplicationTime = ReadTransformer.ApplicationTime.ON_INPUT) +@BQSRMode(ApplicationTime = ReadTransformer.ApplicationTime.ON_INPUT) @DocumentedGATKFeature(groupName = "Uncategorized", extraDocs = {CommandLineGATK.class}) public abstract class Walker { final protected static Logger logger = Logger.getLogger(Walker.class); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 30d2e24ef..443b493be 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -32,10 +32,9 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter; import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; -import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.GATKLiteUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -46,6 +45,7 @@ import org.broadinstitute.sting.utils.recalibration.QuantizationInfo; import org.broadinstitute.sting.utils.recalibration.RecalUtils; import org.broadinstitute.sting.utils.recalibration.RecalibrationReport; import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; +import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -104,7 +104,7 @@ import java.util.ArrayList; */ @DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} ) -@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) +@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) @By(DataSource.READS) @ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file @Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 507806fbe..93928a780 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.BadMateFilter; import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; @@ -117,7 +118,7 @@ import java.util.*; */ @DocumentedGATKFeature( groupName = "Variant Discovery Tools", extraDocs = {CommandLineGATK.class} ) -@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_INPUT) +@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.ON_INPUT) @ReadFilters( {BadMateFilter.class, MappingQualityUnavailableFilter.class} ) @Reference(window=@Window(start=-200,stop=200)) @By(DataSource.REFERENCE) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index d9b71f938..76d8d85c2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -36,6 +36,7 @@ import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.BAQMode; import org.broadinstitute.sting.gatk.walkers.ReadWalker; @@ -111,7 +112,7 @@ import java.util.*; * @author ebanks */ @DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} ) -@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) +@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.ON_OUTPUT) public class IndelRealigner extends ReadWalker { public static final String ORIGINAL_CIGAR_TAG = "OC"; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index fc6df6902..a52d57031 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -33,10 +33,10 @@ import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.*; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -101,7 +101,7 @@ import java.util.TreeSet; @Reference(window=@Window(start=-1,stop=50)) @Allows(value={DataSource.READS, DataSource.REFERENCE}) @By(DataSource.REFERENCE) -@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) +@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) public class RealignerTargetCreator extends RodWalker implements TreeReducible { /** diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java index 439a0d8ed..cf4d699ee 100644 --- a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java +++ b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java @@ -52,13 +52,6 @@ public class BAQ { DONT_MODIFY // do the BAQ, but don't modify the quality scores themselves, just return them in the function. } - public enum ApplicationTime { - FORBIDDEN, // Walker does not tolerate BAQ input - ON_INPUT, // apply the BAQ calculation to the incoming reads, the default - ON_OUTPUT, // apply the BAQ calculation to outgoing read streams - HANDLED_IN_WALKER // the walker will deal with the BAQ calculation status itself - } - public static final String BAQ_TAG = "BQ"; private static double[] qual2prob = new double[256]; @@ -68,7 +61,7 @@ public class BAQ { } // Phred scaled now (changed 1/10/2011) - public static double DEFAULT_GOP = 40; + public static final double DEFAULT_GOP = 40; /* Takes a Phred Scale quality score and returns the error probability. * @@ -110,10 +103,19 @@ public class BAQ { * Use defaults for everything */ public BAQ() { - cd = convertFromPhredScale(DEFAULT_GOP); + this(DEFAULT_GOP); + } + + /** + * Use defaults for everything + */ + public BAQ(final double gapOpenPenalty) { + cd = convertFromPhredScale(gapOpenPenalty); initializeCachedData(); } + + /** * Create a new HmmGlocal object with specified parameters * diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java new file mode 100644 index 000000000..4589ffb71 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java @@ -0,0 +1,49 @@ +package org.broadinstitute.sting.utils.baq; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.walkers.BAQMode; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/** + * Applies Heng's BAQ calculation to a stream of incoming reads + */ +public class BAQReadTransformer extends ReadTransformer { + private BAQ baqHMM; + private IndexedFastaSequenceFile refReader; + private BAQ.CalculationMode cmode; + private BAQ.QualityMode qmode; + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { + final BAQMode mode = WalkerManager.getWalkerAnnotation(walker, BAQMode.class); + this.refReader = engine.getReferenceDataSource().getReference(); + this.cmode = engine.getArguments().BAQMode; + this.qmode = mode.QualityMode(); + baqHMM = new BAQ(engine.getArguments().BAQGOP); + + if ( qmode == BAQ.QualityMode.DONT_MODIFY ) + throw new ReviewedStingException("BUG: shouldn't create BAQ transformer with quality mode DONT_MODIFY"); + + if ( mode.ApplicationTime() == ReadTransformer.ApplicationTime.FORBIDDEN && enabled() ) + throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + cmode + " was requested."); + + return mode.ApplicationTime(); + } + + @Override + public boolean enabled() { + return cmode != BAQ.CalculationMode.OFF; + } + + @Override + public GATKSAMRecord apply(final GATKSAMRecord read) { + baqHMM.baqRead(read, refReader, cmode, qmode); + return read; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQSamIterator.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQSamIterator.java deleted file mode 100644 index adfeef518..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/baq/BAQSamIterator.java +++ /dev/null @@ -1,59 +0,0 @@ -package org.broadinstitute.sting.utils.baq; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.Iterator; - -/** - * Simple iterator that applies Heng's BAQ calculation to a stream of incoming reads - */ -public class BAQSamIterator implements StingSAMIterator { - private final StingSAMIterator it; - private final BAQ baqHMM = new BAQ(); // creates a BAQ creator with default parameters - private final IndexedFastaSequenceFile refReader; - private final BAQ.CalculationMode cmode; - private final BAQ.QualityMode qmode; - - /** - * Creates a new BAMSamIterator using the reference getter refReader and applies the BAM to the reads coming - * in from it. See BAQ docs for baqType information. - * - * @param refReader - * @param it - * @param cmode - * @param qmode - */ - @Requires({ - "refReader != null", - "it != null", - "cmode != null" , - "qmode != null"}) - public BAQSamIterator(IndexedFastaSequenceFile refReader, StingSAMIterator it, BAQ.CalculationMode cmode, BAQ.QualityMode qmode) { - if ( cmode == BAQ.CalculationMode.OFF ) throw new ReviewedStingException("BUG: shouldn't create BAQSamIterator with calculation mode OFF"); - if ( qmode == BAQ.QualityMode.DONT_MODIFY ) throw new ReviewedStingException("BUG: shouldn't create BAQSamIterator with quailty mode DONT_MODIFY"); - - this.refReader = refReader; - this.it = it; - this.cmode = cmode; - this.qmode = qmode; - } - - @Requires("hasNext()") - @Ensures("result != null") - public SAMRecord next() { - //System.out.printf("BAQing during input%n"); - SAMRecord read = it.next(); - baqHMM.baqRead(read, refReader, cmode, qmode); - return read; - } - - public boolean hasNext() { return this.it.hasNext(); } - public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } - public void close() { it.close(); } - public Iterator iterator() { return this; } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java b/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java new file mode 100644 index 000000000..028e75226 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java @@ -0,0 +1,44 @@ +package org.broadinstitute.sting.utils.baq; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.Iterator; + +/** + * Iterator that applies a ReadTransformer to a stream of reads + */ +public class ReadTransformingIterator implements StingSAMIterator { + private final StingSAMIterator it; + private final ReadTransformer transformer; + + /** + * Creates a new ReadTransforming iterator + */ + @Requires({"it != null", "engine != null", "transformer != null", "transformer.isInitialized()"}) + public ReadTransformingIterator(final StingSAMIterator it, final ReadTransformer transformer) { + if ( ! transformer.isInitialized() ) + throw new IllegalStateException("Creating a read transformer stream for an uninitialized read transformer: " + transformer); + if ( transformer.getApplicationTime() == ReadTransformer.ApplicationTime.FORBIDDEN ) + throw new IllegalStateException("Creating a read transformer stream for a forbidden transformer " + transformer); + + this.it = it; + this.transformer = transformer; + } + + @Requires("hasNext()") + @Ensures("result != null") + public SAMRecord next() { + final GATKSAMRecord read = (GATKSAMRecord)it.next(); + return transformer.apply(read); + } + + public boolean hasNext() { return this.it.hasNext(); } + public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } + public void close() { it.close(); } + public Iterator iterator() { return this; } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java new file mode 100644 index 000000000..431014032 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java @@ -0,0 +1,30 @@ +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; + +import java.lang.annotation.*; + +/** + * User: hanna + * Date: May 14, 2009 + * Time: 1:51:22 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Allows the walker to indicate what type of data it wants to consume. + */ + +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface BQSRMode { + public abstract ReadTransformer.ApplicationTime ApplicationTime() default ReadTransformer.ApplicationTime.ON_INPUT; +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java new file mode 100644 index 000000000..fae0e8c09 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java @@ -0,0 +1,40 @@ +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/** + * A ReadTransformer that applies BQSR on the fly to reads + * + * User: rpoplin + * Date: 2/13/12 + */ +public class BQSRReadTransformer extends ReadTransformer { + private boolean enabled; + private BaseRecalibration bqsr; + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { + this.enabled = engine.hasBaseRecalibration(); + this.bqsr = engine.getBaseRecalibration(); + final BQSRMode mode = WalkerManager.getWalkerAnnotation(walker, BQSRMode.class); + return mode.ApplicationTime(); + } + + @Override + public boolean enabled() { + return enabled; + } + + /** + * initialize a new BQSRReadTransformer that applies BQSR on the fly to incoming reads. + */ + @Override + public GATKSAMRecord apply(GATKSAMRecord read) { + bqsr.recalibrateRead(read); + return read; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRSamIterator.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRSamIterator.java deleted file mode 100644 index 048f8e58c..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRSamIterator.java +++ /dev/null @@ -1,50 +0,0 @@ -package org.broadinstitute.sting.utils.recalibration; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.Iterator; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: 2/13/12 - */ - -public class BQSRSamIterator implements StingSAMIterator { - private final StingSAMIterator it; - private final BaseRecalibration bqsr; - - /** - * Creates a new BQSRSamIterator and applies BQSR on the fly to incoming reads. - * - * @param it The incoming SamIterator to wrap - * @param bqsr The object which holds the BQSR table information and knows how to apply it - */ - @Requires({ - "it != null", - "bqsr != null"}) - public BQSRSamIterator(StingSAMIterator it, BaseRecalibration bqsr) { - if ( bqsr == null ) throw new ReviewedStingException("BUG: shouldn't create BQSRSamIterator with null recalibration object"); - - this.it = it; - this.bqsr = bqsr; - } - - @Requires("hasNext()") - @Ensures("result != null") - public SAMRecord next() { - SAMRecord read = it.next(); - bqsr.recalibrateRead((GATKSAMRecord) read); - return read; - } - - public boolean hasNext() { return this.it.hasNext(); } - public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } - public void close() { it.close(); } - public Iterator iterator() { return this; } -} From cf91d894e4c17d9a7af17abc1bdadecf3443e5bf Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 31 Aug 2012 11:56:40 -0400 Subject: [PATCH 097/161] Fix build problems with tests --- .../utils/baq/ReadTransformingIterator.java | 2 +- .../reads/DownsamplerBenchmark.java | 23 ++++++++--------- .../reads/SAMDataSourceUnitTest.java | 24 ++++++------------ .../LocusIteratorByStateUnitTest.java | 25 +++++++++---------- 4 files changed, 31 insertions(+), 43 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java b/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java index 028e75226..18ab9e01a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java +++ b/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java @@ -19,7 +19,7 @@ public class ReadTransformingIterator implements StingSAMIterator { /** * Creates a new ReadTransforming iterator */ - @Requires({"it != null", "engine != null", "transformer != null", "transformer.isInitialized()"}) + @Requires({"it != null", "transformer != null", "transformer.isInitialized()"}) public ReadTransformingIterator(final StingSAMIterator it, final ReadTransformer transformer) { if ( ! transformer.isInitialized() ) throw new IllegalStateException("Creating a read transformer stream for an uninitialized read transformer: " + transformer); diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 477b76e37..5aeb741ec 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -36,8 +36,8 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.baq.BAQ; import java.util.Collections; import java.util.Iterator; @@ -69,18 +69,15 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark { for(int i = 0; i < reps; i++) { SAMFileReader reader = new SAMFileReader(inputFile); ReadProperties readProperties = new ReadProperties(Collections.singletonList(new SAMReaderID(inputFile,new Tags())), - reader.getFileHeader(), - false, - SAMFileReader.ValidationStringency.SILENT, - downsampling.create(), - new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALL)), - Collections.emptyList(), - false, - BAQ.CalculationMode.OFF, - BAQ.QualityMode.DONT_MODIFY, - null, // no BAQ - null, // no BQSR - (byte)0); + reader.getFileHeader(), + false, + SAMFileReader.ValidationStringency.SILENT, + downsampling.create(), + new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALL)), + Collections.emptyList(), + Collections.emptyList(), + false, + (byte)0); GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()); // Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally filters these out? diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java index f2c546317..730b3f410 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java @@ -24,9 +24,6 @@ package org.broadinstitute.sting.gatk.datasources.reads; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertTrue; -import static org.testng.Assert.fail; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMProgramRecord; @@ -35,24 +32,25 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.testng.annotations.AfterMethod; import org.testng.annotations.BeforeMethod; - import org.testng.annotations.Test; import java.io.File; import java.io.FileNotFoundException; import java.util.ArrayList; -import java.util.Iterator; +import java.util.Collections; import java.util.List; +import static org.testng.Assert.*; + /** * @author aaron * @version 1.0 @@ -183,11 +181,8 @@ public class SAMDataSourceUnitTest extends BaseTest { null, new ValidationExclusion(), new ArrayList(), + Collections.emptyList(), false, - BAQ.CalculationMode.OFF, - BAQ.QualityMode.DONT_MODIFY, - null, // no BAQ - null, // no BQSR (byte) -1, removeProgramRecords); @@ -205,11 +200,8 @@ public class SAMDataSourceUnitTest extends BaseTest { null, new ValidationExclusion(), new ArrayList(), + Collections.emptyList(), false, - BAQ.CalculationMode.OFF, - BAQ.QualityMode.DONT_MODIFY, - null, // no BAQ - null, // no BQSR (byte) -1, removeProgramRecords); diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index 4480acacd..fbc063ab6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -4,25 +4,27 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMRecord; import net.sf.samtools.util.CloseableIterator; -import org.broadinstitute.sting.gatk.filters.ReadFilter; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.*; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; /** * testing of the LocusIteratorByState @@ -349,11 +351,8 @@ public class LocusIteratorByStateUnitTest extends BaseTest { null, new ValidationExclusion(), Collections.emptyList(), + Collections.emptyList(), false, - BAQ.CalculationMode.OFF, - BAQ.QualityMode.DONT_MODIFY, - null, // no BAQ - null, // no BQSR (byte) -1 ); } From e028901d54d07330a65da9a9bff739e1e6f36f32 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 31 Aug 2012 13:40:33 -0400 Subject: [PATCH 098/161] Fixed bad contract in ReadTransformer --- .../broadinstitute/sting/gatk/iterators/ReadTransformer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java index d307789f3..28348ecc2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java @@ -47,7 +47,7 @@ abstract public class ReadTransformer { * @param engine the engine, for initializing values * @param walker the walker we intend to run */ - @Requires({"initialized == false", "engine != null", "walker == null"}) + @Requires({"initialized == false", "engine != null", "walker != null"}) @Ensures("initialized == true") public final void initialize(final ApplicationTime overrideTime, final GenomeAnalysisEngine engine, final Walker walker) { if ( engine == null ) throw new IllegalArgumentException("engine cannot be null"); From 27ddebee53e7d6b808c82dec5dd8849cd5014dd0 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 31 Aug 2012 13:41:03 -0400 Subject: [PATCH 099/161] Protect PrintReads from strange state from TraverseReadsUnitTests --- .../broadinstitute/sting/gatk/walkers/PrintReads.java | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java index dca23ae66..a5d4b45b6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java @@ -41,10 +41,7 @@ import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.File; -import java.util.Collection; -import java.util.Random; -import java.util.Set; -import java.util.TreeSet; +import java.util.*; /** * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file. @@ -141,6 +138,7 @@ public class PrintReads extends ReadWalker impleme public boolean simplifyReads = false; + List readTransformers = Collections.emptyList(); private TreeSet samplesToChoose = new TreeSet(); private boolean SAMPLES_SPECIFIED = false; @@ -153,6 +151,9 @@ public class PrintReads extends ReadWalker impleme if ( platform != null ) platform = platform.toUpperCase(); + if ( getToolkit() != null ) + readTransformers = getToolkit().getReadTransformers(); + Collection samplesFromFile; if (!sampleFile.isEmpty()) { samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFile); @@ -226,7 +227,7 @@ public class PrintReads extends ReadWalker impleme public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord readIn, RefMetaDataTracker metaDataTracker ) { GATKSAMRecord workingRead = readIn; - for ( final ReadTransformer transformer : getToolkit().getReadTransformers() ) { + for ( final ReadTransformer transformer : readTransformers ) { if ( logger.isDebugEnabled() ) logger.debug("Applying transformer " + transformer + " to read " + readIn.getReadName()); workingRead = transformer.apply(workingRead); } From c9ea213c9bc1de56180a727f6e532b94c8cb4408 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 31 Aug 2012 13:42:29 -0400 Subject: [PATCH 100/161] Make BaseRecalibration thread-safe -- In the process uncovered two strange things 1 -- qualityScoreByFullCovariateKey was created but never used. Seems like a cache? 2 -- Discovered nasty bug in BaseRecalibrator: https://jira.broadinstitute.org/browse/GSA-534 --- .../recalibration/BaseRecalibration.java | 34 ++++++++++++++----- .../utils/recalibration/ReadCovariates.java | 13 +++++++ 2 files changed, 38 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index a563b18fc..0af7deec4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -27,12 +27,11 @@ package org.broadinstitute.sting.utils.recalibration; import net.sf.samtools.SAMTag; import net.sf.samtools.SAMUtils; -import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; -import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.File; @@ -46,7 +45,6 @@ import java.io.File; public class BaseRecalibration { private final static int MAXIMUM_RECALIBRATED_READ_LENGTH = 5000; - private final ReadCovariates readCovariates; private final QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done) private final RecalibrationTables recalibrationTables; @@ -56,10 +54,23 @@ public class BaseRecalibration { private final int preserveQLessThan; private final boolean emitOriginalQuals; - private static final NestedHashMap[] qualityScoreByFullCovariateKey = new NestedHashMap[EventType.values().length]; // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values. - static { - for (int i = 0; i < EventType.values().length; i++) - qualityScoreByFullCovariateKey[i] = new NestedHashMap(); + // TODO -- was this supposed to be used somewhere? +// private static final NestedHashMap[] qualityScoreByFullCovariateKey = new NestedHashMap[EventType.values().length]; // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values. +// static { +// for (int i = 0; i < EventType.values().length; i++) +// qualityScoreByFullCovariateKey[i] = new NestedHashMap(); +// } + + /** + * Thread local cache to allow multi-threaded use of this class + */ + private ThreadLocal readCovariatesCache; + { + readCovariatesCache = new ThreadLocal () { + @Override protected ReadCovariates initialValue() { + return new ReadCovariates(MAXIMUM_RECALIBRATED_READ_LENGTH, requestedCovariates.length); + } + }; } /** @@ -81,7 +92,6 @@ public class BaseRecalibration { else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wnats to use what's in the report. quantizationInfo.quantizeQualityScores(quantizationLevels); - readCovariates = new ReadCovariates(MAXIMUM_RECALIBRATED_READ_LENGTH, requestedCovariates.length); this.disableIndelQuals = disableIndelQuals; this.preserveQLessThan = preserveQLessThan; this.emitOriginalQuals = emitOriginalQuals; @@ -104,6 +114,11 @@ public class BaseRecalibration { } // Compute all covariates for the read + // TODO -- the need to clear here suggests there's an error in the indexing / assumption code + // TODO -- for BI and DI. Perhaps due to the indel buffer size on the ends of the reads? + // TODO -- the output varies with -nt 1 and -nt 2 if you don't call clear here + // TODO -- needs to be fixed. + final ReadCovariates readCovariates = readCovariatesCache.get().clear(); RecalUtils.computeCovariates(read, requestedCovariates, readCovariates); for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings @@ -130,6 +145,7 @@ public class BaseRecalibration { } } + /** * Implements a serial recalibration of the reads using the combinational table. * First, we perform a positional recalibration, and then a subsequent dinuc correction. @@ -147,7 +163,7 @@ public class BaseRecalibration { * @param errorModel the event type * @return A recalibrated quality score as a byte */ - protected byte performSequentialQualityCalculation(final int[] key, final EventType errorModel) { + private byte performSequentialQualityCalculation(final int[] key, final EventType errorModel) { final byte qualFromRead = (byte)(long)key[1]; final double globalDeltaQ = calculateGlobalDeltaQ(recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE), key, errorModel); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java index c86bd4deb..2b682f84b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.utils.recalibration; +import java.util.Arrays; + /** * The object temporarily held by a read that describes all of it's covariates. * @@ -21,6 +23,17 @@ public class ReadCovariates { currentCovariateIndex = index; } + /** + * Necessary due to bug in BaseRecalibration recalibrateRead function. It is clearly seeing space it's not supposed to + * @return + */ + public ReadCovariates clear() { + for ( int i = 0; i < keys.length; i++ ) + for ( int j = 0; j < keys[i].length; j++) + Arrays.fill(keys[i][j], 0); + return this; + } + public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) { keys[EventType.BASE_SUBSTITUTION.index][readOffset][currentCovariateIndex] = mismatch; keys[EventType.BASE_INSERTION.index][readOffset][currentCovariateIndex] = insertion; From 5ea7cd6dcc612e8e284a4faaccc0222302565e0f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 31 Aug 2012 14:01:54 -0400 Subject: [PATCH 101/161] Updating resource bundle: no reason to include both genotype and sites files for Omni and HM3, sites are enough. Also, don't include duplicate entry for the Mills indels. --- .../sting/queue/qscripts/GATKResourcesBundle.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 3dc953361..08496e284 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -125,17 +125,17 @@ class GATKResourcesBundle extends QScript { addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_135_b37.leftAligned.vcf", "dbsnp_135", b37, true, false)) - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_genotypes_1525_samples.b37.vcf", - "1000G_omni2.5", b37, true, true)) + addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_sites_2141_samples.b37.vcf", + "1000G_omni2.5", b37, true, false)) - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf", - "hapmap_3.3", b37, true, true)) + addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf", + "hapmap_3.3", b37, true, false)) addResource(new Resource("/humgen/1kg/DCC/ftp/technical/working/20120312_phase1_v2_indel_cleaned_sites_list/ALL.wgs.phase1_release_v2.20101123.official_indel_calls.20120312.sites.vcf", "1000G_phase1.indels", b37, true, false)) addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/GoldStandardIndel/gold.standard.indel.MillsAnd1000G.b37.vcf", - "Mills_and_1000G_gold_standard.indels", b37, true, true)) + "Mills_and_1000G_gold_standard.indels", b37, true, false)) // // example call set for wiki tutorial From 277ba94c7bff86ac6c67955e64b313b4f0e50707 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 31 Aug 2012 14:06:29 -0400 Subject: [PATCH 102/161] Update from dbsnp135 to dbsnp137. --- .../sting/queue/qscripts/GATKResourcesBundle.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 08496e284..5e66520ca 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -122,8 +122,8 @@ class GATKResourcesBundle extends QScript { // // standard VCF files. Will be lifted to each reference // - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_135_b37.leftAligned.vcf", - "dbsnp_135", b37, true, false)) + addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_137_b37.leftAligned.vcf", + "dbsnp_137", b37, true, false)) addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_sites_2141_samples.b37.vcf", "1000G_omni2.5", b37, true, false)) From 1b0ce511a61bc6d1906e6817bc376d6851920f7e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 31 Aug 2012 19:50:20 -0400 Subject: [PATCH 103/161] Updating BQSR tests due to my change to reset BQSR calibration data --- .../sting/gatk/walkers/bqsr/BQSRIntegrationTest.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java index bd75806dd..85615962c 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java @@ -127,9 +127,9 @@ public class BQSRIntegrationTest extends WalkerTest { @DataProvider(name = "PRTest") public Object[][] createPRTestData() { return new Object[][]{ - {new PRTest("", "d2d6ed8667cdba7e56f5db97d6262676")}, - {new PRTest(" -qq -1", "b7053d3d67aba6d8892f0a60f0ded338")}, - {new PRTest(" -qq 6", "bfbf0855185b2b70aa35237fb71e4487")}, + {new PRTest("", "1532242f9fe90ef759a0faa5d85f61fb")}, + {new PRTest(" -qq -1", "3dd2c87915c96ac55c3872026574d8cb")}, + {new PRTest(" -qq 6", "5d012ee224f1cb4a7afac59e3655e20c")}, {new PRTest(" -DIQ", "66aa65223f192ee39c1773aa187fd493")} }; } From 52d6bea8045c2f83124c31fecd83409f7ac8dc9b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 1 Sep 2012 11:08:36 -0400 Subject: [PATCH 104/161] a few more useful git ignores --- .gitignore | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.gitignore b/.gitignore index 8623fa076..456794cea 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,10 @@ queueScatterGather /bar* integrationtests/ public/testdata/onTheFlyOutputTest.vcf +private/testdata/onTheFlyOutputTest.vcf +lib +html +gatkdocs +dist +build +resources From 0892f2b8b2196a779bb9eb433b73854168c3fb3b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 3 Sep 2012 20:18:56 -0400 Subject: [PATCH 105/161] Closing GSA-287:LocusReferenceView doesn't do very well in the case where contigs land off the end of the reference -- Confirmed that reads spanning off the end of the chromosome don't cause an exception by adding integration test for a single read that starts 7 bases from the end of chromosome 1 and spans 90 bases or so off. Added pileup integration test to ensure this behavior continues to work --- .../walkers/PileupWalkerIntegrationTest.java | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java index 9d9b91872..667b325ed 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java @@ -5,15 +5,7 @@ import org.testng.annotations.Test; import java.util.Arrays; -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Dec 1, 2009 - * Time: 9:03:34 AM - * To change this template use File | Settings | File Templates. - */ public class PileupWalkerIntegrationTest extends WalkerTest { - @Test public void testGnarleyFHSPileup() { String gatk_args = "-T Pileup -I " + validationDataLocation + "FHS_Pileup_Test.bam " @@ -23,4 +15,14 @@ public class PileupWalkerIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList(expected_md5)); executeTest("Testing the standard (no-indel) pileup on three merged FHS pools with 27 deletions in 969 bases", spec); } + + @Test + public void testSingleReadAligningOffChromosome1() { + String gatk_args = "-T Pileup " + + " -I " + privateTestDir + "readOffb37contig1.bam" + + " -R " + b37KGReference + + " -o %s"; + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList("4a45fe1f85aaa8c4158782f2b6dee2bd")); + executeTest("Testing single read spanning off chromosome 1", spec); + } } From c9944d81ef935223efd10643643be33f13ae0b06 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 4 Sep 2012 10:33:37 -0400 Subject: [PATCH 106/161] Skip array needs to also be used in the updateDataForRead function of the delocalized BQSR. --- .../bqsr/AdvancedRecalibrationEngine.java | 74 ++++++++++--------- .../walkers/bqsr/RecalibrationEngine.java | 2 +- .../bqsr/StandardRecalibrationEngine.java | 2 +- 3 files changed, 40 insertions(+), 38 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java index e5c952b76..ff1754a10 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java @@ -106,47 +106,49 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp } @Override - public synchronized void updateDataForRead(final GATKSAMRecord read, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { + public synchronized void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { for( int offset = 0; offset < read.getReadBases().length; offset++ ) { - final ReadCovariates readCovariates = covariateKeySetFrom(read); + if( !skip[offset] ) { + final ReadCovariates readCovariates = covariateKeySetFrom(read); - tempQualArray[EventType.BASE_SUBSTITUTION.index] = read.getBaseQualities()[offset]; - tempFractionalErrorArray[EventType.BASE_SUBSTITUTION.index] = snpErrors[offset]; - tempQualArray[EventType.BASE_INSERTION.index] = read.getBaseInsertionQualities()[offset]; - tempFractionalErrorArray[EventType.BASE_INSERTION.index] = insertionErrors[offset]; - tempQualArray[EventType.BASE_DELETION.index] = read.getBaseDeletionQualities()[offset]; - tempFractionalErrorArray[EventType.BASE_DELETION.index] = deletionErrors[offset]; + tempQualArray[EventType.BASE_SUBSTITUTION.index] = read.getBaseQualities()[offset]; + tempFractionalErrorArray[EventType.BASE_SUBSTITUTION.index] = snpErrors[offset]; + tempQualArray[EventType.BASE_INSERTION.index] = read.getBaseInsertionQualities()[offset]; + tempFractionalErrorArray[EventType.BASE_INSERTION.index] = insertionErrors[offset]; + tempQualArray[EventType.BASE_DELETION.index] = read.getBaseDeletionQualities()[offset]; + tempFractionalErrorArray[EventType.BASE_DELETION.index] = deletionErrors[offset]; - for (final EventType eventType : EventType.values()) { - final int[] keys = readCovariates.getKeySet(offset, eventType); - final int eventIndex = eventType.index; - final byte qual = tempQualArray[eventIndex]; - final double isError = tempFractionalErrorArray[eventIndex]; + for (final EventType eventType : EventType.values()) { + final int[] keys = readCovariates.getKeySet(offset, eventType); + final int eventIndex = eventType.index; + final byte qual = tempQualArray[eventIndex]; + final double isError = tempFractionalErrorArray[eventIndex]; - final NestedIntegerArray rgRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE); - final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex); - final RecalDatum rgThisDatum = createDatumObject(qual, isError); - if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it - rgRecalTable.put(rgThisDatum, keys[0], eventIndex); - else - rgPreviousDatum.combine(rgThisDatum); - - final NestedIntegerArray qualRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE); - final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex); - if (qualPreviousDatum == null) - qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex); - else - qualPreviousDatum.increment(1.0, isError); - - for (int i = 2; i < covariates.length; i++) { - if (keys[i] < 0) - continue; - final NestedIntegerArray covRecalTable = recalibrationTables.getTable(i); - final RecalDatum covPreviousDatum = covRecalTable.get(keys[0], keys[1], keys[i], eventIndex); - if (covPreviousDatum == null) - covRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], keys[i], eventIndex); + final NestedIntegerArray rgRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE); + final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex); + final RecalDatum rgThisDatum = createDatumObject(qual, isError); + if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it + rgRecalTable.put(rgThisDatum, keys[0], eventIndex); else - covPreviousDatum.increment(1.0, isError); + rgPreviousDatum.combine(rgThisDatum); + + final NestedIntegerArray qualRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE); + final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex); + if (qualPreviousDatum == null) + qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex); + else + qualPreviousDatum.increment(1.0, isError); + + for (int i = 2; i < covariates.length; i++) { + if (keys[i] < 0) + continue; + final NestedIntegerArray covRecalTable = recalibrationTables.getTable(i); + final RecalDatum covPreviousDatum = covRecalTable.get(keys[0], keys[1], keys[i], eventIndex); + if (covPreviousDatum == null) + covRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], keys[i], eventIndex); + else + covPreviousDatum.increment(1.0, isError); + } } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java index ab65c1462..ce60f5a3a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java @@ -35,5 +35,5 @@ public interface RecalibrationEngine { public void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase); - public void updateDataForRead(final GATKSAMRecord read, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors); + public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java index 76a82a134..2b0f8ca80 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java @@ -93,7 +93,7 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP } @Override - public synchronized void updateDataForRead( final GATKSAMRecord read, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { + public synchronized void updateDataForRead( final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { throw new UnsupportedOperationException("Delocalized BQSR is not available in the GATK-lite version"); } From d7954372020086206fb226eb620031c7a5c71b9c Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Tue, 4 Sep 2012 16:41:44 -0400 Subject: [PATCH 107/161] - New UserExceptions added for when ReadFilters or Walkers specified on the command line are not found. When -rf xxxx cannot find the class corresponding to xxxx, all read filters are printed in a better formatted way, with links to their gatk docs. - VariantAnnotatorEngine changed to call genotype annotations even if pilups and allele -> likelihood mappings are not present. Current genotype annotations altered to check for null pilupes and null mappings. --- .../sting/gatk/filters/FilterManager.java | 30 ++++++++++---- .../annotator/AlleleBalanceBySample.java | 3 ++ .../annotator/DepthPerAlleleBySample.java | 2 +- .../annotator/MappingQualityZeroBySample.java | 2 +- .../annotator/VariantAnnotatorEngine.java | 14 +++---- .../utils/classloader/PluginManager.java | 14 ++++++- .../sting/utils/exceptions/UserException.java | 12 ++++++ .../InvalidArgumentIntegrationTest.java | 41 +++++++++++++++++++ 8 files changed, 97 insertions(+), 21 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java index bddfa6a0d..5ca8a1779 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java @@ -29,6 +29,7 @@ import com.google.common.base.Function; import com.google.common.collect.Collections2; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.help.GATKDocUtils; import java.util.Collection; import java.util.List; @@ -68,16 +69,29 @@ public class FilterManager extends PluginManager { @Override protected String formatErrorMessage(String pluginCategory, String pluginName) { List> availableFilters = this.getPluginsImplementing(ReadFilter.class); - Collection availableFilterNames = Collections2.transform(availableFilters, new Function,String>(){ - @Override - public String apply(final Class input) { - return getName(input); - } - }); - return String.format("Read filter %s not found. Available read filters:%n%s.%n%n%s",pluginName, - Utils.join(String.format(", "),availableFilterNames), + return String.format("Read filter %s not found. Available read filters:%n%n%s%n%n%s",pluginName, + userFriendlyListofReadFilters(availableFilters), "Please consult the GATK Documentation (http://www.broadinstitute.org/gatk/gatkdocs/) for more information."); } + + private String userFriendlyListofReadFilters(List> filters) { + final String headName = "FilterName", headDoc = "Documentation"; + int longestNameLength = -1; + for ( Class < ? extends ReadFilter> filter : filters ) { + longestNameLength = Math.max(longestNameLength,this.getName(filter).length()); + } + String format = " %"+longestNameLength+"s %s%n"; + + StringBuilder listBuilder = new StringBuilder(); + listBuilder.append(String.format(format,headName,headDoc)); + for ( Class filter : filters ) { + String helpLink = GATKDocUtils.helpLinksToGATKDocs(filter); + String filterName = this.getName(filter); + listBuilder.append(String.format(format,filterName,helpLink)); + } + + return listBuilder.toString(); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java index 0104f24d9..1e1f65333 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java @@ -33,6 +33,9 @@ public class AlleleBalanceBySample extends GenotypeAnnotation implements Experim final Genotype g, final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap){ + if ( stratifiedContext == null ) + return; + Double ratio = annotateSNP(stratifiedContext, vc, g); if (ratio == null) return; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 85387f7cf..ee9b51b56 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -54,7 +54,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa final Genotype g, final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { - if ( g == null || !g.isCalled() ) + if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) ) return; if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty()) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java index 354b798bb..44657a7e7 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java @@ -55,7 +55,7 @@ public class MappingQualityZeroBySample extends GenotypeAnnotation { final Genotype g, final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap){ - if ( g == null || !g.isCalled() ) + if ( g == null || !g.isCalled() || stratifiedContext == null ) return; int mq0 = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 22ec5468f..eae13e1b5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -300,16 +300,12 @@ public class VariantAnnotatorEngine { if (stratifiedPerReadAlleleLikelihoodMap != null) perReadAlleleLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName()); - if ( context == null && perReadAlleleLikelihoodMap == null) { - // no likelihoods nor pileup available: just move on to next sample - genotypes.add(genotype); - } else { - final GenotypeBuilder gb = new GenotypeBuilder(genotype); - for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { - annotation.annotate(tracker, walker, ref, context, vc, genotype, gb, perReadAlleleLikelihoodMap); - } - genotypes.add(gb.make()); + + final GenotypeBuilder gb = new GenotypeBuilder(genotype); + for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { + annotation.annotate(tracker, walker, ref, context, vc, genotype, gb, perReadAlleleLikelihoodMap); } + genotypes.add(gb.make()); } return genotypes; diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java index 9f1b6db93..82fb6b8d6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java @@ -27,6 +27,8 @@ package org.broadinstitute.sting.utils.classloader; import ch.qos.logback.classic.Level; import ch.qos.logback.classic.Logger; +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -276,8 +278,16 @@ public class PluginManager { */ public PluginType createByName(String pluginName) { Class plugin = pluginsByName.get(pluginName); - if( plugin == null ) - throw new UserException(formatErrorMessage(pluginCategory,pluginName)); + if( plugin == null ) { + String errorMessage = formatErrorMessage(pluginCategory,pluginName); + if ( this.getClass().isAssignableFrom(FilterManager.class) ) { + throw new UserException.MalformedReadFilterException(errorMessage); + } else if ( this.getClass().isAssignableFrom(WalkerManager.class) ) { + throw new UserException.MalformedWalkerArgumentsException(errorMessage); + } else { + throw new UserException.CommandLineException(errorMessage); + } + } try { return plugin.newInstance(); } catch (Exception e) { diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 3130469e5..47a2f2f1d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -63,6 +63,18 @@ public class UserException extends ReviewedStingException { } } + public static class MalformedReadFilterException extends CommandLineException { + public MalformedReadFilterException(String message) { + super(String.format("Malformed read filter: %s",message)); + } + } + + public static class MalformedWalkerArgumentsException extends CommandLineException { + public MalformedWalkerArgumentsException(String message) { + super(String.format("Malformed walker argument: %s",message)); + } + } + public static class MalformedGenomeLoc extends UserException { public MalformedGenomeLoc(String message, GenomeLoc loc) { super(String.format("Badly formed genome loc: %s: %s", message, loc)); diff --git a/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java b/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java new file mode 100644 index 000000000..924c6ec5a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java @@ -0,0 +1,41 @@ +package org.broadinstitute.sting.commandline; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 8/31/12 + * Time: 11:03 AM + * To change this template use File | Settings | File Templates. + */ +public class InvalidArgumentIntegrationTest extends WalkerTest { + private static final String callsB36 = BaseTest.validationDataLocation + "lowpass.N3.chr1.raw.vcf"; + + private WalkerTest.WalkerTestSpec baseTest(String flag, String arg, Class exeption) { + return new WalkerTest.WalkerTestSpec("-T VariantsToTable -M 10 --variant:vcf " + + callsB36 + " -F POS,CHROM -R " + + b36KGReference + " -o %s " + flag + " " + arg, + 1, exeption); + + } + + @Test + public void testUnknownReadFilter() { + executeTest("UnknownReadFilter",baseTest("-rf","TestUnknownReadFilter", UserException.MalformedReadFilterException.class)); + } + + @Test + public void testMalformedWalkerArgs() { + executeTest("MalformedWalkerArgs", + new WalkerTest.WalkerTestSpec("-T UnknownWalkerName -M 10 --variant:vcf " + + callsB36 + " -F POS,CHROM -R " + + b36KGReference + " -o %s ", + 1, UserException.MalformedWalkerArgumentsException.class)); + } +} From fc06f39411563691b405887cbb030fb8791ee4e9 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 5 Sep 2012 09:55:34 -0400 Subject: [PATCH 108/161] Fixed docs for Pileup walker --- .../broadinstitute/sting/gatk/walkers/Pileup.java | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java index 2a6ecdb8c..52c6e1560 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java @@ -45,19 +45,8 @@ import java.util.Collections; import java.util.List; /** - * Prints the alignment in the pileup format. In the pileup format, each line represents a genomic position, - * consisting of chromosome name, coordinate, reference base, read bases, read qualities and alignment mapping - * qualities. Information on match, mismatch, indel, strand, mapping quality and start and end of a read are all - * encoded at the read base column. At this column, a dot stands for a match to the reference base on the forward strand, - * a comma for a match on the reverse strand, 'ACGTN' for a mismatch on the forward strand and 'acgtn' for a mismatch on the - * reverse strand. - * - * A pattern '\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this reference position and the next - * reference position. The length of the insertion is given by the integer in the pattern, followed by the inserted sequence. - * Similarly, a pattern '-[0-9]+[ACGTNacgtn]+' represents a deletion from the reference. - * Also at the read base column, a symbol '^' marks the start of a read segment which is a contiguous subsequence on the read - * separated by 'N/S/H' CIGAR operations. The ASCII of the character following '^' minus 33 gives the mapping quality. - * A symbol '$' marks the end of a read segment. + * Prints the alignment in something similar to the samtools pileup format. Each line represents a genomic position, + * consisting of chromosome name, coordinate, reference base, read bases, and read qualities. * * Associated command: * samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate] From 84a83fd3f3aa89b50463c230d5393bed0c4b8183 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 5 Sep 2012 10:41:03 -0400 Subject: [PATCH 109/161] fixing typo --- .../src/org/broadinstitute/sting/utils/clipping/ClippingOp.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index 91414d8fe..98eb582e8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -18,7 +18,7 @@ import java.util.Vector; * of the read, plus an option extraInfo (useful for carrying info where needed). *

* Also holds the critical apply function that actually execute the clipping operation on a provided read, - * according to the wishes of the supplid ClippingAlgorithm enum. + * according to the wishes of the supplied ClippingAlgorithm enum. */ public class ClippingOp { public final int start, stop; // inclusive From 6e517df5d94141d3badc45f0ec0b7e65828fc158 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Wed, 5 Sep 2012 14:33:08 -0400 Subject: [PATCH 111/161] fixed a typo in StringText.properties --- .../sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 845fc68a6..5009698e1 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -380,7 +380,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem allelesToGenotype.removeAll( activeAllelesToGenotype ); } - if( !activeRegion.isActive ) { return 0; } // Not active so nothing to do! + if( !activeRegion.isActive ) { return 0; } // Not active so nothing to do TODO : YOSSI Write something smart!! if( activeRegion.size() == 0 && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return 0; } // No reads here so nothing to do! if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do! From ad5fa449e7e19c53875cbaaa2a21c78b360cecf8 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Wed, 5 Sep 2012 14:46:10 -0400 Subject: [PATCH 112/161] fixed a typo in the string comment --- .../sting/gatk/walkers/indels/RealignerTargetCreator.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index a52d57031..b14dc9cc9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -57,7 +57,7 @@ import java.util.TreeSet; * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, - * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are + * it is impossible to place reads on the reference genome such that mismatches are minimized across all reads. Consequently, even when some reads are * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, * also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus * indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an @@ -69,7 +69,7 @@ import java.util.TreeSet; *

  • Running the realigner over those intervals (see the IndelRealigner tool)
  • * *

    - * An important note: the input bam(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step. + * An important note: the input BAM(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step. *

    * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them * (or with reads from similar technologies). This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string. From d6884e705a06d734aed634f05a9e35026ab418b1 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Wed, 5 Sep 2012 15:21:00 -0400 Subject: [PATCH 113/161] Revert "fixed a typo in StringText.properties" This reverts commit b74c1c17e748f75e59d23545084b983e2a8d2fa6. --- .../sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 0537ca189..f4d8a88e0 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -380,7 +380,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem allelesToGenotype.removeAll( activeAllelesToGenotype ); } - if( !activeRegion.isActive ) { return 0; } // Not active so nothing to do TODO : YOSSI Write something smart!! + if( !activeRegion.isActive ) { return 0; } // Not active so nothing to do! if( activeRegion.size() == 0 && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return 0; } // No reads here so nothing to do! if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do! From e3b4cc02aa3d18a6f436093774356ceaffba6a46 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 07:26:01 -0400 Subject: [PATCH 114/161] Done GSA-282: Unindexed traversals crash if a read goes off the end of a contig -- Already fixed in the codebase. Added unindexed bam and integration tests to ensure this is fine going forward. --- .../walkers/PileupWalkerIntegrationTest.java | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java index 667b325ed..e16ef3125 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java @@ -16,13 +16,27 @@ public class PileupWalkerIntegrationTest extends WalkerTest { executeTest("Testing the standard (no-indel) pileup on three merged FHS pools with 27 deletions in 969 bases", spec); } + + + private final static String SingleReadAligningOffChromosome1MD5 = "4a45fe1f85aaa8c4158782f2b6dee2bd"; @Test public void testSingleReadAligningOffChromosome1() { String gatk_args = "-T Pileup " + " -I " + privateTestDir + "readOffb37contig1.bam" + " -R " + b37KGReference + " -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList("4a45fe1f85aaa8c4158782f2b6dee2bd")); + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList(SingleReadAligningOffChromosome1MD5)); executeTest("Testing single read spanning off chromosome 1", spec); } + + @Test + public void testSingleReadAligningOffChromosome1NoIndex() { + String gatk_args = "-T Pileup " + + " -I " + privateTestDir + "readOffb37contig1.noIndex.bam" + + " -R " + b37KGReference + + " -U ALLOW_UNINDEXED_BAM" + + " -o %s"; + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList(SingleReadAligningOffChromosome1MD5)); + executeTest("Testing single read spanning off chromosome 1 unindexed", spec); + } } From 397a5551ef73e87971ba255c68a9f82b73d21490 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 31 Aug 2012 19:54:33 -0400 Subject: [PATCH 115/161] More memory for gatkdocs and extracthelp targets --- build.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/build.xml b/build.xml index f681ddafa..0d1deba29 100644 --- a/build.xml +++ b/build.xml @@ -577,6 +577,7 @@ docletpathref="doclet.classpath" classpathref="external.dependencies" classpath="${java.classes}" + maxmemory="2g" additionalparam="-build-timestamp "${build.timestamp}" -absolute-version ${build.version} -out ${basedir}/${resource.path} -quiet"> @@ -780,6 +781,7 @@ docletpathref="doclet.classpath" classpathref="external.dependencies" classpath="${java.classes}" + maxmemory="2g" additionalparam="${gatkdocs.include.hidden.arg} -private -build-timestamp "${build.timestamp}" -absolute-version ${build.version} -quiet"> From 6055101df8965a3391a19fe686edb8ba85f10487 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 31 Aug 2012 20:10:26 -0400 Subject: [PATCH 116/161] NanoScheduler no longer groups inputs, each map() call is interlaced now -- Maximizes the efficiency of the threads -- Simplifies interface (yea!) -- Reduces number of combinatorial tests that need to be performed --- .../gatk/traversals/TraverseReadsNano.java | 4 +- .../utils/nanoScheduler/NanoScheduler.java | 76 ++++++------------- .../nanoScheduler/NanoSchedulerUnitTest.java | 41 +++++----- 3 files changed, 43 insertions(+), 78 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index b397cb8c0..dbddeb092 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -55,13 +55,11 @@ public class TraverseReadsNano extends TraversalEngine, /** our log, which we want to capture anything from this class */ protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class); private static final boolean DEBUG = false; - private static final int MIN_GROUP_SIZE = 100; final NanoScheduler nanoScheduler; public TraverseReadsNano(int nThreads) { final int bufferSize = ReadShard.getReadBufferSize() + 1; // actually has 1 more than max - final int mapGroupSize = (int)Math.max(Math.ceil(bufferSize / 50.0 + 1), MIN_GROUP_SIZE); - nanoScheduler = new NanoScheduler(bufferSize, mapGroupSize, nThreads); + nanoScheduler = new NanoScheduler(bufferSize, nThreads); } @Override diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 668c82524..5c6aa6a35 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -3,7 +3,6 @@ package org.broadinstitute.sting.utils.nanoScheduler; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.Iterator; @@ -47,7 +46,6 @@ public class NanoScheduler { private final static boolean ALLOW_SINGLE_THREAD_FASTPATH = true; final int bufferSize; - final int mapGroupSize; final int nThreads; final ExecutorService executor; boolean shutdown = false; @@ -57,29 +55,15 @@ public class NanoScheduler { * Create a new nanoschedule with the desire characteristics requested by the argument * * @param bufferSize the number of input elements to read in each scheduling cycle. - * @param mapGroupSize How many inputs should be grouped together per map? If -1 we make a reasonable guess * @param nThreads the number of threads to use to get work done, in addition to the thread calling execute */ public NanoScheduler(final int bufferSize, - final int mapGroupSize, final int nThreads) { if ( bufferSize < 1 ) throw new IllegalArgumentException("bufferSize must be >= 1, got " + bufferSize); if ( nThreads < 1 ) throw new IllegalArgumentException("nThreads must be >= 1, got " + nThreads); - if ( mapGroupSize > bufferSize ) throw new IllegalArgumentException("mapGroupSize " + mapGroupSize + " must be <= bufferSize " + bufferSize); - if ( mapGroupSize == 0 || mapGroupSize < -1 ) throw new IllegalArgumentException("mapGroupSize cannot be <= 0" + mapGroupSize); - this.bufferSize = bufferSize; this.nThreads = nThreads; - - if ( mapGroupSize == -1 ) { - this.mapGroupSize = (int)Math.ceil(this.bufferSize / (10.0*this.nThreads)); - logger.info(String.format("Dynamically setting grouping size to %d based on buffer size %d and n threads %d", - this.mapGroupSize, this.bufferSize, this.nThreads)); - } else { - this.mapGroupSize = mapGroupSize; - } - this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads); } @@ -101,15 +85,6 @@ public class NanoScheduler { return bufferSize; } - /** - * The grouping size used by this NanoScheduler - * @return - */ - @Ensures("result > 0") - public int getMapGroupSize() { - return mapGroupSize; - } - /** * Tells this nanoScheduler to shutdown immediately, releasing all its resources. * @@ -214,10 +189,10 @@ public class NanoScheduler { final List inputs = readInputs(inputReader); // send jobs for map - final Queue>> mapQueue = submitMapJobs(map, executor, inputs); + final Queue> mapQueue = submitMapJobs(map, executor, inputs); // send off the reduce job, and block until we get at least one reduce result - sum = reduceParallel(reduce, mapQueue, sum); + sum = reduceSerial(reduce, mapQueue, sum); } catch (InterruptedException ex) { throw new ReviewedStingException("got execution exception", ex); } catch (ExecutionException ex) { @@ -229,16 +204,16 @@ public class NanoScheduler { } @Requires({"reduce != null", "! mapQueue.isEmpty()"}) - private ReduceType reduceParallel(final ReduceFunction reduce, - final Queue>> mapQueue, - final ReduceType initSum) + private ReduceType reduceSerial(final ReduceFunction reduce, + final Queue> mapQueue, + final ReduceType initSum) throws InterruptedException, ExecutionException { ReduceType sum = initSum; // while mapQueue has something in it to reduce - for ( final Future> future : mapQueue ) { - for ( final MapType value : future.get() ) // block until we get the values for this task - sum = reduce.apply(value, sum); + for ( final Future future : mapQueue ) { + final MapType value = future.get(); // block until we get the values for this task + sum = reduce.apply(value, sum); } return sum; @@ -247,7 +222,7 @@ public class NanoScheduler { /** * Read up to inputBufferSize elements from inputReader * - * @return a queue of inputs read in, containing one or more values of InputType read in + * @return a queue of input read in, containing one or more values of InputType read in */ @Requires("inputReader.hasNext()") @Ensures("!result.isEmpty()") @@ -263,14 +238,14 @@ public class NanoScheduler { } @Requires({"map != null", "! inputs.isEmpty()"}) - private Queue>> submitMapJobs(final MapFunction map, - final ExecutorService executor, - final List inputs) { - final Queue>> mapQueue = new LinkedList>>(); + private Queue> submitMapJobs(final MapFunction map, + final ExecutorService executor, + final List inputs) { + final Queue> mapQueue = new LinkedList>(); - for ( final List subinputs : Utils.groupList(inputs, getMapGroupSize()) ) { - final CallableMap doMap = new CallableMap(map, subinputs); - final Future> future = executor.submit(doMap); + for ( final InputType input : inputs ) { + final CallableMap doMap = new CallableMap(map, input); + final Future future = executor.submit(doMap); mapQueue.add(future); } @@ -280,23 +255,18 @@ public class NanoScheduler { /** * A simple callable version of the map function for use with the executor pool */ - private class CallableMap implements Callable> { - final List inputs; + private class CallableMap implements Callable { + final InputType input; final MapFunction map; - @Requires({"map != null", "inputs.size() <= getMapGroupSize()"}) - private CallableMap(final MapFunction map, final List inputs) { - this.inputs = inputs; + @Requires({"map != null"}) + private CallableMap(final MapFunction map, final InputType inputs) { + this.input = inputs; this.map = map; } - @Ensures("result.size() == inputs.size()") - @Override public List call() throws Exception { - final List outputs = new LinkedList(); - for ( final InputType input : inputs ) - outputs.add(map.apply(input)); - debugPrint(" Processed %d elements with map", outputs.size()); - return outputs; + @Override public MapType call() throws Exception { + return map.apply(input); } } } diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index 89506dcb1..1dcc243f2 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -5,7 +5,10 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; /** * UnitTests for the NanoScheduler @@ -39,18 +42,17 @@ public class NanoSchedulerUnitTest extends BaseTest { } private static class NanoSchedulerBasicTest extends TestDataProvider { - final int bufferSize, mapGroupSize, nThreads, start, end, expectedResult; + final int bufferSize, nThreads, start, end, expectedResult; - public NanoSchedulerBasicTest(final int bufferSize, final int mapGroupSize, final int nThreads, final int start, final int end) { + public NanoSchedulerBasicTest(final int bufferSize, final int nThreads, final int start, final int end) { super(NanoSchedulerBasicTest.class); this.bufferSize = bufferSize; - this.mapGroupSize = mapGroupSize; this.nThreads = nThreads; this.start = start; this.end = end; this.expectedResult = sum2x(start, end); - setName(String.format("%s nt=%d buf=%d mapGroupSize=%d start=%d end=%d sum=%d", - getClass().getSimpleName(), nThreads, bufferSize, mapGroupSize, start, end, expectedResult)); + setName(String.format("%s nt=%d buf=%d start=%d end=%d sum=%d", + getClass().getSimpleName(), nThreads, bufferSize, start, end, expectedResult)); } public Iterator makeReader() { @@ -69,14 +71,10 @@ public class NanoSchedulerUnitTest extends BaseTest { @DataProvider(name = "NanoSchedulerBasicTest") public Object[][] createNanoSchedulerBasicTest() { for ( final int bufferSize : Arrays.asList(1, 10, 1000, 1000000) ) { - for ( final int mapGroupSize : Arrays.asList(-1, 1, 10, 100, 1000) ) { - if ( mapGroupSize <= bufferSize ) { - for ( final int nt : Arrays.asList(1, 2, 4) ) { - for ( final int start : Arrays.asList(0) ) { - for ( final int end : Arrays.asList(1, 2, 11, 10000, 100000) ) { - exampleTest = new NanoSchedulerBasicTest(bufferSize, mapGroupSize, nt, start, end); - } - } + for ( final int nt : Arrays.asList(1, 2, 4) ) { + for ( final int start : Arrays.asList(0) ) { + for ( final int end : Arrays.asList(1, 2, 11, 10000, 100000) ) { + exampleTest = new NanoSchedulerBasicTest(bufferSize, nt, start, end); } } } @@ -101,10 +99,9 @@ public class NanoSchedulerUnitTest extends BaseTest { private void testNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { final NanoScheduler nanoScheduler = - new NanoScheduler(test.bufferSize, test.mapGroupSize, test.nThreads); + new NanoScheduler(test.bufferSize, test.nThreads); Assert.assertEquals(nanoScheduler.getBufferSize(), test.bufferSize, "bufferSize argument"); - Assert.assertTrue(nanoScheduler.getMapGroupSize() >= test.mapGroupSize, "mapGroupSize argument"); Assert.assertEquals(nanoScheduler.getnThreads(), test.nThreads, "nThreads argument"); final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); @@ -115,11 +112,11 @@ public class NanoSchedulerUnitTest extends BaseTest { @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", dependsOnMethods = "testMultiThreadedNanoScheduler", timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testNanoSchedulerInLoop(final NanoSchedulerBasicTest test) throws InterruptedException { - if ( test.bufferSize > 1 && (test.mapGroupSize > 1 || test.mapGroupSize == -1)) { + if ( test.bufferSize > 1) { logger.warn("Running " + test); final NanoScheduler nanoScheduler = - new NanoScheduler(test.bufferSize, test.mapGroupSize, test.nThreads); + new NanoScheduler(test.bufferSize, test.nThreads); // test reusing the scheduler for ( int i = 0; i < 10; i++ ) { @@ -134,7 +131,7 @@ public class NanoSchedulerUnitTest extends BaseTest { @Test(timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testShutdown() throws InterruptedException { - final NanoScheduler nanoScheduler = new NanoScheduler(1, 1, 2); + final NanoScheduler nanoScheduler = new NanoScheduler(1, 2); Assert.assertFalse(nanoScheduler.isShutdown(), "scheduler should be alive"); nanoScheduler.shutdown(); Assert.assertTrue(nanoScheduler.isShutdown(), "scheduler should be dead"); @@ -142,15 +139,15 @@ public class NanoSchedulerUnitTest extends BaseTest { @Test(expectedExceptions = IllegalStateException.class, timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testShutdownExecuteFailure() throws InterruptedException { - final NanoScheduler nanoScheduler = new NanoScheduler(1, 1, 2); + final NanoScheduler nanoScheduler = new NanoScheduler(1, 2); nanoScheduler.shutdown(); nanoScheduler.execute(exampleTest.makeReader(), exampleTest.makeMap(), exampleTest.initReduce(), exampleTest.makeReduce()); } public static void main(String [ ] args) { - final NanoSchedulerBasicTest test = new NanoSchedulerBasicTest(1000, 100, Integer.valueOf(args[0]), 0, Integer.valueOf(args[1])); + final NanoSchedulerBasicTest test = new NanoSchedulerBasicTest(1000, Integer.valueOf(args[0]), 0, Integer.valueOf(args[1])); final NanoScheduler nanoScheduler = - new NanoScheduler(test.bufferSize, test.mapGroupSize, test.nThreads); + new NanoScheduler(test.bufferSize, test.nThreads); final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); System.out.printf("Sum = %d, expected =%d%n", sum, test.expectedResult); From e01258b2615609e925f2deb3dd886bae6b08402a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 1 Sep 2012 11:51:31 -0400 Subject: [PATCH 117/161] NanoScheduler now supports printProgress. Bugfixes to printProgress -- TraverseReadsNano prints progress at the end of each traversal unit -- Fix bugs in TraversalEngine printProgress -- Synchronize the method so we don't get multiple logged outputs when two or more HMSs call printProgress before initialization at the start! -- Fix the logic for mustPrint, which actually had the logic of mustNotPrint. Now we see the done log line that was always supposed to be there -- Fix output formatting, as the done() line was incorrectly shifting over the % complete by 1 char as 100.0% didn't fit in %4.1f -- Add clearer doc on -PF argument so that people know that the performance log can be generated to standard out if one wants --- .../arguments/GATKArgumentCollection.java | 10 +++++- .../gatk/traversals/TraversalEngine.java | 33 +++++++++++++++---- .../gatk/traversals/TraverseReadsNano.java | 18 +++++++--- 3 files changed, 48 insertions(+), 13 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 72cb5e02f..6be66b204 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -41,7 +41,9 @@ import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalSetRule; import java.io.File; -import java.util.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; /** * @author aaron @@ -197,6 +199,12 @@ public class GATKArgumentCollection { // performance log arguments // // -------------------------------------------------------------------------------------------------------------- + + /** + * The file name for the GATK performance log output, or null if you don't want to generate the + * detailed performance logging table. This table is suitable for importing into R or any + * other analysis software that can read tsv files + */ @Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false) public File performanceLog = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index abc71e549..198f9342e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -189,12 +189,26 @@ public abstract class TraversalEngine,Provide /** * Forward request to printProgress * + * Assumes that one cycle has been completed + * * @param shard the given shard currently being processed. * @param loc the location */ public void printProgress(Shard shard, GenomeLoc loc) { // A bypass is inserted here for unit testing. - printProgress(loc,shard.getReadMetrics(),false); + printProgress(loc,shard.getReadMetrics(),false, 1); + } + + /** + * Forward request to printProgress + * + * @param shard the given shard currently being processed. + * @param loc the location + * @param nElapsedCycles the number of cycles (turns of map) that have occurred since the last call + */ + public void printProgress(Shard shard, GenomeLoc loc, int nElapsedCycles) { + // A bypass is inserted here for unit testing. + printProgress(loc,shard.getReadMetrics(),false, nElapsedCycles); } /** @@ -205,12 +219,16 @@ public abstract class TraversalEngine,Provide * @param metrics Data processed since the last cumulative * @param mustPrint If true, will print out info, regardless of nRecords or time interval */ - private void printProgress(GenomeLoc loc, ReadMetrics metrics, boolean mustPrint) { - if ( mustPrint || printProgressCheckCounter++ % PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES != 0 ) + private synchronized void printProgress(GenomeLoc loc, ReadMetrics metrics, boolean mustPrint, int nElapsedCycles) { + final int previousPrintCycle = printProgressCheckCounter / PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES; + final int newPrintCycle = (printProgressCheckCounter+nElapsedCycles) / PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES; + + printProgressCheckCounter += nElapsedCycles; // keep track of our number of cycles through printProgress + if ( newPrintCycle == previousPrintCycle && ! mustPrint ) // don't do any work more often than PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES return; - if(!progressMeterInitialized && mustPrint == false ) { + if( ! progressMeterInitialized ) { logger.info("[INITIALIZATION COMPLETE; TRAVERSAL STARTING]"); logger.info(String.format("%15s processed.%s runtime per.1M.%s completed total.runtime remaining", "Location", getTraversalType(), getTraversalType())); @@ -250,8 +268,9 @@ public abstract class TraversalEngine,Provide else PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds - logger.info(String.format("%15s %5.2e %s %s %4.1f%% %s %s", - loc == null ? "done with mapped reads" : loc, nRecords*1.0, elapsed, unitRate, + final String posName = loc == null ? (mustPrint ? "done" : "unmapped reads") : Integer.toString(loc.getStart()); + logger.info(String.format("%15s %5.2e %s %s %5.1f%% %s %s", + posName, nRecords*1.0, elapsed, unitRate, 100*fractionGenomeTargetCompleted, estTotalRuntime, timeToCompletion)); } @@ -309,7 +328,7 @@ public abstract class TraversalEngine,Provide * Called after a traversal to print out information about the traversal process */ public void printOnTraversalDone() { - printProgress(null, null, true); + printProgress(null, null, true, 1); final double elapsed = timer == null ? 0 : timer.getElapsedTime(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index dbddeb092..2ada8bbfa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.gatk.datasources.providers.ReadView; import org.broadinstitute.sting.gatk.datasources.reads.ReadShard; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.nanoScheduler.MapFunction; import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; import org.broadinstitute.sting.utils.nanoScheduler.ReduceFunction; @@ -87,9 +88,15 @@ public class TraverseReadsNano extends TraversalEngine, final TraverseReadsMap myMap = new TraverseReadsMap(walker); final TraverseReadsReduce myReduce = new TraverseReadsReduce(walker); - T result = nanoScheduler.execute(aggregateMapData(dataProvider).iterator(), myMap, sum, myReduce); - // TODO -- how do we print progress? - //printProgress(dataProvider.getShard(), ???); + final List aggregatedInputs = aggregateMapData(dataProvider); + final T result = nanoScheduler.execute(aggregatedInputs.iterator(), myMap, sum, myReduce); + + final GATKSAMRecord lastRead = aggregatedInputs.get(aggregatedInputs.size() - 1).read; + final GenomeLoc locus = engine.getGenomeLocParser().createGenomeLoc(lastRead); + printProgress(dataProvider.getShard(), locus, aggregatedInputs.size()); + + // TODO -- how can I get done value? + // done = walker.isDone(); return result; } @@ -165,8 +172,9 @@ public class TraverseReadsNano extends TraversalEngine, return walker.map(data.refContext, data.read, data.tracker); } } - - return null; // TODO -- what should we return in the case where the walker is done or the read is filtered? + // TODO -- how can we cleanly support done and filtered. Need to return + // TODO -- a MapResult object that says the status + return null; } } } From 7087b22ea397c96a78a9dbc2bc98558d80343cea Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 1 Sep 2012 12:28:54 -0400 Subject: [PATCH 118/161] No debugging output (even conditional) for ReadTransformers in PrintReads --- .../src/org/broadinstitute/sting/gatk/walkers/PrintReads.java | 1 - 1 file changed, 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java index a5d4b45b6..4118617fc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java @@ -228,7 +228,6 @@ public class PrintReads extends ReadWalker impleme GATKSAMRecord workingRead = readIn; for ( final ReadTransformer transformer : readTransformers ) { - if ( logger.isDebugEnabled() ) logger.debug("Applying transformer " + transformer + " to read " + readIn.getReadName()); workingRead = transformer.apply(workingRead); } From 800a27c3a701bef87bd8210b0dddf080c1555068 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 1 Sep 2012 12:29:59 -0400 Subject: [PATCH 119/161] NanoScheduler tracks time within input, map, and reduce -- Helpful for understanding where the time goes to each bit of the code. -- Controlled by a local static boolean, to avoid the potential overhead in general --- .../utils/nanoScheduler/NanoScheduler.java | 40 ++++++++++++++++--- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 5c6aa6a35..39b541944 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -3,6 +3,8 @@ package org.broadinstitute.sting.utils.nanoScheduler; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.AutoFormattingTime; +import org.broadinstitute.sting.utils.SimpleTimer; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.Iterator; @@ -44,6 +46,7 @@ import java.util.concurrent.*; public class NanoScheduler { private final static Logger logger = Logger.getLogger(NanoScheduler.class); private final static boolean ALLOW_SINGLE_THREAD_FASTPATH = true; + private final static boolean TIME_CALLS = true; final int bufferSize; final int nThreads; @@ -51,6 +54,10 @@ public class NanoScheduler { boolean shutdown = false; boolean debug = false; + final SimpleTimer inputTimer = new SimpleTimer(); + final SimpleTimer mapTimer = new SimpleTimer(); + final SimpleTimer reduceTimer = new SimpleTimer(); + /** * Create a new nanoschedule with the desire characteristics requested by the argument * @@ -97,6 +104,19 @@ public class NanoScheduler { throw new IllegalStateException("Remaining tasks found in the executor, unexpected behavior!"); } shutdown = true; + + if (TIME_CALLS) { + printTimerInfo("Input time", inputTimer); + printTimerInfo("Map time", mapTimer); + printTimerInfo("Reduce time", reduceTimer); + } + } + + private void printTimerInfo(final String label, final SimpleTimer timer) { + final double total = inputTimer.getElapsedTime() + mapTimer.getElapsedTime() + reduceTimer.getElapsedTime(); + final double myTimeInSec = timer.getElapsedTime(); + final double myTimePercent = myTimeInSec / total * 100; + logger.info(String.format("%s: %s (%5.2f%%)", label, new AutoFormattingTime(myTimeInSec), myTimePercent)); } /** @@ -134,10 +154,10 @@ public class NanoScheduler { * It is safe to call this function repeatedly on a single nanoScheduler, at least until the * shutdown method is called. * - * @param inputReader - * @param map - * @param reduce - * @return + * @param inputReader an iterator providing us with the input data to nanoSchedule map/reduce over + * @param map the map function from input type -> map type, will be applied in parallel to each input + * @param reduce the reduce function from map type + reduce type -> reduce type to be applied in order to map results + * @return the last reduce value */ public ReduceType execute(final Iterator inputReader, final MapFunction map, @@ -213,7 +233,10 @@ public class NanoScheduler { // while mapQueue has something in it to reduce for ( final Future future : mapQueue ) { final MapType value = future.get(); // block until we get the values for this task + + if ( TIME_CALLS) reduceTimer.restart(); sum = reduce.apply(value, sum); + if ( TIME_CALLS) reduceTimer.stop(); } return sum; @@ -229,11 +252,15 @@ public class NanoScheduler { private List readInputs(final Iterator inputReader) { int n = 0; final List inputs = new LinkedList(); + + if ( TIME_CALLS) inputTimer.restart(); while ( inputReader.hasNext() && n < getBufferSize() ) { final InputType input = inputReader.next(); inputs.add(input); n++; } + if ( TIME_CALLS) inputTimer.stop(); + return inputs; } @@ -266,7 +293,10 @@ public class NanoScheduler { } @Override public MapType call() throws Exception { - return map.apply(input); + if ( TIME_CALLS) mapTimer.restart(); + final MapType result = map.apply(input); + if ( TIME_CALLS) mapTimer.stop(); + return result; } } } From 59109d5eeb8798bb2c6eabf4b987837fc693b951 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 2 Sep 2012 10:54:18 -0400 Subject: [PATCH 120/161] NanoScheduler tracks time outside of its execute call --- .../utils/nanoScheduler/NanoScheduler.java | 25 ++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 39b541944..a6be6ad6d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -54,6 +54,7 @@ public class NanoScheduler { boolean shutdown = false; boolean debug = false; + final SimpleTimer outsideSchedulerTimer = new SimpleTimer(); final SimpleTimer inputTimer = new SimpleTimer(); final SimpleTimer mapTimer = new SimpleTimer(); final SimpleTimer reduceTimer = new SimpleTimer(); @@ -72,6 +73,9 @@ public class NanoScheduler { this.bufferSize = bufferSize; this.nThreads = nThreads; this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads); + + // start timing the time spent outside of the nanoScheduler + outsideSchedulerTimer.start(); } /** @@ -98,6 +102,8 @@ public class NanoScheduler { * After this call, execute cannot be invoked without throwing an error */ public void shutdown() { + outsideSchedulerTimer.stop(); + if ( executor != null ) { final List remaining = executor.shutdownNow(); if ( ! remaining.isEmpty() ) @@ -106,14 +112,16 @@ public class NanoScheduler { shutdown = true; if (TIME_CALLS) { - printTimerInfo("Input time", inputTimer); - printTimerInfo("Map time", mapTimer); - printTimerInfo("Reduce time", reduceTimer); + printTimerInfo("Input time", inputTimer); + printTimerInfo("Map time", mapTimer); + printTimerInfo("Reduce time", reduceTimer); + printTimerInfo("Outside time", outsideSchedulerTimer); } } private void printTimerInfo(final String label, final SimpleTimer timer) { - final double total = inputTimer.getElapsedTime() + mapTimer.getElapsedTime() + reduceTimer.getElapsedTime(); + final double total = inputTimer.getElapsedTime() + mapTimer.getElapsedTime() + + reduceTimer.getElapsedTime() + outsideSchedulerTimer.getElapsedTime(); final double myTimeInSec = timer.getElapsedTime(); final double myTimePercent = myTimeInSec / total * 100; logger.info(String.format("%s: %s (%5.2f%%)", label, new AutoFormattingTime(myTimeInSec), myTimePercent)); @@ -168,11 +176,16 @@ public class NanoScheduler { if ( map == null ) throw new IllegalArgumentException("map function cannot be null"); if ( reduce == null ) throw new IllegalArgumentException("reduce function cannot be null"); + outsideSchedulerTimer.stop(); + ReduceType result; if ( ALLOW_SINGLE_THREAD_FASTPATH && getnThreads() == 1 ) { - return executeSingleThreaded(inputReader, map, initialValue, reduce); + result = executeSingleThreaded(inputReader, map, initialValue, reduce); } else { - return executeMultiThreaded(inputReader, map, initialValue, reduce); + result = executeMultiThreaded(inputReader, map, initialValue, reduce); } + + outsideSchedulerTimer.restart(); + return result; } /** From 6a5a70cdf1a80751d1fe54594c0d0d2ee6a3fa87 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 2 Sep 2012 11:37:30 -0400 Subject: [PATCH 121/161] Done GSA-539: SimpleTimer should use System.nanoTime for nanoSecond resolution --- .../sting/utils/SimpleTimer.java | 89 ++++++++++++++----- .../sting/utils/SimpleTimerUnitTest.java | 63 ++++++++++++- 2 files changed, 128 insertions(+), 24 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java b/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java index 15d34a348..b3a9986c5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java +++ b/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java @@ -1,18 +1,42 @@ package org.broadinstitute.sting.utils; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.util.concurrent.TimeUnit; + /** - * A useful simple system for timing code. This code is not thread safe! + * A useful simple system for timing code with nano second resolution + * + * Note that this code is not thread-safe. If you have a single timer + * being started and stopped by multiple threads you will need to protect the + * calls to avoid meaningless results of having multiple starts and stops + * called sequentially. * * User: depristo * Date: Dec 10, 2010 * Time: 9:07:44 AM */ public class SimpleTimer { - final private String name; - private long elapsed = 0l; - private long startTime = 0l; - boolean running = false; + protected static final double NANO_TO_SECOND_DOUBLE = 1.0 / TimeUnit.SECONDS.toNanos(1); + private final String name; + + /** + * The elapsedTimeNano time in nanoSeconds of this timer. The elapsedTimeNano time is the + * sum of times between starts/restrats and stops. + */ + private long elapsedTimeNano = 0l; + + /** + * The start time of the last start/restart in nanoSeconds + */ + private long startTimeNano = 0l; + + /** + * Is this timer currently running (i.e., the last call was start/restart) + */ + private boolean running = false; /** * Creates an anonymous simple timer @@ -25,7 +49,8 @@ public class SimpleTimer { * Creates a simple timer named name * @param name of the timer, must not be null */ - public SimpleTimer(String name) { + public SimpleTimer(final String name) { + if ( name == null ) throw new IllegalArgumentException("SimpleTimer name cannot be null"); this.name = name; } @@ -37,27 +62,27 @@ public class SimpleTimer { } /** - * Starts the timer running, and sets the elapsed time to 0. This is equivalent to + * Starts the timer running, and sets the elapsedTimeNano time to 0. This is equivalent to * resetting the time to have no history at all. * * @return this object, for programming convenience */ + @Ensures("elapsedTimeNano == 0l") public synchronized SimpleTimer start() { - elapsed = 0l; - restart(); - return this; + elapsedTimeNano = 0l; + return restart(); } /** - * Starts the timer running, without reseting the elapsed time. This function may be + * Starts the timer running, without resetting the elapsedTimeNano time. This function may be * called without first calling start(). The only difference between start and restart - * is that start resets the elapsed time, while restart does not. + * is that start resets the elapsedTimeNano time, while restart does not. * * @return this object, for programming convenience */ public synchronized SimpleTimer restart() { running = true; - startTime = currentTime(); + startTimeNano = currentTimeNano(); return this; } @@ -71,29 +96,53 @@ public class SimpleTimer { /** * @return A convenience function to obtain the current time in milliseconds from this timer */ - public synchronized long currentTime() { + public long currentTime() { return System.currentTimeMillis(); } /** - * Stops the timer. Increases the elapsed time by difference between start and now. The - * timer must be running in order to call stop + * @return A convenience function to obtain the current time in nanoSeconds from this timer + */ + public long currentTimeNano() { + return System.nanoTime(); + } + + /** + * Stops the timer. Increases the elapsedTimeNano time by difference between start and now. + * + * It's ok to call stop on a timer that's not running. It has no effect on the timer. * * @return this object, for programming convenience */ + @Requires("startTimeNano != 0l") public synchronized SimpleTimer stop() { - running = false; - elapsed += currentTime() - startTime; + if ( running ) { + running = false; + elapsedTimeNano += currentTimeNano() - startTimeNano; + } return this; } /** - * Returns the total elapsed time of all start/stops of this timer. If the timer is currently + * Returns the total elapsedTimeNano time of all start/stops of this timer. If the timer is currently * running, includes the difference from currentTime() and the start as well * * @return this time, in seconds */ public synchronized double getElapsedTime() { - return (running ? (currentTime() - startTime + elapsed) : elapsed) / 1000.0; + return nanoToSecondsAsDouble(getElapsedTimeNano()); + } + + protected static double nanoToSecondsAsDouble(final long nano) { + return nano * NANO_TO_SECOND_DOUBLE; + } + + /** + * @see #getElapsedTime() but returns the result in nanoseconds + * + * @return the elapsed time in nanoseconds + */ + public synchronized long getElapsedTimeNano() { + return running ? (currentTimeNano() - startTimeNano + elapsedTimeNano) : elapsedTimeNano; } } diff --git a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java index 7a2696b7b..7285c00ac 100755 --- a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java @@ -1,12 +1,12 @@ package org.broadinstitute.sting.utils; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.testng.Assert; -import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; -import java.io.File; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.TimeUnit; public class SimpleTimerUnitTest extends BaseTest { private final static String NAME = "unit.test.timer"; @@ -17,33 +17,88 @@ public class SimpleTimerUnitTest extends BaseTest { Assert.assertEquals(t.getName(), NAME, "Name is not the provided one"); Assert.assertFalse(t.isRunning(), "Initial state of the timer is running"); Assert.assertEquals(t.getElapsedTime(), 0.0, "New timer elapsed time should be 0"); + Assert.assertEquals(t.getElapsedTimeNano(), 0l, "New timer elapsed time nano should be 0"); t.start(); Assert.assertTrue(t.isRunning(), "Started timer isn't running"); Assert.assertTrue(t.getElapsedTime() >= 0.0, "Elapsed time should be >= 0"); + Assert.assertTrue(t.getElapsedTimeNano() >= 0.0, "Elapsed time nano should be >= 0"); + long n1 = t.getElapsedTimeNano(); double t1 = t.getElapsedTime(); idleLoop(); // idle loop to wait a tiny bit of time + long n2 = t.getElapsedTimeNano(); double t2 = t.getElapsedTime(); Assert.assertTrue(t2 >= t1, "T2 >= T1 for a running time"); + Assert.assertTrue(n2 >= n1, "T2 >= T1 nano for a running time"); t.stop(); Assert.assertFalse(t.isRunning(), "Stopped timer still running"); + long n3 = t.getElapsedTimeNano(); double t3 = t.getElapsedTime(); idleLoop(); // idle loop to wait a tiny bit of time double t4 = t.getElapsedTime(); + long n4 = t.getElapsedTimeNano(); Assert.assertTrue(t4 == t3, "Elapsed times for two calls of stop timer not the same"); + Assert.assertTrue(n4 == n3, "Elapsed times for two calls of stop timer not the same"); t.restart(); idleLoop(); // idle loop to wait a tiny bit of time double t5 = t.getElapsedTime(); + long n5 = t.getElapsedTimeNano(); Assert.assertTrue(t.isRunning(), "Restarted timer should be running"); idleLoop(); // idle loop to wait a tiny bit of time double t6 = t.getElapsedTime(); + long n6 = t.getElapsedTimeNano(); Assert.assertTrue(t5 >= t4, "Restarted timer elapsed time should be after elapsed time preceding the restart"); Assert.assertTrue(t6 >= t5, "Second elapsed time not after the first in restarted timer"); + Assert.assertTrue(n5 >= n4, "Restarted timer elapsed time nano should be after elapsed time preceding the restart"); + Assert.assertTrue(n6 >= n5, "Second elapsed time nano not after the first in restarted timer"); + + final List secondTimes = Arrays.asList(t1, t2, t3, t4, t5, t6); + final List nanoTimes = Arrays.asList(n1, n2, n3, n4, n5, n6); + for ( int i = 0; i < nanoTimes.size(); i++ ) + Assert.assertEquals( + SimpleTimer.nanoToSecondsAsDouble(nanoTimes.get(i)), + secondTimes.get(i), 1e-1, "Nanosecond and second timer disagree"); } - private final static void idleLoop() { + @Test + public void testNanoResolution() { + SimpleTimer t = new SimpleTimer(NAME); + + // test the nanosecond resolution + long n7 = t.currentTimeNano(); + int sum = 0; + for ( int i = 0; i < 100; i++) sum += i; + long n8 = t.currentTimeNano(); + final long delta = n8 - n7; + final long oneMilliInNano = TimeUnit.MILLISECONDS.toNanos(1); + logger.warn("nanoTime before nano operation " + n7); + logger.warn("nanoTime after nano operation of summing 100 ints " + n8 + ", sum = " + sum + " time delta " + delta + " vs. 1 millsecond in nano " + oneMilliInNano); + Assert.assertTrue(n8 > n7, "SimpleTimer doesn't appear to have nanoSecond resolution: n8 " + n8 + " <= n7 " + n7); + Assert.assertTrue(delta < oneMilliInNano, + "SimpleTimer doesn't appear to have nanoSecond resolution: time delta is " + delta + " vs 1 millisecond in nano " + oneMilliInNano); + } + + @Test + public void testMeaningfulTimes() { + SimpleTimer t = new SimpleTimer(NAME); + + t.start(); + for ( int i = 0; i < 100; i++ ) ; + long nano = t.getElapsedTimeNano(); + double secs = t.getElapsedTime(); + + Assert.assertTrue(secs > 0, "Seconds timer doesn't appear to count properly: elapsed time is " + secs); + Assert.assertTrue(secs < 0.01, "Fast operation said to take longer than 10 milliseconds: elapsed time in seconds " + secs); + + Assert.assertTrue(nano > 0, "Nanosecond timer doesn't appear to count properly: elapsed time is " + nano); + final long maxTimeInMicro = 100; + final long maxTimeInNano = TimeUnit.MICROSECONDS.toNanos(100); + Assert.assertTrue(nano < maxTimeInNano, "Fast operation said to take longer than " + maxTimeInMicro + " microseconds: elapsed time in nano " + nano + " micro " + TimeUnit.NANOSECONDS.toMicros(nano)); + } + + private static void idleLoop() { for ( int i = 0; i < 100000; i++ ) ; // idle loop to wait a tiny bit of time } } \ No newline at end of file From 1a8f5fc374994b06f16d2a6cc987a2720d42b144 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 2 Sep 2012 11:53:59 -0400 Subject: [PATCH 122/161] Trivial cleanup of NanoScheduler --- .../sting/utils/nanoScheduler/NanoScheduler.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index a6be6ad6d..1ef4d3950 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -54,10 +54,10 @@ public class NanoScheduler { boolean shutdown = false; boolean debug = false; - final SimpleTimer outsideSchedulerTimer = new SimpleTimer(); - final SimpleTimer inputTimer = new SimpleTimer(); - final SimpleTimer mapTimer = new SimpleTimer(); - final SimpleTimer reduceTimer = new SimpleTimer(); + final SimpleTimer outsideSchedulerTimer = new SimpleTimer("outside"); + final SimpleTimer inputTimer = new SimpleTimer("input"); + final SimpleTimer mapTimer = new SimpleTimer("map"); + final SimpleTimer reduceTimer = new SimpleTimer("reduce"); /** * Create a new nanoschedule with the desire characteristics requested by the argument From 9823102c0cceea72beb0db689631a1ebeade9978 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 2 Sep 2012 12:16:56 -0400 Subject: [PATCH 123/161] TraverseReadsNano supports walker.filter and walker.done -- Instead of returning directly the result of map(), returns a MapResult object with the value and a reduceMe flag. -- Reduce function respects the reduceMe flag -- Code cleanup and more documentation --- .../gatk/traversals/TraverseReadsNano.java | 120 +++++++++++++----- 1 file changed, 86 insertions(+), 34 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index 2ada8bbfa..4bb700c37 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -40,27 +40,28 @@ import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; import org.broadinstitute.sting.utils.nanoScheduler.ReduceFunction; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import java.util.ArrayList; +import java.util.LinkedList; import java.util.List; /** - * @author aaron + * A nano-scheduling version of TraverseReads. + * + * Implements the traversal of a walker that accepts individual reads, the reference, and + * RODs per map call. Directly supports shared memory parallelism via NanoScheduler + * + * @author depristo * @version 1.0 - * @date Apr 24, 2009 - *

    - * Class TraverseReads - *

    - * This class handles traversing by reads in the new shardable style + * @date 9/2/2012 */ public class TraverseReadsNano extends TraversalEngine,ReadShardDataProvider> { /** our log, which we want to capture anything from this class */ protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class); private static final boolean DEBUG = false; - final NanoScheduler nanoScheduler; + final NanoScheduler nanoScheduler; public TraverseReadsNano(int nThreads) { final int bufferSize = ReadShard.getReadBufferSize() + 1; // actually has 1 more than max - nanoScheduler = new NanoScheduler(bufferSize, nThreads); + nanoScheduler = new NanoScheduler(bufferSize, nThreads); } @Override @@ -95,18 +96,23 @@ public class TraverseReadsNano extends TraversalEngine, final GenomeLoc locus = engine.getGenomeLocParser().createGenomeLoc(lastRead); printProgress(dataProvider.getShard(), locus, aggregatedInputs.size()); - // TODO -- how can I get done value? - // done = walker.isDone(); - return result; } + /** + * Aggregate all of the inputs for all map calls into MapData, to be provided + * to NanoScheduler for Map/Reduce + * + * @param dataProvider the source of our data + * @return a linked list of MapData objects holding the read, ref, and ROD info for every map/reduce + * should execute + */ private List aggregateMapData(final ReadShardDataProvider dataProvider) { final ReadView reads = new ReadView(dataProvider); final ReadReferenceView reference = new ReadReferenceView(dataProvider); final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); - final List mapData = new ArrayList(); // TODO -- need size of reads + final List mapData = new LinkedList(); for ( final SAMRecord read : reads ) { final ReferenceContext refContext = ! read.getReadUnmappedFlag() ? reference.getReferenceContext(read) @@ -132,19 +138,9 @@ public class TraverseReadsNano extends TraversalEngine, super.printOnTraversalDone(); } - private class TraverseReadsReduce implements ReduceFunction { - final ReadWalker walker; - - private TraverseReadsReduce(ReadWalker walker) { - this.walker = walker; - } - - @Override - public T apply(M one, T sum) { - return walker.reduce(one, sum); - } - } - + /** + * The input data needed for each map call. The read, the reference, and the RODs + */ private class MapData { final GATKSAMRecord read; final ReferenceContext refContext; @@ -157,7 +153,43 @@ public class TraverseReadsNano extends TraversalEngine, } } - private class TraverseReadsMap implements MapFunction { + /** + * Contains the results of a map call, indicating whether the call was good, filtered, or done + */ + private class MapResult { + final M value; + final boolean reduceMe; + + /** + * Create a MapResult with value that should be reduced + * + * @param value the value to reduce + */ + private MapResult(final M value) { + this.value = value; + this.reduceMe = true; + } + + /** + * Create a MapResult that shouldn't be reduced + */ + private MapResult() { + this.value = null; + this.reduceMe = false; + } + } + + /** + * A static object that tells reduce that the result of map should be skipped (filtered or done) + */ + private final MapResult SKIP_REDUCE = new MapResult(); + + /** + * MapFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Applies walker.map to MapData, returning a MapResult object containing the result + */ + private class TraverseReadsMap implements MapFunction { final ReadWalker walker; private TraverseReadsMap(ReadWalker walker) { @@ -165,16 +197,36 @@ public class TraverseReadsNano extends TraversalEngine, } @Override - public M apply(final MapData data) { + public MapResult apply(final MapData data) { if ( ! walker.isDone() ) { final boolean keepMeP = walker.filter(data.refContext, data.read); - if (keepMeP) { - return walker.map(data.refContext, data.read, data.tracker); - } + if (keepMeP) + return new MapResult(walker.map(data.refContext, data.read, data.tracker)); } - // TODO -- how can we cleanly support done and filtered. Need to return - // TODO -- a MapResult object that says the status - return null; + + return SKIP_REDUCE; + } + } + + /** + * ReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable + */ + private class TraverseReadsReduce implements ReduceFunction { + final ReadWalker walker; + + private TraverseReadsReduce(ReadWalker walker) { + this.walker = walker; + } + + @Override + public T apply(MapResult one, T sum) { + if ( one.reduceMe ) + // only run reduce on values that aren't DONE or FAILED + return walker.reduce(one.value, sum); + else + return sum; } } } From d7105223fe7d8bb4848dbc2cfe7ccfbb9709b4b6 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 13:44:39 -0400 Subject: [PATCH 124/161] More debugging output for NanoScheduler when debugging is enabled --- .../sting/utils/nanoScheduler/NanoScheduler.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 1ef4d3950..f0e77354f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -220,12 +220,14 @@ public class NanoScheduler { try { // read in our input values final List inputs = readInputs(inputReader); + debugPrint("Enqueuing " + inputs.size() + " elements to map"); // send jobs for map final Queue> mapQueue = submitMapJobs(map, executor, inputs); // send off the reduce job, and block until we get at least one reduce result sum = reduceSerial(reduce, mapQueue, sum); + debugPrint(" Done with cycle of map/reduce"); } catch (InterruptedException ex) { throw new ReviewedStingException("got execution exception", ex); } catch (ExecutionException ex) { @@ -307,6 +309,7 @@ public class NanoScheduler { @Override public MapType call() throws Exception { if ( TIME_CALLS) mapTimer.restart(); + if ( debug ) debugPrint("\t\tmap " + input); final MapType result = map.apply(input); if ( TIME_CALLS) mapTimer.stop(); return result; From 757e6a016081205e3c78c71ed184c982d63910f6 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 13:45:57 -0400 Subject: [PATCH 125/161] Making Pileup thread-safe -- Old version relied on out printstream magically sorting output, new version puts the print in reduce --- .../sting/gatk/walkers/Pileup.java | 33 +++++++++++-------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java index 52c6e1560..607c83966 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java @@ -52,7 +52,7 @@ import java.util.List; * samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate] */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -public class Pileup extends LocusWalker implements TreeReducible { +public class Pileup extends LocusWalker implements TreeReducible { private static final String verboseDelimiter = "@"; // it's ugly to use "@" but it's literally the only usable character not allowed in read names @@ -70,27 +70,32 @@ public class Pileup extends LocusWalker implements TreeReducib @Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false) public List> rods = Collections.emptyList(); - public void initialize() { - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - String rods = getReferenceOrderedData( tracker ); + @Override + public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + final String rods = getReferenceOrderedData( tracker ); ReadBackedPileup basePileup = context.getBasePileup(); - out.printf("%s %s", basePileup.getPileupString((char)ref.getBase()), rods); - if ( SHOW_VERBOSE ) - out.printf(" %s", createVerboseOutput(basePileup)); - out.println(); - return 1; + final StringBuilder s = new StringBuilder(); + s.append(String.format("%s %s", basePileup.getPileupString((char)ref.getBase()), rods)); + if ( SHOW_VERBOSE ) + s.append(" ").append(createVerboseOutput(basePileup)); + s.append("\n"); + + return s.toString(); } // Given result of map function + @Override public Integer reduceInit() { return 0; } - public Integer reduce(Integer value, Integer sum) { - return treeReduce(sum,value); + + @Override + public Integer reduce(String value, Integer sum) { + out.print(value); + return sum + 1; } + + @Override public Integer treeReduce(Integer lhs, Integer rhs) { return lhs + rhs; } From d503ed97abd7a4990e3412aa8a934ff0761e847b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 13:47:40 -0400 Subject: [PATCH 126/161] Mark I NanoScheduling TraverseLoci -- Refactored TraverseLoci into old linear version and nano scheduling version -- Temp. GATK argument to say how many nano threads to use -- Can efficiently scale to 3 threads before blocking on input --- .../sting/gatk/ReadMetrics.java | 10 +- .../arguments/GATKArgumentCollection.java | 4 + .../sting/gatk/executive/MicroScheduler.java | 3 +- ...raverseLoci.java => TraverseLociBase.java} | 70 +++--- .../gatk/traversals/TraverseLociLinear.java | 48 +++++ .../gatk/traversals/TraverseLociNano.java | 200 ++++++++++++++++++ 6 files changed, 293 insertions(+), 42 deletions(-) rename public/java/src/org/broadinstitute/sting/gatk/traversals/{TraverseLoci.java => TraverseLociBase.java} (57%) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java create mode 100755 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java b/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java index ceaa30f01..bfea0b1e1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk; import net.sf.picard.filter.SamRecordFilter; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.TreeMap; @@ -119,11 +118,18 @@ public class ReadMetrics implements Cloneable { return nRecords; } + /** + * Increments the number of 'iterations' (one call of filter/map/reduce sequence) completed. + */ + public void incrementNumIterations(final long by) { + nRecords += by; + } + /** * Increments the number of 'iterations' (one call of filter/map/reduce sequence) completed. */ public void incrementNumIterations() { - nRecords++; + incrementNumIterations(1); } public long getNumReadsSeen() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 6be66b204..33400bd9e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -313,6 +313,10 @@ public class GATKArgumentCollection { @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false) public Integer numberOfBAMFileHandles = null; + @Argument(fullName="nanoThreads", shortName = "nanoThreads", doc="NanoThreading", required = false) + @Hidden + public int nanoThreads = 1; + @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line.", required = false) public List readGroupBlackList = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 417a0982f..073a46ee3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -146,7 +146,8 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { if (walker instanceof ReadWalker) { traversalEngine = numThreads > 1 ? new TraverseReadsNano(numThreads) : new TraverseReads(); } else if (walker instanceof LocusWalker) { - traversalEngine = new TraverseLoci(); + // TODO -- refactor to use better interface + traversalEngine = engine.getArguments().nanoThreads > 1 ? new TraverseLociNano(engine.getArguments().nanoThreads) : new TraverseLociLinear(); } else if (walker instanceof DuplicateWalker) { traversalEngine = new TraverseDuplicates(); } else if (walker instanceof ReadPairWalker) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java similarity index 57% rename from public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java rename to public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java index a5a6919a2..19d95381e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java @@ -3,9 +3,7 @@ package org.broadinstitute.sting.gatk.traversals; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.WalkerManager; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.Walker; @@ -15,28 +13,42 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; /** * A simple solution to iterating over all reference positions over a series of genomic locations. */ -public class TraverseLoci extends TraversalEngine,LocusShardDataProvider> { +public abstract class TraverseLociBase extends TraversalEngine,LocusShardDataProvider> { /** * our log, which we want to capture anything from this class */ protected static final Logger logger = Logger.getLogger(TraversalEngine.class); @Override - protected String getTraversalType() { + protected final String getTraversalType() { return "sites"; } + protected static class TraverseResults { + final int numIterations; + final T reduceResult; + + public TraverseResults(int numIterations, T reduceResult) { + this.numIterations = numIterations; + this.reduceResult = reduceResult; + } + } + + protected abstract TraverseResults traverse( final LocusWalker walker, + final LocusView locusView, + final LocusReferenceView referenceView, + final ReferenceOrderedView referenceOrderedDataView, + final T sum); + @Override public T traverse( LocusWalker walker, LocusShardDataProvider dataProvider, T sum) { - logger.debug(String.format("TraverseLoci.traverse: Shard is %s", dataProvider)); + logger.debug(String.format("TraverseLociBase.traverse: Shard is %s", dataProvider)); - LocusView locusView = getLocusView( walker, dataProvider ); - boolean done = false; + final LocusView locusView = getLocusView( walker, dataProvider ); if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all - //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); ReferenceOrderedView referenceOrderedDataView = null; if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) @@ -44,43 +56,23 @@ public class TraverseLoci extends TraversalEngine,Locu else referenceOrderedDataView = (RodLocusView)locusView; - LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); + final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); - // We keep processing while the next reference location is within the interval - while( locusView.hasNext() && ! done ) { - AlignmentContext locus = locusView.next(); - GenomeLoc location = locus.getLocation(); - - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - final boolean keepMeP = walker.filter(tracker, refContext, locus); - if (keepMeP) { - M x = walker.map(tracker, refContext, locus); - sum = walker.reduce(x, sum); - done = walker.isDone(); - } - - printProgress(dataProvider.getShard(),locus.getLocation()); - } + final TraverseResults result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum ); + sum = result.reduceResult; + dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations); } // We have a final map call to execute here to clean up the skipped based from the // last position in the ROD to that in the interval if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) { // only do this if the walker isn't done! - RodLocusView rodLocusView = (RodLocusView)locusView; - long nSkipped = rodLocusView.getLastSkippedBases(); + final RodLocusView rodLocusView = (RodLocusView)locusView; + final long nSkipped = rodLocusView.getLastSkippedBases(); if ( nSkipped > 0 ) { - GenomeLoc site = rodLocusView.getLocOneBeyondShard(); - AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped); - M x = walker.map(null, null, ac); + final GenomeLoc site = rodLocusView.getLocOneBeyondShard(); + final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped); + final M x = walker.map(null, null, ac); sum = walker.reduce(x, sum); } } @@ -90,14 +82,14 @@ public class TraverseLoci extends TraversalEngine,Locu /** * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track' - * of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype + * of sorts, providing a consistent interface so that TraverseLociBase doesn't need to be reimplemented for any new datatype * that comes along. * @param walker walker to interrogate. * @param dataProvider Data which which to drive the locus view. * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. */ private LocusView getLocusView( Walker walker, LocusShardDataProvider dataProvider ) { - DataSource dataSource = WalkerManager.getWalkerDataSource(walker); + final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); if( dataSource == DataSource.READS ) return new CoveredLocusView(dataProvider); else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java new file mode 100755 index 000000000..1dec3b238 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java @@ -0,0 +1,48 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView; +import org.broadinstitute.sting.gatk.datasources.providers.LocusView; +import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.utils.GenomeLoc; + +/** + * A simple solution to iterating over all reference positions over a series of genomic locations. + */ +public class TraverseLociLinear extends TraverseLociBase { + + @Override + protected TraverseResults traverse(LocusWalker walker, LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView, T sum) { + // We keep processing while the next reference location is within the interval + boolean done = false; + int numIterations = 0; + + while( locusView.hasNext() && ! done ) { + numIterations++; + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + + final boolean keepMeP = walker.filter(tracker, refContext, locus); + if (keepMeP) { + final M x = walker.map(tracker, refContext, locus); + sum = walker.reduce(x, sum); + done = walker.isDone(); + } + + // TODO -- refactor printProgress to separate updating read metrics from printing progress + //printProgress(dataProvider.getShard(),locus.getLocation()); + } + + return new TraverseResults(numIterations, sum); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java new file mode 100755 index 000000000..4e6eb1915 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java @@ -0,0 +1,200 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView; +import org.broadinstitute.sting.gatk.datasources.providers.LocusView; +import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.nanoScheduler.MapFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; +import org.broadinstitute.sting.utils.nanoScheduler.ReduceFunction; + +import java.util.Iterator; + +/** + * A simple solution to iterating over all reference positions over a series of genomic locations. + */ +public class TraverseLociNano extends TraverseLociBase { + /** our log, which we want to capture anything from this class */ + private static final boolean DEBUG = false; + private static final int BUFFER_SIZE = 1000; + + final NanoScheduler nanoScheduler; + + public TraverseLociNano(int nThreads) { + nanoScheduler = new NanoScheduler(BUFFER_SIZE, nThreads); + } + + @Override + protected TraverseResults traverse(final LocusWalker walker, + final LocusView locusView, + final LocusReferenceView referenceView, + final ReferenceOrderedView referenceOrderedDataView, + final T sum) { + nanoScheduler.setDebug(DEBUG); + final TraverseLociMap myMap = new TraverseLociMap(walker); + final TraverseLociReduce myReduce = new TraverseLociReduce(walker); + + final MapDataIterator inputIterator = new MapDataIterator(locusView, referenceView, referenceOrderedDataView); + final T result = nanoScheduler.execute(inputIterator, myMap, sum, myReduce); + + // todo -- how do I print progress? +// final GATKSAMRecord lastRead = aggregatedInputs.get(aggregatedInputs.size() - 1).read; +// final GenomeLoc locus = engine.getGenomeLocParser().createGenomeLoc(lastRead); +// printProgress(dataProvider.getShard(), locus, aggregatedInputs.size()); + + return new TraverseResults(inputIterator.numIterations, result); + } + + /** + * Create iterator that provides inputs for all map calls into MapData, to be provided + * to NanoScheduler for Map/Reduce + */ + private class MapDataIterator implements Iterator { + final LocusView locusView; + final LocusReferenceView referenceView; + final ReferenceOrderedView referenceOrderedDataView; + int numIterations = 0; + + private MapDataIterator(LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView) { + this.locusView = locusView; + this.referenceView = referenceView; + this.referenceOrderedDataView = referenceOrderedDataView; + } + + @Override + public boolean hasNext() { + return locusView.hasNext(); + } + + @Override + public MapData next() { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + //logger.info("Pulling data from MapDataIterator at " + location); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(location, refContext); + + numIterations++; + return new MapData(locus, refContext, tracker); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Cannot remove elements from MapDataIterator"); + } + } + + @Override + public void printOnTraversalDone() { + nanoScheduler.shutdown(); + super.printOnTraversalDone(); + } + + /** + * The input data needed for each map call. The read, the reference, and the RODs + */ + private class MapData { + final AlignmentContext alignmentContext; + final ReferenceContext refContext; + final RefMetaDataTracker tracker; + + private MapData(final AlignmentContext alignmentContext, ReferenceContext refContext, RefMetaDataTracker tracker) { + this.alignmentContext = alignmentContext; + this.refContext = refContext; + this.tracker = tracker; + } + + @Override + public String toString() { + return "MapData " + alignmentContext.getLocation(); + } + } + + /** + * Contains the results of a map call, indicating whether the call was good, filtered, or done + */ + private class MapResult { + final M value; + final boolean reduceMe; + + /** + * Create a MapResult with value that should be reduced + * + * @param value the value to reduce + */ + private MapResult(final M value) { + this.value = value; + this.reduceMe = true; + } + + /** + * Create a MapResult that shouldn't be reduced + */ + private MapResult() { + this.value = null; + this.reduceMe = false; + } + } + + /** + * A static object that tells reduce that the result of map should be skipped (filtered or done) + */ + private final MapResult SKIP_REDUCE = new MapResult(); + + /** + * MapFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Applies walker.map to MapData, returning a MapResult object containing the result + */ + private class TraverseLociMap implements MapFunction { + final LocusWalker walker; + + private TraverseLociMap(LocusWalker walker) { + this.walker = walker; + } + + @Override + public MapResult apply(final MapData data) { + if ( ! walker.isDone() ) { + final boolean keepMeP = walker.filter(data.tracker, data.refContext, data.alignmentContext); + if (keepMeP) { + final M x = walker.map(data.tracker, data.refContext, data.alignmentContext); + return new MapResult(x); + } + } + return SKIP_REDUCE; + } + } + + /** + * ReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable + */ + private class TraverseLociReduce implements ReduceFunction { + final LocusWalker walker; + + private TraverseLociReduce(LocusWalker walker) { + this.walker = walker; + } + + @Override + public T apply(MapResult one, T sum) { + if ( one.reduceMe ) + // only run reduce on values that aren't DONE or FAILED + return walker.reduce(one.value, sum); + else + return sum; + } + } +} From 8cdeb51b78696340d9303d44342095bb82a40671 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 14:50:06 -0400 Subject: [PATCH 127/161] Cleanup printProgress in TraversalEngine -- Separate updating cumulative traversal metrics from printing progress. There's now an updateCumulativeMetrics function and a printProgress() that only takes a current position -- printProgress now soles relies on the time since the last progress to decide if it will print or not. No longer uses the number of cycles, since this isn't reliable in the case of nano scheduling -- GenomeAnalysisEngine now maintains a pointer to the master cumulative metrics. getCumulativeMetrics never returns null, which was handled in some parts of the code but not others. -- Update all of the traversals to use the new updateCumulativeMetrics, printProgress model -- Added progress callback to nano scheduler. Every bufferSize elements this callback is invoked, allowing us to smoothly update the progress meter in the NanoScheduler -- Rename MapFunction to NanoSchedulerMap and the same for reduce. --- .../sting/gatk/GenomeAnalysisEngine.java | 7 +- .../gatk/traversals/TraversalEngine.java | 152 ++++++++---------- .../traversals/TraverseActiveRegions.java | 3 +- .../gatk/traversals/TraverseDuplicates.java | 3 +- .../gatk/traversals/TraverseLociBase.java | 1 + .../gatk/traversals/TraverseLociLinear.java | 3 +- .../gatk/traversals/TraverseLociNano.java | 25 +-- .../gatk/traversals/TraverseReadPairs.java | 3 +- .../sting/gatk/traversals/TraverseReads.java | 7 +- .../gatk/traversals/TraverseReadsNano.java | 14 +- .../utils/nanoScheduler/NanoScheduler.java | 35 ++-- ...ion.java => NanoSchedulerMapFunction.java} | 2 +- .../NanoSchedulerProgressFunction.java | 12 ++ ....java => NanoSchedulerReduceFunction.java} | 2 +- .../nanoScheduler/NanoSchedulerUnitTest.java | 4 +- 15 files changed, 153 insertions(+), 120 deletions(-) rename public/java/src/org/broadinstitute/sting/utils/nanoScheduler/{MapFunction.java => NanoSchedulerMapFunction.java} (84%) create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerProgressFunction.java rename public/java/src/org/broadinstitute/sting/utils/nanoScheduler/{ReduceFunction.java => NanoSchedulerReduceFunction.java} (87%) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index b9b5e452d..1b4333ce2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -143,6 +143,8 @@ public class GenomeAnalysisEngine { */ private ThreadAllocation threadAllocation; + private ReadMetrics cumulativeMetrics = null; + /** * A currently hacky unique name for this GATK instance */ @@ -1035,7 +1037,10 @@ public class GenomeAnalysisEngine { * owned by the caller; the caller can do with the object what they wish. */ public ReadMetrics getCumulativeMetrics() { - return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics(); + // todo -- probably shouldn't be lazy + if ( cumulativeMetrics == null ) + cumulativeMetrics = readsDataSource == null ? new ReadMetrics() : readsDataSource.getCumulativeReadMetrics(); + return cumulativeMetrics; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 198f9342e..4422d49ae 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -44,24 +44,12 @@ import java.util.List; import java.util.Map; public abstract class TraversalEngine,ProviderType extends ShardDataProvider> { + /** our log, which we want to capture anything from this class */ + protected static final Logger logger = Logger.getLogger(TraversalEngine.class); + // Time in milliseconds since we initialized this engine private static final int HISTORY_WINDOW_SIZE = 50; - private static class ProcessingHistory { - double elapsedSeconds; - long unitsProcessed; - long bpProcessed; - GenomeLoc loc; - - public ProcessingHistory(double elapsedSeconds, GenomeLoc loc, long unitsProcessed, long bpProcessed) { - this.elapsedSeconds = elapsedSeconds; - this.loc = loc; - this.unitsProcessed = unitsProcessed; - this.bpProcessed = bpProcessed; - } - - } - /** lock object to sure updates to history are consistent across threads */ private static final Object lock = new Object(); LinkedList history = new LinkedList(); @@ -70,13 +58,12 @@ public abstract class TraversalEngine,Provide private SimpleTimer timer = null; // How long can we go without printing some progress info? - private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000; - private int printProgressCheckCounter = 0; private long lastProgressPrintTime = -1; // When was the last time we printed progress log? - private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; // in milliseconds - private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds - private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0; - private final double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0; + + private final static long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; // in milliseconds + private final static double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0; + private final static double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0; + private long progressPrintFrequency = 10 * 1000; // in milliseconds private boolean progressMeterInitialized = false; // for performance log @@ -85,15 +72,12 @@ public abstract class TraversalEngine,Provide private File performanceLogFile; private PrintStream performanceLog = null; private long lastPerformanceLogPrintTime = -1; // When was the last time we printed to the performance log? - private final long PERFORMANCE_LOG_PRINT_FREQUENCY = PROGRESS_PRINT_FREQUENCY; // in milliseconds + private final long PERFORMANCE_LOG_PRINT_FREQUENCY = progressPrintFrequency; // in milliseconds /** Size, in bp, of the area we are processing. Updated once in the system in initial for performance reasons */ long targetSize = -1; GenomeLocSortedSet targetIntervals = null; - /** our log, which we want to capture anything from this class */ - protected static final Logger logger = Logger.getLogger(TraversalEngine.class); - protected GenomeAnalysisEngine engine; // ---------------------------------------------------------------------------------------------------- @@ -187,28 +171,34 @@ public abstract class TraversalEngine,Provide } /** - * Forward request to printProgress + * Update the cumulative traversal metrics according to the data in this shard * - * Assumes that one cycle has been completed - * - * @param shard the given shard currently being processed. - * @param loc the location + * @param shard a non-null shard */ - public void printProgress(Shard shard, GenomeLoc loc) { - // A bypass is inserted here for unit testing. - printProgress(loc,shard.getReadMetrics(),false, 1); + public void updateCumulativeMetrics(final Shard shard) { + updateCumulativeMetrics(shard.getReadMetrics()); + } + + /** + * Update the cumulative traversal metrics according to the data in this shard + * + * @param singleTraverseMetrics read metrics object containing the information about a single shard's worth + * of data processing + */ + public void updateCumulativeMetrics(final ReadMetrics singleTraverseMetrics) { + engine.getCumulativeMetrics().incrementMetrics(singleTraverseMetrics); } /** * Forward request to printProgress * - * @param shard the given shard currently being processed. + * Assumes that one cycle has been completed + * * @param loc the location - * @param nElapsedCycles the number of cycles (turns of map) that have occurred since the last call */ - public void printProgress(Shard shard, GenomeLoc loc, int nElapsedCycles) { + public void printProgress(final GenomeLoc loc) { // A bypass is inserted here for unit testing. - printProgress(loc,shard.getReadMetrics(),false, nElapsedCycles); + printProgress(loc, false); } /** @@ -216,18 +206,9 @@ public abstract class TraversalEngine,Provide * every M seconds, for N and M set in global variables. * * @param loc Current location, can be null if you are at the end of the traversal - * @param metrics Data processed since the last cumulative * @param mustPrint If true, will print out info, regardless of nRecords or time interval */ - private synchronized void printProgress(GenomeLoc loc, ReadMetrics metrics, boolean mustPrint, int nElapsedCycles) { - final int previousPrintCycle = printProgressCheckCounter / PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES; - final int newPrintCycle = (printProgressCheckCounter+nElapsedCycles) / PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES; - - printProgressCheckCounter += nElapsedCycles; // keep track of our number of cycles through printProgress - if ( newPrintCycle == previousPrintCycle && ! mustPrint ) - // don't do any work more often than PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES - return; - + private synchronized void printProgress(final GenomeLoc loc, boolean mustPrint) { if( ! progressMeterInitialized ) { logger.info("[INITIALIZATION COMPLETE; TRAVERSAL STARTING]"); logger.info(String.format("%15s processed.%s runtime per.1M.%s completed total.runtime remaining", @@ -236,37 +217,30 @@ public abstract class TraversalEngine,Provide } final long curTime = timer.currentTime(); - boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, PROGRESS_PRINT_FREQUENCY); + boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, progressPrintFrequency); boolean printLog = performanceLog != null && maxElapsedIntervalForPrinting(curTime, lastPerformanceLogPrintTime, PERFORMANCE_LOG_PRINT_FREQUENCY); if ( printProgress || printLog ) { - // getting and appending metrics data actually turns out to be quite a heavyweight - // operation. Postpone it until after determining whether to print the log message. - ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics() != null ? engine.getCumulativeMetrics() : new ReadMetrics(); - if(metrics != null) - cumulativeMetrics.incrementMetrics(metrics); - - final long nRecords = cumulativeMetrics.getNumIterations(); - - ProcessingHistory last = updateHistory(loc,cumulativeMetrics); + final ProcessingHistory last = updateHistory(loc, engine.getCumulativeMetrics()); final AutoFormattingTime elapsed = new AutoFormattingTime(last.elapsedSeconds); - final AutoFormattingTime bpRate = new AutoFormattingTime(secondsPerMillionBP(last)); - final AutoFormattingTime unitRate = new AutoFormattingTime(secondsPerMillionElements(last)); - final double fractionGenomeTargetCompleted = calculateFractionGenomeTargetCompleted(last); + final AutoFormattingTime bpRate = new AutoFormattingTime(last.secondsPerMillionBP()); + final AutoFormattingTime unitRate = new AutoFormattingTime(last.secondsPerMillionElements()); + final double fractionGenomeTargetCompleted = last.calculateFractionGenomeTargetCompleted(targetSize); final AutoFormattingTime estTotalRuntime = new AutoFormattingTime(elapsed.getTimeInSeconds() / fractionGenomeTargetCompleted); final AutoFormattingTime timeToCompletion = new AutoFormattingTime(estTotalRuntime.getTimeInSeconds() - elapsed.getTimeInSeconds()); + final long nRecords = engine.getCumulativeMetrics().getNumIterations(); if ( printProgress ) { lastProgressPrintTime = curTime; // dynamically change the update rate so that short running jobs receive frequent updates while longer jobs receive fewer updates if ( estTotalRuntime.getTimeInSeconds() > TWELVE_HOURS_IN_SECONDS ) - PROGRESS_PRINT_FREQUENCY = 60 * 1000; // in milliseconds + progressPrintFrequency = 60 * 1000; // in milliseconds else if ( estTotalRuntime.getTimeInSeconds() > TWO_HOURS_IN_SECONDS ) - PROGRESS_PRINT_FREQUENCY = 30 * 1000; // in milliseconds + progressPrintFrequency = 30 * 1000; // in milliseconds else - PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds + progressPrintFrequency = 10 * 1000; // in milliseconds final String posName = loc == null ? (mustPrint ? "done" : "unmapped reads") : Integer.toString(loc.getStart()); logger.info(String.format("%15s %5.2e %s %s %5.1f%% %s %s", @@ -296,7 +270,7 @@ public abstract class TraversalEngine,Provide * @param metrics information about what's been processed already * @return */ - private final ProcessingHistory updateHistory(GenomeLoc loc, ReadMetrics metrics) { + private ProcessingHistory updateHistory(GenomeLoc loc, ReadMetrics metrics) { synchronized (lock) { if ( history.size() > HISTORY_WINDOW_SIZE ) history.pop(); @@ -309,26 +283,11 @@ public abstract class TraversalEngine,Provide } } - /** How long in seconds to process 1M traversal units? */ - private final double secondsPerMillionElements(ProcessingHistory last) { - return (last.elapsedSeconds * 1000000.0) / Math.max(last.unitsProcessed, 1); - } - - /** How long in seconds to process 1M bp on the genome? */ - private final double secondsPerMillionBP(ProcessingHistory last) { - return (last.elapsedSeconds * 1000000.0) / Math.max(last.bpProcessed, 1); - } - - /** What fractoin of the target intervals have we covered? */ - private final double calculateFractionGenomeTargetCompleted(ProcessingHistory last) { - return (1.0*last.bpProcessed) / targetSize; - } - /** * Called after a traversal to print out information about the traversal process */ public void printOnTraversalDone() { - printProgress(null, null, true, 1); + printProgress(null, true); final double elapsed = timer == null ? 0 : timer.getElapsedTime(); @@ -389,7 +348,7 @@ public abstract class TraversalEngine,Provide * @return Frequency, in seconds, of performance log writes. */ public long getPerformanceProgressPrintFrequencySeconds() { - return PROGRESS_PRINT_FREQUENCY; + return progressPrintFrequency; } /** @@ -397,6 +356,35 @@ public abstract class TraversalEngine,Provide * @param seconds number of seconds between messages indicating performance frequency. */ public void setPerformanceProgressPrintFrequencySeconds(long seconds) { - PROGRESS_PRINT_FREQUENCY = seconds; + progressPrintFrequency = seconds; + } + + private static class ProcessingHistory { + double elapsedSeconds; + long unitsProcessed; + long bpProcessed; + GenomeLoc loc; + + public ProcessingHistory(double elapsedSeconds, GenomeLoc loc, long unitsProcessed, long bpProcessed) { + this.elapsedSeconds = elapsedSeconds; + this.loc = loc; + this.unitsProcessed = unitsProcessed; + this.bpProcessed = bpProcessed; + } + + /** How long in seconds to process 1M traversal units? */ + private double secondsPerMillionElements() { + return (elapsedSeconds * 1000000.0) / Math.max(unitsProcessed, 1); + } + + /** How long in seconds to process 1M bp on the genome? */ + private double secondsPerMillionBP() { + return (elapsedSeconds * 1000000.0) / Math.max(bpProcessed, 1); + } + + /** What fractoin of the target intervals have we covered? */ + private double calculateFractionGenomeTargetCompleted(final long targetSize) { + return (1.0*bpProcessed) / targetSize; + } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index ecaa15fe9..bbd9346b3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -104,7 +104,8 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum ); sum = result.reduceResult; dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations); + updateCumulativeMetrics(dataProvider.getShard()); } // We have a final map call to execute here to clean up the skipped based from the diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java index 1dec3b238..22381092f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java @@ -39,8 +39,7 @@ public class TraverseLociLinear extends TraverseLociBase { done = walker.isDone(); } - // TODO -- refactor printProgress to separate updating read metrics from printing progress - //printProgress(dataProvider.getShard(),locus.getLocation()); + printProgress(locus.getLocation()); } return new TraverseResults(numIterations, sum); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java index 4e6eb1915..73b73c002 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java @@ -8,9 +8,10 @@ import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.nanoScheduler.MapFunction; import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; -import org.broadinstitute.sting.utils.nanoScheduler.ReduceFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NanoSchedulerMapFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NanoSchedulerProgressFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NanoSchedulerReduceFunction; import java.util.Iterator; @@ -26,6 +27,7 @@ public class TraverseLociNano extends TraverseLociBase { public TraverseLociNano(int nThreads) { nanoScheduler = new NanoScheduler(BUFFER_SIZE, nThreads); + nanoScheduler.setProgressFunction(new TraverseLociProgress()); } @Override @@ -41,11 +43,6 @@ public class TraverseLociNano extends TraverseLociBase { final MapDataIterator inputIterator = new MapDataIterator(locusView, referenceView, referenceOrderedDataView); final T result = nanoScheduler.execute(inputIterator, myMap, sum, myReduce); - // todo -- how do I print progress? -// final GATKSAMRecord lastRead = aggregatedInputs.get(aggregatedInputs.size() - 1).read; -// final GenomeLoc locus = engine.getGenomeLocParser().createGenomeLoc(lastRead); -// printProgress(dataProvider.getShard(), locus, aggregatedInputs.size()); - return new TraverseResults(inputIterator.numIterations, result); } @@ -156,7 +153,7 @@ public class TraverseLociNano extends TraverseLociBase { * * Applies walker.map to MapData, returning a MapResult object containing the result */ - private class TraverseLociMap implements MapFunction { + private class TraverseLociMap implements NanoSchedulerMapFunction { final LocusWalker walker; private TraverseLociMap(LocusWalker walker) { @@ -177,11 +174,11 @@ public class TraverseLociNano extends TraverseLociBase { } /** - * ReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * NanoSchedulerReduceFunction for TraverseReads meeting NanoScheduler interface requirements * * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable */ - private class TraverseLociReduce implements ReduceFunction { + private class TraverseLociReduce implements NanoSchedulerReduceFunction { final LocusWalker walker; private TraverseLociReduce(LocusWalker walker) { @@ -197,4 +194,12 @@ public class TraverseLociNano extends TraverseLociBase { return sum; } } + + private class TraverseLociProgress implements NanoSchedulerProgressFunction { + @Override + public void progress(MapData lastProcessedMap) { + if (lastProcessedMap.alignmentContext != null) + printProgress(lastProcessedMap.alignmentContext.getLocation()); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java index ebaac40af..9b076fce4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java @@ -65,7 +65,8 @@ public class TraverseReadPairs extends TraversalEngine extends TraversalEngine,Read sum = walker.reduce(x, sum); } - GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart()); - printProgress(dataProvider.getShard(),locus); + final GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart()); + + updateCumulativeMetrics(dataProvider.getShard()); + printProgress(locus); + done = walker.isDone(); } return sum; diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index 4bb700c37..5679747e1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -35,9 +35,9 @@ import org.broadinstitute.sting.gatk.datasources.reads.ReadShard; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.nanoScheduler.MapFunction; import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; -import org.broadinstitute.sting.utils.nanoScheduler.ReduceFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NanoSchedulerMapFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NanoSchedulerReduceFunction; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.LinkedList; @@ -94,7 +94,9 @@ public class TraverseReadsNano extends TraversalEngine, final GATKSAMRecord lastRead = aggregatedInputs.get(aggregatedInputs.size() - 1).read; final GenomeLoc locus = engine.getGenomeLocParser().createGenomeLoc(lastRead); - printProgress(dataProvider.getShard(), locus, aggregatedInputs.size()); + + updateCumulativeMetrics(dataProvider.getShard()); + printProgress(locus); return result; } @@ -189,7 +191,7 @@ public class TraverseReadsNano extends TraversalEngine, * * Applies walker.map to MapData, returning a MapResult object containing the result */ - private class TraverseReadsMap implements MapFunction { + private class TraverseReadsMap implements NanoSchedulerMapFunction { final ReadWalker walker; private TraverseReadsMap(ReadWalker walker) { @@ -209,11 +211,11 @@ public class TraverseReadsNano extends TraversalEngine, } /** - * ReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * NanoSchedulerReduceFunction for TraverseReads meeting NanoScheduler interface requirements * * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable */ - private class TraverseReadsReduce implements ReduceFunction { + private class TraverseReadsReduce implements NanoSchedulerReduceFunction { final ReadWalker walker; private TraverseReadsReduce(ReadWalker walker) { diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index f0e77354f..f0c2a6723 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -54,6 +54,8 @@ public class NanoScheduler { boolean shutdown = false; boolean debug = false; + private NanoSchedulerProgressFunction progressFunction = null; + final SimpleTimer outsideSchedulerTimer = new SimpleTimer("outside"); final SimpleTimer inputTimer = new SimpleTimer("input"); final SimpleTimer mapTimer = new SimpleTimer("map"); @@ -148,6 +150,17 @@ public class NanoScheduler { this.debug = debug; } + /** + * Set the progress callback function to progressFunction + * + * The progress callback is invoked after each buffer size elements have been processed by map/reduce + * + * @param progressFunction a progress function to call, or null if you don't want any progress callback + */ + public void setProgressFunction(final NanoSchedulerProgressFunction progressFunction) { + this.progressFunction = progressFunction; + } + /** * Execute a map/reduce job with this nanoScheduler * @@ -168,9 +181,9 @@ public class NanoScheduler { * @return the last reduce value */ public ReduceType execute(final Iterator inputReader, - final MapFunction map, + final NanoSchedulerMapFunction map, final ReduceType initialValue, - final ReduceFunction reduce) { + final NanoSchedulerReduceFunction reduce) { if ( isShutdown() ) throw new IllegalStateException("execute called on already shutdown NanoScheduler"); if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null"); if ( map == null ) throw new IllegalArgumentException("map function cannot be null"); @@ -193,9 +206,9 @@ public class NanoScheduler { * @return the reduce result of this map/reduce job */ private ReduceType executeSingleThreaded(final Iterator inputReader, - final MapFunction map, + final NanoSchedulerMapFunction map, final ReduceType initialValue, - final ReduceFunction reduce) { + final NanoSchedulerReduceFunction reduce) { ReduceType sum = initialValue; while ( inputReader.hasNext() ) { final InputType input = inputReader.next(); @@ -211,9 +224,9 @@ public class NanoScheduler { * @return the reduce result of this map/reduce job */ private ReduceType executeMultiThreaded(final Iterator inputReader, - final MapFunction map, + final NanoSchedulerMapFunction map, final ReduceType initialValue, - final ReduceFunction reduce) { + final NanoSchedulerReduceFunction reduce) { debugPrint("Executing nanoScheduler"); ReduceType sum = initialValue; while ( inputReader.hasNext() ) { @@ -228,6 +241,8 @@ public class NanoScheduler { // send off the reduce job, and block until we get at least one reduce result sum = reduceSerial(reduce, mapQueue, sum); debugPrint(" Done with cycle of map/reduce"); + + if ( progressFunction != null ) progressFunction.progress(inputs.get(inputs.size()-1)); } catch (InterruptedException ex) { throw new ReviewedStingException("got execution exception", ex); } catch (ExecutionException ex) { @@ -239,7 +254,7 @@ public class NanoScheduler { } @Requires({"reduce != null", "! mapQueue.isEmpty()"}) - private ReduceType reduceSerial(final ReduceFunction reduce, + private ReduceType reduceSerial(final NanoSchedulerReduceFunction reduce, final Queue> mapQueue, final ReduceType initSum) throws InterruptedException, ExecutionException { @@ -280,7 +295,7 @@ public class NanoScheduler { } @Requires({"map != null", "! inputs.isEmpty()"}) - private Queue> submitMapJobs(final MapFunction map, + private Queue> submitMapJobs(final NanoSchedulerMapFunction map, final ExecutorService executor, final List inputs) { final Queue> mapQueue = new LinkedList>(); @@ -299,10 +314,10 @@ public class NanoScheduler { */ private class CallableMap implements Callable { final InputType input; - final MapFunction map; + final NanoSchedulerMapFunction map; @Requires({"map != null"}) - private CallableMap(final MapFunction map, final InputType inputs) { + private CallableMap(final NanoSchedulerMapFunction map, final InputType inputs) { this.input = inputs; this.map = map; } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerMapFunction.java similarity index 84% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapFunction.java rename to public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerMapFunction.java index 440c263b7..ddf4421d2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapFunction.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerMapFunction.java @@ -9,7 +9,7 @@ package org.broadinstitute.sting.utils.nanoScheduler; * Date: 8/24/12 * Time: 9:49 AM */ -public interface MapFunction { +public interface NanoSchedulerMapFunction { /** * Return function on input, returning a value of ResultType * @param input diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerProgressFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerProgressFunction.java new file mode 100644 index 000000000..8631196a3 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerProgressFunction.java @@ -0,0 +1,12 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 9/4/12 + * Time: 2:10 PM + * To change this template use File | Settings | File Templates. + */ +public interface NanoSchedulerProgressFunction { + public void progress(final InputType lastMapInput); +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReduceFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerReduceFunction.java similarity index 87% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReduceFunction.java rename to public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerReduceFunction.java index 8f1b0eddd..7e58eeaf9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReduceFunction.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerReduceFunction.java @@ -7,7 +7,7 @@ package org.broadinstitute.sting.utils.nanoScheduler; * Date: 8/24/12 * Time: 9:49 AM */ -public interface ReduceFunction { +public interface NanoSchedulerReduceFunction { /** * Combine one with sum into a new ReduceType * @param one the result of a map call on an input element diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index 1dcc243f2..0ec3035e2 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -21,11 +21,11 @@ import java.util.List; public class NanoSchedulerUnitTest extends BaseTest { public static final int NANO_SCHEDULE_MAX_RUNTIME = 60000; - private static class Map2x implements MapFunction { + private static class Map2x implements NanoSchedulerMapFunction { @Override public Integer apply(Integer input) { return input * 2; } } - private static class ReduceSum implements ReduceFunction { + private static class ReduceSum implements NanoSchedulerReduceFunction { int prevOne = Integer.MIN_VALUE; @Override public Integer apply(Integer one, Integer sum) { From 03dd470ec152c1bf7682ce3afde2141b151acf13 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 14:58:12 -0400 Subject: [PATCH 128/161] Test for progressFunction in NanoScheduler; bugfix for single threaded fast path --- .../utils/nanoScheduler/NanoScheduler.java | 3 +++ .../nanoScheduler/NanoSchedulerUnitTest.java | 22 ++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index f0c2a6723..61d4fdd01 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -210,9 +210,12 @@ public class NanoScheduler { final ReduceType initialValue, final NanoSchedulerReduceFunction reduce) { ReduceType sum = initialValue; + int i = 0; while ( inputReader.hasNext() ) { final InputType input = inputReader.next(); final MapType mapValue = map.apply(input); + if ( i++ % bufferSize == 0 && progressFunction != null ) + progressFunction.progress(input); sum = reduce.apply(mapValue, sum); } return sum; diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index 0ec3035e2..3bd006ffe 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -34,6 +34,16 @@ public class NanoSchedulerUnitTest extends BaseTest { } } + private static class ProgressCallback implements NanoSchedulerProgressFunction { + int callBacks = 0; + + @Override + public void progress(Integer lastMapInput) { + callBacks++; + } + } + + private static int sum2x(final int start, final int end) { int sum = 0; for ( int i = start; i < end; i++ ) @@ -62,6 +72,11 @@ public class NanoSchedulerUnitTest extends BaseTest { return ints.iterator(); } + public int nExpectedCallbacks() { + int nElements = Math.max(end - start, 0); + return nElements / bufferSize; + } + public Map2x makeMap() { return new Map2x(); } public Integer initReduce() { return 0; } public ReduceSum makeReduce() { return new ReduceSum(); } @@ -73,7 +88,7 @@ public class NanoSchedulerUnitTest extends BaseTest { for ( final int bufferSize : Arrays.asList(1, 10, 1000, 1000000) ) { for ( final int nt : Arrays.asList(1, 2, 4) ) { for ( final int start : Arrays.asList(0) ) { - for ( final int end : Arrays.asList(1, 2, 11, 10000, 100000) ) { + for ( final int end : Arrays.asList(0, 1, 2, 11, 10000, 100000) ) { exampleTest = new NanoSchedulerBasicTest(bufferSize, nt, start, end); } } @@ -101,12 +116,17 @@ public class NanoSchedulerUnitTest extends BaseTest { final NanoScheduler nanoScheduler = new NanoScheduler(test.bufferSize, test.nThreads); + final ProgressCallback callback = new ProgressCallback(); + nanoScheduler.setProgressFunction(callback); + Assert.assertEquals(nanoScheduler.getBufferSize(), test.bufferSize, "bufferSize argument"); Assert.assertEquals(nanoScheduler.getnThreads(), test.nThreads, "nThreads argument"); final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); Assert.assertNotNull(sum); Assert.assertEquals((int)sum, test.expectedResult, "NanoScheduler sum not the same as calculated directly"); + + Assert.assertTrue(callback.callBacks >= test.nExpectedCallbacks(), "Not enough callbacks detected"); nanoScheduler.shutdown(); } From a997c99806b49c1ca0efdd4cc9c834df465e7b22 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 17:54:43 -0400 Subject: [PATCH 129/161] Initial NanoScheduler with input producer thread --- .../utils/nanoScheduler/NanoScheduler.java | 109 ++++++++++++++---- .../nanoScheduler/NanoSchedulerUnitTest.java | 3 +- 2 files changed, 86 insertions(+), 26 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 61d4fdd01..4f9fedce3 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -5,6 +5,7 @@ import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.AutoFormattingTime; import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.Iterator; @@ -74,7 +75,7 @@ public class NanoScheduler { this.bufferSize = bufferSize; this.nThreads = nThreads; - this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads); + this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads-1); // start timing the time spent outside of the nanoScheduler outsideSchedulerTimer.start(); @@ -232,20 +233,31 @@ public class NanoScheduler { final NanoSchedulerReduceFunction reduce) { debugPrint("Executing nanoScheduler"); ReduceType sum = initialValue; - while ( inputReader.hasNext() ) { + boolean done = false; + + final BlockingQueue inputQueue = new LinkedBlockingDeque(bufferSize); + final InputProducer inputProducer = new InputProducer(inputReader, inputQueue); + inputProducer.start(); + + while ( ! done ) { try { - // read in our input values - final List inputs = readInputs(inputReader); - debugPrint("Enqueuing " + inputs.size() + " elements to map"); + final Pair, Boolean> readResults = readInputs(inputQueue); + final List inputs = readResults.getFirst(); + done = readResults.getSecond(); - // send jobs for map - final Queue> mapQueue = submitMapJobs(map, executor, inputs); + if ( ! inputs.isEmpty() ) { + // send jobs for map + final Queue> mapQueue = submitMapJobs(map, executor, inputs); - // send off the reduce job, and block until we get at least one reduce result - sum = reduceSerial(reduce, mapQueue, sum); - debugPrint(" Done with cycle of map/reduce"); + // send off the reduce job, and block until we get at least one reduce result + sum = reduceSerial(reduce, mapQueue, sum); + debugPrint(" Done with cycle of map/reduce"); - if ( progressFunction != null ) progressFunction.progress(inputs.get(inputs.size()-1)); + if ( progressFunction != null ) progressFunction.progress(inputs.get(inputs.size()-1)); + } else { + // we must be done + if ( ! done ) throw new IllegalStateException("Inputs empty but not done"); + } } catch (InterruptedException ex) { throw new ReviewedStingException("got execution exception", ex); } catch (ExecutionException ex) { @@ -267,9 +279,9 @@ public class NanoScheduler { for ( final Future future : mapQueue ) { final MapType value = future.get(); // block until we get the values for this task - if ( TIME_CALLS) reduceTimer.restart(); + if ( TIME_CALLS ) reduceTimer.restart(); sum = reduce.apply(value, sum); - if ( TIME_CALLS) reduceTimer.stop(); + if ( TIME_CALLS ) reduceTimer.stop(); } return sum; @@ -280,21 +292,68 @@ public class NanoScheduler { * * @return a queue of input read in, containing one or more values of InputType read in */ - @Requires("inputReader.hasNext()") - @Ensures("!result.isEmpty()") - private List readInputs(final Iterator inputReader) { + @Requires("inputReader != null") + @Ensures("result != null") + private Pair, Boolean> readInputs(final BlockingQueue inputReader) throws InterruptedException { int n = 0; final List inputs = new LinkedList(); + boolean done = false; - if ( TIME_CALLS) inputTimer.restart(); - while ( inputReader.hasNext() && n < getBufferSize() ) { - final InputType input = inputReader.next(); - inputs.add(input); - n++; + while ( ! done && n < getBufferSize() ) { + final InputDatum input = inputReader.take(); + done = input.isLast(); + if ( ! done ) { + inputs.add(input.datum); + n++; + } } - if ( TIME_CALLS) inputTimer.stop(); - return inputs; + return new Pair, Boolean>(inputs, done); + } + + private class InputProducer extends Thread { + final Iterator inputReader; + final BlockingQueue outputQueue; + + public InputProducer(final Iterator inputReader, final BlockingQueue outputQueue) { + this.inputReader = inputReader; + this.outputQueue = outputQueue; + } + + public void run() { + try { + while ( inputReader.hasNext() ) { + if ( TIME_CALLS ) inputTimer.restart(); + final InputType input = inputReader.next(); + if ( TIME_CALLS ) inputTimer.stop(); + outputQueue.put(new InputDatum(input)); + } + + // add the EOF object so we know we are done + outputQueue.put(new InputDatum()); + } catch (InterruptedException ex) { + throw new ReviewedStingException("got execution exception", ex); + } + } + } + + private class InputDatum { + final boolean isLast; + final InputType datum; + + private InputDatum(final InputType datum) { + isLast = false; + this.datum = datum; + } + + private InputDatum() { + isLast = true; + this.datum = null; + } + + public boolean isLast() { + return isLast; + } } @Requires({"map != null", "! inputs.isEmpty()"}) @@ -326,10 +385,10 @@ public class NanoScheduler { } @Override public MapType call() throws Exception { - if ( TIME_CALLS) mapTimer.restart(); + if ( TIME_CALLS ) mapTimer.restart(); if ( debug ) debugPrint("\t\tmap " + input); final MapType result = map.apply(input); - if ( TIME_CALLS) mapTimer.stop(); + if ( TIME_CALLS ) mapTimer.stop(); return result; } } diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index 3bd006ffe..ddfc3cecd 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -126,7 +126,7 @@ public class NanoSchedulerUnitTest extends BaseTest { Assert.assertNotNull(sum); Assert.assertEquals((int)sum, test.expectedResult, "NanoScheduler sum not the same as calculated directly"); - Assert.assertTrue(callback.callBacks >= test.nExpectedCallbacks(), "Not enough callbacks detected"); + Assert.assertTrue(callback.callBacks >= test.nExpectedCallbacks(), "Not enough callbacks detected. Expected at least " + test.nExpectedCallbacks() + " but saw only " + callback.callBacks); nanoScheduler.shutdown(); } @@ -168,6 +168,7 @@ public class NanoSchedulerUnitTest extends BaseTest { final NanoSchedulerBasicTest test = new NanoSchedulerBasicTest(1000, Integer.valueOf(args[0]), 0, Integer.valueOf(args[1])); final NanoScheduler nanoScheduler = new NanoScheduler(test.bufferSize, test.nThreads); + nanoScheduler.setDebug(true); final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); System.out.printf("Sum = %d, expected =%d%n", sum, test.expectedResult); From c822b7c760245064741acf3c2221a299c89d21cb Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 18:02:40 -0400 Subject: [PATCH 130/161] Fix long-standing NPE in LMS due to inappropriate timing of initialization --- .../sting/gatk/executive/LinearMicroScheduler.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 5bcb16c94..740bcb566 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -60,11 +60,12 @@ public class LinearMicroScheduler extends MicroScheduler { boolean done = walker.isDone(); int counter = 0; + + traversalEngine.startTimersIfNecessary(); for (Shard shard : shardStrategy ) { if ( done || shard == null ) // we ran out of shards that aren't owned break; - traversalEngine.startTimersIfNecessary(); if(shard.getShardType() == Shard.ShardType.LOCUS) { WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine)); From 71d9ebcb0d8266152a142b5f9207eec022a7716f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 18:03:05 -0400 Subject: [PATCH 131/161] Fix bug (introduced by me) that didn't include contig in progress meter --- .../broadinstitute/sting/gatk/traversals/TraversalEngine.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 4422d49ae..8c617e4dc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -242,7 +242,7 @@ public abstract class TraversalEngine,Provide else progressPrintFrequency = 10 * 1000; // in milliseconds - final String posName = loc == null ? (mustPrint ? "done" : "unmapped reads") : Integer.toString(loc.getStart()); + final String posName = loc == null ? (mustPrint ? "done" : "unmapped reads") : String.format("%s:%d", loc.getContig(), loc.getStart()); logger.info(String.format("%15s %5.2e %s %s %5.1f%% %s %s", posName, nRecords*1.0, elapsed, unitRate, 100*fractionGenomeTargetCompleted, estTotalRuntime, timeToCompletion)); From 1e55475adcce7a9ec5ab8322fe5ed46efe111d1c Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 18:07:08 -0400 Subject: [PATCH 132/161] NanoScheduler uses ExecutorService to run input reader thread --- .../utils/nanoScheduler/NanoScheduler.java | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 4f9fedce3..89e44ce93 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -51,7 +51,8 @@ public class NanoScheduler { final int bufferSize; final int nThreads; - final ExecutorService executor; + final ExecutorService inputExecutor; + final ExecutorService mapExecutor; boolean shutdown = false; boolean debug = false; @@ -75,7 +76,8 @@ public class NanoScheduler { this.bufferSize = bufferSize; this.nThreads = nThreads; - this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads-1); + this.mapExecutor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads-1); + this.inputExecutor = Executors.newSingleThreadExecutor(); // start timing the time spent outside of the nanoScheduler outsideSchedulerTimer.start(); @@ -107,10 +109,10 @@ public class NanoScheduler { public void shutdown() { outsideSchedulerTimer.stop(); - if ( executor != null ) { - final List remaining = executor.shutdownNow(); + if ( mapExecutor != null ) { + final List remaining = mapExecutor.shutdownNow(); if ( ! remaining.isEmpty() ) - throw new IllegalStateException("Remaining tasks found in the executor, unexpected behavior!"); + throw new IllegalStateException("Remaining tasks found in the mapExecutor, unexpected behavior!"); } shutdown = true; @@ -236,8 +238,8 @@ public class NanoScheduler { boolean done = false; final BlockingQueue inputQueue = new LinkedBlockingDeque(bufferSize); - final InputProducer inputProducer = new InputProducer(inputReader, inputQueue); - inputProducer.start(); + + inputExecutor.submit(new InputProducer(inputReader, inputQueue)); while ( ! done ) { try { @@ -247,7 +249,7 @@ public class NanoScheduler { if ( ! inputs.isEmpty() ) { // send jobs for map - final Queue> mapQueue = submitMapJobs(map, executor, inputs); + final Queue> mapQueue = submitMapJobs(map, mapExecutor, inputs); // send off the reduce job, and block until we get at least one reduce result sum = reduceSerial(reduce, mapQueue, sum); @@ -311,7 +313,7 @@ public class NanoScheduler { return new Pair, Boolean>(inputs, done); } - private class InputProducer extends Thread { + private class InputProducer implements Runnable { final Iterator inputReader; final BlockingQueue outputQueue; From 9bf1d138d9ba921312b49b00d1627f4feff62c2d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Sep 2012 15:41:52 -0400 Subject: [PATCH 133/161] New GATK argument interface for data and cpu threads -- Closes GSA-515 Nanoscheduler GSA-542 Good interface to nanoScheduler -- Old -nt means dataThreads -- New -cnt (--num_cpu_threads_per_data_thread) gives you n cpu threads for each data thread in the system -- Cleanup logic for handling data and cpu threading in HMS, LMS, and MS -- GATKRunReport reports the total number of threads in use by the GATK, not just the nt value -- Removed the io,cpu tags for nt. Stupid system if you ask me. Cleaned up the GenomeAnalysisEngine and ThreadAllocation handling to be totally straightforward now --- .../sting/gatk/GenomeAnalysisEngine.java | 32 ++++---- .../arguments/GATKArgumentCollection.java | 44 ++++++----- .../executive/HierarchicalMicroScheduler.java | 17 ++-- .../gatk/executive/LinearMicroScheduler.java | 9 +-- .../sting/gatk/executive/MicroScheduler.java | 40 ++++++---- .../io/stubs/VariantContextWriterStub.java | 4 +- .../sting/gatk/phonehome/GATKRunReport.java | 2 +- .../resourcemanagement/ThreadAllocation.java | 78 +++++++++++-------- 8 files changed, 123 insertions(+), 103 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 1b4333ce2..fa28b02cd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -400,28 +400,22 @@ public class GenomeAnalysisEngine { * Parse out the thread allocation from the given command-line argument. */ private void determineThreadAllocation() { - Tags tags = parsingEngine.getTags(argCollection.numberOfThreads); + if ( argCollection.numberOfDataThreads < 1 ) throw new UserException.BadArgumentValue("num_threads", "cannot be less than 1, but saw " + argCollection.numberOfDataThreads); + if ( argCollection.numberOfCPUThreadsPerDataThread < 1 ) throw new UserException.BadArgumentValue("num_cpu_threads", "cannot be less than 1, but saw " + argCollection.numberOfCPUThreadsPerDataThread); + if ( argCollection.numberOfIOThreads < 0 ) throw new UserException.BadArgumentValue("num_io_threads", "cannot be less than 0, but saw " + argCollection.numberOfIOThreads); - // TODO: Kill this complicated logic once Queue supports arbitrary tagged parameters. - Integer numCPUThreads = null; - if(tags.containsKey("cpu") && argCollection.numberOfCPUThreads != null) - throw new UserException("Number of CPU threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other."); - else if(tags.containsKey("cpu")) - numCPUThreads = Integer.parseInt(tags.getValue("cpu")); - else if(argCollection.numberOfCPUThreads != null) - numCPUThreads = argCollection.numberOfCPUThreads; - - Integer numIOThreads = null; - if(tags.containsKey("io") && argCollection.numberOfIOThreads != null) - throw new UserException("Number of IO threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other."); - else if(tags.containsKey("io")) - numIOThreads = Integer.parseInt(tags.getValue("io")); - else if(argCollection.numberOfIOThreads != null) - numIOThreads = argCollection.numberOfIOThreads; - - this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads, numCPUThreads, numIOThreads, ! argCollection.disableEfficiencyMonitor); + this.threadAllocation = new ThreadAllocation(argCollection.numberOfDataThreads, + argCollection.numberOfCPUThreadsPerDataThread, + argCollection.numberOfIOThreads, + ! argCollection.disableEfficiencyMonitor); } + public int getTotalNumberOfThreads() { + return this.threadAllocation == null ? 1 : threadAllocation.getTotalNumThreads(); + } + + + /** * Allow subclasses and others within this package direct access to the walker manager. * @return The walker manager used by this package. diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 33400bd9e..b9e44d87b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -287,9 +287,32 @@ public class GATKArgumentCollection { @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false) public ValidationExclusion.TYPE unsafe; - /** How many threads should be allocated to this analysis. */ - @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false) - public Integer numberOfThreads = 1; + // -------------------------------------------------------------------------------------------------------------- + // + // Multi-threading arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * How many data threads should be allocated to this analysis? Data threads contains N cpu threads per + * data thread, and act as completely data parallel processing, increasing the memory usage of GATK + * by M data threads. Data threads generally scale extremely effectively, up to 24 cores + */ + @Argument(fullName = "num_threads", shortName = "nt", doc = "How many data threads should be allocated to running this analysis.", required = false) + public Integer numberOfDataThreads = 1; + + /** + * How many CPU threads should be allocated per data thread? Each CPU thread operates the map + * cycle independently, but may run into earlier scaling problems with IO than data threads. Has + * the benefit of not requiring X times as much memory per thread as data threads do, but rather + * only a constant overhead. + */ + @Argument(fullName="num_cpu_threads_per_data_thread", shortName = "cnt", doc="How many CPU threads should be allocated per data thread to running this analysis?", required = false) + public int numberOfCPUThreadsPerDataThread = 1; + + @Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false) + @Hidden + public int numberOfIOThreads = 0; /** * By default the GATK monitors its own efficiency, but this can have a itsy-bitsy tiny @@ -299,24 +322,9 @@ public class GATKArgumentCollection { @Argument(fullName = "disableThreadEfficiencyMonitor", shortName = "dtem", doc = "Disable GATK efficiency monitoring", required = false) public Boolean disableEfficiencyMonitor = false; - /** - * The following two arguments (num_cpu_threads, num_io_threads are TEMPORARY since Queue cannot currently support arbitrary tagged data types. - * TODO: Kill this when I can do a tagged integer in Queue. - */ - @Argument(fullName="num_cpu_threads", shortName = "nct", doc="How many of the given threads should be allocated to the CPU", required = false) - @Hidden - public Integer numberOfCPUThreads = null; - @Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false) - @Hidden - public Integer numberOfIOThreads = null; - @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false) public Integer numberOfBAMFileHandles = null; - @Argument(fullName="nanoThreads", shortName = "nanoThreads", doc="NanoThreading", required = false) - @Hidden - public int nanoThreads = 1; - @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line.", required = false) public List readGroupBlackList = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 9198d210d..f1d2f7b5b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -76,21 +77,21 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar /** * Create a new hierarchical microscheduler to process the given reads and reference. * - * @param walker the walker used to process the dataset. - * @param reads Reads file(s) to process. - * @param reference Reference for driving the traversal. - * @param nThreadsToUse maximum number of threads to use to do the work + * @param walker the walker used to process the dataset. + * @param reads Reads file(s) to process. + * @param reference Reference for driving the traversal. + * @param threadAllocation How should we apply multi-threaded execution? */ protected HierarchicalMicroScheduler(final GenomeAnalysisEngine engine, final Walker walker, final SAMDataSource reads, final IndexedFastaSequenceFile reference, final Collection rods, - final int nThreadsToUse, - final boolean monitorThreadPerformance ) { - super(engine, walker, reads, reference, rods, nThreadsToUse); + final ThreadAllocation threadAllocation) { + super(engine, walker, reads, reference, rods, threadAllocation); - if ( monitorThreadPerformance ) { + final int nThreadsToUse = threadAllocation.getNumDataThreads(); + if ( threadAllocation.monitorThreadEfficiency() ) { final EfficiencyMonitoringThreadFactory monitoringThreadFactory = new EfficiencyMonitoringThreadFactory(nThreadsToUse); setThreadEfficiencyMonitor(monitoringThreadFactory); this.threadPool = Executors.newFixedThreadPool(nThreadsToUse, monitoringThreadFactory); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 740bcb566..ceb4a6f9b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.traversals.TraverseActiveRegions; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.SampleUtils; @@ -39,13 +40,11 @@ public class LinearMicroScheduler extends MicroScheduler { final SAMDataSource reads, final IndexedFastaSequenceFile reference, final Collection rods, - final int numThreads, // may be > 1 if are nanoScheduling - final boolean monitorThreadPerformance ) { - super(engine, walker, reads, reference, rods, numThreads); + final ThreadAllocation threadAllocation) { + super(engine, walker, reads, reference, rods, threadAllocation); - if ( monitorThreadPerformance ) + if ( threadAllocation.monitorThreadEfficiency() ) setThreadEfficiencyMonitor(new ThreadEfficiencyMonitor()); - } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 073a46ee3..bc0d5da96 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -100,27 +100,30 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * @return The best-fit microscheduler. */ public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { - if (threadAllocation.getNumCPUThreads() > 1) { + if ( threadAllocation.isRunningInParallelMode() ) + logger.info(String.format("Running the GATK in parallel mode with %d CPU threads for each of %d data threads", + threadAllocation.getNumCPUThreadsPerDataThread(), threadAllocation.getNumDataThreads())); + + if ( threadAllocation.getNumDataThreads() > 1 ) { if (walker.isReduceByInterval()) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); - logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads())); - - if ( walker instanceof ReadWalker ) { - if ( ! (walker instanceof ThreadSafeMapReduce) ) badNT(engine, walker); - return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); + if ( ! (walker instanceof TreeReducible) ) { + throw badNT("nt", engine, walker); } else { - // TODO -- update test for when nano scheduling only is an option - if ( ! (walker instanceof TreeReducible) ) badNT(engine, walker); - return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); + return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); } } else { - return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); + if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof ThreadSafeMapReduce) ) + throw badNT("cnt", engine, walker); + return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); } } - private static void badNT(final GenomeAnalysisEngine engine, final Walker walker) { - throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); + private static UserException badNT(final String parallelArg, final GenomeAnalysisEngine engine, final Walker walker) { + throw new UserException.BadArgumentValue("nt", + String.format("The analysis %s currently does not support parallel execution with %s. " + + "Please run your analysis without the %s option.", engine.getWalkerName(walker.getClass()), parallelArg, parallelArg)); } /** @@ -130,24 +133,27 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * @param reads The reads. * @param reference The reference. * @param rods the rods to include in the traversal - * @param numThreads the number of threads we are using in the underlying traversal + * @param threadAllocation the allocation of threads to use in the underlying traversal */ protected MicroScheduler(final GenomeAnalysisEngine engine, final Walker walker, final SAMDataSource reads, final IndexedFastaSequenceFile reference, final Collection rods, - final int numThreads) { + final ThreadAllocation threadAllocation) { this.engine = engine; this.reads = reads; this.reference = reference; this.rods = rods; if (walker instanceof ReadWalker) { - traversalEngine = numThreads > 1 ? new TraverseReadsNano(numThreads) : new TraverseReads(); + traversalEngine = threadAllocation.getNumCPUThreadsPerDataThread() > 1 + ? new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()) + : new TraverseReads(); } else if (walker instanceof LocusWalker) { - // TODO -- refactor to use better interface - traversalEngine = engine.getArguments().nanoThreads > 1 ? new TraverseLociNano(engine.getArguments().nanoThreads) : new TraverseLociLinear(); + traversalEngine = threadAllocation.getNumCPUThreadsPerDataThread() > 1 + ? new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()) + : new TraverseLociLinear(); } else if (walker instanceof DuplicateWalker) { traversalEngine = new TraverseDuplicates(); } else if (walker instanceof ReadPairWalker) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java index 260a7efda..ee1dc63e6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java @@ -32,9 +32,9 @@ import org.broadinstitute.sting.utils.classloader.JVMUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.writer.Options; import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory; import java.io.File; @@ -269,7 +269,7 @@ public class VariantContextWriterStub implements Stub, Var * @return */ public boolean alsoWriteBCFForTest() { - return engine.getArguments().numberOfThreads == 1 && // only works single threaded + return engine.getArguments().numberOfDataThreads == 1 && // only works single threaded ! isCompressed() && // for non-compressed outputs getFile() != null && // that are going to disk engine.getArguments().generateShadowBCF; // and we actually want to do it diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index 6f3f175a2..51fed470f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -218,7 +218,7 @@ public class GATKRunReport { // if there was an exception, capture it this.mException = e == null ? null : new ExceptionToXML(e); - numThreads = engine.getArguments().numberOfThreads; + numThreads = engine.getTotalNumberOfThreads(); percentTimeRunning = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.USER_CPU); percentTimeBlocking = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.BLOCKING); percentTimeWaiting = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.WAITING); diff --git a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java index caae55ac5..f958c9db8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java @@ -24,7 +24,7 @@ package org.broadinstitute.sting.gatk.resourcemanagement; -import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; /** * Models how threads are distributed between various components of the GATK. @@ -33,7 +33,12 @@ public class ThreadAllocation { /** * The number of CPU threads to be used by the GATK. */ - private final int numCPUThreads; + private final int numDataThreads; + + /** + * The number of CPU threads per data thread for GATK processing + */ + private final int numCPUThreadsPerDataThread; /** * Number of threads to devote exclusively to IO. Default is 0. @@ -45,8 +50,12 @@ public class ThreadAllocation { */ private final boolean monitorEfficiency; - public int getNumCPUThreads() { - return numCPUThreads; + public int getNumDataThreads() { + return numDataThreads; + } + + public int getNumCPUThreadsPerDataThread() { + return numCPUThreadsPerDataThread; } public int getNumIOThreads() { @@ -57,47 +66,50 @@ public class ThreadAllocation { return monitorEfficiency; } + /** + * Are we running in parallel mode? + * + * @return true if any parallel processing is enabled + */ + public boolean isRunningInParallelMode() { + return getTotalNumThreads() > 1; + } + + /** + * What is the total number of threads in use by the GATK? + * + * @return the sum of all thread allocations in this object + */ + public int getTotalNumThreads() { + return getNumDataThreads() + getNumCPUThreadsPerDataThread() + getNumIOThreads(); + } + /** * Construct the default thread allocation. */ public ThreadAllocation() { - this(1, null, null, false); + this(1, 1, 0, false); } /** * Set up the thread allocation. Default allocation is 1 CPU thread, 0 IO threads. * (0 IO threads means that no threads are devoted exclusively to IO; they're inline on the CPU thread). - * @param totalThreads Complete number of threads to allocate. - * @param numCPUThreads Total number of threads allocated to the traversal. + * @param numDataThreads Total number of threads allocated to the traversal. + * @param numCPUThreadsPerDataThread The number of CPU threads per data thread to allocate * @param numIOThreads Total number of threads allocated exclusively to IO. + * @param monitorEfficiency should we monitor threading efficiency in the GATK? */ - public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads, final boolean monitorEfficiency) { - // If no allocation information is present, allocate all threads to CPU - if(numCPUThreads == null && numIOThreads == null) { - this.numCPUThreads = totalThreads; - this.numIOThreads = 0; - } - // If only CPU threads are specified, allocate remainder to IO (minimum 0 dedicated IO threads). - else if(numIOThreads == null) { - if(numCPUThreads > totalThreads) - throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) is higher than the total threads",totalThreads,numCPUThreads)); - this.numCPUThreads = numCPUThreads; - this.numIOThreads = totalThreads - numCPUThreads; - } - // If only IO threads are specified, allocate remainder to CPU (minimum 1 dedicated CPU thread). - else if(numCPUThreads == null) { - if(numIOThreads > totalThreads) - throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of io threads (%d) is higher than the total threads",totalThreads,numIOThreads)); - this.numCPUThreads = Math.max(1,totalThreads-numIOThreads); - this.numIOThreads = numIOThreads; - } - else { - if(numCPUThreads + numIOThreads != totalThreads) - throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) + the count of io threads (%d) does not match",totalThreads,numCPUThreads,numIOThreads)); - this.numCPUThreads = numCPUThreads; - this.numIOThreads = numIOThreads; - } + public ThreadAllocation(final int numDataThreads, + final int numCPUThreadsPerDataThread, + final int numIOThreads, + final boolean monitorEfficiency) { + if ( numDataThreads < 1 ) throw new ReviewedStingException("numDataThreads cannot be less than 1, but saw " + numDataThreads); + if ( numCPUThreadsPerDataThread < 1 ) throw new ReviewedStingException("numCPUThreadsPerDataThread cannot be less than 1, but saw " + numCPUThreadsPerDataThread); + if ( numIOThreads < 0 ) throw new ReviewedStingException("numIOThreads cannot be less than 0, but saw " + numIOThreads); + this.numDataThreads = numDataThreads; + this.numCPUThreadsPerDataThread = numCPUThreadsPerDataThread; + this.numIOThreads = numIOThreads; this.monitorEfficiency = monitorEfficiency; } } From 225f3a0ebe380ce8283c4adc4e46fcfa91b2155b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Sep 2012 16:35:00 -0400 Subject: [PATCH 134/161] Update integration test system to allow us to differentiate between testing data and cpu parallelism --- .../org/broadinstitute/sting/WalkerTest.java | 37 +++++++++++++++---- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 7e38c00f3..660259ca8 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -40,13 +40,13 @@ import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.variantcontext.VariantContextTestProvider; - -import java.io.*; - import org.testng.Assert; import org.testng.annotations.AfterSuite; import org.testng.annotations.BeforeMethod; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.PrintStream; import java.text.SimpleDateFormat; import java.util.*; @@ -251,20 +251,43 @@ public class WalkerTest extends BaseTest { return false; } - protected Pair, List> executeTestParallel(final String name, WalkerTestSpec spec) { - return executeTest(name, spec, Arrays.asList(1, 4)); + public enum ParallelTestType { + TREE_REDUCIBLE, + NANO_SCHEDULED, + BOTH } - protected Pair, List> executeTest(final String name, WalkerTestSpec spec, List parallelThreads) { + protected Pair, List> executeTestParallel(final String name, WalkerTestSpec spec, ParallelTestType testType) { + final List ntThreads = testType == ParallelTestType.TREE_REDUCIBLE || testType == ParallelTestType.BOTH ? Arrays.asList(1, 4) : Collections.emptyList(); + final List cntThreads = testType == ParallelTestType.NANO_SCHEDULED || testType == ParallelTestType.BOTH ? Arrays.asList(1, 4) : Collections.emptyList(); + + return executeTest(name, spec, ntThreads, cntThreads); + } + + protected Pair, List> executeTestParallel(final String name, WalkerTestSpec spec) { + return executeTestParallel(name, spec, ParallelTestType.BOTH); + } + + protected Pair, List> executeTest(final String name, WalkerTestSpec spec, List ntThreads, List cpuThreads) { String originalArgs = spec.args; Pair, List> results = null; - for ( int nt : parallelThreads ) { + boolean ran1 = false; + for ( int nt : ntThreads ) { String extra = nt == 1 ? "" : (" -nt " + nt); + ran1 = ran1 || nt == 1; spec.args = originalArgs + extra; results = executeTest(name + "-nt-" + nt, spec); } + for ( int cnt : cpuThreads ) { + if ( cnt != 1 ) { + String extra = " -cnt " + cnt; + spec.args = originalArgs + extra; + results = executeTest(name + "-cnt-" + cnt, spec); + } + } + return results; } From dddf148a595af445d3a9e6ab66bf20a6d8dc3e93 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Sep 2012 16:35:32 -0400 Subject: [PATCH 135/161] Fixed bug in ThreadAllocation getTotalNumberOfThreads -- It isnt data + cpu its data * cpu threads. --- .../sting/gatk/resourcemanagement/ThreadAllocation.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java index f958c9db8..c86f06c25 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java @@ -81,7 +81,7 @@ public class ThreadAllocation { * @return the sum of all thread allocations in this object */ public int getTotalNumThreads() { - return getNumDataThreads() + getNumCPUThreadsPerDataThread() + getNumIOThreads(); + return getNumDataThreads() * getNumCPUThreadsPerDataThread() + getNumIOThreads(); } /** From c5f1ceaa95d17b9aedd9b2e9a33d7d516fee95b8 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Sep 2012 16:38:21 -0400 Subject: [PATCH 136/161] All read and loci traversals go through NanoScheduler now -- The NanoScheduler is doing a good job at tracking important information like time spent in map/reduce/input etc. -- Can be disabled with static boolean in MicroScheduler if we have problems -- See GSA-515 Nanoscheduler GSA-549 Retire TraverseReads and TraverseLoci after testing confirms nano scheduler version in single threaded version is fine --- .../sting/gatk/executive/MicroScheduler.java | 8 +++++--- .../utils/nanoScheduler/NanoScheduler.java | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index bc0d5da96..490f44470 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -59,6 +59,8 @@ import java.util.Collection; /** Shards and schedules data in manageable chunks. */ public abstract class MicroScheduler implements MicroSchedulerMBean { + // TODO -- remove me and retire non nano scheduled versions of traversals + private final static boolean USE_NANOSCHEDULER_FOR_EVERYTHING = true; protected static final Logger logger = Logger.getLogger(MicroScheduler.class); /** @@ -101,7 +103,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { */ public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { if ( threadAllocation.isRunningInParallelMode() ) - logger.info(String.format("Running the GATK in parallel mode with %d CPU threads for each of %d data threads", + logger.info(String.format("Running the GATK in parallel mode with %d CPU thread(s) for each of %d data thread(s)", threadAllocation.getNumCPUThreadsPerDataThread(), threadAllocation.getNumDataThreads())); if ( threadAllocation.getNumDataThreads() > 1 ) { @@ -147,11 +149,11 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { this.rods = rods; if (walker instanceof ReadWalker) { - traversalEngine = threadAllocation.getNumCPUThreadsPerDataThread() > 1 + traversalEngine = USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 ? new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()) : new TraverseReads(); } else if (walker instanceof LocusWalker) { - traversalEngine = threadAllocation.getNumCPUThreadsPerDataThread() > 1 + traversalEngine = USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 ? new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()) : new TraverseLociLinear(); } else if (walker instanceof DuplicateWalker) { diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 89e44ce93..ade6dcaf5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -193,6 +193,7 @@ public class NanoScheduler { if ( reduce == null ) throw new IllegalArgumentException("reduce function cannot be null"); outsideSchedulerTimer.stop(); + ReduceType result; if ( ALLOW_SINGLE_THREAD_FASTPATH && getnThreads() == 1 ) { result = executeSingleThreaded(inputReader, map, initialValue, reduce); @@ -214,13 +215,29 @@ public class NanoScheduler { final NanoSchedulerReduceFunction reduce) { ReduceType sum = initialValue; int i = 0; + + // start timer to ensure that both hasNext and next are caught by the timer + if ( TIME_CALLS ) inputTimer.restart(); while ( inputReader.hasNext() ) { final InputType input = inputReader.next(); + if ( TIME_CALLS ) inputTimer.stop(); + + // map + if ( TIME_CALLS ) mapTimer.restart(); final MapType mapValue = map.apply(input); + if ( TIME_CALLS ) mapTimer.stop(); + if ( i++ % bufferSize == 0 && progressFunction != null ) progressFunction.progress(input); + + // reduce + if ( TIME_CALLS ) reduceTimer.restart(); sum = reduce.apply(mapValue, sum); + if ( TIME_CALLS ) reduceTimer.stop(); + + if ( TIME_CALLS ) inputTimer.restart(); } + return sum; } From e11915aa0aa901d44e73a7d44c2ba1b707e42d21 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Sep 2012 17:37:56 -0400 Subject: [PATCH 137/161] GSA-515 Nanoscheduler GSA-550 ThreadSafeMapReduce shouldn't be super interface of TreeReducible --- .../broadinstitute/sting/gatk/executive/MicroScheduler.java | 2 +- .../src/org/broadinstitute/sting/gatk/walkers/FlagStat.java | 2 +- .../{ThreadSafeMapReduce.java => NanoSchedulable.java} | 2 +- .../src/org/broadinstitute/sting/gatk/walkers/Pileup.java | 2 +- .../src/org/broadinstitute/sting/gatk/walkers/PrintReads.java | 2 +- .../org/broadinstitute/sting/gatk/walkers/TreeReducible.java | 2 +- .../sting/gatk/walkers/bqsr/BaseRecalibrator.java | 2 +- .../sting/gatk/walkers/genotyper/UnifiedGenotyper.java | 2 +- .../org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java | 3 ++- .../org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java | 3 ++- .../org/broadinstitute/sting/gatk/walkers/qc/CountReads.java | 4 ++-- 11 files changed, 14 insertions(+), 12 deletions(-) rename public/java/src/org/broadinstitute/sting/gatk/walkers/{ThreadSafeMapReduce.java => NanoSchedulable.java} (97%) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 490f44470..1da712e8a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -116,7 +116,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); } } else { - if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof ThreadSafeMapReduce) ) + if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) ) throw badNT("cnt", engine, walker); return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java index 14d14aca5..b4ef66aaf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java @@ -45,7 +45,7 @@ import java.text.NumberFormat; */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS}) -public class FlagStat extends ReadWalker implements ThreadSafeMapReduce { +public class FlagStat extends ReadWalker implements NanoSchedulable { @Output PrintStream out; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java similarity index 97% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java index 1ce469f8c..731ce7e4e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java @@ -27,5 +27,5 @@ package org.broadinstitute.sting.gatk.walkers; * declare that their map function is thread-safe and so multiple * map calls can be run in parallel in the same JVM instance. */ -public interface ThreadSafeMapReduce { +public interface NanoSchedulable { } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java index 607c83966..a3efea9f1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java @@ -52,7 +52,7 @@ import java.util.List; * samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate] */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -public class Pileup extends LocusWalker implements TreeReducible { +public class Pileup extends LocusWalker implements TreeReducible, NanoSchedulable { private static final String verboseDelimiter = "@"; // it's ugly to use "@" but it's literally the only usable character not allowed in read names diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java index 4118617fc..37176cbf9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java @@ -93,7 +93,7 @@ import java.util.*; @ReadTransformersMode(ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER) @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER) @Requires({DataSource.READS, DataSource.REFERENCE}) -public class PrintReads extends ReadWalker implements ThreadSafeMapReduce { +public class PrintReads extends ReadWalker implements NanoSchedulable { @Output(doc="Write output to this BAM filename instead of STDOUT", required = true) SAMFileWriter out; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java index 8621c0e9d..c950e07e4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java @@ -13,7 +13,7 @@ package org.broadinstitute.sting.gatk.walkers; * shards of the data can reduce with each other, and the composite result * can be reduced with other composite results. */ -public interface TreeReducible extends ThreadSafeMapReduce { +public interface TreeReducible { /** * A composite, 'reduce of reduces' function. * @param lhs 'left-most' portion of data in the composite reduce. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 443b493be..43aa85a05 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -109,7 +109,7 @@ import java.util.ArrayList; @ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file @Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality @PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta -public class BaseRecalibrator extends LocusWalker implements TreeReducible { +public class BaseRecalibrator extends LocusWalker implements TreeReducible, NanoSchedulable { @ArgumentCollection private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 93928a780..32ceff715 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -125,7 +125,7 @@ import java.util.*; // TODO -- When LocusIteratorByState gets cleaned up, we should enable multiple @By sources: // TODO -- @By( {DataSource.READS, DataSource.REFERENCE_ORDERED_DATA} ) @Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250) -public class UnifiedGenotyper extends LocusWalker, UnifiedGenotyper.UGStatistics> implements TreeReducible, AnnotatorCompatible { +public class UnifiedGenotyper extends LocusWalker, UnifiedGenotyper.UGStatistics> implements TreeReducible, AnnotatorCompatible, NanoSchedulable { @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java index bd10eab87..cd295f26e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; @@ -40,7 +41,7 @@ import java.io.PrintStream; * */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -public class CountLoci extends LocusWalker implements TreeReducible { +public class CountLoci extends LocusWalker implements TreeReducible, NanoSchedulable { @Output(doc="Write count to this file instead of STDOUT") PrintStream out; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java index 9915d617e..ab37a2322 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java @@ -37,6 +37,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.GenomeLoc; @@ -73,7 +74,7 @@ import java.util.*; * */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -public class CountRODs extends RodWalker, Long>> implements TreeReducible, Long>> { +public class CountRODs extends RodWalker, Long>> implements TreeReducible, Long>>, NanoSchedulable { @Output public PrintStream out; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java index 856ea77f5..301fa5b9b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java @@ -4,9 +4,9 @@ import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.ThreadSafeMapReduce; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -41,7 +41,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) -public class CountReads extends ReadWalker implements ThreadSafeMapReduce { +public class CountReads extends ReadWalker implements NanoSchedulable { public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { return 1; } From 574a8f710b5b193f7a2d4299b5f9222605aa6ff7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Sep 2012 17:40:02 -0400 Subject: [PATCH 138/161] Add static boolean controlled output of individual map call timing to nanoSecond resolution --- .../sting/utils/nanoScheduler/NanoScheduler.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index ade6dcaf5..24db0f7dc 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -47,6 +47,7 @@ import java.util.concurrent.*; public class NanoScheduler { private final static Logger logger = Logger.getLogger(NanoScheduler.class); private final static boolean ALLOW_SINGLE_THREAD_FASTPATH = true; + private final static boolean LOG_MAP_TIMES = false; private final static boolean TIME_CALLS = true; final int bufferSize; @@ -224,7 +225,9 @@ public class NanoScheduler { // map if ( TIME_CALLS ) mapTimer.restart(); + final long preMapTime = LOG_MAP_TIMES ? 0 : mapTimer.currentTimeNano(); final MapType mapValue = map.apply(input); + if ( LOG_MAP_TIMES ) logger.info("MAP TIME " + (mapTimer.currentTimeNano() - preMapTime)); if ( TIME_CALLS ) mapTimer.stop(); if ( i++ % bufferSize == 0 && progressFunction != null ) From 228bac75e48b390e886d9fdbf222978aaad1fc2b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Sep 2012 20:57:49 -0400 Subject: [PATCH 139/161] By default do only NT tests in integration tests --- public/java/test/org/broadinstitute/sting/WalkerTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 660259ca8..bcfd00aed 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -265,7 +265,7 @@ public class WalkerTest extends BaseTest { } protected Pair, List> executeTestParallel(final String name, WalkerTestSpec spec) { - return executeTestParallel(name, spec, ParallelTestType.BOTH); + return executeTestParallel(name, spec, ParallelTestType.TREE_REDUCIBLE); } protected Pair, List> executeTest(final String name, WalkerTestSpec spec, List ntThreads, List cpuThreads) { From 0bd2a872faf1d71911a85c9748e3cfcc426bf6df Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 07:26:01 -0400 Subject: [PATCH 140/161] Done GSA-282: Unindexed traversals crash if a read goes off the end of a contig -- Already fixed in the codebase. Added unindexed bam and integration tests to ensure this is fine going forward. --- .../walkers/PileupWalkerIntegrationTest.java | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java index 667b325ed..e16ef3125 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java @@ -16,13 +16,27 @@ public class PileupWalkerIntegrationTest extends WalkerTest { executeTest("Testing the standard (no-indel) pileup on three merged FHS pools with 27 deletions in 969 bases", spec); } + + + private final static String SingleReadAligningOffChromosome1MD5 = "4a45fe1f85aaa8c4158782f2b6dee2bd"; @Test public void testSingleReadAligningOffChromosome1() { String gatk_args = "-T Pileup " + " -I " + privateTestDir + "readOffb37contig1.bam" + " -R " + b37KGReference + " -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList("4a45fe1f85aaa8c4158782f2b6dee2bd")); + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList(SingleReadAligningOffChromosome1MD5)); executeTest("Testing single read spanning off chromosome 1", spec); } + + @Test + public void testSingleReadAligningOffChromosome1NoIndex() { + String gatk_args = "-T Pileup " + + " -I " + privateTestDir + "readOffb37contig1.noIndex.bam" + + " -R " + b37KGReference + + " -U ALLOW_UNINDEXED_BAM" + + " -o %s"; + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList(SingleReadAligningOffChromosome1MD5)); + executeTest("Testing single read spanning off chromosome 1 unindexed", spec); + } } From 1b064805ed31c6532abf7d55d2e641388aad42c0 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Sep 2012 21:13:19 -0400 Subject: [PATCH 141/161] Renaming -cnt to -nct for consistency --- .../sting/gatk/arguments/GATKArgumentCollection.java | 2 +- .../sting/gatk/executive/MicroScheduler.java | 2 +- public/java/test/org/broadinstitute/sting/WalkerTest.java | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index b9e44d87b..b8a7334b3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -307,7 +307,7 @@ public class GATKArgumentCollection { * the benefit of not requiring X times as much memory per thread as data threads do, but rather * only a constant overhead. */ - @Argument(fullName="num_cpu_threads_per_data_thread", shortName = "cnt", doc="How many CPU threads should be allocated per data thread to running this analysis?", required = false) + @Argument(fullName="num_cpu_threads_per_data_thread", shortName = "nct", doc="How many CPU threads should be allocated per data thread to running this analysis?", required = false) public int numberOfCPUThreadsPerDataThread = 1; @Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 1da712e8a..46d6b5882 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -117,7 +117,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { } } else { if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) ) - throw badNT("cnt", engine, walker); + throw badNT("nct", engine, walker); return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); } } diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index bcfd00aed..fa9f9e8a7 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -280,11 +280,11 @@ public class WalkerTest extends BaseTest { results = executeTest(name + "-nt-" + nt, spec); } - for ( int cnt : cpuThreads ) { - if ( cnt != 1 ) { - String extra = " -cnt " + cnt; + for ( int nct : cpuThreads ) { + if ( nct != 1 ) { + String extra = " -nct " + nct; spec.args = originalArgs + extra; - results = executeTest(name + "-cnt-" + cnt, spec); + results = executeTest(name + "-cnt-" + nct, spec); } } From 5ab5d8dee8754f0a8b545971b48644226934017a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Sep 2012 22:08:34 -0400 Subject: [PATCH 142/161] Give EfficiencyMonitoringThreadFactoryUnitTest longer to complete its tests --- .../EfficiencyMonitoringThreadFactoryUnitTest.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java index 6544b9845..d8da274ce 100755 --- a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java @@ -34,14 +34,17 @@ import org.testng.annotations.Test; import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.concurrent.*; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; /** * Tests for the state monitoring thread factory. */ public class EfficiencyMonitoringThreadFactoryUnitTest extends BaseTest { // the duration of the tests -- 100 ms is tolerable given the number of tests we are doing - private final static long THREAD_TARGET_DURATION_IN_MILLISECOND = 1000; + private final static long THREAD_TARGET_DURATION_IN_MILLISECOND = 10000; private final static int MAX_THREADS = 4; final static Object GLOBAL_LOCK = new Object(); From 6df6c1abd538f5616fb624236e0f9cd36a0871ea Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 6 Sep 2012 13:14:18 -0400 Subject: [PATCH 144/161] Fix for PBT to stop NPE when there are no likelihoods present --- .../sting/gatk/walkers/phasing/PhaseByTransmission.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index bbd4bf92f..00acf854a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -541,7 +541,7 @@ public class PhaseByTransmission extends RodWalker, HashMa //Get a Map of genotype likelihoods. //In case of null, unavailable or no call, all likelihoods are 1/3. private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){ - if(genotype == null || !genotype.isCalled()){ + if(genotype == null || !genotype.isCalled() || genotype.getLikelihoods() == null){ EnumMap likelihoods = new EnumMap(GenotypeType.class); likelihoods.put(GenotypeType.HOM_REF,1.0/3.0); likelihoods.put(GenotypeType.HET,1.0/3.0); From cb84a6473f19597d6ab220915fdd102002c0f352 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Thu, 24 May 2012 09:17:11 -0400 Subject: [PATCH 145/161] Downsampling: experimental engine integration -Off by default; engine fork isolates new code paths from old code paths, so no integration tests change yet -Experimental implementation is currently BROKEN due to a serious issue involving file spans. No one can/should use the experimental features until I've patched this issue. -There are temporarily two independent versions of LocusIteratorByState. Anyone changing one version should port the change to the other (if possible), and anyone adding unit tests for one version should add the same unit tests for the other (again, if possible). This situation will hopefully be extremely temporary, and last only until the experimental implementation is proven. --- .../reducereads/SlidingWindow.java | 2 +- .../sting/gatk/DownsamplingMethod.java | 52 -- .../sting/gatk/GenomeAnalysisEngine.java | 29 +- .../sting/gatk/ReadProperties.java | 1 + .../sting/gatk/WalkerManager.java | 12 +- .../arguments/GATKArgumentCollection.java | 40 +- .../gatk/datasources/providers/LocusView.java | 9 +- .../gatk/datasources/reads/SAMDataSource.java | 90 ++- .../{ => downsampling}/DownsampleType.java | 2 +- .../sting/gatk/downsampling/Downsampler.java | 73 +- .../gatk/downsampling/DownsamplingMethod.java | 153 +++++ .../DownsamplingReadsIterator.java | 47 +- .../downsampling/FractionalDownsampler.java | 42 +- .../FractionalDownsamplerFactory.java | 45 ++ .../downsampling/LevelingDownsampler.java | 212 ++++++ .../PerSampleDownsamplingReadsIterator.java | 202 ++++++ .../downsampling/PositionalDownsampler.java | 259 ------- .../gatk/downsampling/ReadsDownsampler.java | 17 +- .../downsampling/ReadsDownsamplerFactory.java | 37 + .../downsampling/ReservoirDownsampler.java | 37 +- .../ReservoirDownsamplerFactory.java | 45 ++ .../SimplePositionalDownsampler.java | 169 +++++ .../SimplePositionalDownsamplerFactory.java | 45 ++ .../sting/gatk/executive/WindowMaker.java | 9 +- ...tor.java => LegacyDownsampleIterator.java} | 4 +- .../gatk/iterators/LocusIteratorByState.java | 4 +- .../LocusIteratorByStateExperimental.java | 649 ++++++++++++++++++ .../gatk/iterators/VerifyingSamIterator.java | 4 +- .../sting/gatk/walkers/Downsample.java | 2 +- .../walkers/coverage/DepthOfCoverage.java | 2 +- .../walkers/genotyper/UnifiedGenotyper.java | 2 +- .../broadinstitute/sting/utils/MathUtils.java | 12 +- .../pileup/AbstractReadBackedPileup.java | 2 + .../sam/ArtificialMultiSampleReadStream.java | 86 +++ .../utils/sam/ArtificialSAMFileReader.java | 27 + .../sting/utils/sam/ArtificialSAMUtils.java | 24 + .../sam/ArtificialSingleSampleReadStream.java | 212 ++++++ ...ificialSingleSampleReadStreamAnalyzer.java | 281 ++++++++ .../reads/DownsamplerBenchmark.java | 5 +- .../reads/SAMDataSourceUnitTest.java | 171 ++++- .../DownsamplingReadsIteratorUnitTest.java | 161 +++-- .../FractionalDownsamplerUnitTest.java | 178 +++-- .../LevelingDownsamplerUnitTest.java | 163 +++++ ...mpleDownsamplingReadsIteratorUnitTest.java | 298 ++++++++ .../PositionalDownsamplerUnitTest.java | 357 ---------- ...ificialSingleSampleReadStreamAnalyzer.java | 126 ++++ .../ReservoirDownsamplerUnitTest.java | 129 ++++ .../SimplePositionalDownsamplerUnitTest.java | 330 +++++++++ ...usIteratorByStateExperimentalUnitTest.java | 546 +++++++++++++++ .../VerifyingSamIteratorUnitTest.java | 13 +- ...> LegacyReservoirDownsamplerUnitTest.java} | 2 +- ...ificialSingleSampleReadStreamUnitTest.java | 161 +++++ 52 files changed, 4701 insertions(+), 879 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java rename public/java/src/org/broadinstitute/sting/gatk/{ => downsampling}/DownsampleType.java (75%) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java rename public/java/src/org/broadinstitute/sting/gatk/iterators/{DownsampleIterator.java => LegacyDownsampleIterator.java} (88%) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java delete mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java rename public/java/test/org/broadinstitute/sting/utils/{ReservoirDownsamplerUnitTest.java => LegacyReservoirDownsamplerUnitTest.java} (99%) create mode 100644 public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index bdb9ef843..d2fc08c62 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -546,7 +546,7 @@ public class SlidingWindow { FractionalDownsampler downsampler = new FractionalDownsampler(fraction); downsampler.submit(allReads); - return downsampler.consumeDownsampledItems(); + return downsampler.consumeFinalizedItems(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java b/public/java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java deleted file mode 100644 index 6d9e79156..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java +++ /dev/null @@ -1,52 +0,0 @@ -package org.broadinstitute.sting.gatk; - -import org.broadinstitute.sting.utils.exceptions.UserException; - -/** - * Describes the method for downsampling reads at a given locus. - * - * @author hanna - * @version 0.1 - */ - -public class DownsamplingMethod { - /** - * Type of downsampling to perform. - */ - public final DownsampleType type; - - /** - * Actual downsampling target is specified as an integer number of reads. - */ - public final Integer toCoverage; - - /** - * Actual downsampling target is specified as a fraction of total available reads. - */ - public final Double toFraction; - - /** - * Expresses no downsampling applied at all. - */ - public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null); - - public DownsamplingMethod(DownsampleType type, Integer toCoverage, Double toFraction) { - // Do some basic sanity checks on the downsampling parameters passed in. - - // Can't leave toFraction and toCoverage null unless type is experimental naive duplicate eliminator. - if(type != DownsampleType.NONE && toFraction == null && toCoverage == null) - throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling."); - - // Fraction and coverage cannot both be specified. - if(toFraction != null && toCoverage != null) - throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified. Please choose only one."); - - // Experimental by sample downsampling does not work with a fraction of reads. - if(type == DownsampleType.BY_SAMPLE && toFraction != null) - throw new UserException.CommandLineException("Cannot downsample to fraction with new EXPERIMENTAL_BY_SAMPLE method"); - - this.type = type; - this.toCoverage = toCoverage; - this.toFraction = toFraction; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index fa28b02cd..3ce8a92b7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -36,6 +36,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.executive.MicroScheduler; import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.gatk.filters.ReadFilter; @@ -441,14 +442,18 @@ public class GenomeAnalysisEngine { protected DownsamplingMethod getDownsamplingMethod() { GATKArgumentCollection argCollection = this.getArguments(); - DownsamplingMethod method; - if(argCollection.getDownsamplingMethod() != null) - method = argCollection.getDownsamplingMethod(); - else if(WalkerManager.getDownsamplingMethod(walker) != null) - method = WalkerManager.getDownsamplingMethod(walker); - else - method = GATKArgumentCollection.getDefaultDownsamplingMethod(); - return method; + boolean useExperimentalDownsampling = argCollection.enableExperimentalDownsampling; + + // until the file pointer bug with the experimental downsamplers is fixed, disallow running with experimental downsampling + if ( useExperimentalDownsampling ) { + throw new UserException("The experimental downsampling implementation is currently crippled by a file-pointer-related bug. Until this bug is fixed, it's not safe (or possible) for anyone to use the experimental implementation!"); + } + + DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod(); + DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useExperimentalDownsampling); + DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useExperimentalDownsampling); + + return commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod); } protected void setDownsamplingMethod(DownsamplingMethod method) { @@ -821,11 +826,13 @@ public class GenomeAnalysisEngine { * @return A data source for the given set of reads. */ private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) { - DownsamplingMethod method = getDownsamplingMethod(); + DownsamplingMethod downsamplingMethod = getDownsamplingMethod(); // Synchronize the method back into the collection so that it shows up when // interrogating for the downsample method during command line recreation. - setDownsamplingMethod(method); + setDownsamplingMethod(downsamplingMethod); + + logger.info(downsamplingMethod); if (argCollection.removeProgramRecords && argCollection.keepProgramRecords) throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options"); @@ -843,7 +850,7 @@ public class GenomeAnalysisEngine { argCollection.useOriginalBaseQualities, argCollection.strictnessLevel, argCollection.readBufferSize, - method, + downsamplingMethod, new ValidationExclusion(Arrays.asList(argCollection.unsafe)), filters, readTransformers, diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java index b2d4d202d..e1ada93cc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java +++ b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java @@ -4,6 +4,7 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; diff --git a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java index ae59ce438..fbacbddc4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java @@ -27,6 +27,8 @@ package org.broadinstitute.sting.gatk; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; @@ -304,9 +306,10 @@ public class WalkerManager extends PluginManager { * downsampling method is specified on the command-line, the command-line version will * be used instead. * @param walkerClass The class of the walker to interrogate. + * @param useExperimentalDownsampling If true, use the experimental downsampling implementation * @return The downsampling method, as specified by the walker. Null if none exists. */ - public static DownsamplingMethod getDownsamplingMethod(Class walkerClass) { + public static DownsamplingMethod getDownsamplingMethod(Class walkerClass, boolean useExperimentalDownsampling) { DownsamplingMethod downsamplingMethod = null; if( walkerClass.isAnnotationPresent(Downsample.class) ) { @@ -314,7 +317,7 @@ public class WalkerManager extends PluginManager { DownsampleType type = downsampleParameters.by(); Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null; Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null; - downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction); + downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useExperimentalDownsampling); } return downsamplingMethod; @@ -333,10 +336,11 @@ public class WalkerManager extends PluginManager { * downsampling method is specified on the command-line, the command-line version will * be used instead. * @param walker The walker to interrogate. + * @param useExperimentalDownsampling If true, use the experimental downsampling implementation * @return The downsampling method, as specified by the walker. Null if none exists. */ - public static DownsamplingMethod getDownsamplingMethod(Walker walker) { - return getDownsamplingMethod(walker.getClass()); + public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useExperimentalDownsampling) { + return getDownsamplingMethod(walker.getClass(), useExperimentalDownsampling); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index b8a7334b3..44817379a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -31,8 +31,8 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.IntervalBinding; -import org.broadinstitute.sting.gatk.DownsampleType; -import org.broadinstitute.sting.gatk.DownsamplingMethod; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.gatk.samples.PedigreeValidationType; import org.broadinstitute.sting.utils.QualityUtils; @@ -140,15 +140,11 @@ public class GATKArgumentCollection { @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false) public boolean nonDeterministicRandomSeed = false; - /** - * The override mechanism in the GATK, by default, populates the command-line arguments, then - * the defaults from the walker annotations. Unfortunately, walker annotations should be trumped - * by a user explicitly specifying command-line arguments. - * TODO: Change the GATK so that walker defaults are loaded first, then command-line arguments. - */ - private static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE; - private static int DEFAULT_DOWNSAMPLING_COVERAGE = 1000; - + // -------------------------------------------------------------------------------------------------------------- + // + // Downsampling Arguments + // + // -------------------------------------------------------------------------------------------------------------- @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here", required = false) public DownsampleType downsamplingType = null; @@ -158,17 +154,20 @@ public class GATKArgumentCollection { @Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus", required = false) public Integer downsampleCoverage = null; + @Argument(fullName = "enable_experimental_downsampling", shortName = "enable_experimental_downsampling", doc = "Enable experimental engine-level downsampling", required = false) + @Hidden + public boolean enableExperimentalDownsampling = false; + /** * Gets the downsampling method explicitly specified by the user. If the user didn't specify * a default downsampling mechanism, return the default. * @return The explicitly specified downsampling mechanism, or the default if none exists. */ public DownsamplingMethod getDownsamplingMethod() { - if(downsamplingType == null && downsampleFraction == null && downsampleCoverage == null) + if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null ) return null; - if(downsamplingType == null && downsampleCoverage != null) - return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,downsampleCoverage,null); - return new DownsamplingMethod(downsamplingType,downsampleCoverage,downsampleFraction); + + return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, enableExperimentalDownsampling); } /** @@ -178,9 +177,11 @@ public class GATKArgumentCollection { public void setDownsamplingMethod(DownsamplingMethod method) { if (method == null) throw new IllegalArgumentException("method is null"); + downsamplingType = method.type; downsampleCoverage = method.toCoverage; downsampleFraction = method.toFraction; + enableExperimentalDownsampling = method.useExperimentalDownsampling; } // -------------------------------------------------------------------------------------------------------------- @@ -208,15 +209,6 @@ public class GATKArgumentCollection { @Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false) public File performanceLog = null; - /** - * Gets the default downsampling method, returned if the user didn't specify any downsampling - * method. - * @return The default downsampling mechanism, or null if none exists. - */ - public static DownsamplingMethod getDefaultDownsamplingMethod() { - return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,DEFAULT_DOWNSAMPLING_COVERAGE,null); - } - @Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false) public Boolean useOriginalBaseQualities = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java index a3ce6dd27..cd3403f2f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java @@ -1,6 +1,6 @@ package org.broadinstitute.sting.gatk.datasources.providers; -import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.iterators.LocusIterator; @@ -135,8 +135,13 @@ public abstract class LocusView extends LocusIterator implements View { // Cache the current and apply filtering. AlignmentContext current = nextLocus; - if( sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null ) + + // The old ALL_READS downsampling implementation -- only use if we're not using the new experimental downsampling: + if( ! sourceInfo.getDownsamplingMethod().useExperimentalDownsampling && + sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null ) { + current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage ); + } // Indicate that the next operation will need to advance. nextLocus = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 7d027438b..437813f19 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -30,7 +30,9 @@ import net.sf.samtools.*; import net.sf.samtools.util.CloseableIterator; import net.sf.samtools.util.RuntimeIOException; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.DownsamplingMethod; +import org.broadinstitute.sting.gatk.downsampling.*; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadMetrics; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; @@ -152,6 +154,8 @@ public class SAMDataSource { */ private final ThreadAllocation threadAllocation; + private final boolean expandShardsForDownsampling; + /** * Create a new SAM data source given the supplied read metadata. * @param samFiles list of reads files. @@ -302,6 +306,11 @@ public class SAMDataSource { includeReadsWithDeletionAtLoci, defaultBaseQualities); + expandShardsForDownsampling = readProperties.getDownsamplingMethod() != null && + readProperties.getDownsamplingMethod().useExperimentalDownsampling && + readProperties.getDownsamplingMethod().type != DownsampleType.NONE && + readProperties.getDownsamplingMethod().toCoverage != null; + // cache the read group id (original) -> read group id (merged) // and read group id (merged) -> read group id (original) mappings. for(SAMReaderID id: readerIDs) { @@ -457,6 +466,16 @@ public class SAMDataSource { } } + /** + * Are we expanding shards as necessary to prevent shard boundaries from occurring at improper places? + * + * @return true if we are using expanded shards, otherwise false + */ + public boolean usingExpandedShards() { + return expandShardsForDownsampling; + } + + /** * Fill the given buffering shard with reads. * @param shard Shard to fill. @@ -484,6 +503,31 @@ public class SAMDataSource { } } + // If the reads are sorted in coordinate order, ensure that all reads + // having the same alignment start become part of the same shard, to allow + // downsampling to work better across shard boundaries. Note that because our + // read stream has already been fed through the positional downsampler, which + // ensures that at each alignment start position there are no more than dcov + // reads, we're in no danger of accidentally creating a disproportionately huge + // shard + if ( expandShardsForDownsampling && sortOrder == SAMFileHeader.SortOrder.coordinate ) { + while ( iterator.hasNext() ) { + SAMRecord additionalRead = iterator.next(); + + // Stop filling the shard as soon as we encounter a read having a different + // alignment start or contig from the last read added in the earlier loop + // above, or an unmapped read + if ( read == null || + additionalRead.getReadUnmappedFlag() || + ! additionalRead.getReferenceIndex().equals(read.getReferenceIndex()) || + additionalRead.getAlignmentStart() != read.getAlignmentStart() ) { + break; + } + shard.addRead(additionalRead); + noteFilePositionUpdate(positionUpdates, additionalRead); + } + } + // If the reads are sorted in queryname order, ensure that all reads // having the same queryname become part of the same shard. if(sortOrder == SAMFileHeader.SortOrder.queryname) { @@ -578,6 +622,7 @@ public class SAMDataSource { iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator); if(shard.getGenomeLocs().size() > 0) iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs()); + iteratorMap.put(readers.getReader(id), iterator); } @@ -660,20 +705,25 @@ public class SAMDataSource { List readTransformers, byte defaultBaseQualities) { - // *********************************************************************************** // - // * NOTE: ALL FILTERING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * // - // * (otherwise we will process something that we may end up throwing away) * // - // *********************************************************************************** // + // ************************************************************************************************ // + // * NOTE: ALL FILTERING/DOWNSAMPLING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * // + // * (otherwise we will process something that we may end up throwing away) * // + // ************************************************************************************************ // - if (downsamplingFraction != null) - wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction); + wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters)); + + if ( readProperties.getDownsamplingMethod().useExperimentalDownsampling ) { + wrappedIterator = applyDownsamplingIterator(wrappedIterator); + } + + // Use the old fractional downsampler only if we're not using experimental downsampling: + if ( ! readProperties.getDownsamplingMethod().useExperimentalDownsampling && downsamplingFraction != null ) + wrappedIterator = new LegacyDownsampleIterator(wrappedIterator, downsamplingFraction); // unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification, // verify the read ordering by applying a sort order iterator if (!noValidationOfReadOrder && enableVerification) - wrappedIterator = new VerifyingSamIterator(genomeLocParser,wrappedIterator); - - wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters)); + wrappedIterator = new VerifyingSamIterator(wrappedIterator); if (useOriginalBaseQualities || defaultBaseQualities >= 0) // only wrap if we are replacing the original qualities or using a default base quality @@ -688,6 +738,26 @@ public class SAMDataSource { return wrappedIterator; } + protected StingSAMIterator applyDownsamplingIterator( StingSAMIterator wrappedIterator ) { + if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) { + ReadsDownsamplerFactory downsamplerFactory = readProperties.getDownsamplingMethod().toCoverage != null ? + new SimplePositionalDownsamplerFactory(readProperties.getDownsamplingMethod().toCoverage) : + new FractionalDownsamplerFactory(readProperties.getDownsamplingMethod().toFraction); + + return new PerSampleDownsamplingReadsIterator(wrappedIterator, downsamplerFactory); + } + else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) { + ReadsDownsampler downsampler = readProperties.getDownsamplingMethod().toCoverage != null ? + new SimplePositionalDownsampler(readProperties.getDownsamplingMethod().toCoverage) : + new FractionalDownsampler(readProperties.getDownsamplingMethod().toFraction); + + return new DownsamplingReadsIterator(wrappedIterator, downsampler); + } + + return wrappedIterator; + } + + private class SAMResourcePool { /** * How many entries can be cached in this resource pool? diff --git a/public/java/src/org/broadinstitute/sting/gatk/DownsampleType.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java similarity index 75% rename from public/java/src/org/broadinstitute/sting/gatk/DownsampleType.java rename to public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java index 3fabf6e0d..c3d17436a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/DownsampleType.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.gatk; +package org.broadinstitute.sting.gatk.downsampling; /** * Type of downsampling method to invoke. diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java index 5fb99b2bc..f5741af4e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java @@ -28,49 +28,92 @@ import java.util.Collection; import java.util.List; /** - * The basic downsampler API, with no reads-specific operations + * The basic downsampler API, with no reads-specific operations. + * + * Downsamplers that extend this interface rather than the ReadsDownsampler interface can handle + * any kind of item, however they cannot be wrapped within a DownsamplingReadsIterator or a + * PerSampleDownsamplingReadsIterator. * * @author David Roazen */ public interface Downsampler { - /* - * Submit one item to the downsampler for consideration . Some downsamplers will be able to determine + /** + * Submit one item to the downsampler for consideration. Some downsamplers will be able to determine * immediately whether the item survives the downsampling process, while others will need to see * more items before making that determination. + * + * @param item the individual item to submit to the downsampler for consideration */ public void submit( T item ); - /* - * Submit a collection of items to the downsampler for consideration. + /** + * Submit a collection of items to the downsampler for consideration. Should be equivalent to calling + * submit() on each individual item in the collection. + * + * @param items the collection of items to submit to the downsampler for consideration */ public void submit( Collection items ); - /* + /** * Are there items that have survived the downsampling process waiting to be retrieved? + * + * @return true if this downsampler has > 0 finalized items, otherwise false */ - public boolean hasDownsampledItems(); + public boolean hasFinalizedItems(); - /* - * Return (and remove) all items that have survived downsampling and are waiting to be retrieved. + /** + * Return (and *remove*) all items that have survived downsampling and are waiting to be retrieved. + * + * @return a list of all finalized items this downsampler contains, or an empty list if there are none */ - public List consumeDownsampledItems(); + public List consumeFinalizedItems(); - /* + /** * Are there items stored in this downsampler that it doesn't yet know whether they will * ultimately survive the downsampling process? + * + * @return true if this downsampler has > 0 pending items, otherwise false */ public boolean hasPendingItems(); - /* + /** + * Peek at the first finalized item stored in this downsampler (or null if there are no finalized items) + * + * @return the first finalized item in this downsampler (the item is not removed from the downsampler by this call), + * or null if there are none + */ + public T peekFinalized(); + + /** + * Peek at the first pending item stored in this downsampler (or null if there are no pending items) + * + * @return the first pending item stored in this downsampler (the item is not removed from the downsampler by this call), + * or null if there are none + */ + public T peekPending(); + + /** + * Returns the number of items discarded (so far) during the downsampling process + * + * @return the number of items that have been submitted to this downsampler and discarded in the process of + * downsampling + */ + public int getNumberOfDiscardedItems(); + + /** * Used to tell the downsampler that no more items will be submitted to it, and that it should * finalize any pending items. */ public void signalEndOfInput(); - /* - * Reset the downsampler to a clean state, devoid of any pending/downsampled items or tracked state - * information. + /** + * Empty the downsampler of all finalized/pending items */ public void clear(); + + /** + * Reset stats in the downsampler such as the number of discarded items *without* clearing the downsampler of items + */ + public void reset(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java new file mode 100644 index 000000000..ae1d98ce0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.exceptions.UserException; + +/** + * Describes the method for downsampling reads at a given locus. + */ + +public class DownsamplingMethod { + /** + * Type of downsampling to perform. + */ + public final DownsampleType type; + + /** + * Actual downsampling target is specified as an integer number of reads. + */ + public final Integer toCoverage; + + /** + * Actual downsampling target is specified as a fraction of total available reads. + */ + public final Double toFraction; + + /** + * Use the new experimental downsampling? + */ + public final boolean useExperimentalDownsampling; + + /** + * Expresses no downsampling applied at all. + */ + public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null,false); + + /** + * Default type to use if no type is specified + */ + public static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE; + + /** + * Default target coverage for locus-based traversals + */ + public static int DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE = 1000; + + public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useExperimentalDownsampling ) { + this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE; + this.toCoverage = toCoverage; + this.toFraction = toFraction; + this.useExperimentalDownsampling = useExperimentalDownsampling; + + if ( type == DownsampleType.NONE ) { + toCoverage = null; + toFraction = null; + } + + validate(); + } + + private void validate() { + // Can't leave toFraction and toCoverage null unless type is NONE + if ( type != DownsampleType.NONE && toFraction == null && toCoverage == null ) + throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling."); + + // Fraction and coverage cannot both be specified. + if ( toFraction != null && toCoverage != null ) + throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified. Please choose only one."); + + // toCoverage must be > 0 when specified + if ( toCoverage != null && toCoverage <= 0 ) { + throw new UserException.CommandLineException("toCoverage must be > 0 when downsampling to coverage"); + } + + // toFraction must be >= 0.0 and <= 1.0 when specified + if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) { + throw new UserException.CommandLineException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads"); + } + + // Some restrictions only exist for the old downsampling implementation: + if ( ! useExperimentalDownsampling ) { + // By sample downsampling does not work with a fraction of reads in the old downsampling implementation + if( type == DownsampleType.BY_SAMPLE && toFraction != null ) + throw new UserException.CommandLineException("Cannot downsample to fraction with the BY_SAMPLE method"); + } + + // Some restrictions only exist for the new downsampling implementation: + if ( useExperimentalDownsampling ) { + if ( type == DownsampleType.ALL_READS && toCoverage != null ) { + throw new UserException.CommandLineException("Cannot downsample to coverage with the ALL_READS method in the experimental downsampling implementation"); + } + } + } + + public String toString() { + StringBuilder builder = new StringBuilder("Downsampling Settings: "); + + if ( type == DownsampleType.NONE ) { + builder.append("No downsampling"); + } + else { + builder.append(String.format("Method: %s ", type)); + + if ( toCoverage != null ) { + builder.append(String.format("Target Coverage: %d ", toCoverage)); + } + else { + builder.append(String.format("Target Fraction: %.2f ", toFraction)); + } + + if ( useExperimentalDownsampling ) { + builder.append("Using Experimental Downsampling"); + } + } + + return builder.toString(); + } + + public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useExperimentalDownsampling ) { + if ( walker instanceof LocusWalker || walker instanceof ActiveRegionWalker ) { + return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE, DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE, + null, useExperimentalDownsampling); + } + else { + return new DownsamplingMethod(DownsampleType.NONE, null, null, useExperimentalDownsampling); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java index bccc2e946..c8fbc829c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java @@ -33,7 +33,8 @@ import java.util.NoSuchElementException; /** - * StingSAMIterator wrapper around our generic reads downsampler interface + * StingSAMIterator wrapper around our generic reads downsampler interface. Converts the push-style + * downsampler interface to a pull model. * * @author David Roazen */ @@ -42,35 +43,50 @@ public class DownsamplingReadsIterator implements StingSAMIterator { private StingSAMIterator nestedSAMIterator; private ReadsDownsampler downsampler; private Collection downsampledReadsCache; - private Iterator downsampledReadsCacheIterator; + private SAMRecord nextRead = null; + private Iterator downsampledReadsCacheIterator = null; + /** + * @param iter wrapped iterator from which this iterator will pull reads + * @param downsampler downsampler through which the reads will be fed + */ public DownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsampler downsampler ) { nestedSAMIterator = iter; this.downsampler = downsampler; - fillDownsampledReadsCache(); + + advanceToNextRead(); } public boolean hasNext() { - if ( downsampledReadsCacheIterator.hasNext() ) { - return true; - } - else if ( ! nestedSAMIterator.hasNext() || ! fillDownsampledReadsCache() ) { - return false; - } - - return true; + return nextRead != null; } public SAMRecord next() { - if ( ! downsampledReadsCacheIterator.hasNext() && ! fillDownsampledReadsCache() ) { + if ( nextRead == null ) { throw new NoSuchElementException("next() called when there are no more items"); } - return downsampledReadsCacheIterator.next(); + SAMRecord toReturn = nextRead; + advanceToNextRead(); + + return toReturn; + } + + private void advanceToNextRead() { + if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) { + nextRead = null; + } + else { + nextRead = downsampledReadsCacheIterator.next(); + } + } + + private boolean readyToReleaseReads() { + return downsampledReadsCacheIterator != null && downsampledReadsCacheIterator.hasNext(); } private boolean fillDownsampledReadsCache() { - while ( nestedSAMIterator.hasNext() && ! downsampler.hasDownsampledItems() ) { + while ( nestedSAMIterator.hasNext() && ! downsampler.hasFinalizedItems() ) { downsampler.submit(nestedSAMIterator.next()); } @@ -78,7 +94,8 @@ public class DownsamplingReadsIterator implements StingSAMIterator { downsampler.signalEndOfInput(); } - downsampledReadsCache = downsampler.consumeDownsampledItems(); + // use returned collection directly rather than make a copy, for speed + downsampledReadsCache = downsampler.consumeFinalizedItems(); downsampledReadsCacheIterator = downsampledReadsCache.iterator(); return downsampledReadsCacheIterator.hasNext(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java index d5d529c9f..8901ae525 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java @@ -33,7 +33,10 @@ import java.util.Collection; import java.util.List; /** - * Fractional Downsampler: selects a specified fraction of the reads for inclusion + * Fractional Downsampler: selects a specified fraction of the reads for inclusion. + * + * Since the selection is done randomly, the actual fraction of reads retained may be slightly + * more or less than the requested fraction, depending on the total number of reads submitted. * * @author David Roazen */ @@ -43,8 +46,16 @@ public class FractionalDownsampler implements ReadsDownsamp private int cutoffForInclusion; + private int numDiscardedItems; + private static final int RANDOM_POOL_SIZE = 10000; + /** + * Construct a FractionalDownsampler + * + * @param fraction Fraction of reads to preserve, between 0.0 (inclusive) and 1.0 (inclusive). + * Actual number of reads preserved may differ randomly. + */ public FractionalDownsampler( double fraction ) { if ( fraction < 0.0 || fraction > 1.0 ) { throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive"); @@ -52,12 +63,16 @@ public class FractionalDownsampler implements ReadsDownsamp cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE); clear(); + reset(); } public void submit( T newRead ) { if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) { selectedReads.add(newRead); } + else { + numDiscardedItems++; + } } public void submit( Collection newReads ) { @@ -66,11 +81,12 @@ public class FractionalDownsampler implements ReadsDownsamp } } - public boolean hasDownsampledItems() { + public boolean hasFinalizedItems() { return selectedReads.size() > 0; } - public List consumeDownsampledItems() { + public List consumeFinalizedItems() { + // pass by reference rather than make a copy, for speed List downsampledItems = selectedReads; clear(); return downsampledItems; @@ -80,6 +96,18 @@ public class FractionalDownsampler implements ReadsDownsamp return false; } + public T peekFinalized() { + return selectedReads.isEmpty() ? null : selectedReads.get(0); + } + + public T peekPending() { + return null; + } + + public int getNumberOfDiscardedItems() { + return numDiscardedItems; + } + public void signalEndOfInput() { // NO-OP } @@ -88,7 +116,15 @@ public class FractionalDownsampler implements ReadsDownsamp selectedReads = new ArrayList(); } + public void reset() { + numDiscardedItems = 0; + } + public boolean requiresCoordinateSortOrder() { return false; } + + public void signalNoMoreReadsBefore( T read ) { + // NO-OP + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java new file mode 100644 index 000000000..7a7c9e91e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +/** + * Factory for creating FractionalDownsamplers on demand + * + * @author David Roazen + */ +public class FractionalDownsamplerFactory implements ReadsDownsamplerFactory { + + private double fraction; + + public FractionalDownsamplerFactory( double fraction ) { + this.fraction = fraction; + } + + public ReadsDownsampler newInstance() { + return new FractionalDownsampler(fraction); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java new file mode 100644 index 000000000..73d69140d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.*; + +/** + * Leveling Downsampler: Given a set of Lists of arbitrary items and a target size, removes items from + * the Lists in an even fashion until the total size of all Lists is <= the target size. Leveling + * does not occur until all Lists have been submitted and signalEndOfInput() is called. + * + * The Lists should be LinkedLists for maximum efficiency during item removal, however other + * kinds of Lists are also accepted (albeit at a slight performance penalty). + * + * Since this downsampler extends the Downsampler interface rather than the ReadsDownsampler interface, + * the Lists need not contain reads. However this downsampler may not be wrapped within one of the + * DownsamplingReadsIterators + * + * @param the List type representing the stacks to be leveled + * @param the type of the elements of each List + * + * @author David Roazen + */ +public class LevelingDownsampler, E> implements Downsampler { + + private int targetSize; + + private List groups; + + private boolean groupsAreFinalized; + + private int numDiscardedItems; + + /** + * Construct a LevelingDownsampler + * + * @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed + * this value -- if it does, items are removed from Lists evenly until the total size + * is <= this value + */ + public LevelingDownsampler( int targetSize ) { + this.targetSize = targetSize; + clear(); + reset(); + } + + public void submit( T item ) { + groups.add(item); + } + + public void submit( Collection items ){ + groups.addAll(items); + } + + public boolean hasFinalizedItems() { + return groupsAreFinalized && groups.size() > 0; + } + + public List consumeFinalizedItems() { + if ( ! hasFinalizedItems() ) { + return new ArrayList(); + } + + // pass by reference rather than make a copy, for speed + List toReturn = groups; + clear(); + return toReturn; + } + + public boolean hasPendingItems() { + return ! groupsAreFinalized && groups.size() > 0; + } + + public T peekFinalized() { + return hasFinalizedItems() ? groups.get(0) : null; + } + + public T peekPending() { + return hasPendingItems() ? groups.get(0) : null; + } + + public int getNumberOfDiscardedItems() { + return numDiscardedItems; + } + + public void signalEndOfInput() { + levelGroups(); + groupsAreFinalized = true; + } + + public void clear() { + groups = new ArrayList(); + groupsAreFinalized = false; + } + + public void reset() { + numDiscardedItems = 0; + } + + private void levelGroups() { + int totalSize = 0; + int[] groupSizes = new int[groups.size()]; + int currentGroupIndex = 0; + + for ( T group : groups ) { + groupSizes[currentGroupIndex] = group.size(); + totalSize += groupSizes[currentGroupIndex]; + currentGroupIndex++; + } + + if ( totalSize <= targetSize ) { + return; // no need to eliminate any items + } + + // We will try to remove exactly this many items, however we will refuse to allow any + // one group to fall below size 1, and so might end up removing fewer items than this + int numItemsToRemove = totalSize - targetSize; + + currentGroupIndex = 0; + int numConsecutiveUmodifiableGroups = 0; + + // Continue until we've either removed all the items we wanted to, or we can't + // remove any more items without violating the constraint that all groups must + // be left with at least one item + while ( numItemsToRemove > 0 && numConsecutiveUmodifiableGroups < groupSizes.length ) { + if ( groupSizes[currentGroupIndex] > 1 ) { + groupSizes[currentGroupIndex]--; + numItemsToRemove--; + numConsecutiveUmodifiableGroups = 0; + } + else { + numConsecutiveUmodifiableGroups++; + } + + currentGroupIndex = (currentGroupIndex + 1) % groupSizes.length; + } + + // Now we actually go through and reduce each group to its new count as specified in groupSizes + currentGroupIndex = 0; + for ( T group : groups ) { + downsampleOneGroup(group, groupSizes[currentGroupIndex]); + currentGroupIndex++; + } + } + + private void downsampleOneGroup( T group, int numItemsToKeep ) { + if ( numItemsToKeep >= group.size() ) { + return; + } + + numDiscardedItems += group.size() - numItemsToKeep; + + BitSet itemsToKeep = new BitSet(group.size()); + for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(group.size(), numItemsToKeep) ) { + itemsToKeep.set(selectedIndex); + } + + int currentIndex = 0; + + // If our group is a linked list, we can remove the desired items in a single O(n) pass with an iterator + if ( group instanceof LinkedList ) { + Iterator iter = group.iterator(); + while ( iter.hasNext() ) { + iter.next(); + + if ( ! itemsToKeep.get(currentIndex) ) { + iter.remove(); + } + + currentIndex++; + } + } + // If it's not a linked list, it's more efficient to copy the desired items into a new list and back rather + // than suffer O(n^2) of item shifting + else { + List keptItems = new ArrayList(numItemsToKeep); + + for ( E item : group ) { + if ( itemsToKeep.get(currentIndex) ) { + keptItems.add(item); + } + currentIndex++; + } + group.clear(); + group.addAll(keptItems); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java new file mode 100644 index 000000000..8b2034460 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMRecordComparator; +import net.sf.samtools.SAMRecordCoordinateComparator; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; + +import java.util.*; + + +/** + * StingSAMIterator wrapper around our generic reads downsampler interface + * that downsamples reads for each sample independently, and then re-assembles + * the reads back into a single merged stream. + * + * @author David Roazen + */ +public class PerSampleDownsamplingReadsIterator implements StingSAMIterator { + + private StingSAMIterator nestedSAMIterator; + private ReadsDownsamplerFactory downsamplerFactory; + private Map> perSampleDownsamplers; + private PriorityQueue orderedDownsampledReadsCache; + private SAMRecord nextRead = null; + private SAMRecordComparator readComparator = new SAMRecordCoordinateComparator(); + private SAMRecord earliestPendingRead = null; + private ReadsDownsampler earliestPendingDownsampler = null; + + // Initial size of our cache of finalized reads + private static final int DOWNSAMPLED_READS_INITIAL_CACHE_SIZE = 4096; + + // The number of positional changes that can occur in the read stream before all downsamplers + // should be informed of the current position (guards against samples with relatively sparse reads + // getting stuck in a pending state): + private static final int DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL = 3; // TODO: experiment with this value + + /** + * @param iter wrapped iterator from which this iterator will pull reads + * @param downsamplerFactory factory used to create new downsamplers as needed + */ + public PerSampleDownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsamplerFactory downsamplerFactory ) { + nestedSAMIterator = iter; + this.downsamplerFactory = downsamplerFactory; + perSampleDownsamplers = new HashMap>(); + orderedDownsampledReadsCache = new PriorityQueue(DOWNSAMPLED_READS_INITIAL_CACHE_SIZE, readComparator); + + advanceToNextRead(); + } + + public boolean hasNext() { + return nextRead != null; + } + + public SAMRecord next() { + if ( nextRead == null ) { + throw new NoSuchElementException("next() called when there are no more items"); + } + + SAMRecord toReturn = nextRead; + advanceToNextRead(); + + return toReturn; + } + + private void advanceToNextRead() { + if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) { + nextRead = null; + } + else { + nextRead = orderedDownsampledReadsCache.poll(); + } + } + + private boolean readyToReleaseReads() { + if ( orderedDownsampledReadsCache.isEmpty() ) { + return false; + } + + return earliestPendingRead == null || + readComparator.compare(orderedDownsampledReadsCache.peek(), earliestPendingRead) <= 0; + } + + private void updateEarliestPendingRead( ReadsDownsampler currentDownsampler ) { + // If there is no recorded earliest pending read and this downsampler has pending items, + // then this downsampler's first pending item becomes the new earliest pending read: + if ( earliestPendingRead == null && currentDownsampler.hasPendingItems() ) { + earliestPendingRead = currentDownsampler.peekPending(); + earliestPendingDownsampler = currentDownsampler; + } + // In all other cases, we only need to update the earliest pending read when the downsampler + // associated with it experiences a change in its pending reads, since by assuming a sorted + // read stream we're assured that each downsampler's earliest pending read will only increase + // in genomic position over time. + // + // TODO: An occasional O(samples) linear search seems like a better option than keeping the downsamplers + // TODO: sorted by earliest pending read, which would cost at least O(total_reads * (samples + log(samples))), + // TODO: but need to verify this empirically. + else if ( currentDownsampler == earliestPendingDownsampler && + (! currentDownsampler.hasPendingItems() || readComparator.compare(currentDownsampler.peekPending(), earliestPendingRead) != 0) ) { + + earliestPendingRead = null; + earliestPendingDownsampler = null; + for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { + if ( perSampleDownsampler.hasPendingItems() && + (earliestPendingRead == null || readComparator.compare(perSampleDownsampler.peekPending(), earliestPendingRead) < 0) ) { + + earliestPendingRead = perSampleDownsampler.peekPending(); + earliestPendingDownsampler = perSampleDownsampler; + } + } + } + } + + private boolean fillDownsampledReadsCache() { + SAMRecord prevRead = null; + int numPositionalChanges = 0; + + // Continue submitting reads to the per-sample downsamplers until the read at the top of the priority queue + // can be released without violating global sort order + while ( nestedSAMIterator.hasNext() && ! readyToReleaseReads() ) { + SAMRecord read = nestedSAMIterator.next(); + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + + ReadsDownsampler thisSampleDownsampler = perSampleDownsamplers.get(sampleName); + if ( thisSampleDownsampler == null ) { + thisSampleDownsampler = downsamplerFactory.newInstance(); + perSampleDownsamplers.put(sampleName, thisSampleDownsampler); + } + + thisSampleDownsampler.submit(read); + updateEarliestPendingRead(thisSampleDownsampler); + + if ( prevRead != null && prevRead.getAlignmentStart() != read.getAlignmentStart() ) { + numPositionalChanges++; + } + + // If the number of times we've changed position exceeds a certain threshold, inform all + // downsamplers of the current position in the read stream. This is to prevent downsamplers + // for samples with sparser reads than others from getting stuck too long in a pending state. + if ( numPositionalChanges > DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL ) { + for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { + perSampleDownsampler.signalNoMoreReadsBefore(read); + updateEarliestPendingRead(perSampleDownsampler); + } + } + + prevRead = read; + } + + if ( ! nestedSAMIterator.hasNext() ) { + for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { + perSampleDownsampler.signalEndOfInput(); + } + earliestPendingRead = null; + earliestPendingDownsampler = null; + } + + for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { + if ( perSampleDownsampler.hasFinalizedItems() ) { + orderedDownsampledReadsCache.addAll(perSampleDownsampler.consumeFinalizedItems()); + } + } + + return readyToReleaseReads(); + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } + + public void close() { + nestedSAMIterator.close(); + } + + public Iterator iterator() { + return this; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java deleted file mode 100644 index f29c7728c..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.*; - -/** - * Positional Downsampler: When eliminating reads, try to do so evenly based on the alignment start positions - * - * @author David Roazen - */ -public class PositionalDownsampler implements ReadsDownsampler { - - private int targetCoverage; - - private ReservoirDownsampler reservoir; - - private int currentContigIndex; - - private int currentAlignmentStart; - - private LinkedList pendingReads; - - private ArrayList finalizedReads; - - public PositionalDownsampler ( int targetCoverage ) { - this.targetCoverage = targetCoverage; - clear(); - } - - public void submit ( T newRead ) { - if ( readIsPastCurrentPosition(newRead) ) { - updateAndDownsamplePendingReads(); - } - - reservoir.submit(newRead); - updateCurrentPosition(newRead); - } - - public void submit ( Collection newReads ) { - for ( T read : newReads ) { - submit(read); - } - } - - public boolean hasDownsampledItems() { - return finalizedReads.size() > 0; - } - - public List consumeDownsampledItems() { - List toReturn = finalizedReads; - finalizedReads = new ArrayList(); - return toReturn; - } - - public boolean hasPendingItems() { - return pendingReads.size() > 0; - } - - public void signalEndOfInput() { - updateAndDownsamplePendingReads(); - - for ( PositionalReadGrouping group : pendingReads ) { - group.finalizeAllActiveReads(); - finalizedReads.addAll(group.getFinalizedReads()); - } - - pendingReads.clear(); - } - - public void clear() { - reservoir = new ReservoirDownsampler(targetCoverage); - pendingReads = new LinkedList(); - finalizedReads = new ArrayList(); - } - - public boolean requiresCoordinateSortOrder() { - return true; - } - - private void updateCurrentPosition ( T read ) { - currentContigIndex = read.getReferenceIndex(); - currentAlignmentStart = read.getAlignmentStart(); - } - - private boolean readIsPastCurrentPosition ( T read ) { - return read.getReferenceIndex() != currentContigIndex || read.getAlignmentStart() > currentAlignmentStart; - } - - private void updateAndDownsamplePendingReads() { - finalizeOutOfScopeReads(); - - List oldLocusReads = reservoir.consumeDownsampledItems(); - pendingReads.add(new PositionalReadGrouping(oldLocusReads, currentContigIndex, currentAlignmentStart)); - - downsampleOverlappingGroups(); - } - - private void finalizeOutOfScopeReads() { - Iterator iter = pendingReads.iterator(); - boolean noPrecedingUnfinalizedGroups = true; - - while ( iter.hasNext() ) { - PositionalReadGrouping currentGroup = iter.next(); - currentGroup.finalizeActiveReadsBeforePosition(currentContigIndex, currentAlignmentStart); - - if ( currentGroup.isFinalized() && noPrecedingUnfinalizedGroups ) { - iter.remove(); - finalizedReads.addAll(currentGroup.getFinalizedReads()); - } - else { - noPrecedingUnfinalizedGroups = false; - } - } - } - - private void downsampleOverlappingGroups() { - int[] groupReadCounts = new int[pendingReads.size()]; - int totalCoverage = 0; - int numActiveGroups = 0; - int currentGroup = 0; - - for ( PositionalReadGrouping group : pendingReads ) { - groupReadCounts[currentGroup] = group.numActiveReads(); - totalCoverage += groupReadCounts[currentGroup]; - - if ( groupReadCounts[currentGroup] > 0 ) { - numActiveGroups++; - } - - currentGroup++; - } - - if ( totalCoverage <= targetCoverage ) { - return; - } - - int numReadsToRemove = Math.min(totalCoverage - targetCoverage, totalCoverage - numActiveGroups); - currentGroup = 0; - - while ( numReadsToRemove > 0 ) { - if ( groupReadCounts[currentGroup] > 1 ) { - groupReadCounts[currentGroup]--; - numReadsToRemove--; - } - - currentGroup = (currentGroup + 1) % groupReadCounts.length; - } - - currentGroup = 0; - for ( PositionalReadGrouping group : pendingReads ) { - if ( ! group.isFinalized() ) { - group.downsampleActiveReads(groupReadCounts[currentGroup]); - } - currentGroup++; - } - } - - private class PositionalReadGrouping { - private List activeReads; - private List finalizedReads; - - private int contig; - private int alignmentStart; - - public PositionalReadGrouping( Collection reads, int contig, int alignmentStart ) { - activeReads = new LinkedList(reads); - finalizedReads = new ArrayList(); - this.contig = contig; - this.alignmentStart = alignmentStart; - } - - public int numActiveReads() { - return activeReads.size(); - } - - public boolean isFinalized() { - return activeReads.size() == 0; - } - - public List getFinalizedReads() { - return finalizedReads; - } - - public void finalizeActiveReadsBeforePosition( int contig, int position ) { - if ( this.contig != contig ) { - finalizeAllActiveReads(); - return; - } - - Iterator iter = activeReads.iterator(); - - while ( iter.hasNext() ) { - T read = iter.next(); - if ( read.getAlignmentEnd() < position ) { - iter.remove(); - finalizedReads.add(read); - } - } - } - - public void finalizeAllActiveReads() { - finalizedReads.addAll(activeReads); - activeReads.clear(); - } - - public void downsampleActiveReads( int numReadsToKeep ) { - if ( numReadsToKeep > activeReads.size() || numReadsToKeep < 0 ) { - throw new ReviewedStingException(String.format("Cannot retain %d reads out of %d total reads", - numReadsToKeep, activeReads.size())); - } - - BitSet itemsToKeep = new BitSet(activeReads.size()); - for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(activeReads.size(), numReadsToKeep) ) { - itemsToKeep.set(selectedIndex); - } - - int currentIndex = 0; - Iterator iter = activeReads.iterator(); - - while ( iter.hasNext() ) { - T read = iter.next(); - - if ( ! itemsToKeep.get(currentIndex) ) { - iter.remove(); - } - - currentIndex++; - } - } - - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java index f78aaf4bf..3ff6f4454 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java @@ -33,8 +33,23 @@ import net.sf.samtools.SAMRecord; */ public interface ReadsDownsampler extends Downsampler { - /* + /** * Does this downsampler require that reads be fed to it in coordinate order? + * + * @return true if reads must be submitted to this downsampler in coordinate order, otherwise false */ public boolean requiresCoordinateSortOrder(); + + /** + * Tell this downsampler that no more reads located before the provided read (according to + * the sort order of the read stream) will be fed to it. + * + * Allows position-aware downsamplers to finalize pending reads earlier than they would + * otherwise be able to, particularly when doing per-sample downsampling and reads for + * certain samples are sparser than average. + * + * @param read the downsampler will assume that no reads located before this read will ever + * be submitted to it in the future + */ + public void signalNoMoreReadsBefore( T read ); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java new file mode 100644 index 000000000..2fa32497b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +/** + * A ReadsDownsamplerFactory can be used to create an arbitrary number of instances of a particular + * downsampler, all sharing the same construction parameters. + * + * @author David Roazen + */ +public interface ReadsDownsamplerFactory { + public ReadsDownsampler newInstance(); +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java index cb40c7042..bab4734c4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java @@ -48,6 +48,14 @@ public class ReservoirDownsampler implements ReadsDownsampl private int totalReadsSeen; + private int numDiscardedItems; + + /** + * Construct a ReservoirDownsampler + * + * @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained + * after downsampling will be min(totalReads, targetSampleSize) + */ public ReservoirDownsampler ( int targetSampleSize ) { if ( targetSampleSize <= 0 ) { throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0"); @@ -55,6 +63,7 @@ public class ReservoirDownsampler implements ReadsDownsampl this.targetSampleSize = targetSampleSize; clear(); + reset(); } public void submit ( T newRead ) { @@ -68,6 +77,7 @@ public class ReservoirDownsampler implements ReadsDownsampl if ( randomSlot < targetSampleSize ) { reservoir.set(randomSlot, newRead); } + numDiscardedItems++; } } @@ -77,11 +87,12 @@ public class ReservoirDownsampler implements ReadsDownsampl } } - public boolean hasDownsampledItems() { + public boolean hasFinalizedItems() { return reservoir.size() > 0; } - public List consumeDownsampledItems() { + public List consumeFinalizedItems() { + // pass by reference rather than make a copy, for speed List downsampledItems = reservoir; clear(); return downsampledItems; @@ -91,16 +102,36 @@ public class ReservoirDownsampler implements ReadsDownsampl return false; } + public T peekFinalized() { + return reservoir.isEmpty() ? null : reservoir.get(0); + } + + public T peekPending() { + return null; + } + + public int getNumberOfDiscardedItems() { + return numDiscardedItems; + } + public void signalEndOfInput() { // NO-OP } public void clear() { reservoir = new ArrayList(targetSampleSize); - totalReadsSeen = 0; + totalReadsSeen = 0; // an internal stat used by the downsampling process, so not cleared by reset() below + } + + public void reset() { + numDiscardedItems = 0; } public boolean requiresCoordinateSortOrder() { return false; } + + public void signalNoMoreReadsBefore( T read ) { + // NO-OP + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java new file mode 100644 index 000000000..040f0c788 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +/** + * Factory for creating ReservoirDownsamplers on demand + * + * @author David Roazen + */ +public class ReservoirDownsamplerFactory implements ReadsDownsamplerFactory { + + private int targetSampleSize; + + public ReservoirDownsamplerFactory( int targetSampleSize ) { + this.targetSampleSize = targetSampleSize; + } + + public ReadsDownsampler newInstance() { + return new ReservoirDownsampler(targetSampleSize); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java new file mode 100644 index 000000000..30affc2b3 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +import java.util.*; + +/** + * Simple Positional Downsampler: Downsample each stack of reads at each alignment start to a size <= a target coverage + * using a Reservoir downsampler. Stores only O(target coverage) reads in memory at any given time. + * + * @author David Roazen + */ +public class SimplePositionalDownsampler implements ReadsDownsampler { + + private int targetCoverage; + + private ReservoirDownsampler reservoir; + + private int currentContigIndex; + + private int currentAlignmentStart; + + private boolean positionEstablished; + + private boolean unmappedReadsReached; + + private ArrayList finalizedReads; + + private int numDiscardedItems; + + /** + * Construct a SimplePositionalDownsampler + * + * @param targetCoverage Maximum number of reads that may share any given alignment start position + */ + public SimplePositionalDownsampler( int targetCoverage ) { + this.targetCoverage = targetCoverage; + reservoir = new ReservoirDownsampler(targetCoverage); + finalizedReads = new ArrayList(); + clear(); + reset(); + } + + public void submit( T newRead ) { + updatePositionalState(newRead); + + if ( unmappedReadsReached ) { // don't downsample the unmapped reads at the end of the stream + finalizedReads.add(newRead); + } + else { + int reservoirPreviouslyDiscardedItems = reservoir.getNumberOfDiscardedItems(); + reservoir.submit(newRead); + numDiscardedItems += reservoir.getNumberOfDiscardedItems() - reservoirPreviouslyDiscardedItems; + } + } + + public void submit( Collection newReads ) { + for ( T read : newReads ) { + submit(read); + } + } + + public boolean hasFinalizedItems() { + return finalizedReads.size() > 0; + } + + public List consumeFinalizedItems() { + // pass by reference rather than make a copy, for speed + List toReturn = finalizedReads; + finalizedReads = new ArrayList(); + return toReturn; + } + + public boolean hasPendingItems() { + return reservoir.hasFinalizedItems(); + } + + public T peekFinalized() { + return finalizedReads.isEmpty() ? null : finalizedReads.get(0); + } + + public T peekPending() { + return reservoir.peekFinalized(); + } + + public int getNumberOfDiscardedItems() { + return numDiscardedItems; + } + + public void signalEndOfInput() { + finalizeReservoir(); + } + + public void clear() { + reservoir.clear(); + reservoir.reset(); + finalizedReads.clear(); + positionEstablished = false; + unmappedReadsReached = false; + } + + public void reset() { + numDiscardedItems = 0; + } + + public boolean requiresCoordinateSortOrder() { + return true; + } + + public void signalNoMoreReadsBefore( T read ) { + updatePositionalState(read); + } + + private void updatePositionalState( T newRead ) { + if ( readIsPastCurrentPosition(newRead) ) { + if ( reservoir.hasFinalizedItems() ) { + finalizeReservoir(); + } + + setCurrentPosition(newRead); + + if ( newRead.getReadUnmappedFlag() ) { + unmappedReadsReached = true; + } + } + } + + private void setCurrentPosition( T read ) { + currentContigIndex = read.getReferenceIndex(); + currentAlignmentStart = read.getAlignmentStart(); + positionEstablished = true; + } + + private boolean readIsPastCurrentPosition( T read ) { + return ! positionEstablished || + read.getReferenceIndex() > currentContigIndex || + read.getAlignmentStart() > currentAlignmentStart || + (read.getReadUnmappedFlag() && ! unmappedReadsReached); + } + + private void finalizeReservoir() { + finalizedReads.addAll(reservoir.consumeFinalizedItems()); + reservoir.reset(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java new file mode 100644 index 000000000..fcc18b16b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +/** + * Factory for creating SimplePositionalDownsamplers on demand + * + * @author David Roazen + */ +public class SimplePositionalDownsamplerFactory implements ReadsDownsamplerFactory { + + private int targetCoverage; + + public SimplePositionalDownsamplerFactory( int targetCoverage ) { + this.targetCoverage = targetCoverage; + } + + public ReadsDownsampler newInstance() { + return new SimplePositionalDownsampler(targetCoverage); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index da11d36dd..6c0dc9769 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.iterators.LocusIterator; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; +import org.broadinstitute.sting.gatk.iterators.LocusIteratorByStateExperimental; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -81,7 +82,13 @@ public class WindowMaker implements Iterable, I public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals, Collection sampleNames) { this.sourceInfo = shard.getReadProperties(); this.readIterator = iterator; - this.sourceIterator = new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames)); + + // Temporary: use the experimental version of LocusIteratorByState if experimental downsampling was requested: + this.sourceIterator = sourceInfo.getDownsamplingMethod().useExperimentalDownsampling ? + new PeekableIterator(new LocusIteratorByStateExperimental(iterator,sourceInfo,genomeLocParser, sampleNames)) + : + new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames)); + this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/DownsampleIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyDownsampleIterator.java similarity index 88% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/DownsampleIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyDownsampleIterator.java index 835748ff0..c0de06b49 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/DownsampleIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyDownsampleIterator.java @@ -6,13 +6,13 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import java.util.Iterator; -public class DownsampleIterator implements StingSAMIterator { +public class LegacyDownsampleIterator implements StingSAMIterator { StingSAMIterator it; int cutoff; SAMRecord next; - public DownsampleIterator(StingSAMIterator it, double fraction) { + public LegacyDownsampleIterator(StingSAMIterator it, double fraction) { this.it = it; cutoff = (int)(fraction * 10000); next = getNextRecord(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 64f914064..46e84798a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -31,8 +31,8 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.DownsampleType; -import org.broadinstitute.sting.gatk.DownsamplingMethod; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.GenomeLoc; diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java new file mode 100755 index 000000000..557cbd009 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java @@ -0,0 +1,649 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.picard.util.PeekableIterator; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.ReadProperties; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.Downsampler; +import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.util.*; + +/** + * Iterator that traverses a SAM File, accumulating information on a per-locus basis + */ +public class LocusIteratorByStateExperimental extends LocusIterator { + /** + * our log, which we want to capture anything from this class + */ + private static Logger logger = Logger.getLogger(LocusIteratorByState.class); + + // ----------------------------------------------------------------------------------------------------------------- + // + // member fields + // + // ----------------------------------------------------------------------------------------------------------------- + + /** + * Used to create new GenomeLocs. + */ + private final GenomeLocParser genomeLocParser; + private final ArrayList samples; + private final ReadStateManager readStates; + + protected static class SAMRecordState { + SAMRecord read; + int readOffset = -1; // how far are we offset from the start of the read bases? + int genomeOffset = -1; // how far are we offset from the alignment start on the genome? + + Cigar cigar = null; + int cigarOffset = -1; + CigarElement curElement = null; + int nCigarElements = 0; + + int cigarElementCounter = -1; // how far are we into a single cigarElement + + // The logical model for generating extended events is as follows: the "record state" implements the traversal + // along the reference; thus stepForwardOnGenome() returns on every and only on actual reference bases. This + // can be a (mis)match or a deletion (in the latter case, we still return on every individual reference base the + // deletion spans). In the extended events mode, the record state also remembers if there was an insertion, or + // if the deletion just started *right before* the current reference base the record state is pointing to upon the return from + // stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended + // events immediately preceding the current reference base). + + public SAMRecordState(SAMRecord read) { + this.read = read; + cigar = read.getCigar(); + nCigarElements = cigar.numCigarElements(); + + //System.out.printf("Creating a SAMRecordState: %s%n", this); + } + + public SAMRecord getRead() { + return read; + } + + /** + * What is our current offset in the read's bases that aligns us with the reference genome? + * + * @return + */ + public int getReadOffset() { + return readOffset; + } + + /** + * What is the current offset w.r.t. the alignment state that aligns us to the readOffset? + * + * @return + */ + public int getGenomeOffset() { + return genomeOffset; + } + + public int getGenomePosition() { + return read.getAlignmentStart() + getGenomeOffset(); + } + + public GenomeLoc getLocation(GenomeLocParser genomeLocParser) { + return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition()); + } + + public CigarOperator getCurrentCigarOperator() { + return curElement.getOperator(); + } + + public String toString() { + return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement); + } + + public CigarElement peekForwardOnGenome() { + return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement ); + } + + public CigarElement peekBackwardOnGenome() { + return ( cigarElementCounter - 1 == 0 && cigarOffset - 1 > 0 ? cigar.getCigarElement(cigarOffset - 1) : curElement ); + } + + + public CigarOperator stepForwardOnGenome() { + // we enter this method with readOffset = index of the last processed base on the read + // (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion + + + if (curElement == null || ++cigarElementCounter > curElement.getLength()) { + cigarOffset++; + if (cigarOffset < nCigarElements) { + curElement = cigar.getCigarElement(cigarOffset); + cigarElementCounter = 0; + // next line: guards against cigar elements of length 0; when new cigar element is retrieved, + // we reenter in order to re-check cigarElementCounter against curElement's length + return stepForwardOnGenome(); + } else { + if (curElement != null && curElement.getOperator() == CigarOperator.D) + throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar"); + + // Reads that contain indels model the genomeOffset as the following base in the reference. Because + // we fall into this else block only when indels end the read, increment genomeOffset such that the + // current offset of this read is the next ref base after the end of the indel. This position will + // model a point on the reference somewhere after the end of the read. + genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here: + // we do step forward on the ref, and by returning null we also indicate that we are past the read end. + + return null; + } + } + + boolean done = false; + switch (curElement.getOperator()) { + case H: // ignore hard clips + case P: // ignore pads + cigarElementCounter = curElement.getLength(); + break; + case I: // insertion w.r.t. the reference + case S: // soft clip + cigarElementCounter = curElement.getLength(); + readOffset += curElement.getLength(); + break; + case D: // deletion w.r.t. the reference + if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string + throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar"); + // should be the same as N case + genomeOffset++; + done = true; + break; + case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + genomeOffset++; + done = true; + break; + case M: + case EQ: + case X: + readOffset++; + genomeOffset++; + done = true; + break; + default: + throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator()); + } + + return done ? curElement.getOperator() : stepForwardOnGenome(); + } + } + + //final boolean DEBUG = false; + //final boolean DEBUG2 = false && DEBUG; + private ReadProperties readInfo; + private AlignmentContext nextAlignmentContext; + private boolean performLevelingDownsampling; + + // ----------------------------------------------------------------------------------------------------------------- + // + // constructors and other basic operations + // + // ----------------------------------------------------------------------------------------------------------------- + + public LocusIteratorByStateExperimental(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples) { + this.readInfo = readInformation; + this.genomeLocParser = genomeLocParser; + this.samples = new ArrayList(samples); + this.readStates = new ReadStateManager(samIterator); + + this.performLevelingDownsampling = readInfo.getDownsamplingMethod() != null && + readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && + readInfo.getDownsamplingMethod().toCoverage != null; + + // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when + // there's no read data. So we need to throw this error only when samIterator.hasNext() is true + if (this.samples.isEmpty() && samIterator.hasNext()) { + throw new IllegalArgumentException("samples list must not be empty"); + } + } + + /** + * For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list + * for the system. + */ + public final static Collection sampleListForSAMWithoutReadGroups() { + List samples = new ArrayList(); + samples.add(null); + return samples; + } + + public Iterator iterator() { + return this; + } + + public void close() { + //this.it.close(); + } + + public boolean hasNext() { + lazyLoadNextAlignmentContext(); + return (nextAlignmentContext != null); + //if ( DEBUG ) System.out.printf("hasNext() = %b%n", r); + } + + private GenomeLoc getLocation() { + return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser); + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // next() routine and associated collection operations + // + // ----------------------------------------------------------------------------------------------------------------- + public AlignmentContext next() { + lazyLoadNextAlignmentContext(); + if (!hasNext()) + throw new NoSuchElementException("LocusIteratorByState: out of elements."); + AlignmentContext currentAlignmentContext = nextAlignmentContext; + nextAlignmentContext = null; + return currentAlignmentContext; + } + + /** + * Creates the next alignment context from the given state. Note that this is implemented as a lazy load method. + * nextAlignmentContext MUST BE null in order for this method to advance to the next entry. + */ + private void lazyLoadNextAlignmentContext() { + while (nextAlignmentContext == null && readStates.hasNext()) { + readStates.collectPendingReads(); + + final GenomeLoc location = getLocation(); + final Map fullPileup = new HashMap(); + + // TODO: How can you determine here whether the current pileup has been downsampled? + boolean hasBeenSampled = false; + + for (final String sample : samples) { + final Iterator iterator = readStates.iterator(sample); + final List pile = new ArrayList(readStates.size(sample)); + + int size = 0; // number of elements in this sample's pileup + int nDeletions = 0; // number of deletions in this sample's pileup + int nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0) + + while (iterator.hasNext()) { + final SAMRecordState state = iterator.next(); // state object with the read/offset information + final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read + final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator + final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element + final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element + final boolean isSingleElementCigar = nextElement == lastElement; + final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator + final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator + int readOffset = state.getReadOffset(); // the base offset on this read + + final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION; + final boolean isAfterDeletion = lastOp == CigarOperator.DELETION; + final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION; + final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION && !isSingleElementCigar; + final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()); + + int nextElementLength = nextElement.getLength(); + + if (op == CigarOperator.N) // N's are never added to any pileup + continue; + + if (op == CigarOperator.D) { + // TODO -- LIBS is totally busted for deletions so that reads with Ds right before Is in their CIGAR are broken; must fix + if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so + pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1)); + size++; + nDeletions++; + if (read.getMappingQuality() == 0) + nMQ0Reads++; + } + } + else { + if (!filterBaseInRead(read, location.getStart())) { + String insertedBaseString = null; + if (nextOp == CigarOperator.I) { + final int insertionOffset = isSingleElementCigar ? 0 : 1; + // TODO -- someone please implement a better fix for the single element insertion CIGAR! + if (isSingleElementCigar) + readOffset -= (nextElement.getLength() - 1); // LIBS has passed over the insertion bases! + insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + insertionOffset, readOffset + insertionOffset + nextElement.getLength())); + } + + pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength)); + size++; + if (read.getMappingQuality() == 0) + nMQ0Reads++; + } + } + } + + if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup + fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads)); + } + + updateReadStates(); // critical - must be called after we get the current state offsets and location + if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done + nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled); + } + } + + // fast testing of position + private boolean readIsPastCurrentPosition(SAMRecord read) { + if (readStates.isEmpty()) + return false; + else { + SAMRecordState state = readStates.getFirst(); + SAMRecord ourRead = state.getRead(); + return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition(); + } + } + + /** + * Generic place to put per-base filters appropriate to LocusIteratorByState + * + * @param rec + * @param pos + * @return + */ + private static boolean filterBaseInRead(GATKSAMRecord rec, long pos) { + return ReadUtils.isBaseInsideAdaptor(rec, pos); + } + + private void updateReadStates() { + for (final String sample : samples) { + Iterator it = readStates.iterator(sample); + while (it.hasNext()) { + SAMRecordState state = it.next(); + CigarOperator op = state.stepForwardOnGenome(); + if (op == null) { + // we discard the read only when we are past its end AND indel at the end of the read (if any) was + // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe + // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. + it.remove(); // we've stepped off the end of the object + } + } + } + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } + + protected class ReadStateManager { + private final PeekableIterator iterator; + private final SamplePartitioner samplePartitioner; + private final Map readStatesBySample = new HashMap(); + private int totalReadStates = 0; + + public ReadStateManager(Iterator source) { + this.iterator = new PeekableIterator(source); + + for (final String sample : samples) { + readStatesBySample.put(sample, new PerSampleReadStateManager()); + } + + samplePartitioner = new SamplePartitioner(); + } + + /** + * Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented + * for this iterator; if present, total read states will be decremented. + * + * @param sample The sample. + * @return Iterator over the reads associated with that sample. + */ + public Iterator iterator(final String sample) { + return new Iterator() { + private Iterator wrappedIterator = readStatesBySample.get(sample).iterator(); + + public boolean hasNext() { + return wrappedIterator.hasNext(); + } + + public SAMRecordState next() { + return wrappedIterator.next(); + } + + public void remove() { + wrappedIterator.remove(); + } + }; + } + + public boolean isEmpty() { + return totalReadStates == 0; + } + + /** + * Retrieves the total number of reads in the manager across all samples. + * + * @return Total number of reads over all samples. + */ + public int size() { + return totalReadStates; + } + + /** + * Retrieves the total number of reads in the manager in the given sample. + * + * @param sample The sample. + * @return Total number of reads in the given sample. + */ + public int size(final String sample) { + return readStatesBySample.get(sample).size(); + } + + public SAMRecordState getFirst() { + for (final String sample : samples) { + PerSampleReadStateManager reads = readStatesBySample.get(sample); + if (!reads.isEmpty()) + return reads.peek(); + } + return null; + } + + public boolean hasNext() { + return totalReadStates > 0 || iterator.hasNext(); + } + + public void collectPendingReads() { + if (!iterator.hasNext()) + return; + + if (readStates.size() == 0) { + int firstContigIndex = iterator.peek().getReferenceIndex(); + int firstAlignmentStart = iterator.peek().getAlignmentStart(); + while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) { + samplePartitioner.submitRead(iterator.next()); + } + } else { + // Fast fail in the case that the read is past the current position. + if (readIsPastCurrentPosition(iterator.peek())) + return; + + while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) { + samplePartitioner.submitRead(iterator.next()); + } + } + + for (final String sample : samples) { + Collection newReads = samplePartitioner.getReadsForSample(sample); + PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); + addReadsToSample(statesBySample, newReads); + } + + samplePartitioner.reset(); + } + + /** + * Add reads with the given sample name to the given hanger entry. + * + * @param readStates The list of read states to add this collection of reads. + * @param reads Reads to add. Selected reads will be pulled from this source. + */ + private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads) { + if (reads.isEmpty()) + return; + + Collection newReadStates = new LinkedList(); + + for (SAMRecord read : reads) { + SAMRecordState state = new SAMRecordState(read); + state.stepForwardOnGenome(); + newReadStates.add(state); + } + + readStates.addStatesAtNextAlignmentStart(newReadStates); + } + + protected class PerSampleReadStateManager implements Iterable { + private List> readStatesByAlignmentStart = new LinkedList>(); + private int thisSampleReadStates = 0; + private Downsampler> levelingDownsampler = + performLevelingDownsampling ? + new LevelingDownsampler, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) : + null; + + public void addStatesAtNextAlignmentStart(Collection states) { + if ( states.isEmpty() ) { + return; + } + + readStatesByAlignmentStart.add(new LinkedList(states)); + thisSampleReadStates += states.size(); + totalReadStates += states.size(); + + if ( levelingDownsampler != null ) { + levelingDownsampler.submit(readStatesByAlignmentStart); + levelingDownsampler.signalEndOfInput(); + + thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); + totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); + + // use returned List directly rather than make a copy, for efficiency's sake + readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems(); + levelingDownsampler.reset(); + } + } + + public boolean isEmpty() { + return readStatesByAlignmentStart.isEmpty(); + } + + public SAMRecordState peek() { + return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek(); + } + + public int size() { + return thisSampleReadStates; + } + + public Iterator iterator() { + return new Iterator() { + private Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); + private LinkedList currentPositionReadStates = null; + private Iterator currentPositionReadStatesIterator = null; + + public boolean hasNext() { + return alignmentStartIterator.hasNext() || + (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext()); + } + + public SAMRecordState next() { + if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) { + currentPositionReadStates = alignmentStartIterator.next(); + currentPositionReadStatesIterator = currentPositionReadStates.iterator(); + } + + return currentPositionReadStatesIterator.next(); + } + + public void remove() { + currentPositionReadStatesIterator.remove(); + thisSampleReadStates--; + totalReadStates--; + + if ( currentPositionReadStates.isEmpty() ) { + alignmentStartIterator.remove(); + } + } + }; + } + } + } + + /** + * Note: stores reads by sample ID string, not by sample object + */ + private class SamplePartitioner { + private Map> readsBySample; + private long readsSeen = 0; + + public SamplePartitioner() { + readsBySample = new HashMap>(); + + for ( String sample : samples ) { + readsBySample.put(sample, new ArrayList()); + } + } + + public void submitRead(SAMRecord read) { + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + if (readsBySample.containsKey(sampleName)) + readsBySample.get(sampleName).add(read); + readsSeen++; + } + + public long getNumReadsSeen() { + return readsSeen; + } + + public Collection getReadsForSample(String sampleName) { + if ( ! readsBySample.containsKey(sampleName) ) + throw new NoSuchElementException("Sample name not found"); + return readsBySample.get(sampleName); + } + + public void reset() { + for ( Collection perSampleReads : readsBySample.values() ) + perSampleReads.clear(); + readsSeen = 0; + } + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java index 3ffe95e8b..9578bba56 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java @@ -10,13 +10,11 @@ import java.util.Iterator; * Verifies that the incoming stream of reads is correctly sorted */ public class VerifyingSamIterator implements StingSAMIterator { - private GenomeLocParser genomeLocParser; StingSAMIterator it; SAMRecord last = null; boolean checkOrderP = true; - public VerifyingSamIterator(GenomeLocParser genomeLocParser,StingSAMIterator it) { - this.genomeLocParser = genomeLocParser; + public VerifyingSamIterator(StingSAMIterator it) { this.it = it; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java index d662b0092..de2cd836c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java @@ -1,6 +1,6 @@ package org.broadinstitute.sting.gatk.walkers; -import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import java.lang.annotation.*; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java index c5b043b7a..44b0d74ca 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java @@ -30,7 +30,7 @@ import org.broadinstitute.sting.commandline.Advanced; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 32ceff715..0d1997252 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -27,7 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 0c096ea73..759ec1cc6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -75,6 +75,17 @@ public class MathUtils { } } + /** + * Get a random int between min and max (inclusive) using the global GATK random number generator + * + * @param min lower bound of the range + * @param max upper bound of the range + * @return a random int >= min and <= max + */ + public static int randomIntegerInRange( int min, int max ) { + return GenomeAnalysisEngine.getRandomGenerator().nextInt(max - min + 1) + min; + } + // A fast implementation of the Math.round() method. This method does not perform // under/overflow checking, so this shouldn't be used in the general case (but is fine // if one is already make those checks before calling in to the rounding). @@ -1655,5 +1666,4 @@ public class MathUtils { return result; } - } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index 3d986f666..ed6fc46bb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -613,6 +613,8 @@ public abstract class AbstractReadBackedPileup { + + private Collection perSampleArtificialReadStreams; + private MergingSamRecordIterator mergingIterator; + + public ArtificialMultiSampleReadStream( Collection perSampleArtificialReadStreams ) { + if ( perSampleArtificialReadStreams == null || perSampleArtificialReadStreams.isEmpty() ) { + throw new ReviewedStingException("Can't create an ArtificialMultiSampleReadStream out of 0 ArtificialSingleSampleReadStreams"); + } + + this.perSampleArtificialReadStreams = perSampleArtificialReadStreams; + } + + public Iterator iterator() { + // lazy initialization to prevent reads from being created until they're needed + initialize(); + + return mergingIterator; + } + + public StingSAMIterator getStingSAMIterator() { + // lazy initialization to prevent reads from being created until they're needed + initialize(); + + return StingSAMIteratorAdapter.adapt(mergingIterator); + } + + private void initialize() { + Collection perSampleSAMReaders = new ArrayList(perSampleArtificialReadStreams.size()); + Collection headers = new ArrayList(perSampleArtificialReadStreams.size()); + + for ( ArtificialSingleSampleReadStream readStream : perSampleArtificialReadStreams ) { + Collection thisStreamReads = readStream.makeReads(); + + SAMFileReader reader = new ArtificialSAMFileReader(readStream.getHeader(), + thisStreamReads.toArray(new SAMRecord[thisStreamReads.size()])); + perSampleSAMReaders.add(reader); + headers.add(reader.getFileHeader()); + } + + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, true); + mergingIterator = new MergingSamRecordIterator(headerMerger, perSampleSAMReaders, true); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java index adf60b16b..0b5fa391d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java @@ -40,8 +40,11 @@ public class ArtificialSAMFileReader extends SAMFileReader { */ private final List reads; + private SAMFileHeader customHeader = null; + /** * Construct an artificial SAM file reader. + * @param sequenceDictionary sequence dictionary used to initialize our GenomeLocParser * @param reads Reads to use as backing data source. */ public ArtificialSAMFileReader(SAMSequenceDictionary sequenceDictionary,SAMRecord... reads) { @@ -50,6 +53,30 @@ public class ArtificialSAMFileReader extends SAMFileReader { this.reads = Arrays.asList(reads); } + /** + * Construct an artificial SAM file reader with the given SAM file header + * + * @param customHeader Header that should be returned by calls to getFileHeader() on this reader + * @param reads Reads to use as backing data source. + */ + public ArtificialSAMFileReader( SAMFileHeader customHeader, SAMRecord... reads ) { + super(createEmptyInputStream(),true); + + this.customHeader = customHeader; + this.genomeLocParser = new GenomeLocParser(customHeader.getSequenceDictionary()); + this.reads = Arrays.asList(reads); + } + + + @Override + public SAMFileHeader getFileHeader() { + if ( customHeader != null ) { + return customHeader; + } + + return super.getFileHeader(); + } + /** * @{inheritDoc} */ diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index d0211db07..0859957a3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -276,6 +276,30 @@ public class ArtificialSAMUtils { return Arrays.asList(left, right); } + /** + * Create a collection of identical artificial reads based on the parameters. The cigar string for each + * read will be *M, where * is the length of the read. + * + * Useful for testing things like positional downsampling where you care only about the position and + * number of reads, and not the other attributes. + * + * @param stackSize number of identical reads to create + * @param header the SAM header to associate each read with + * @param name name associated with each read + * @param refIndex the reference index, i.e. what chromosome to associate them with + * @param alignmentStart where to start each alignment + * @param length the length of each read + * + * @return a collection of stackSize reads all sharing the above properties + */ + public static Collection createStackOfIdenticalArtificialReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { + Collection stack = new ArrayList(stackSize); + for ( int i = 1; i <= stackSize; i++ ) { + stack.add(createArtificialRead(header, name, refIndex, alignmentStart, length)); + } + return stack; + } + /** * create an iterator containing the specified read piles * diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java new file mode 100644 index 000000000..a9480692b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; + +/** + * An artificial stream of reads from a single read group/sample with configurable characteristics + * such as: + * + * -the number of contigs that the reads should be distributed across + * -number of "stacks" of reads sharing the same alignment start position per contig + * -the min/max number of reads in each stack (exact values chosen randomly from this range) + * -the min/max distance between stack start positions (exact values chosen randomly from this range) + * -the min/max length of each read (exact values chosen randomly from this range) + * -the number of unmapped reads + * + * The cigar string for all reads will be *M, where * is the length of the read. + * + * @author David Roazen + */ +public class ArtificialSingleSampleReadStream implements Iterable { + private SAMFileHeader header; + private String readGroupID; + private int numContigs; + private int numStacksPerContig; + private int minReadsPerStack; + private int maxReadsPerStack; + private int minDistanceBetweenStacks; + private int maxDistanceBetweenStacks; + private int minReadLength; + private int maxReadLength; + private int numUnmappedReads; + + private static final String READ_GROUP_TAG = "RG"; + + public ArtificialSingleSampleReadStream( SAMFileHeader header, + String readGroupID, + int numContigs, + int numStacksPerContig, + int minReadsPerStack, + int maxReadsPerStack, + int minDistanceBetweenStacks, + int maxDistanceBetweenStacks, + int minReadLength, + int maxReadLength, + int numUnmappedReads ) { + this.header = header; + this.readGroupID = readGroupID; + this.numContigs = numContigs; + this.numStacksPerContig = numStacksPerContig; + this.minReadsPerStack = minReadsPerStack; + this.maxReadsPerStack = maxReadsPerStack; + this.minDistanceBetweenStacks = minDistanceBetweenStacks; + this.maxDistanceBetweenStacks = maxDistanceBetweenStacks; + this.minReadLength = minReadLength; + this.maxReadLength = maxReadLength; + this.numUnmappedReads = numUnmappedReads; + + validateStreamParameters(); + } + + private void validateStreamParameters() { + if ( header == null || readGroupID == null ) { + throw new ReviewedStingException("null SAMFileHeader or read group ID") ; + } + + if ( header.getReadGroup(readGroupID) == null ) { + throw new ReviewedStingException(String.format("Read group %s not found in SAMFileHeader", readGroupID)); + } + + if ( numContigs < 0 || numStacksPerContig < 0 || minReadsPerStack < 0 || maxReadsPerStack < 0 || + minDistanceBetweenStacks < 0 || maxDistanceBetweenStacks < 0 || minReadLength < 0 || maxReadLength < 0 || + numUnmappedReads < 0 ) { + throw new ReviewedStingException("Read stream parameters must be >= 0"); + } + + if ( (numContigs == 0 && numStacksPerContig != 0) || (numContigs != 0 && numStacksPerContig == 0) ) { + throw new ReviewedStingException("numContigs and numStacksPerContig must either both be > 0, or both be 0"); + } + + if ( minReadsPerStack > maxReadsPerStack ) { + throw new ReviewedStingException("minReadsPerStack > maxReadsPerStack"); + } + + if ( minDistanceBetweenStacks > maxDistanceBetweenStacks ) { + throw new ReviewedStingException("minDistanceBetweenStacks > maxDistanceBetweenStacks"); + } + + if ( minReadLength > maxReadLength ) { + throw new ReviewedStingException("minReadLength > maxReadLength"); + } + } + + public Iterator iterator() { + return makeReads().iterator(); + } + + public StingSAMIterator getStingSAMIterator() { + return StingSAMIteratorAdapter.adapt(iterator()); + } + + public Collection makeReads() { + Collection reads = new ArrayList(numContigs * numStacksPerContig * maxReadsPerStack); + + for ( int contig = 0; contig < numContigs; contig++ ) { + int alignmentStart = 1; + + for ( int stack = 0; stack < numStacksPerContig; stack++ ) { + reads.addAll(makeReadStack(contig, alignmentStart, MathUtils.randomIntegerInRange(minReadsPerStack, maxReadsPerStack))); + alignmentStart += MathUtils.randomIntegerInRange(minDistanceBetweenStacks, maxDistanceBetweenStacks); + } + } + + if ( numUnmappedReads > 0 ) { + reads.addAll(makeReadStack(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, SAMRecord.NO_ALIGNMENT_START, numUnmappedReads)); + } + + return reads; + } + + private Collection makeReadStack( int contig, int alignmentStart, int stackSize ) { + Collection readStack = new ArrayList(stackSize); + + for ( int i = 0; i < stackSize; i++ ) { + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, + "foo", + contig, + alignmentStart, + MathUtils.randomIntegerInRange(minReadLength, maxReadLength)); + read.setAttribute(READ_GROUP_TAG, readGroupID); + readStack.add(read); + } + + return readStack; + } + + public SAMFileHeader getHeader() { + return header; + } + + public String getReadGroupID() { + return readGroupID; + } + + public int getNumContigs() { + return numContigs; + } + + public int getNumStacksPerContig() { + return numStacksPerContig; + } + + public int getMinReadsPerStack() { + return minReadsPerStack; + } + + public int getMaxReadsPerStack() { + return maxReadsPerStack; + } + + public int getMinDistanceBetweenStacks() { + return minDistanceBetweenStacks; + } + + public int getMaxDistanceBetweenStacks() { + return maxDistanceBetweenStacks; + } + + public int getMinReadLength() { + return minReadLength; + } + + public int getMaxReadLength() { + return maxReadLength; + } + + public int getNumUnmappedReads() { + return numUnmappedReads; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java new file mode 100644 index 000000000..a4d7c5146 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.ArrayList; +import java.util.List; + +/** + * A class for analyzing and validating the read stream produced by an ArtificialSingleSampleReadStream. + * + * Collects various statistics about the stream of reads it's fed, and validates the stream + * by checking whether the collected statistics match the nominal properties of the stream. + * + * Subclasses are expected to override the validate() method in order to check whether an artificial + * read stream has been *transformed* in some way (eg., by downsampling or some other process), rather + * than merely checking whether the stream matches its original properties. + * + * Usage is simple: + * + * ArtificialSingleSampleReadStreamAnalyzer analyzer = new ArtificialSingleSampleReadStreamAnalyzer(originalStream); + * analyzer.analyze(originalOrTransformedStream); + * analyzer.validate(); // override this method if you want to check whether the stream has been transformed + * // in a certain way relative to the original stream + * + * @author David Roazen + */ +public class ArtificialSingleSampleReadStreamAnalyzer { + protected ArtificialSingleSampleReadStream originalStream; + protected SAMRecord lastRead; + protected int totalReads; + protected boolean allSamplesMatch; + protected int numContigs; + protected List stacksPerContig; + protected Integer minReadsPerStack; + protected Integer maxReadsPerStack; + protected Integer minDistanceBetweenStacks; + protected Integer maxDistanceBetweenStacks; + protected Integer minReadLength; + protected Integer maxReadLength; + protected int numUnmappedReads; + + protected int currentContigNumStacks; + protected int currentStackNumReads; + + /** + * Construct a new read stream analyzer, providing an ArtificialSingleSampleReadStream that will + * serve as the basis for comparison after the analysis is complete. + * + * @param originalStream the original ArtificialSingleSampleReadStream upon which the stream + * that will be fed to the analyzer is based + */ + public ArtificialSingleSampleReadStreamAnalyzer( ArtificialSingleSampleReadStream originalStream ) { + this.originalStream = originalStream; + reset(); + } + + /** + * Reset all read stream statistics collected by this analyzer to prepare for a fresh run + */ + public void reset() { + lastRead = null; + totalReads = 0; + allSamplesMatch = true; + numContigs = 0; + stacksPerContig = new ArrayList(); + minReadsPerStack = null; + maxReadsPerStack = null; + minDistanceBetweenStacks = null; + maxDistanceBetweenStacks = null; + minReadLength = null; + maxReadLength = null; + numUnmappedReads = 0; + currentContigNumStacks = 0; + currentStackNumReads = 0; + } + + /** + * Collect statistics on the stream of reads passed in + * + * @param stream the stream of reads to analyze + */ + public void analyze( Iterable stream ) { + for ( SAMRecord read : stream ) { + update(read); + } + finalizeStats(); + } + + /** + * Validate the stream by checking whether our collected statistics match the properties of the + * original stream. Throws a ReviewedStingException if the stream is invalid. + * + * Override this method if you want to check whether the stream has been transformed in some + * way relative to the original stream. + */ + public void validate() { + if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && originalStream.getNumUnmappedReads() == 0 ) { + if ( totalReads != 0 ) { + throw new ReviewedStingException("got reads from the stream, but the stream was configured to have 0 reads"); + } + return; // no further validation needed for the 0-reads case + } + else if ( totalReads == 0 ) { + throw new ReviewedStingException("got no reads from the stream, but the stream was configured to have > 0 reads"); + } + + if ( ! allSamplesMatch ) { + throw new ReviewedStingException("some reads had the wrong sample"); + } + + if ( numContigs != originalStream.getNumContigs() ) { + throw new ReviewedStingException("number of contigs not correct"); + } + + if ( stacksPerContig.size() != originalStream.getNumContigs() ) { + throw new ReviewedStingException(String.format("bug in analyzer code: calculated sizes for %d contigs even though there were only %d contigs", + stacksPerContig.size(), originalStream.getNumContigs())); + } + + for ( int contigStackCount : stacksPerContig ) { + if ( contigStackCount != originalStream.getNumStacksPerContig() ) { + throw new ReviewedStingException("contig had incorrect number of stacks"); + } + } + + if ( originalStream.getNumStacksPerContig() > 0 ) { + if ( minReadsPerStack < originalStream.getMinReadsPerStack() ) { + throw new ReviewedStingException("stack had fewer than the minimum number of reads"); + } + if ( maxReadsPerStack > originalStream.getMaxReadsPerStack() ) { + throw new ReviewedStingException("stack had more than the maximum number of reads"); + } + } + else if ( minReadsPerStack != null || maxReadsPerStack != null ) { + throw new ReviewedStingException("bug in analyzer code: reads per stack was calculated even though 0 stacks per contig was specified"); + } + + if ( originalStream.getNumStacksPerContig() > 1 ) { + if ( minDistanceBetweenStacks < originalStream.getMinDistanceBetweenStacks() ) { + throw new ReviewedStingException("stacks were separated by less than the minimum distance"); + } + if ( maxDistanceBetweenStacks > originalStream.getMaxDistanceBetweenStacks() ) { + throw new ReviewedStingException("stacks were separated by more than the maximum distance"); + } + } + else if ( minDistanceBetweenStacks != null || maxDistanceBetweenStacks != null ) { + throw new ReviewedStingException("bug in analyzer code: distance between stacks was calculated even though numStacksPerContig was <= 1"); + } + + if ( minReadLength < originalStream.getMinReadLength() ) { + throw new ReviewedStingException("read was shorter than the minimum allowed length"); + } + if ( maxReadLength > originalStream.getMaxReadLength() ) { + throw new ReviewedStingException("read was longer than the maximum allowed length"); + } + + if ( numUnmappedReads != originalStream.getNumUnmappedReads() ) { + throw new ReviewedStingException(String.format("wrong number of unmapped reads: requested %d but saw %d", + originalStream.getNumUnmappedReads(), numUnmappedReads)); + } + + if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && + numUnmappedReads != totalReads ) { + throw new ReviewedStingException("stream should have consisted only of unmapped reads, but saw some mapped reads"); + } + } + + public void update( SAMRecord read ) { + if ( read.getReadUnmappedFlag() ) { + numUnmappedReads++; + + if ( numUnmappedReads == 1 && lastRead != null ) { + processContigChange(); + numContigs--; + } + } + else if ( lastRead == null ) { + numContigs = 1; + currentContigNumStacks = 1; + currentStackNumReads = 1; + } + else if ( ! read.getReferenceIndex().equals(lastRead.getReferenceIndex()) ) { + processContigChange(); + } + else if ( read.getAlignmentStart() != lastRead.getAlignmentStart() ) { + processStackChangeWithinContig(read); + } + else { + currentStackNumReads++; + } + + updateReadLength(read.getReadLength()); + allSamplesMatch = allSamplesMatch && readHasCorrectSample(read); + totalReads++; + + lastRead = read; + } + + + private void processContigChange() { + numContigs++; + + stacksPerContig.add(currentContigNumStacks); + currentContigNumStacks = 1; + + updateReadsPerStack(currentStackNumReads); + currentStackNumReads = 1; + } + + private void processStackChangeWithinContig( SAMRecord read ) { + currentContigNumStacks++; + + updateReadsPerStack(currentStackNumReads); + currentStackNumReads = 1; + + updateDistanceBetweenStacks(read.getAlignmentStart() - lastRead.getAlignmentStart()); + } + + private void updateReadsPerStack( int stackReadCount ) { + if ( minReadsPerStack == null || stackReadCount < minReadsPerStack ) { + minReadsPerStack = stackReadCount; + } + if ( maxReadsPerStack == null || stackReadCount > maxReadsPerStack ) { + maxReadsPerStack = stackReadCount; + } + } + + private void updateDistanceBetweenStacks( int stackDistance ) { + if ( minDistanceBetweenStacks == null || stackDistance < minDistanceBetweenStacks ) { + minDistanceBetweenStacks = stackDistance; + } + if ( maxDistanceBetweenStacks == null || stackDistance > maxDistanceBetweenStacks ) { + maxDistanceBetweenStacks = stackDistance; + } + } + + private void updateReadLength( int readLength ) { + if ( minReadLength == null || readLength < minReadLength ) { + minReadLength = readLength; + } + if ( maxReadLength == null || readLength > maxReadLength ) { + maxReadLength = readLength; + } + } + + private boolean readHasCorrectSample( SAMRecord read ) { + return originalStream.getReadGroupID().equals(read.getAttribute("RG")); + } + + public void finalizeStats() { + if ( lastRead != null && ! lastRead.getReadUnmappedFlag() ) { + stacksPerContig.add(currentContigNumStacks); + updateReadsPerStack(currentStackNumReads); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 5aeb741ec..d2bfabacf 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -29,7 +29,7 @@ import net.sf.picard.filter.FilteringIterator; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Tags; -import org.broadinstitute.sting.gatk.DownsamplingMethod; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; @@ -37,6 +37,7 @@ import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.walkers.qc.CountLoci; import org.broadinstitute.sting.utils.GenomeLocParser; import java.util.Collections; @@ -97,7 +98,7 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark { }, PER_SAMPLE { @Override - DownsamplingMethod create() { return GATKArgumentCollection.getDefaultDownsamplingMethod(); } + DownsamplingMethod create() { return DownsamplingMethod.getDefaultDownsamplingMethod(new CountLoci(), false); } }; abstract DownsamplingMethod create(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java index 730b3f410..9df849940 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java @@ -25,36 +25,40 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMProgramRecord; -import net.sf.samtools.SAMRecord; +import net.sf.samtools.*; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; import org.testng.annotations.AfterMethod; import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import org.testng.Assert; import java.io.File; import java.io.FileNotFoundException; +import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.List; import static org.testng.Assert.*; /** - * @author aaron - * @version 1.0 - * @date Apr 8, 2009 *

    * Class SAMDataSourceUnitTest *

    @@ -66,6 +70,161 @@ public class SAMDataSourceUnitTest extends BaseTest { private IndexedFastaSequenceFile seq; private GenomeLocParser genomeLocParser; + + /*********************************** + * Tests for the fillShard() method + ***********************************/ + + /** + * Tests to ensure that the fillShard() method does not place shard boundaries at inappropriate places, + * such as within an alignment start position + */ + private static class SAMDataSourceFillShardBoundaryTest extends TestDataProvider { + private int numContigs; + private int numStacksPerContig; + private int stackSize; + private int numUnmappedReads; + private DownsamplingMethod downsamplingMethod; + + private SAMFileHeader header; + + public SAMDataSourceFillShardBoundaryTest( int numContigs, + int numStacksPerContig, + int stackSize, + int numUnmappedReads, + int downsamplingTargetCoverage ) { + super(SAMDataSourceFillShardBoundaryTest.class); + + this.numContigs = numContigs; + this.numStacksPerContig = numStacksPerContig; + this.stackSize = stackSize; + this.numUnmappedReads = numUnmappedReads; + + this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null, true); + + setName(String.format("%s: numContigs=%d numStacksPerContig=%d stackSize=%d numUnmappedReads=%d downsamplingTargetCoverage=%d", + getClass().getSimpleName(), numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage)); + } + + public void run() { + SAMDataSource dataSource = new SAMDataSource(Arrays.asList(createTestBAM()), + new ThreadAllocation(), + null, + new GenomeLocParser(header.getSequenceDictionary()), + false, + SAMFileReader.ValidationStringency.SILENT, + null, + downsamplingMethod, + new ValidationExclusion(), + new ArrayList(), + false); + + Assert.assertTrue(dataSource.usingExpandedShards()); + + Iterable shardIterator = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); + + SAMRecord readAtEndOfLastShard = null; + + for ( Shard shard : shardIterator ) { + int numContigsThisShard = 0; + SAMRecord lastRead = null; + + for ( SAMRecord read : shard.iterator() ) { + if ( lastRead == null ) { + numContigsThisShard = 1; + } + else if ( ! read.getReadUnmappedFlag() && ! lastRead.getReferenceIndex().equals(read.getReferenceIndex()) ) { + numContigsThisShard++; + } + + // If the last read from the previous shard is not unmapped, we have to make sure + // that no reads in this shard start at the same position + if ( readAtEndOfLastShard != null && ! readAtEndOfLastShard.getReadUnmappedFlag() ) { + Assert.assertFalse(readAtEndOfLastShard.getReferenceIndex().equals(read.getReferenceIndex()) && + readAtEndOfLastShard.getAlignmentStart() == read.getAlignmentStart(), + String.format("Reads from alignment start position %d:%d are split across multiple shards", + read.getReferenceIndex(), read.getAlignmentStart())); + } + + lastRead = read; + } + + // There should never be reads from more than 1 contig in a shard (ignoring unmapped reads) + Assert.assertTrue(numContigsThisShard == 1, "found a shard with reads from multiple contigs"); + + readAtEndOfLastShard = lastRead; + } + } + + private SAMReaderID createTestBAM() { + header = ArtificialSAMUtils.createArtificialSamHeader(numContigs, 1, 100000); + SAMReadGroupRecord readGroup = new SAMReadGroupRecord("foo"); + readGroup.setSample("testSample"); + header.addReadGroup(readGroup); + ArtificialSingleSampleReadStream artificialReads = new ArtificialSingleSampleReadStream(header, + "foo", + numContigs, + numStacksPerContig, + stackSize, + stackSize, + 1, + 100, + 50, + 150, + numUnmappedReads); + + File testBAMFile; + try { + testBAMFile = File.createTempFile("SAMDataSourceFillShardBoundaryTest", ".bam"); + testBAMFile.deleteOnExit(); + } + catch ( IOException e ) { + throw new ReviewedStingException(String.format("Failed to create temp bam file for test %s. %s", this, e.getMessage())); + } + + SAMFileWriter bamWriter = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(header, true, testBAMFile); + for ( SAMRecord read : artificialReads ) { + bamWriter.addAlignment(read); + } + bamWriter.close(); + + return new SAMReaderID(testBAMFile, new Tags()); + } + } + + @DataProvider(name = "SAMDataSourceFillShardTestDataProvider") + public Object[][] createSAMDataSourceFillShardBoundaryTests() { + // Take downsampling out of the equation for these tests -- we are only interested in whether the + // shard boundaries occur at the right places in the read stream, and removing downsampling as a + // factor simplifies that task (note that we still need to provide a specific downsampling method with + // experimental downsampling enabled to trigger the shard expansion behavior, for now) + int downsamplingTargetCoverage = ReadShard.MAX_READS * 10; + + for ( int numContigs = 1; numContigs <= 3; numContigs++ ) { + for ( int numStacksPerContig : Arrays.asList(1, 2, 4) ) { + // Use crucial read shard boundary values as the stack sizes + for ( int stackSize : Arrays.asList(ReadShard.MAX_READS / 2, ReadShard.MAX_READS / 2 + 10, ReadShard.MAX_READS, ReadShard.MAX_READS - 1, ReadShard.MAX_READS + 1, ReadShard.MAX_READS * 2) ) { + for ( int numUnmappedReads : Arrays.asList(0, ReadShard.MAX_READS / 2, ReadShard.MAX_READS * 2) ) { + new SAMDataSourceFillShardBoundaryTest(numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage); + } + } + } + } + + return SAMDataSourceFillShardBoundaryTest.getTests(SAMDataSourceFillShardBoundaryTest.class); + } + + // TODO: re-enable these tests once the issues with filepointer ordering + the downsamplers are worked out + @Test(dataProvider = "SAMDataSourceFillShardTestDataProvider", enabled = false) + public void testSAMDataSourceFillShard( SAMDataSourceFillShardBoundaryTest test ) { + logger.warn("Running test: " + test); + + test.run(); + } + + + // TODO: the legacy tests below should really be replaced with a more comprehensive suite of tests for SAMDataSource + /** * This function does the setup of our parser, before each method call. *

    diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java index b0de78b97..b0a8ff065 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java @@ -1,73 +1,138 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.downsampling; import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.testng.Assert; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.ArrayList; -import java.util.Collection; +import java.util.Arrays; -public class DownsamplingReadsIteratorUnitTest { +public class DownsamplingReadsIteratorUnitTest extends BaseTest { - @Test - public void testDownsamplingIteratorWithPositionalDownsampling() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + private static class DownsamplingReadsIteratorTest extends TestDataProvider { + private DownsamplingReadsIterator downsamplingIter; + private int targetCoverage; + private ArtificialSingleSampleReadStream stream; + private ArtificialSingleSampleReadStreamAnalyzer streamAnalyzer; - Collection reads = new ArrayList(); + public DownsamplingReadsIteratorTest( ArtificialSingleSampleReadStream stream, int targetCoverage ) { + super(DownsamplingReadsIteratorTest.class); - reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 1, 100)); - reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 50, 100)); + this.stream = stream; + this.targetCoverage = targetCoverage; - StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler(1000)); - - Assert.assertTrue(iter.hasNext()); - SAMRecord previous = iter.next(); - int count = 1; - - while ( iter.hasNext() ) { - SAMRecord current = iter.next(); - Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex())); - count++; - previous = current; + setName(String.format("%s: targetCoverage=%d numContigs=%d stacksPerContig=%d readsPerStack=%d-%d distanceBetweenStacks=%d-%d readLength=%d-%d unmappedReads=%d", + getClass().getSimpleName(), + targetCoverage, + stream.getNumContigs(), + stream.getNumStacksPerContig(), + stream.getMinReadsPerStack(), + stream.getMaxReadsPerStack(), + stream.getMinDistanceBetweenStacks(), + stream.getMaxDistanceBetweenStacks(), + stream.getMinReadLength(), + stream.getMaxReadLength(), + stream.getNumUnmappedReads())); } - Assert.assertEquals(count, 1000); + public void run() { + streamAnalyzer = new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(stream, targetCoverage); + downsamplingIter = new DownsamplingReadsIterator(stream.getStingSAMIterator(), new SimplePositionalDownsampler(targetCoverage)); + + streamAnalyzer.analyze(downsamplingIter); + + // Check whether the observed properties of the downsampled stream are what they should be + streamAnalyzer.validate(); + + // Allow memory used by this test to be reclaimed + stream = null; + streamAnalyzer = null; + downsamplingIter = null; + } } - @Test - public void testDownsamplingIteratorNoEffectiveDownsampling() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + @DataProvider(name = "DownsamplingReadsIteratorTestDataProvider") + public Object[][] createDownsamplingReadsIteratorTests() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(5, 1, 10000); + String readGroupID = "testReadGroup"; + SAMReadGroupRecord readGroup = new SAMReadGroupRecord(readGroupID); + readGroup.setSample("testSample"); + header.addReadGroup(readGroup); - Collection reads = new ArrayList(); + // Values that don't vary across tests + int targetCoverage = 10; + int minReadLength = 50; + int maxReadLength = 100; + int minDistanceBetweenStacks = 1; + int maxDistanceBetweenStacks = maxReadLength + 1; - reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100)); - reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100)); + GenomeAnalysisEngine.resetRandomGenerator(); - StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler(1000)); - - Assert.assertTrue(iter.hasNext()); - SAMRecord previous = iter.next(); - int count = 1; - - while ( iter.hasNext() ) { - SAMRecord current = iter.next(); - Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex())); - count++; - previous = current; + // brute force testing! + for ( int numContigs : Arrays.asList(1, 2, 5) ) { + for ( int stacksPerContig : Arrays.asList(1, 2, 10) ) { + for ( int minReadsPerStack : Arrays.asList(1, targetCoverage / 2, targetCoverage, targetCoverage - 1, targetCoverage + 1, targetCoverage * 2) ) { + for ( int maxReadsPerStack : Arrays.asList(1, targetCoverage / 2, targetCoverage, targetCoverage - 1, targetCoverage + 1, targetCoverage * 2) ) { + for ( int numUnmappedReads : Arrays.asList(0, 1, targetCoverage, targetCoverage * 2) ) { + // Only interested in sane read stream configurations here + if ( minReadsPerStack <= maxReadsPerStack ) { + new DownsamplingReadsIteratorTest(new ArtificialSingleSampleReadStream(header, + readGroupID, + numContigs, + stacksPerContig, + minReadsPerStack, + maxReadsPerStack, + minDistanceBetweenStacks, + maxDistanceBetweenStacks, + minReadLength, + maxReadLength, + numUnmappedReads), + targetCoverage); + } + } + } + } + } } - Assert.assertEquals(count, 600); + return DownsamplingReadsIteratorTest.getTests(DownsamplingReadsIteratorTest.class); } - private ArrayList createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { - ArrayList stack = new ArrayList(stackSize); - for ( int i = 1; i <= stackSize; i++ ) { - stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length)); - } - return stack; + @Test(dataProvider = "DownsamplingReadsIteratorTestDataProvider") + public void runDownsamplingReadsIteratorTest( DownsamplingReadsIteratorTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + test.run(); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java index 0f4bae555..3bf1096b1 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java @@ -1,65 +1,157 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.downsampling; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import org.testng.Assert; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; import java.util.List; -public class FractionalDownsamplerUnitTest { +public class FractionalDownsamplerUnitTest extends BaseTest { - @Test - public void test100PercentInclusion() { - FractionalDownsampler downsampler = new FractionalDownsampler(1.0); - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + private static class FractionalDownsamplerTest extends TestDataProvider { + double fraction; + int totalReads; + int expectedMinNumReadsAfterDownsampling; + int expectedMaxNumReadsAfterDownsampling; + int expectedMinDiscardedItems; + int expectedMaxDiscardedItems; - downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500)); - downsampler.signalEndOfInput(); + private static final double EXPECTED_ACCURACY = 0.05; // should be accurate to within +/- this percent - List downsampledReads = downsampler.consumeDownsampledItems(); + public FractionalDownsamplerTest( double fraction, int totalReads ) { + super(FractionalDownsamplerTest.class); - Assert.assertTrue(downsampledReads.size() == 1000); - } + this.fraction = fraction; + this.totalReads = totalReads; - @Test - public void test0PercentInclusion() { - FractionalDownsampler downsampler = new FractionalDownsampler(0.0); - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + calculateExpectations(); - downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500)); - downsampler.signalEndOfInput(); - - List downsampledReads = downsampler.consumeDownsampledItems(); - - Assert.assertTrue(downsampledReads.isEmpty()); - } - - @Test - public void test50PercentInclusion() { - FractionalDownsampler downsampler = new FractionalDownsampler(0.5); - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - downsampler.submit(createRandomReads(5000, header, "foo", 0, 100000, 500)); - downsampler.signalEndOfInput(); - - List downsampledReads = downsampler.consumeDownsampledItems(); - - Assert.assertTrue(downsampledReads.size() >= 2000 && downsampledReads.size() <= 3000); - } - - private List createRandomReads( int numReads, SAMFileHeader header, String name, int contigIndex, int maxAlignmentStart, int maxLength ) { - List reads = new ArrayList(numReads); - - for ( int i = 1; i <= numReads; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialRead(header, name, contigIndex, - GenomeAnalysisEngine.getRandomGenerator().nextInt(maxAlignmentStart) + 1, - GenomeAnalysisEngine.getRandomGenerator().nextInt(maxLength) + 1)); + setName(String.format("%s: fraction=%.2f totalReads=%d expectedMinNumReadsAfterDownsampling=%d expectedMaxNumReadsAfterDownsampling=%d", + getClass().getSimpleName(), fraction, totalReads, expectedMinNumReadsAfterDownsampling, expectedMaxNumReadsAfterDownsampling)); } - return reads; + private void calculateExpectations() { + // Require an exact match in the 0% and 100% cases + if ( fraction == 0.0 ) { + expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = 0; + expectedMinDiscardedItems = expectedMaxDiscardedItems = totalReads; + } + else if ( fraction == 1.0 ) { + expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = totalReads; + expectedMinDiscardedItems = expectedMaxDiscardedItems = 0; + } + else { + expectedMinNumReadsAfterDownsampling = Math.max((int)((fraction - EXPECTED_ACCURACY) * totalReads), 0); + expectedMaxNumReadsAfterDownsampling = Math.min((int) ((fraction + EXPECTED_ACCURACY) * totalReads), totalReads); + expectedMinDiscardedItems = totalReads - expectedMaxNumReadsAfterDownsampling; + expectedMaxDiscardedItems = totalReads - expectedMinNumReadsAfterDownsampling; + } + } + + public Collection createReads() { + Collection reads = new ArrayList(totalReads); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100)); + + return reads; + } + } + + @DataProvider(name = "FractionalDownsamplerTestDataProvider") + public Object[][] createFractionalDownsamplerTestData() { + for ( double fraction : Arrays.asList(0.0, 0.25, 0.5, 0.75, 1.0) ) { + for ( int totalReads : Arrays.asList(0, 1000, 10000) ) { + new FractionalDownsamplerTest(fraction, totalReads); + } + } + + return FractionalDownsamplerTest.getTests(FractionalDownsamplerTest.class); + } + + @Test(dataProvider = "FractionalDownsamplerTestDataProvider") + public void runFractionalDownsamplerTest( FractionalDownsamplerTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + ReadsDownsampler downsampler = new FractionalDownsampler(test.fraction); + + downsampler.submit(test.createReads()); + + if ( test.totalReads > 0 ) { + if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + } + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.totalReads > 0 ) { + if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + } + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertTrue(downsampledReads.size() >= test.expectedMinNumReadsAfterDownsampling && + downsampledReads.size() <= test.expectedMaxNumReadsAfterDownsampling); + + Assert.assertTrue(downsampler.getNumberOfDiscardedItems() >= test.expectedMinDiscardedItems && + downsampler.getNumberOfDiscardedItems() <= test.expectedMaxDiscardedItems); + + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.totalReads - downsampledReads.size()); + + downsampler.reset(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java new file mode 100644 index 000000000..2717d014c --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; +import org.testng.Assert; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedList; +import java.util.List; + +public class LevelingDownsamplerUnitTest extends BaseTest { + + private static class LevelingDownsamplerUniformStacksTest extends TestDataProvider { + public enum DataStructure { LINKED_LIST, ARRAY_LIST } + + int targetSize; + int numStacks; + int stackSize; + DataStructure dataStructure; + int expectedSize; + + public LevelingDownsamplerUniformStacksTest( int targetSize, int numStacks, int stackSize, DataStructure dataStructure ) { + super(LevelingDownsamplerUniformStacksTest.class); + + this.targetSize = targetSize; + this.numStacks = numStacks; + this.stackSize = stackSize; + this.dataStructure = dataStructure; + expectedSize = calculateExpectedDownsampledStackSize(); + + setName(String.format("%s: targetSize=%d numStacks=%d stackSize=%d dataStructure=%s expectedSize=%d", + getClass().getSimpleName(), targetSize, numStacks, stackSize, dataStructure, expectedSize)); + } + + public Collection> createStacks() { + Collection> stacks = new ArrayList>(); + + for ( int i = 1; i <= numStacks; i++ ) { + List stack = dataStructure == DataStructure.LINKED_LIST ? new LinkedList() : new ArrayList(); + + for ( int j = 1; j <= stackSize; j++ ) { + stack.add(new Object()); + } + + stacks.add(stack); + } + + return stacks; + } + + private int calculateExpectedDownsampledStackSize() { + int numItemsToRemove = numStacks * stackSize - targetSize; + + if ( numStacks == 0 ) { + return 0; + } + else if ( numItemsToRemove <= 0 ) { + return stackSize; + } + + return Math.max(1, stackSize - (numItemsToRemove / numStacks)); + } + } + + @DataProvider(name = "UniformStacksDataProvider") + public Object[][] createUniformStacksTestData() { + for ( int targetSize = 1; targetSize <= 10000; targetSize *= 10 ) { + for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { + for ( int stackSize = 1; stackSize <= 1000; stackSize *= 10 ) { + for ( LevelingDownsamplerUniformStacksTest.DataStructure dataStructure : LevelingDownsamplerUniformStacksTest.DataStructure.values() ) { + new LevelingDownsamplerUniformStacksTest(targetSize, numStacks, stackSize, dataStructure); + } + } + } + } + + return LevelingDownsamplerUniformStacksTest.getTests(LevelingDownsamplerUniformStacksTest.class); + } + + @Test( dataProvider = "UniformStacksDataProvider" ) + public void testLevelingDownsamplerWithUniformStacks( LevelingDownsamplerUniformStacksTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + Downsampler> downsampler = new LevelingDownsampler, Object>(test.targetSize); + + downsampler.submit(test.createStacks()); + + if ( test.numStacks > 0 ) { + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.numStacks > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List> downsampledStacks = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertEquals(downsampledStacks.size(), test.numStacks); + + int totalRemainingItems = 0; + for ( List stack : downsampledStacks ) { + Assert.assertTrue(Math.abs(stack.size() - test.expectedSize) <= 1); + totalRemainingItems += stack.size(); + } + + int numItemsReportedDiscarded = downsampler.getNumberOfDiscardedItems(); + int numItemsActuallyDiscarded = test.numStacks * test.stackSize - totalRemainingItems; + + Assert.assertEquals(numItemsReportedDiscarded, numItemsActuallyDiscarded); + + downsampler.reset(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + + Assert.assertTrue(totalRemainingItems <= Math.max(test.targetSize, test.numStacks)); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java new file mode 100644 index 000000000..b9022900b --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java @@ -0,0 +1,298 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.iterators.VerifyingSamIterator; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.ArtificialMultiSampleReadStream; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class PerSampleDownsamplingReadsIteratorUnitTest extends BaseTest { + + private static class PerSampleDownsamplingReadsIteratorTest extends TestDataProvider { + + // TODO: tests should distinguish between variance across samples and variance within a sample + + private enum StreamDensity { + SPARSE (MAX_READ_LENGTH, MAX_READ_LENGTH * 2), + DENSE (1, MIN_READ_LENGTH), + MIXED (1, MAX_READ_LENGTH * 2), + UNIFORM_DENSE (1, 1), + UNIFORM_SPARSE (MAX_READ_LENGTH * 2, MAX_READ_LENGTH * 2); + + int minDistanceBetweenStacks; + int maxDistanceBetweenStacks; + + StreamDensity( int minDistanceBetweenStacks, int maxDistanceBetweenStacks ) { + this.minDistanceBetweenStacks = minDistanceBetweenStacks; + this.maxDistanceBetweenStacks = maxDistanceBetweenStacks; + } + + public String toString() { + return String.format("StreamDensity:%d-%d", minDistanceBetweenStacks, maxDistanceBetweenStacks); + } + } + + private enum StreamStackDepth { + NON_UNIFORM_LOW (1, 5), + NON_UNIFORM_HIGH (15, 20), + NON_UNIFORM_MIXED (1, 20), + UNIFORM_SINGLE (1, 1), + UNIFORM_LOW (2, 2), + UNIFORM_HIGH (20, 20), + UNIFORM_MEDIUM (10, 10); // should set target coverage to this value for testing + + int minReadsPerStack; + int maxReadsPerStack; + + StreamStackDepth( int minReadsPerStack, int maxReadsPerStack ) { + this.minReadsPerStack = minReadsPerStack; + this.maxReadsPerStack = maxReadsPerStack; + } + + public boolean isUniform() { + return minReadsPerStack == maxReadsPerStack; + } + + public String toString() { + return String.format("StreamStackDepth:%d-%d", minReadsPerStack, maxReadsPerStack); + } + } + + private enum StreamStacksPerContig { + UNIFORM(20, 20), + NON_UNIFORM(1, 30); + + int minStacksPerContig; + int maxStacksPerContig; + + StreamStacksPerContig( int minStacksPerContig, int maxStacksPerContig ) { + this.minStacksPerContig = minStacksPerContig; + this.maxStacksPerContig = maxStacksPerContig; + } + + public boolean isUniform() { + return minStacksPerContig == maxStacksPerContig; + } + + public String toString() { + return String.format("StreamStacksPerContig:%d-%d", minStacksPerContig, maxStacksPerContig); + } + } + + // Not interested in testing multiple ranges for the read lengths, as none of our current + // downsamplers are affected by read length + private static final int MIN_READ_LENGTH = 50; + private static final int MAX_READ_LENGTH = 150; + + private ReadsDownsamplerFactory downsamplerFactory; + private int targetCoverage; + private int numSamples; + private int minContigs; + private int maxContigs; + private StreamDensity streamDensity; + private StreamStackDepth streamStackDepth; + private StreamStacksPerContig streamStacksPerContig; + private double unmappedReadsFraction; + private int unmappedReadsCount; + private boolean verifySortedness; + + private ArtificialMultiSampleReadStream mergedReadStream; + private Map perSampleArtificialReadStreams; + private Map perSampleStreamAnalyzers; + private SAMFileHeader header; + + public PerSampleDownsamplingReadsIteratorTest( ReadsDownsamplerFactory downsamplerFactory, + int targetCoverage, + int numSamples, + int minContigs, + int maxContigs, + StreamDensity streamDensity, + StreamStackDepth streamStackDepth, + StreamStacksPerContig streamStacksPerContig, + double unmappedReadsFraction, + int unmappedReadsCount, + boolean verifySortedness ) { + super(PerSampleDownsamplingReadsIteratorTest.class); + + this.downsamplerFactory = downsamplerFactory; + this.targetCoverage = targetCoverage; + this.numSamples = numSamples; + this.minContigs = minContigs; + this.maxContigs = maxContigs; + this.streamDensity = streamDensity; + this.streamStackDepth = streamStackDepth; + this.streamStacksPerContig = streamStacksPerContig; + this.unmappedReadsFraction = unmappedReadsFraction; + this.unmappedReadsCount = unmappedReadsCount; + this.verifySortedness = verifySortedness; + + header = createHeader(); + createReadStreams(); + + setName(String.format("%s: targetCoverage=%d numSamples=%d minContigs=%d maxContigs=%d %s %s %s unmappedReadsFraction=%.2f unmappedReadsCount=%d verifySortedness=%b", + getClass().getSimpleName(), targetCoverage, numSamples, minContigs, maxContigs, streamDensity, streamStackDepth, streamStacksPerContig, unmappedReadsFraction, unmappedReadsCount, verifySortedness)); + } + + private SAMFileHeader createHeader() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(maxContigs, 1, (streamDensity.maxDistanceBetweenStacks + MAX_READ_LENGTH) * streamStacksPerContig.maxStacksPerContig + 100000); + List readGroups = new ArrayList(numSamples); + List sampleNames = new ArrayList(numSamples); + + for ( int i = 0; i < numSamples; i++ ) { + readGroups.add("ReadGroup" + i); + sampleNames.add("Sample" + i); + } + + return ArtificialSAMUtils.createEnumeratedReadGroups(header, readGroups, sampleNames); + } + + private void createReadStreams() { + perSampleArtificialReadStreams = new HashMap(numSamples); + perSampleStreamAnalyzers = new HashMap(numSamples); + + for (SAMReadGroupRecord readGroup : header.getReadGroups() ) { + String readGroupID = readGroup.getReadGroupId(); + String sampleName = readGroup.getSample(); + + int thisSampleNumContigs = MathUtils.randomIntegerInRange(minContigs, maxContigs); + int thisSampleStacksPerContig = MathUtils.randomIntegerInRange(streamStacksPerContig.minStacksPerContig, streamStacksPerContig.maxStacksPerContig); + + int thisSampleNumUnmappedReads = GenomeAnalysisEngine.getRandomGenerator().nextDouble() < unmappedReadsFraction ? unmappedReadsCount : 0; + + ArtificialSingleSampleReadStream thisSampleStream = new ArtificialSingleSampleReadStream(header, + readGroupID, + thisSampleNumContigs, + thisSampleStacksPerContig, + streamStackDepth.minReadsPerStack, + streamStackDepth.maxReadsPerStack, + streamDensity.minDistanceBetweenStacks, + streamDensity.maxDistanceBetweenStacks, + MIN_READ_LENGTH, + MAX_READ_LENGTH, + thisSampleNumUnmappedReads); + perSampleArtificialReadStreams.put(sampleName, thisSampleStream); + perSampleStreamAnalyzers.put(sampleName, new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(thisSampleStream, targetCoverage)); + } + + mergedReadStream = new ArtificialMultiSampleReadStream(perSampleArtificialReadStreams.values()); + } + + public void run() { + StingSAMIterator downsamplingIter = new PerSampleDownsamplingReadsIterator(mergedReadStream.getStingSAMIterator(), downsamplerFactory); + + if ( verifySortedness ) { + downsamplingIter = new VerifyingSamIterator(downsamplingIter); + } + + while ( downsamplingIter.hasNext() ) { + SAMRecord read = downsamplingIter.next(); + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + + ArtificialSingleSampleReadStreamAnalyzer analyzer = perSampleStreamAnalyzers.get(sampleName); + if ( analyzer != null ) { + analyzer.update(read); + } + else { + throw new ReviewedStingException("bug: stream analyzer for sample " + sampleName + " not found"); + } + } + + for ( Map.Entry analyzerEntry : perSampleStreamAnalyzers.entrySet() ) { + ArtificialSingleSampleReadStreamAnalyzer analyzer = analyzerEntry.getValue(); + analyzer.finalizeStats(); + + // Validate the downsampled read stream for each sample individually + analyzer.validate(); + } + + // Allow memory used by this test to be reclaimed: + mergedReadStream = null; + perSampleArtificialReadStreams = null; + perSampleStreamAnalyzers = null; + } + } + + @DataProvider(name = "PerSampleDownsamplingReadsIteratorTestDataProvider") + public Object[][] createPerSampleDownsamplingReadsIteratorTests() { + + GenomeAnalysisEngine.resetRandomGenerator(); + + // Some values don't vary across tests + int targetCoverage = PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.UNIFORM_MEDIUM.minReadsPerStack; + ReadsDownsamplerFactory downsamplerFactory = new SimplePositionalDownsamplerFactory(targetCoverage); + int maxContigs = 3; + boolean verifySortedness = true; + + for ( int numSamples : Arrays.asList(1, 2, 10) ) { + for ( int minContigs = 1; minContigs <= maxContigs; minContigs++ ) { + for ( PerSampleDownsamplingReadsIteratorTest.StreamDensity streamDensity : PerSampleDownsamplingReadsIteratorTest.StreamDensity.values() ) { + for ( PerSampleDownsamplingReadsIteratorTest.StreamStackDepth streamStackDepth : PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.values() ) { + for (PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig streamStacksPerContig : PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig.values() ) { + for ( double unmappedReadsFraction : Arrays.asList(0.0, 1.0, 0.5) ) { + for ( int unmappedReadsCount : Arrays.asList(1, 50) ) { + new PerSampleDownsamplingReadsIteratorTest(downsamplerFactory, + targetCoverage, + numSamples, + minContigs, + maxContigs, + streamDensity, + streamStackDepth, + streamStacksPerContig, + unmappedReadsFraction, + unmappedReadsCount, + verifySortedness); + } + } + } + } + } + } + } + + return PerSampleDownsamplingReadsIteratorTest.getTests(PerSampleDownsamplingReadsIteratorTest.class); + } + + @Test(dataProvider = "PerSampleDownsamplingReadsIteratorTestDataProvider") + public void runPerSampleDownsamplingReadsIteratorTest( PerSampleDownsamplingReadsIteratorTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + test.run(); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java deleted file mode 100644 index b1d8e45c9..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java +++ /dev/null @@ -1,357 +0,0 @@ -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.annotations.Test; -import org.testng.Assert; - -import java.util.*; - -// TODO: generalize these tests so that all possible arrangements of 1-4 stacks can be tested -public class PositionalDownsamplerUnitTest extends BaseTest { - - /** - * ------- - * ------- - * ------- - * ------- - * ------- - * ------- - */ - @Test - public void testThreeOverlappingIdenticalStacks() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreeOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000); - } - - /** - * ------- - * ------- - * ------- - * ------- - * ------- - * ------- - */ - @Test - public void testThreeNonOverlappingIdenticalStacks() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 201, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 301, 100)); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreeNonOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) == 1000); - Assert.assertTrue(downsampledStackSizes.get(1) == 1000); - Assert.assertTrue(downsampledStackSizes.get(2) == 1000); - } - - /** - * --- - * --- - * ------- - * ------- - * ------- - * ------- - */ - @Test - public void testThreeStacksWithShortStackAtBeginning() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 25)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 20, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreeStacksWithShortStackAtBeginning: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000); - } - - /** - * ------- - * ------- - * --- - * --- - * ------- - * ------- - */ - @Test - public void testThreeStacksWithShortStackInMiddle() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 25)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 75, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreeStacksWithShortStackInMiddle: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(2) <= 1000); - } - - /** - * ------ - * ------ - * ------- - * ------- - * --- - * --- - */ - @Test - public void testThreeStacksWithShortStackAtEnd() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 135, 25)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreeStacksWithShortStackAtEnd: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000); - } - - /** - * ------- - * ---- - * ------- - * ---- - * ------- - * ------- - */ - @Test - public void testThreePartiallyOverlappingStacks() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 1, 100, 50)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 75, 100, 50)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(2000, header, "foo", 0, 150, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreePartiallyOverlappingStacks: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); - - // TODO: need to examine per-base coverage here - } - - @Test - public void testNoDownsamplingRequired() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 25, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testNoDownsamplingRequired: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) == 300); - Assert.assertTrue(downsampledStackSizes.get(1) == 300); - Assert.assertTrue(downsampledStackSizes.get(2) == 300); - } - - @Test - public void testGATKSAMRecordSupport() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - List reads = new ArrayList(); - for ( int i = 0; i < 10; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10)); - } - - downsampler.submit(reads); - downsampler.signalEndOfInput(); - List downsampledReads = downsampler.consumeDownsampledItems(); - - Assert.assertTrue(downsampledReads.size() == 10); - } - - private ArrayList createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { - ArrayList stack = new ArrayList(stackSize); - for ( int i = 1; i <= stackSize; i++ ) { - stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length)); - } - return stack; - } - - private ArrayList createStackOfVaryingReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int firstLength, int secondLength ) { - ArrayList stack = createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, firstLength); - stack.addAll(createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, secondLength)); - return stack; - } - - private List getDownsampledStackSizesAndVerifySortedness( List downsampledReads ) { - List stackSizes = new ArrayList(); - Iterator iter = downsampledReads.iterator(); - Assert.assertTrue(iter.hasNext()); - - SAMRecord previousRead = iter.next(); - int currentStackSize = 1; - - while ( iter.hasNext() ) { - SAMRecord currentRead = iter.next(); - - if ( ! currentRead.getReferenceIndex().equals(previousRead.getReferenceIndex()) || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) { - stackSizes.add(currentStackSize); - currentStackSize = 1; - } - else if ( currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) { - Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead)); - } - else { - currentStackSize++; - } - - previousRead = currentRead; - } - - stackSizes.add(currentStackSize); - return stackSizes; - } -} - diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java new file mode 100644 index 000000000..9cbd0db8a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; + +/** + * Class for analyzing an artificial read stream that has been positionally downsampled, and verifying + * that the downsampling was done correctly without changing the stream in unexpected ways. + * + * @author David Roazen + */ +public class PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer extends ArtificialSingleSampleReadStreamAnalyzer { + private int targetCoverage; + + public PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer( ArtificialSingleSampleReadStream originalStream, int targetCoverage ) { + super(originalStream); + this.targetCoverage = targetCoverage; + } + + /** + * Overridden validate() method that checks for the effects of positional downsampling in addition to checking + * for whether the original properties of the stream not affected by downsampling have been preserved + */ + @Override + public void validate() { + if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && originalStream.getNumUnmappedReads() == 0 ) { + if ( totalReads != 0 ) { + throw new ReviewedStingException("got reads from the stream, but the stream was configured to have 0 reads"); + } + return; // no further validation needed for the 0-reads case + } + else if ( totalReads == 0 ) { + throw new ReviewedStingException("got no reads from the stream, but the stream was configured to have > 0 reads"); + } + + if ( ! allSamplesMatch ) { + throw new ReviewedStingException("some reads had the wrong sample"); + } + + if ( numContigs != originalStream.getNumContigs() ) { + throw new ReviewedStingException("number of contigs not correct"); + } + + if ( stacksPerContig.size() != originalStream.getNumContigs() ) { + throw new ReviewedStingException(String.format("bug in analyzer code: calculated sizes for %d contigs even though there were only %d contigs", + stacksPerContig.size(), originalStream.getNumContigs())); + } + + for ( int contigStackCount : stacksPerContig ) { + if ( contigStackCount != originalStream.getNumStacksPerContig() ) { + throw new ReviewedStingException("contig had incorrect number of stacks"); + } + } + + if ( originalStream.getNumStacksPerContig() > 0 ) { + + // Check for the effects of positional downsampling: + int stackMinimumAfterDownsampling = Math.min(targetCoverage, originalStream.getMinReadsPerStack()); + int stackMaximumAfterDownsampling = targetCoverage; + + if ( minReadsPerStack < stackMinimumAfterDownsampling ) { + throw new ReviewedStingException("stack had fewer than the minimum number of reads after downsampling"); + } + if ( maxReadsPerStack > stackMaximumAfterDownsampling ) { + throw new ReviewedStingException("stack had more than the maximum number of reads after downsampling"); + } + } + else if ( minReadsPerStack != null || maxReadsPerStack != null ) { + throw new ReviewedStingException("bug in analyzer code: reads per stack was calculated even though 0 stacks per contig was specified"); + } + + if ( originalStream.getNumStacksPerContig() > 1 ) { + if ( minDistanceBetweenStacks < originalStream.getMinDistanceBetweenStacks() ) { + throw new ReviewedStingException("stacks were separated by less than the minimum distance"); + } + if ( maxDistanceBetweenStacks > originalStream.getMaxDistanceBetweenStacks() ) { + throw new ReviewedStingException("stacks were separated by more than the maximum distance"); + } + } + else if ( minDistanceBetweenStacks != null || maxDistanceBetweenStacks != null ) { + throw new ReviewedStingException("bug in analyzer code: distance between stacks was calculated even though numStacksPerContig was <= 1"); + } + + if ( minReadLength < originalStream.getMinReadLength() ) { + throw new ReviewedStingException("read was shorter than the minimum allowed length"); + } + if ( maxReadLength > originalStream.getMaxReadLength() ) { + throw new ReviewedStingException("read was longer than the maximum allowed length"); + } + + if ( numUnmappedReads != originalStream.getNumUnmappedReads() ) { + throw new ReviewedStingException(String.format("wrong number of unmapped reads: requested %d but saw %d", + originalStream.getNumUnmappedReads(), numUnmappedReads)); + } + + if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && + numUnmappedReads != totalReads ) { + throw new ReviewedStingException("stream should have consisted only of unmapped reads, but saw some mapped reads"); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java new file mode 100644 index 000000000..75d0448c4 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +public class ReservoirDownsamplerUnitTest extends BaseTest { + + private static class ReservoirDownsamplerTest extends TestDataProvider { + int reservoirSize; + int totalReads; + int expectedNumReadsAfterDownsampling; + int expectedNumDiscardedItems; + + public ReservoirDownsamplerTest( int reservoirSize, int totalReads ) { + super(ReservoirDownsamplerTest.class); + + this.reservoirSize = reservoirSize; + this.totalReads = totalReads; + + expectedNumReadsAfterDownsampling = Math.min(reservoirSize, totalReads); + expectedNumDiscardedItems = totalReads <= reservoirSize ? 0 : totalReads - reservoirSize; + + setName(String.format("%s: reservoirSize=%d totalReads=%d expectedNumReadsAfterDownsampling=%d expectedNumDiscardedItems=%d", + getClass().getSimpleName(), reservoirSize, totalReads, expectedNumReadsAfterDownsampling, expectedNumDiscardedItems)); + } + + public Collection createReads() { + Collection reads = new ArrayList(totalReads); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100)); + + return reads; + } + } + + @DataProvider(name = "ReservoirDownsamplerTestDataProvider") + public Object[][] createReservoirDownsamplerTestData() { + for ( int reservoirSize = 1; reservoirSize <= 10000; reservoirSize *= 10 ) { + new ReservoirDownsamplerTest(reservoirSize, 0); + for ( int totalReads = 1; totalReads <= 10000; totalReads *= 10 ) { + new ReservoirDownsamplerTest(reservoirSize, totalReads); + } + } + + return ReservoirDownsamplerTest.getTests(ReservoirDownsamplerTest.class); + } + + @Test(dataProvider = "ReservoirDownsamplerTestDataProvider") + public void testReservoirDownsampler( ReservoirDownsamplerTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + ReadsDownsampler downsampler = new ReservoirDownsampler(test.reservoirSize); + + downsampler.submit(test.createReads()); + + if ( test.totalReads > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.totalReads > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertEquals(downsampledReads.size(), test.expectedNumReadsAfterDownsampling); + + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.expectedNumDiscardedItems); + Assert.assertEquals(test.totalReads - downsampledReads.size(), test.expectedNumDiscardedItems); + + downsampler.reset(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java new file mode 100644 index 000000000..5dc41b4a0 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.*; + +public class SimplePositionalDownsamplerUnitTest extends BaseTest { + + private static class SimplePositionalDownsamplerTest extends TestDataProvider { + int targetCoverage; + int numStacks; + List stackSizes; + List expectedStackSizes; + boolean multipleContigs; + int totalInitialReads; + + public SimplePositionalDownsamplerTest( int targetCoverage, List stackSizes, boolean multipleContigs ) { + super(SimplePositionalDownsamplerTest.class); + + this.targetCoverage = targetCoverage; + this.numStacks = stackSizes.size(); + this.stackSizes = stackSizes; + this.multipleContigs = multipleContigs; + + calculateExpectedDownsampledStackSizes(); + + totalInitialReads = 0; + for ( Integer stackSize : stackSizes ) { + totalInitialReads += stackSize; + } + + setName(String.format("%s: targetCoverage=%d numStacks=%d stackSizes=%s expectedSizes=%s multipleContigs=%b", + getClass().getSimpleName(), targetCoverage, numStacks, stackSizes, expectedStackSizes, multipleContigs)); + } + + public Collection createReads() { + Collection reads = new ArrayList(); + SAMFileHeader header = multipleContigs ? + ArtificialSAMUtils.createArtificialSamHeader(2, 1, 1000000) : + ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + int refIndex = 0; + int alignmentStart = 1; + int readLength = 100; + + for ( int i = 0; i < numStacks; i++ ) { + if ( multipleContigs && refIndex == 0 && i >= numStacks / 2 ) { + refIndex++; + } + + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(stackSizes.get(i), header, "foo", + refIndex, alignmentStart, readLength)); + + alignmentStart += 10; + } + + return reads; + } + + private void calculateExpectedDownsampledStackSizes() { + expectedStackSizes = new ArrayList(numStacks); + + for ( Integer stackSize : stackSizes ) { + int expectedSize = targetCoverage >= stackSize ? stackSize : targetCoverage; + expectedStackSizes.add(expectedSize); + } + } + } + + @DataProvider(name = "SimplePositionalDownsamplerTestDataProvider") + public Object[][] createSimplePositionalDownsamplerTestData() { + GenomeAnalysisEngine.resetRandomGenerator(); + + for ( int targetCoverage = 1; targetCoverage <= 10000; targetCoverage *= 10 ) { + for ( int contigs = 1; contigs <= 2; contigs++ ) { + for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { + List stackSizes = new ArrayList(numStacks); + for ( int stack = 1; stack <= numStacks; stack++ ) { + stackSizes.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(targetCoverage * 2) + 1); + } + new SimplePositionalDownsamplerTest(targetCoverage, stackSizes, contigs > 1); + } + } + } + + return SimplePositionalDownsamplerTest.getTests(SimplePositionalDownsamplerTest.class); + } + + @Test( dataProvider = "SimplePositionalDownsamplerTestDataProvider" ) + public void testSimplePostionalDownsampler( SimplePositionalDownsamplerTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + ReadsDownsampler downsampler = new SimplePositionalDownsampler(test.targetCoverage); + + downsampler.submit(test.createReads()); + + if ( test.numStacks > 1 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else if ( test.numStacks == 1 ) { + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.numStacks > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + if ( test.numStacks == 0 ) { + Assert.assertTrue(downsampledReads.isEmpty()); + } + else { + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampledReads); + + Assert.assertEquals(downsampledStackSizes.size(), test.numStacks); + Assert.assertEquals(downsampledStackSizes, test.expectedStackSizes); + + int numReadsActuallyEliminated = test.totalInitialReads - downsampledReads.size(); + int numReadsReportedEliminated = downsampler.getNumberOfDiscardedItems(); + Assert.assertEquals(numReadsActuallyEliminated, numReadsReportedEliminated); + } + + downsampler.reset(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + } + + private List getDownsampledStackSizesAndVerifySortedness( List downsampledReads ) { + List stackSizes = new ArrayList(); + + if ( downsampledReads.isEmpty() ) { + return stackSizes; + } + + Iterator iter = downsampledReads.iterator(); + Assert.assertTrue(iter.hasNext()); + + SAMRecord previousRead = iter.next(); + int currentStackSize = 1; + + while ( iter.hasNext() ) { + SAMRecord currentRead = iter.next(); + + if ( currentRead.getReferenceIndex() > previousRead.getReferenceIndex() || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) { + stackSizes.add(currentStackSize); + currentStackSize = 1; + } + else if ( currentRead.getReferenceIndex() < previousRead.getReferenceIndex() || currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) { + Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead)); + } + else { + currentStackSize++; + } + + previousRead = currentRead; + } + + stackSizes.add(currentStackSize); + return stackSizes; + } + + @Test + public void testSimplePositionalDownsamplerSignalNoMoreReadsBefore() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection readStack = new ArrayList(); + readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(50, header, "foo", 0, 1, 100)); + downsampler.submit(readStack); + + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + + SAMRecord laterRead = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 2, 100); + downsampler.signalNoMoreReadsBefore(laterRead); + + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + Assert.assertEquals(downsampledReads.size(), readStack.size()); + } + + @Test + public void testBasicUnmappedReadsSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection readStack = new ArrayList(); + readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, 100)); + for ( SAMRecord read : readStack ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + downsampler.submit(readStack); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler + Assert.assertEquals(downsampledReads.size(), readStack.size()); + + for ( SAMRecord read: downsampledReads ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + } + + @Test + public void testMixedMappedAndUnmappedReadsSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection mappedReadStack = new ArrayList(); + mappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", 0, 1, 100)); + for ( SAMRecord read : mappedReadStack ) { + Assert.assertFalse(read.getReadUnmappedFlag()); + } + + Collection unmappedReadStack = new ArrayList(); + unmappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, 100)); + for ( SAMRecord read : unmappedReadStack ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + downsampler.submit(mappedReadStack); + downsampler.submit(unmappedReadStack); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler + Assert.assertEquals(downsampledReads.size(), 300); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 100); + + int count = 1; + for ( SAMRecord read: downsampledReads ) { + if ( count <= 100 ) { + Assert.assertFalse(read.getReadUnmappedFlag()); + } + else { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + count++; + } + } + + @Test + public void testGATKSAMRecordSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + List reads = new ArrayList(); + for ( int i = 0; i < 10; i++ ) { + reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10)); + } + + downsampler.submit(reads); + downsampler.signalEndOfInput(); + List downsampledReads = downsampler.consumeFinalizedItems(); + + Assert.assertEquals(downsampledReads.size(), 10); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java new file mode 100644 index 000000000..c148bcf84 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java @@ -0,0 +1,546 @@ +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.ReadProperties; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; +import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * testing of the experimental version of LocusIteratorByState + */ +public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { + private static SAMFileHeader header; + private LocusIteratorByStateExperimental li; + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void beforeClass() { + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + } + + private final LocusIteratorByStateExperimental makeLTBS(List reads, ReadProperties readAttributes) { + return new LocusIteratorByStateExperimental(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByStateExperimental.sampleListForSAMWithoutReadGroups()); + } + + private static ReadProperties createTestReadProperties() { + return createTestReadProperties(null); + } + + private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) { + return new ReadProperties( + Collections.emptyList(), + new SAMFileHeader(), + false, + SAMFileReader.ValidationStringency.STRICT, + downsamplingMethod, + new ValidationExclusion(), + Collections.emptyList(), + Collections.emptyList(), + false, + (byte) -1 + ); + } + + private static class FakeCloseableIterator implements CloseableIterator { + Iterator iterator; + + public FakeCloseableIterator(Iterator it) { + iterator = it; + } + + @Override + public void close() { + return; + } + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public T next() { + return iterator.next(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Don't remove!"); + } + } + + @Test + public void testXandEQOperators() { + final byte[] bases1 = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; + final byte[] bases2 = new byte[] {'A','A','A','C','A','A','A','A','A','C'}; + + // create a test version of the Reads object + ReadProperties readAttributes = createTestReadProperties(); + + SAMRecord r1 = ArtificialSAMUtils.createArtificialRead(header,"r1",0,1,10); + r1.setReadBases(bases1); + r1.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + r1.setCigarString("10M"); + + SAMRecord r2 = ArtificialSAMUtils.createArtificialRead(header,"r2",0,1,10); + r2.setReadBases(bases2); + r2.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + r2.setCigarString("3=1X5=1X"); + + SAMRecord r3 = ArtificialSAMUtils.createArtificialRead(header,"r3",0,1,10); + r3.setReadBases(bases2); + r3.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + r3.setCigarString("3=1X5M1X"); + + SAMRecord r4 = ArtificialSAMUtils.createArtificialRead(header,"r4",0,1,10); + r4.setReadBases(bases2); + r4.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + r4.setCigarString("10M"); + + List reads = Arrays.asList(r1, r2, r3, r4); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads,readAttributes); + + while (li.hasNext()) { + AlignmentContext context = li.next(); + ReadBackedPileup pileup = context.getBasePileup(); + Assert.assertEquals(pileup.depthOfCoverage(), 4); + } + } + + @Test + public void testIndelsInRegularPileup() { + final byte[] bases = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; + final byte[] indelBases = new byte[] {'A','A','A','A','C','T','A','A','A','A','A','A'}; + + // create a test version of the Reads object + ReadProperties readAttributes = createTestReadProperties(); + + SAMRecord before = ArtificialSAMUtils.createArtificialRead(header,"before",0,1,10); + before.setReadBases(bases); + before.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + before.setCigarString("10M"); + + SAMRecord during = ArtificialSAMUtils.createArtificialRead(header,"during",0,2,10); + during.setReadBases(indelBases); + during.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + during.setCigarString("4M2I6M"); + + SAMRecord after = ArtificialSAMUtils.createArtificialRead(header,"after",0,3,10); + after.setReadBases(bases); + after.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + after.setCigarString("10M"); + + List reads = Arrays.asList(before, during, after); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads,readAttributes); + + boolean foundIndel = false; + while (li.hasNext()) { + AlignmentContext context = li.next(); + ReadBackedPileup pileup = context.getBasePileup().getBaseFilteredPileup(10); + for (PileupElement p : pileup) { + if (p.isBeforeInsertion()) { + foundIndel = true; + Assert.assertEquals(p.getEventLength(), 2, "Wrong event length"); + Assert.assertEquals(p.getEventBases(), "CT", "Inserted bases are incorrect"); + break; + } + } + + } + + Assert.assertTrue(foundIndel,"Indel in pileup not found"); + } + + @Test + public void testWholeIndelReadInIsolation() { + final int firstLocus = 44367789; + + // create a test version of the Reads object + ReadProperties readAttributes = createTestReadProperties(); + + SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header, "indelOnly", 0, firstLocus, 76); + indelOnlyRead.setReadBases(Utils.dupBytes((byte)'A',76)); + indelOnlyRead.setBaseQualities(Utils.dupBytes((byte) '@', 76)); + indelOnlyRead.setCigarString("76I"); + + List reads = Arrays.asList(indelOnlyRead); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, readAttributes); + + // Traditionally, reads that end with indels bleed into the pileup at the following locus. Verify that the next pileup contains this read + // and considers it to be an indel-containing read. + Assert.assertTrue(li.hasNext(),"Should have found a whole-indel read in the normal base pileup without extended events enabled"); + AlignmentContext alignmentContext = li.next(); + Assert.assertEquals(alignmentContext.getLocation().getStart(), firstLocus, "Base pileup is at incorrect location."); + ReadBackedPileup basePileup = alignmentContext.getBasePileup(); + Assert.assertEquals(basePileup.getReads().size(),1,"Pileup is of incorrect size"); + Assert.assertSame(basePileup.getReads().get(0), indelOnlyRead, "Read in pileup is incorrect"); + } + + /** + * Test to make sure that reads supporting only an indel (example cigar string: 76I) do + * not negatively influence the ordering of the pileup. + */ + @Test + public void testWholeIndelRead() { + final int firstLocus = 44367788, secondLocus = firstLocus + 1; + + SAMRecord leadingRead = ArtificialSAMUtils.createArtificialRead(header,"leading",0,firstLocus,76); + leadingRead.setReadBases(Utils.dupBytes((byte)'A',76)); + leadingRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); + leadingRead.setCigarString("1M75I"); + + SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header,"indelOnly",0,secondLocus,76); + indelOnlyRead.setReadBases(Utils.dupBytes((byte) 'A', 76)); + indelOnlyRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); + indelOnlyRead.setCigarString("76I"); + + SAMRecord fullMatchAfterIndel = ArtificialSAMUtils.createArtificialRead(header,"fullMatch",0,secondLocus,76); + fullMatchAfterIndel.setReadBases(Utils.dupBytes((byte)'A',76)); + fullMatchAfterIndel.setBaseQualities(Utils.dupBytes((byte)'@',76)); + fullMatchAfterIndel.setCigarString("75I1M"); + + List reads = Arrays.asList(leadingRead, indelOnlyRead, fullMatchAfterIndel); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + int currentLocus = firstLocus; + int numAlignmentContextsFound = 0; + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + Assert.assertEquals(alignmentContext.getLocation().getStart(),currentLocus,"Current locus returned by alignment context is incorrect"); + + if(currentLocus == firstLocus) { + List readsAtLocus = alignmentContext.getBasePileup().getReads(); + Assert.assertEquals(readsAtLocus.size(),1,"Wrong number of reads at locus " + currentLocus); + Assert.assertSame(readsAtLocus.get(0),leadingRead,"leadingRead absent from pileup at locus " + currentLocus); + } + else if(currentLocus == secondLocus) { + List readsAtLocus = alignmentContext.getBasePileup().getReads(); + Assert.assertEquals(readsAtLocus.size(),2,"Wrong number of reads at locus " + currentLocus); + Assert.assertSame(readsAtLocus.get(0),indelOnlyRead,"indelOnlyRead absent from pileup at locus " + currentLocus); + Assert.assertSame(readsAtLocus.get(1),fullMatchAfterIndel,"fullMatchAfterIndel absent from pileup at locus " + currentLocus); + } + + currentLocus++; + numAlignmentContextsFound++; + } + + Assert.assertEquals(numAlignmentContextsFound, 2, "Found incorrect number of alignment contexts"); + } + + /** + * Test to make sure that reads supporting only an indel (example cigar string: 76I) are represented properly + */ + @Test + public void testWholeIndelReadRepresentedTest() { + final int firstLocus = 44367788, secondLocus = firstLocus + 1; + + SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,secondLocus,1); + read1.setReadBases(Utils.dupBytes((byte) 'A', 1)); + read1.setBaseQualities(Utils.dupBytes((byte) '@', 1)); + read1.setCigarString("1I"); + + List reads = Arrays.asList(read1); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + Assert.assertTrue(pe.isBeforeInsertion()); + Assert.assertFalse(pe.isAfterInsertion()); + Assert.assertEquals(pe.getEventBases(), "A"); + } + + SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,secondLocus,10); + read2.setReadBases(Utils.dupBytes((byte) 'A', 10)); + read2.setBaseQualities(Utils.dupBytes((byte) '@', 10)); + read2.setCigarString("10I"); + + reads = Arrays.asList(read2); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + Assert.assertTrue(pe.isBeforeInsertion()); + Assert.assertFalse(pe.isAfterInsertion()); + Assert.assertEquals(pe.getEventBases(), "AAAAAAAAAA"); + } + } + + //////////////////////////////////////////// + // comprehensive LIBS/PileupElement tests // + //////////////////////////////////////////// + + private static final int IS_BEFORE_DELETED_BASE_FLAG = 1; + private static final int IS_BEFORE_DELETION_START_FLAG = 2; + private static final int IS_AFTER_DELETED_BASE_FLAG = 4; + private static final int IS_AFTER_DELETION_END_FLAG = 8; + private static final int IS_BEFORE_INSERTION_FLAG = 16; + private static final int IS_AFTER_INSERTION_FLAG = 32; + private static final int IS_NEXT_TO_SOFTCLIP_FLAG = 64; + + private static class LIBSTest { + + + final String cigar; + final int readLength; + final List offsets; + final List flags; + + private LIBSTest(final String cigar, final int readLength, final List offsets, final List flags) { + this.cigar = cigar; + this.readLength = readLength; + this.offsets = offsets; + this.flags = flags; + } + } + + @DataProvider(name = "LIBSTest") + public Object[][] createLIBSTestData() { + return new Object[][]{ + {new LIBSTest("1I", 1, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))}, + {new LIBSTest("10I", 10, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))}, + {new LIBSTest("2M2I2M", 6, Arrays.asList(0,1,4,5), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG,IS_AFTER_INSERTION_FLAG,0))}, + {new LIBSTest("2M2I", 4, Arrays.asList(0,1), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG))}, + //TODO -- uncomment these when LIBS is fixed + //{new LIBSTest("2I2M", 4, Arrays.asList(2,3), Arrays.asList(IS_AFTER_INSERTION_FLAG,0))}, + //{new LIBSTest("1I1M1D1M", 3, Arrays.asList(0,1), Arrays.asList(IS_AFTER_INSERTION_FLAG | IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG))}, + //{new LIBSTest("1S1I1M", 3, Arrays.asList(2), Arrays.asList(IS_AFTER_INSERTION_FLAG))}, + {new LIBSTest("1M2D2M", 3, Arrays.asList(0,1,2), Arrays.asList(IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG,0))}, + {new LIBSTest("1S1M", 2, Arrays.asList(1), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))}, + {new LIBSTest("1M1S", 2, Arrays.asList(0), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))}, + {new LIBSTest("1S1M1I", 3, Arrays.asList(1), Arrays.asList(IS_BEFORE_INSERTION_FLAG | IS_NEXT_TO_SOFTCLIP_FLAG))} + }; + } + + @Test(dataProvider = "LIBSTest") + public void testLIBS(LIBSTest params) { + final int locus = 44367788; + + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, params.readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', params.readLength)); + read.setBaseQualities(Utils.dupBytes((byte) '@', params.readLength)); + read.setCigarString(params.cigar); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(Arrays.asList(read), createTestReadProperties()); + + int offset = 0; + while ( li.hasNext() ) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + + final int flag = params.flags.get(offset); + Assert.assertEquals(pe.isBeforeDeletedBase(), (flag & IS_BEFORE_DELETED_BASE_FLAG) != 0); + Assert.assertEquals(pe.isBeforeDeletionStart(), (flag & IS_BEFORE_DELETION_START_FLAG) != 0); + Assert.assertEquals(pe.isAfterDeletedBase(), (flag & IS_AFTER_DELETED_BASE_FLAG) != 0); + Assert.assertEquals(pe.isAfterDeletionEnd(), (flag & IS_AFTER_DELETION_END_FLAG) != 0); + Assert.assertEquals(pe.isBeforeInsertion(), (flag & IS_BEFORE_INSERTION_FLAG) != 0); + Assert.assertEquals(pe.isAfterInsertion(), (flag & IS_AFTER_INSERTION_FLAG) != 0); + Assert.assertEquals(pe.isNextToSoftClip(), (flag & IS_NEXT_TO_SOFTCLIP_FLAG) != 0); + + Assert.assertEquals(pe.getOffset(), params.offsets.get(offset).intValue()); + + offset++; + } + } + + //////////////////////////////////////////////// + // End comprehensive LIBS/PileupElement tests // + //////////////////////////////////////////////// + + + /////////////////////////////////////// + // Read State Manager Tests // + /////////////////////////////////////// + + private class PerSampleReadStateManagerTest extends TestDataProvider { + private List readCountsPerAlignmentStart; + private List reads; + private List> recordStatesByAlignmentStart; + private int removalInterval; + + public PerSampleReadStateManagerTest( List readCountsPerAlignmentStart, int removalInterval ) { + super(PerSampleReadStateManagerTest.class); + + this.readCountsPerAlignmentStart = readCountsPerAlignmentStart; + this.removalInterval = removalInterval; + + reads = new ArrayList(); + recordStatesByAlignmentStart = new ArrayList>(); + + setName(String.format("%s: readCountsPerAlignmentStart: %s removalInterval: %d", + getClass().getSimpleName(), readCountsPerAlignmentStart, removalInterval)); + } + + public void run() { + LocusIteratorByStateExperimental libs = makeLTBS(new ArrayList(), createTestReadProperties()); + LocusIteratorByStateExperimental.ReadStateManager readStateManager = + libs.new ReadStateManager(new ArrayList().iterator()); + LocusIteratorByStateExperimental.ReadStateManager.PerSampleReadStateManager perSampleReadStateManager = + readStateManager.new PerSampleReadStateManager(); + + makeReads(); + + for ( ArrayList stackRecordStates : recordStatesByAlignmentStart ) { + perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates); + } + + // read state manager should have the right number of reads + Assert.assertEquals(reads.size(), perSampleReadStateManager.size()); + + Iterator originalReadsIterator = reads.iterator(); + Iterator recordStateIterator = perSampleReadStateManager.iterator(); + int recordStateCount = 0; + int numReadStatesRemoved = 0; + + // Do a first-pass validation of the record state iteration by making sure we get back everything we + // put in, in the same order, doing any requested removals of read states along the way + while ( recordStateIterator.hasNext() ) { + LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next(); + recordStateCount++; + SAMRecord readFromPerSampleReadStateManager = readState.getRead(); + + Assert.assertTrue(originalReadsIterator.hasNext()); + SAMRecord originalRead = originalReadsIterator.next(); + + // The read we get back should be literally the same read in memory as we put in + Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); + + // If requested, remove a read state every removalInterval states + if ( removalInterval > 0 && recordStateCount % removalInterval == 0 ) { + recordStateIterator.remove(); + numReadStatesRemoved++; + } + } + + Assert.assertFalse(originalReadsIterator.hasNext()); + + // If we removed any read states, do a second pass through the read states to make sure the right + // states were removed + if ( numReadStatesRemoved > 0 ) { + Assert.assertEquals(perSampleReadStateManager.size(), reads.size() - numReadStatesRemoved); + + originalReadsIterator = reads.iterator(); + recordStateIterator = perSampleReadStateManager.iterator(); + int readCount = 0; + int readStateCount = 0; + + // Match record states with the reads that should remain after removal + while ( recordStateIterator.hasNext() ) { + LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next(); + readStateCount++; + SAMRecord readFromPerSampleReadStateManager = readState.getRead(); + + Assert.assertTrue(originalReadsIterator.hasNext()); + + SAMRecord originalRead = originalReadsIterator.next(); + readCount++; + + if ( readCount % removalInterval == 0 ) { + originalRead = originalReadsIterator.next(); // advance to next read, since the previous one should have been discarded + readCount++; + } + + // The read we get back should be literally the same read in memory as we put in (after accounting for removals) + Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); + } + + Assert.assertEquals(readStateCount, reads.size() - numReadStatesRemoved); + } + + // Allow memory used by this test to be reclaimed + readCountsPerAlignmentStart = null; + reads = null; + recordStatesByAlignmentStart = null; + } + + private void makeReads() { + int alignmentStart = 1; + + for ( int readsThisStack : readCountsPerAlignmentStart ) { + ArrayList stackReads = new ArrayList(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100))); + ArrayList stackRecordStates = new ArrayList(); + + for ( SAMRecord read : stackReads ) { + stackRecordStates.add(new LocusIteratorByStateExperimental.SAMRecordState(read)); + } + + reads.addAll(stackReads); + recordStatesByAlignmentStart.add(stackRecordStates); + } + } + } + + @DataProvider(name = "PerSampleReadStateManagerTestDataProvider") + public Object[][] createPerSampleReadStateManagerTests() { + for ( List thisTestReadStateCounts : Arrays.asList( Arrays.asList(1), + Arrays.asList(2), + Arrays.asList(10), + Arrays.asList(1, 1), + Arrays.asList(2, 2), + Arrays.asList(10, 10), + Arrays.asList(1, 10), + Arrays.asList(10, 1), + Arrays.asList(1, 1, 1), + Arrays.asList(2, 2, 2), + Arrays.asList(10, 10, 10), + Arrays.asList(1, 1, 1, 1, 1, 1), + Arrays.asList(10, 10, 10, 10, 10, 10), + Arrays.asList(1, 2, 10, 1, 2, 10) + ) ) { + + for ( int removalInterval : Arrays.asList(0, 2, 3) ) { + new PerSampleReadStateManagerTest(thisTestReadStateCounts, removalInterval); + } + } + + return PerSampleReadStateManagerTest.getTests(PerSampleReadStateManagerTest.class); + } + + @Test(dataProvider = "PerSampleReadStateManagerTestDataProvider") + public void runPerSampleReadStateManagerTest( PerSampleReadStateManagerTest test ) { + logger.warn("Running test: " + test); + + test.run(); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java index 3b5d8d6b7..f0d7f83dc 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java @@ -28,14 +28,12 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; -import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; -import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -48,7 +46,6 @@ import java.util.List; */ public class VerifyingSamIteratorUnitTest { private SAMFileHeader samFileHeader; - private GenomeLocParser genomeLocParser; @BeforeClass public void init() { @@ -58,8 +55,6 @@ public class VerifyingSamIteratorUnitTest { samFileHeader = new SAMFileHeader(); samFileHeader.setSequenceDictionary(sequenceDictionary); - - genomeLocParser = new GenomeLocParser(sequenceDictionary); } @Test @@ -68,7 +63,7 @@ public class VerifyingSamIteratorUnitTest { SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(0).getSequenceIndex(),2,10); List reads = Arrays.asList(read1,read2); - VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator())); + VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator())); Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); @@ -83,7 +78,7 @@ public class VerifyingSamIteratorUnitTest { SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(1).getSequenceIndex(),1,10); List reads = Arrays.asList(read1,read2); - VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator())); + VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator())); Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); @@ -98,7 +93,7 @@ public class VerifyingSamIteratorUnitTest { SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(0).getSequenceIndex(),1,10); List reads = Arrays.asList(read1,read2); - VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator())); + VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator())); Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); @@ -116,7 +111,7 @@ public class VerifyingSamIteratorUnitTest { read1.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); List reads = Arrays.asList(read1,read2); - VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator())); + VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator())); Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); diff --git a/public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java similarity index 99% rename from public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java index 0f19e2f90..5b052454a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java @@ -17,7 +17,7 @@ import java.util.*; * @author mhanna * @version 0.1 */ -public class ReservoirDownsamplerUnitTest { +public class LegacyReservoirDownsamplerUnitTest { private static final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1,1,200); diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java new file mode 100644 index 000000000..74626d031 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java @@ -0,0 +1,161 @@ +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; + +import org.broadinstitute.sting.BaseTest; + +public class ArtificialSingleSampleReadStreamUnitTest extends BaseTest { + + private static class ArtificialSingleSampleReadStreamTest extends TestDataProvider { + private ArtificialSingleSampleReadStream stream; + private ArtificialSingleSampleReadStreamAnalyzer streamAnalyzer; + + public ArtificialSingleSampleReadStreamTest( ArtificialSingleSampleReadStream stream ) { + super(ArtificialSingleSampleReadStreamTest.class); + + this.stream = stream; + + setName(String.format("%s: numContigs=%d stacksPerContig=%d readsPerStack=%d-%d distanceBetweenStacks=%d-%d readLength=%d-%d unmappedReads=%d", + getClass().getSimpleName(), + stream.getNumContigs(), + stream.getNumStacksPerContig(), + stream.getMinReadsPerStack(), + stream.getMaxReadsPerStack(), + stream.getMinDistanceBetweenStacks(), + stream.getMaxDistanceBetweenStacks(), + stream.getMinReadLength(), + stream.getMaxReadLength(), + stream.getNumUnmappedReads())); + } + + public void run() { + streamAnalyzer= new ArtificialSingleSampleReadStreamAnalyzer(stream); + + streamAnalyzer.analyze(stream); + + // Check whether the observed properties of the stream match its nominal properties + streamAnalyzer.validate(); + } + } + + @DataProvider(name = "ArtificialSingleSampleReadStreamTestDataProvider") + public Object[][] createArtificialSingleSampleReadStreamTests() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 10000); + String readGroupID = "testReadGroup"; + SAMReadGroupRecord readGroup = new SAMReadGroupRecord(readGroupID); + readGroup.setSample("testSample"); + header.addReadGroup(readGroup); + + GenomeAnalysisEngine.resetRandomGenerator(); + + // brute force testing! + for ( int numContigs = 0; numContigs <= 2; numContigs++ ) { + for ( int stacksPerContig = 0; stacksPerContig <= 2; stacksPerContig++ ) { + for ( int minReadsPerStack = 1; minReadsPerStack <= 2; minReadsPerStack++ ) { + for ( int maxReadsPerStack = 1; maxReadsPerStack <= 3; maxReadsPerStack++ ) { + for ( int minDistanceBetweenStacks = 1; minDistanceBetweenStacks <= 2; minDistanceBetweenStacks++ ) { + for ( int maxDistanceBetweenStacks = 1; maxDistanceBetweenStacks <= 3; maxDistanceBetweenStacks++ ) { + for ( int minReadLength = 1; minReadLength <= 2; minReadLength++ ) { + for ( int maxReadLength = 1; maxReadLength <= 3; maxReadLength++ ) { + for ( int numUnmappedReads = 0; numUnmappedReads <= 2; numUnmappedReads++ ) { + // Only test sane combinations here + if ( minReadsPerStack <= maxReadsPerStack && + minDistanceBetweenStacks <= maxDistanceBetweenStacks && + minReadLength <= maxReadLength && + ((numContigs > 0 && stacksPerContig > 0) || (numContigs == 0 && stacksPerContig == 0)) ) { + + new ArtificialSingleSampleReadStreamTest(new ArtificialSingleSampleReadStream(header, + readGroupID, + numContigs, + stacksPerContig, + minReadsPerStack, + maxReadsPerStack, + minDistanceBetweenStacks, + maxDistanceBetweenStacks, + minReadLength, + maxReadLength, + numUnmappedReads)); + } + } + } + } + } + } + } + } + } + } + + return ArtificialSingleSampleReadStreamTest.getTests(ArtificialSingleSampleReadStreamTest.class); + } + + @Test(dataProvider = "ArtificialSingleSampleReadStreamTestDataProvider") + public void testArtificialSingleSampleReadStream( ArtificialSingleSampleReadStreamTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + test.run(); + } + + @DataProvider(name = "ArtificialSingleSampleReadStreamInvalidArgumentsTestDataProvider") + public Object[][] createInvalidArgumentsTests() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 10000); + String readGroupID = "testReadGroup"; + header.addReadGroup(new SAMReadGroupRecord(readGroupID)); + + return new Object[][] { + {"testNullHeader", null, readGroupID, 1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testNullReadGroup", header, null, 1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidReadGroup", header, "foo", 1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidNumContigs", header, readGroupID, -1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidNumStacksPerContig", header, readGroupID, 1, -1, 1, 2, 1, 2, 1, 2, 0}, + {"test0ContigsNon0StacksPerContig", header, readGroupID, 0, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testNon0Contigs0StacksPerContig", header, readGroupID, 1, 0, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidMinReadsPerStack", header, readGroupID, 1, 1, -1, 2, 1, 2, 1, 2, 0}, + {"testInvalidMaxReadsPerStack", header, readGroupID, 1, 1, 1, -2, 1, 2, 1, 2, 0}, + {"testInvalidMinDistanceBetweenStacks", header, readGroupID, 1, 1, 1, 2, -1, 2, 1, 2, 0}, + {"testInvalidMaxDistanceBetweenStacks", header, readGroupID, 1, 1, 1, 2, 1, -2, 1, 2, 0}, + {"testInvalidMinReadLength", header, readGroupID, 1, 1, 1, 2, 1, 2, -1, 2, 0}, + {"testInvalidMaxReadLength", header, readGroupID, 1, 1, 1, 2, 1, 2, 1, -2, 0}, + {"testInvalidReadsPerStackRange", header, readGroupID, 1, 1, 2, 1, 1, 2, 1, 2, 0}, + {"testInvalidDistanceBetweenStacksRange", header, readGroupID, 1, 1, 1, 2, 2, 1, 1, 2, 0}, + {"testInvalidReadLengthRange", header, readGroupID, 1, 1, 1, 2, 1, 2, 2, 1, 0}, + {"testInvalidNumUnmappedReads", header, readGroupID, 1, 1, 1, 2, 1, 2, 1, 2, -1}, + }; + } + + @Test(dataProvider = "ArtificialSingleSampleReadStreamInvalidArgumentsTestDataProvider", + expectedExceptions = ReviewedStingException.class) + public void testInvalidArguments( String testName, + SAMFileHeader header, + String readGroupID, + int numContigs, + int numStacksPerContig, + int minReadsPerStack, + int maxReadsPerStack, + int minDistanceBetweenStacks, + int maxDistanceBetweenStacks, + int minReadLength, + int maxReadLength, + int numUnmappedReads ) { + + logger.warn("Running test: " + testName); + + ArtificialSingleSampleReadStream stream = new ArtificialSingleSampleReadStream(header, + readGroupID, + numContigs, + numStacksPerContig, + minReadsPerStack, + maxReadsPerStack, + minDistanceBetweenStacks, + maxDistanceBetweenStacks, + minReadLength, + maxReadLength, + numUnmappedReads); + } +} From 576c7280d9b0ebc9b6f73e89cc394cb7fde23623 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 6 Sep 2012 22:03:18 -0400 Subject: [PATCH 146/161] Extensions to the ErrorThrowing framework for testing purposes --- .../sting/gatk/CommandLineGATK.java | 20 ++++----- .../sting/gatk/walkers/qc/ErrorThrowing.java | 44 +++++++++++++++---- .../sting/utils/exceptions/UserException.java | 6 +++ 3 files changed, 51 insertions(+), 19 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index 312d31727..ce57d1a7a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -112,31 +112,31 @@ public class CommandLineGATK extends CommandLineExecutable { } } - protected static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file"; - protected static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files"; + public static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file"; + public static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files"; private static void checkForMaskedUserErrors(final Throwable t) { final String message = t.getMessage(); if ( message == null ) return; // we know what to do about the common "Too many open files" error - if ( message.indexOf("Too many open files") != -1 ) + if ( message.contains("Too many open files") ) exitSystemWithUserError(new UserException.TooManyOpenFiles()); // malformed BAM looks like a SAM file - if ( message.indexOf(PICARD_TEXT_SAM_FILE_ERROR_1) != -1 || - message.indexOf(PICARD_TEXT_SAM_FILE_ERROR_2) != -1 ) + if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || + message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) ) exitSystemWithSamError(t); // can't close tribble index when writing - if ( message.indexOf("Unable to close index for") != -1 ) + if ( message.contains("Unable to close index for") ) exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage())); // disk is full - if ( message.indexOf("No space left on device") != -1 ) - exitSystemWithUserError(new UserException(t.getMessage())); - if ( t.getCause() != null && t.getCause().getMessage().indexOf("No space left on device") != -1 ) - exitSystemWithUserError(new UserException(t.getCause().getMessage())); + if ( message.contains("No space left on device") ) + exitSystemWithUserError(new UserException.NoSpaceOnDevice()); + if ( t.getCause() != null && t.getCause().getMessage().contains("No space left on device") ) + exitSystemWithUserError(new UserException.NoSpaceOnDevice()); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java index a3df3bc13..12423595b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.qc; +import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.gatk.CommandLineGATK; @@ -45,20 +46,23 @@ public class ErrorThrowing extends RodWalker implements TreeRed @Input(fullName="exception", shortName = "E", doc="Java class of exception to throw", required=true) public String exceptionToThrow; + @Argument(fullName = "failMethod", shortName = "fail", doc = "Determines which method to fail in", required = false) + public FailMethod failMethod = FailMethod.MAP; + + public enum FailMethod { + MAP, + REDUCE, + TREE_REDUCE + } + // // Template code to allow us to build the walker, doesn't actually do anything // @Override public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( exceptionToThrow.equals("UserException") ) { - throw new UserException("UserException"); - } else if ( exceptionToThrow.equals("NullPointerException") ) { - throw new NullPointerException(); - } else if ( exceptionToThrow.equals("ReviewedStingException") ) { - throw new ReviewedStingException("ReviewedStingException"); - } else { - throw new UserException.BadArgumentValue("exception", "exception isn't a recognized value " + exceptionToThrow); - } + if ( failMethod == FailMethod.MAP ) + fail(); + return 0; } @Override @@ -68,10 +72,32 @@ public class ErrorThrowing extends RodWalker implements TreeRed @Override public Integer reduce(Integer value, Integer sum) { + if ( failMethod == FailMethod.REDUCE ) + fail(); return value + sum; } public Integer treeReduce(final Integer lhs, final Integer rhs) { + if ( failMethod == FailMethod.TREE_REDUCE ) + fail(); return lhs + rhs; } + + private void fail() { + if ( exceptionToThrow.equals("UserException") ) { + throw new UserException("UserException"); + } else if ( exceptionToThrow.equals("NullPointerException") ) { + throw new NullPointerException(); + } else if ( exceptionToThrow.equals("ReviewedStingException") ) { + throw new ReviewedStingException("ReviewedStingException"); + } else if ( exceptionToThrow.equals("SamError1") ) { + throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_1); + } else if ( exceptionToThrow.equals("SamError2") ) { + throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_2); + } else if ( exceptionToThrow.equals("NoSpace") ) { + throw new net.sf.samtools.util.RuntimeIOException(new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); + } else { + throw new UserException.BadArgumentValue("exception", "exception isn't a recognized value " + exceptionToThrow); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 47a2f2f1d..faafc611a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -141,6 +141,12 @@ public class UserException extends ReviewedStingException { } } + public static class NoSpaceOnDevice extends UserException { + public NoSpaceOnDevice() { + super("There is no space left on the device, so writing failed"); + } + } + public static class CouldNotReadInputFile extends UserException { public CouldNotReadInputFile(String message, Exception e) { super(String.format("Couldn't read file because %s caused by %s", message, getMessage(e))); From 9d12935986c4ded5e60274c5d13a2383678ef0e4 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 6 Sep 2012 14:33:31 -0400 Subject: [PATCH 147/161] Intermediate commit for new hyper parallel NanoScheduler -- There's a logic bug now but I'll go to squash it... --- .../utils/nanoScheduler/NanoScheduler.java | 263 ++++++++++++------ .../utils/threading/NamedThreadFactory.java | 26 ++ .../nanoScheduler/NanoSchedulerUnitTest.java | 6 + 3 files changed, 207 insertions(+), 88 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 24db0f7dc..fe8731d3b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -5,13 +5,11 @@ import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.AutoFormattingTime; import org.broadinstitute.sting.utils.SimpleTimer; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.threading.NamedThreadFactory; import java.util.Iterator; -import java.util.LinkedList; import java.util.List; -import java.util.Queue; import java.util.concurrent.*; /** @@ -52,7 +50,9 @@ public class NanoScheduler { final int bufferSize; final int nThreads; + final ExecutorService inputExecutor; + final ExecutorService reduceExecutor; final ExecutorService mapExecutor; boolean shutdown = false; boolean debug = false; @@ -77,8 +77,14 @@ public class NanoScheduler { this.bufferSize = bufferSize; this.nThreads = nThreads; - this.mapExecutor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads-1); - this.inputExecutor = Executors.newSingleThreadExecutor(); + + if ( nThreads == 1 ) { + this.mapExecutor = this.inputExecutor = this.reduceExecutor = null; + } else { + this.mapExecutor = Executors.newFixedThreadPool(nThreads-1, new NamedThreadFactory("NS-map-thread-%d")); + this.inputExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-input-thread-%d")); + this.reduceExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-reduce-thread-%d")); + } // start timing the time spent outside of the nanoScheduler outsideSchedulerTimer.start(); @@ -110,11 +116,9 @@ public class NanoScheduler { public void shutdown() { outsideSchedulerTimer.stop(); - if ( mapExecutor != null ) { - final List remaining = mapExecutor.shutdownNow(); - if ( ! remaining.isEmpty() ) - throw new IllegalStateException("Remaining tasks found in the mapExecutor, unexpected behavior!"); - } + shutdownExecutor("inputExecutor", inputExecutor); + shutdownExecutor("mapExecutor", mapExecutor); + shutdownExecutor("reduceExecutor", reduceExecutor); shutdown = true; if (TIME_CALLS) { @@ -125,6 +129,31 @@ public class NanoScheduler { } } + /** + * Helper function to cleanly shutdown an execution service, checking that the execution + * state is clean when it's done. + * + * @param name a string name for error messages for the executorService we are shutting down + * @param executorService the executorService to shut down + */ + private void shutdownExecutor(final String name, final ExecutorService executorService) { + if ( executorService != null ) { + if ( executorService.isShutdown() || executorService.isTerminated() ) + throw new IllegalStateException("Executor service " + name + " is already shut down!"); + + final List remaining = executorService.shutdownNow(); + if ( ! remaining.isEmpty() ) + throw new IllegalStateException(remaining.size() + " remaining tasks found in an executor " + name + ", unexpected behavior!"); + } + } + + /** + * Print to logger.info timing information from timer, with name label + * + * @param label the name of the timer to display. Should be human readable + * @param timer the timer whose elapsed time we will display + */ + @Requires({"label != null", "timer != null"}) private void printTimerInfo(final String label, final SimpleTimer timer) { final double total = inputTimer.getElapsedTime() + mapTimer.getElapsedTime() + reduceTimer.getElapsedTime() + outsideSchedulerTimer.getElapsedTime(); @@ -140,16 +169,30 @@ public class NanoScheduler { return shutdown; } + /** + * @return are we displaying verbose debugging information about the scheduling? + */ public boolean isDebug() { return debug; } + /** + * Helper function to display a String.formatted message if we are doing verbose debugging + * + * @param format the format argument suitable for String.format + * @param args the arguments for String.format + */ + @Requires("format != null") private void debugPrint(final String format, Object ... args) { if ( isDebug() ) logger.info("Thread " + Thread.currentThread().getId() + ":" + String.format(format, args)); } - + /** + * Turn on/off verbose debugging + * + * @param debug true if we want verbose debugging + */ public void setDebug(boolean debug) { this.debug = debug; } @@ -179,6 +222,9 @@ public class NanoScheduler { * It is safe to call this function repeatedly on a single nanoScheduler, at least until the * shutdown method is called. * + * Note that this function goes through a single threaded fast path if the number of threads + * is 1. + * * @param inputReader an iterator providing us with the input data to nanoSchedule map/reduce over * @param map the map function from input type -> map type, will be applied in parallel to each input * @param reduce the reduce function from map type + reduce type -> reduce type to be applied in order to map results @@ -207,9 +253,11 @@ public class NanoScheduler { } /** - * Simple efficient reference implementation for single threaded execution + * Simple efficient reference implementation for single threaded execution. + * * @return the reduce result of this map/reduce job */ + @Requires({"inputReader != null", "map != null", "reduce != null"}) private ReduceType executeSingleThreaded(final Iterator inputReader, final NanoSchedulerMapFunction map, final ReduceType initialValue, @@ -249,88 +297,111 @@ public class NanoScheduler { * * @return the reduce result of this map/reduce job */ + @Requires({"inputReader != null", "map != null", "reduce != null"}) private ReduceType executeMultiThreaded(final Iterator inputReader, final NanoSchedulerMapFunction map, final ReduceType initialValue, final NanoSchedulerReduceFunction reduce) { debugPrint("Executing nanoScheduler"); - ReduceType sum = initialValue; - boolean done = false; + // a completion service that tracks when jobs complete, so we can wait in this thread + // until all of the map jobs are completed, without having to shut down the executor itself + final ExecutorCompletionService mapJobCompletionService = + new ExecutorCompletionService(mapExecutor); + + // a blocking queue that limits the number of input datum to the requested buffer size final BlockingQueue inputQueue = new LinkedBlockingDeque(bufferSize); + // a priority queue that stores up to bufferSize * MAP_QUEUE_SCALE_FACTOR elements + // produced by completed map jobs. + final PriorityBlockingQueue mapResultQueue = new PriorityBlockingQueue(bufferSize*100); + + // TODO -- the logic of this blocking queue is wrong! We need to wait for map jobs in order, not just + // -- in the order in which they are produced + + // TODO -- map executor must have fixed size map jobs queue + inputExecutor.submit(new InputProducer(inputReader, inputQueue)); + final Future reduceResult = reduceExecutor.submit(new ReducerThread(reduce, initialValue, mapResultQueue)); - while ( ! done ) { - try { - final Pair, Boolean> readResults = readInputs(inputQueue); - final List inputs = readResults.getFirst(); - done = readResults.getSecond(); + try { + int numJobs = 0; + while ( true ) { + // block on input + final InputDatum inputEnqueueWrapped = inputQueue.take(); - if ( ! inputs.isEmpty() ) { - // send jobs for map - final Queue> mapQueue = submitMapJobs(map, mapExecutor, inputs); + if ( ! inputEnqueueWrapped.isLast() ) { + // get the object itself + final InputType input = inputEnqueueWrapped.datum; + + // the next map call has id + 1 + numJobs++; + + // send job for map via the completion service + final CallableMap doMap = new CallableMap(map, numJobs, input, mapResultQueue); + mapJobCompletionService.submit(doMap, numJobs); - // send off the reduce job, and block until we get at least one reduce result - sum = reduceSerial(reduce, mapQueue, sum); debugPrint(" Done with cycle of map/reduce"); - if ( progressFunction != null ) progressFunction.progress(inputs.get(inputs.size()-1)); + if ( progressFunction != null ) // TODO -- don't cycle so often + progressFunction.progress(input); } else { - // we must be done - if ( ! done ) throw new IllegalStateException("Inputs empty but not done"); + waitForLastJob(mapJobCompletionService, numJobs); + mapResultQueue.add(new MapResult()); + return reduceResult.get(); // wait for our result of reduce } - } catch (InterruptedException ex) { - throw new ReviewedStingException("got execution exception", ex); - } catch (ExecutionException ex) { - throw new ReviewedStingException("got execution exception", ex); } + } catch (InterruptedException ex) { + throw new ReviewedStingException("got execution exception", ex); + } catch (ExecutionException ex) { + throw new ReviewedStingException("got execution exception", ex); } - - return sum; - } - - @Requires({"reduce != null", "! mapQueue.isEmpty()"}) - private ReduceType reduceSerial(final NanoSchedulerReduceFunction reduce, - final Queue> mapQueue, - final ReduceType initSum) - throws InterruptedException, ExecutionException { - ReduceType sum = initSum; - - // while mapQueue has something in it to reduce - for ( final Future future : mapQueue ) { - final MapType value = future.get(); // block until we get the values for this task - - if ( TIME_CALLS ) reduceTimer.restart(); - sum = reduce.apply(value, sum); - if ( TIME_CALLS ) reduceTimer.stop(); - } - - return sum; } /** - * Read up to inputBufferSize elements from inputReader - * - * @return a queue of input read in, containing one or more values of InputType read in + * Helper routine that will wait until the last map job finishes running + * by taking numJob values from the executor completion service, using + * the blocking take() call. */ - @Requires("inputReader != null") - @Ensures("result != null") - private Pair, Boolean> readInputs(final BlockingQueue inputReader) throws InterruptedException { - int n = 0; - final List inputs = new LinkedList(); - boolean done = false; + private void waitForLastJob(final ExecutorCompletionService mapJobCompletionService, + final int numJobs ) throws InterruptedException { + for ( int i = 0; i < numJobs; i++ ) + mapJobCompletionService.take(); + } - while ( ! done && n < getBufferSize() ) { - final InputDatum input = inputReader.take(); - done = input.isLast(); - if ( ! done ) { - inputs.add(input.datum); - n++; - } + private class ReducerThread implements Callable { + final NanoSchedulerReduceFunction reduce; + ReduceType sum; + final PriorityBlockingQueue mapResultQueue; + + public ReducerThread(final NanoSchedulerReduceFunction reduce, + final ReduceType sum, + final PriorityBlockingQueue mapResultQueue) { + this.reduce = reduce; + this.sum = sum; + this.mapResultQueue = mapResultQueue; } - return new Pair, Boolean>(inputs, done); + public ReduceType call() { + try { + while ( true ) { + final MapResult result = mapResultQueue.take(); + //System.out.println("Reduce of map result " + result.id + " with sum " + sum); + if ( result.isLast() ) { + //System.out.println("Saw last! " + result.id); + return sum; + } + else { + if ( TIME_CALLS ) reduceTimer.restart(); + sum = reduce.apply(result.datum, sum); + if ( TIME_CALLS ) reduceTimer.stop(); + } + } + } catch (InterruptedException ex) { + //System.out.println("Interrupted"); + throw new ReviewedStingException("got execution exception", ex); + } + } } private class InputProducer implements Runnable { @@ -359,16 +430,16 @@ public class NanoScheduler { } } - private class InputDatum { + private class BlockingDatum { final boolean isLast; - final InputType datum; + final T datum; - private InputDatum(final InputType datum) { + private BlockingDatum(final T datum) { isLast = false; this.datum = datum; } - private InputDatum() { + private BlockingDatum() { isLast = true; this.datum = null; } @@ -378,40 +449,56 @@ public class NanoScheduler { } } - @Requires({"map != null", "! inputs.isEmpty()"}) - private Queue> submitMapJobs(final NanoSchedulerMapFunction map, - final ExecutorService executor, - final List inputs) { - final Queue> mapQueue = new LinkedList>(); - for ( final InputType input : inputs ) { - final CallableMap doMap = new CallableMap(map, input); - final Future future = executor.submit(doMap); - mapQueue.add(future); + private class InputDatum extends BlockingDatum { + private InputDatum(InputType datum) { super(datum); } + private InputDatum() { } + } + + private class MapResult extends BlockingDatum implements Comparable { + final Integer id; + + private MapResult(MapType datum, Integer id) { + super(datum); + this.id = id; } - return mapQueue; + private MapResult() { + this.id = Integer.MAX_VALUE; + } + + @Override + public int compareTo(MapResult o) { + return id.compareTo(o.id); + } } /** * A simple callable version of the map function for use with the executor pool */ - private class CallableMap implements Callable { + private class CallableMap implements Runnable { + final int id; final InputType input; final NanoSchedulerMapFunction map; + final PriorityBlockingQueue mapResultQueue; @Requires({"map != null"}) - private CallableMap(final NanoSchedulerMapFunction map, final InputType inputs) { - this.input = inputs; + private CallableMap(final NanoSchedulerMapFunction map, + final int id, + final InputType input, + final PriorityBlockingQueue mapResultQueue) { + this.id = id; + this.input = input; this.map = map; + this.mapResultQueue = mapResultQueue; } - @Override public MapType call() throws Exception { + @Override public void run() { if ( TIME_CALLS ) mapTimer.restart(); if ( debug ) debugPrint("\t\tmap " + input); final MapType result = map.apply(input); if ( TIME_CALLS ) mapTimer.stop(); - return result; + mapResultQueue.add(new MapResult(result, id)); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java b/public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java new file mode 100644 index 000000000..b25375b87 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java @@ -0,0 +1,26 @@ +package org.broadinstitute.sting.utils.threading; + +import java.util.concurrent.ThreadFactory; + +/** + * Thread factor that produces threads with a given name pattern + * + * User: depristo + * Date: 9/5/12 + * Time: 9:22 PM + * + */ +public class NamedThreadFactory implements ThreadFactory { + static int id = 0; + final String format; + + public NamedThreadFactory(String format) { + this.format = format; + String.format(format, id); // test the name + } + + @Override + public Thread newThread(Runnable r) { + return new Thread(r, String.format(format, id++)); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index ddfc3cecd..21ac6dcec 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.utils.nanoScheduler; +import org.apache.log4j.BasicConfigurator; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; import org.testng.annotations.DataProvider; @@ -165,6 +166,10 @@ public class NanoSchedulerUnitTest extends BaseTest { } public static void main(String [ ] args) { + org.apache.log4j.Logger logger = org.apache.log4j.Logger.getRootLogger(); + BasicConfigurator.configure(); + logger.setLevel(org.apache.log4j.Level.DEBUG); + final NanoSchedulerBasicTest test = new NanoSchedulerBasicTest(1000, Integer.valueOf(args[0]), 0, Integer.valueOf(args[1])); final NanoScheduler nanoScheduler = new NanoScheduler(test.bufferSize, test.nThreads); @@ -172,5 +177,6 @@ public class NanoSchedulerUnitTest extends BaseTest { final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); System.out.printf("Sum = %d, expected =%d%n", sum, test.expectedResult); + nanoScheduler.shutdown(); } } From c5038849581c97b8dfc0bfd35723553ec1ad20c9 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 7 Sep 2012 08:57:35 -0400 Subject: [PATCH 148/161] GSA-515 Nanoscheduler GSA-551 / Optimize nanoScheduling performance of UnifiedGenotyper -- I've rewritten the entire NS framework to use a producer / consumer model for input -> map and from map -> reduce. This is allowing us to scale reasonably efficiently up to 4 threads (see figure). Future work on the nano scheduler will be itemized in a separate JIRA entry. -- Restructured the NS code for clarity. Docs everywhere. -- This is considered version 1.0 --- .../gatk/traversals/TraverseLociNano.java | 14 +- .../gatk/traversals/TraverseReadsNano.java | 10 +- .../nanoScheduler/BlockingQueueValue.java | 82 ++++++ .../utils/nanoScheduler/FutureValue.java | 45 +++ .../utils/nanoScheduler/InputProducer.java | 62 +++++ .../sting/utils/nanoScheduler/MapResult.java | 36 +++ ...lerMapFunction.java => NSMapFunction.java} | 2 +- ...sFunction.java => NSProgressFunction.java} | 2 +- ...uceFunction.java => NSReduceFunction.java} | 2 +- .../utils/nanoScheduler/NanoScheduler.java | 262 +++++------------- .../utils/nanoScheduler/ReducerThread.java | 64 +++++ .../nanoScheduler/NanoSchedulerUnitTest.java | 8 +- 12 files changed, 383 insertions(+), 206 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/BlockingQueueValue.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/FutureValue.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java rename public/java/src/org/broadinstitute/sting/utils/nanoScheduler/{NanoSchedulerMapFunction.java => NSMapFunction.java} (84%) rename public/java/src/org/broadinstitute/sting/utils/nanoScheduler/{NanoSchedulerProgressFunction.java => NSProgressFunction.java} (81%) rename public/java/src/org/broadinstitute/sting/utils/nanoScheduler/{NanoSchedulerReduceFunction.java => NSReduceFunction.java} (87%) create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java index 73b73c002..e4e2254d0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java @@ -8,10 +8,10 @@ import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NSProgressFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction; import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; -import org.broadinstitute.sting.utils.nanoScheduler.NanoSchedulerMapFunction; -import org.broadinstitute.sting.utils.nanoScheduler.NanoSchedulerProgressFunction; -import org.broadinstitute.sting.utils.nanoScheduler.NanoSchedulerReduceFunction; import java.util.Iterator; @@ -153,7 +153,7 @@ public class TraverseLociNano extends TraverseLociBase { * * Applies walker.map to MapData, returning a MapResult object containing the result */ - private class TraverseLociMap implements NanoSchedulerMapFunction { + private class TraverseLociMap implements NSMapFunction { final LocusWalker walker; private TraverseLociMap(LocusWalker walker) { @@ -174,11 +174,11 @@ public class TraverseLociNano extends TraverseLociBase { } /** - * NanoSchedulerReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements * * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable */ - private class TraverseLociReduce implements NanoSchedulerReduceFunction { + private class TraverseLociReduce implements NSReduceFunction { final LocusWalker walker; private TraverseLociReduce(LocusWalker walker) { @@ -195,7 +195,7 @@ public class TraverseLociNano extends TraverseLociBase { } } - private class TraverseLociProgress implements NanoSchedulerProgressFunction { + private class TraverseLociProgress implements NSProgressFunction { @Override public void progress(MapData lastProcessedMap) { if (lastProcessedMap.alignmentContext != null) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index 5679747e1..b3a0a1390 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -35,9 +35,9 @@ import org.broadinstitute.sting.gatk.datasources.reads.ReadShard; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction; import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; -import org.broadinstitute.sting.utils.nanoScheduler.NanoSchedulerMapFunction; -import org.broadinstitute.sting.utils.nanoScheduler.NanoSchedulerReduceFunction; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.LinkedList; @@ -191,7 +191,7 @@ public class TraverseReadsNano extends TraversalEngine, * * Applies walker.map to MapData, returning a MapResult object containing the result */ - private class TraverseReadsMap implements NanoSchedulerMapFunction { + private class TraverseReadsMap implements NSMapFunction { final ReadWalker walker; private TraverseReadsMap(ReadWalker walker) { @@ -211,11 +211,11 @@ public class TraverseReadsNano extends TraversalEngine, } /** - * NanoSchedulerReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements * * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable */ - private class TraverseReadsReduce implements NanoSchedulerReduceFunction { + private class TraverseReadsReduce implements NSReduceFunction { final ReadWalker walker; private TraverseReadsReduce(ReadWalker walker) { diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/BlockingQueueValue.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/BlockingQueueValue.java new file mode 100644 index 000000000..2daa6c9eb --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/BlockingQueueValue.java @@ -0,0 +1,82 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import com.google.java.contract.Invariant; + +/** + * Wrapper to hold data for a blocking queue, distinguishing an EOF marker from a real object + * + * The only way to tell in a consumer thread that a blocking queue has no more data ever + * coming down the pipe is to pass in a "poison" or EOF object. This class provides + * a generic capacity for that... + * + * The use case looks like this: + * + * BlockingQueue q + * producer: + * while ( x has items ) + * q.put(new BlockingQueueValue(x)) + * q.put(new BlockingQueueValue()) + * + * Consumer: + * while ( true ) + * value = q.take() + * if ( value.isLast() ) + * break + * else + * do something useful with value + * + * + * User: depristo + * Date: 9/6/12 + * Time: 3:08 PM + */ +@Invariant("! isLast || value == null") +class BlockingQueueValue { + /** + * True if this is the EOF marker object + */ + final private boolean isLast; + + /** + * Our value, if we aren't the EOF marker + */ + final private T value; + + /** + * Create a new BlockingQueueValue containing a real value, where last is false + * @param value + */ + BlockingQueueValue(final T value) { + isLast = false; + this.value = value; + } + + /** + * Create a new BlockingQueueValue that is the last item + */ + BlockingQueueValue() { + isLast = true; + this.value = null; + } + + /** + * Is this the EOF marker? + * + * @return true if so, else false + */ + public boolean isLast() { + return isLast; + } + + /** + * Get the value held by this BlockingQueueValue + * + * @return the value + * @throws IllegalStateException if this is the last item + */ + public T getValue() { + if ( isLast() ) + throw new IllegalStateException("Cannot get value for last object"); + return value; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/FutureValue.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/FutureValue.java new file mode 100644 index 000000000..9508a15aa --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/FutureValue.java @@ -0,0 +1,45 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +/** + * Create a future that simply returns a given value + * + * The only standard way to create a future in java is via the ExecutorService interface. + * If you have a data structure holding futures of value T, and you want to add a + * value to it for some reason (to add a EOF marker, for instance) you can use this + * class to create a dummy Future that simply returns a value. + * + * @author depristo + * @since 09/12 + */ +class FutureValue implements Future { + final V value; + + FutureValue(final V value) { + this.value = value; + } + + @Override public boolean cancel(boolean mayInterruptIfRunning) { + return true; + } + + @Override public boolean isCancelled() { + return false; + } + + @Override public boolean isDone() { + return true; + } + + @Override public V get() throws InterruptedException, ExecutionException { + return value; + } + + @Override public V get(long timeout, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException { + return get(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java new file mode 100644 index 000000000..29dddbc49 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java @@ -0,0 +1,62 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.Iterator; +import java.util.concurrent.BlockingQueue; + +/** + * Producer Thread that reads input values from an inputReads and puts them into a BlockingQueue + */ +class InputProducer implements Runnable { + /** + * The iterator we are using to get data from + */ + final Iterator inputReader; + + /** + * Our timer (may be null) that we use to track our input costs + */ + final SimpleTimer inputTimer; + + /** + * Where we put our input values for consumption + */ + final BlockingQueue outputQueue; + + public InputProducer(final Iterator inputReader, + final SimpleTimer inputTimer, + final BlockingQueue outputQueue) { + if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null"); + if ( outputQueue == null ) throw new IllegalArgumentException("OutputQueue cannot be null"); + + this.inputReader = inputReader; + this.inputTimer = inputTimer; + this.outputQueue = outputQueue; + } + + public void run() { + try { + while ( inputReader.hasNext() ) { + if ( inputTimer != null ) inputTimer.restart(); + final InputType input = inputReader.next(); + if ( inputTimer != null ) inputTimer.stop(); + outputQueue.put(new InputValue(input)); + } + + // add the EOF object so our consumer knows we are done in all inputs + outputQueue.put(new InputValue()); + } catch (InterruptedException ex) { + throw new ReviewedStingException("got execution exception", ex); + } + } + + /** + * Helper class that contains a read value suitable for EOF marking in a BlockingQueue + */ + class InputValue extends BlockingQueueValue { + private InputValue(InputType datum) { super(datum); } + private InputValue() { } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java new file mode 100644 index 000000000..3cc6fa786 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java @@ -0,0 +1,36 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +/** + * Holds the results of a map job suitable for producer/consumer threading + * via a BlockingQueue + */ +class MapResult extends BlockingQueueValue { + final int jobID; + + /** + * Create a new MapResult with value datum and jod jobID ID + * + * @param datum the value produced by the map job + * @param jobID the id of the map job (for correctness testing) + */ + MapResult(final MapType datum, final int jobID) { + super(datum); + this.jobID = jobID; + if ( jobID < 0 ) throw new IllegalArgumentException("JobID must be >= 0"); + } + + /** + * Create the EOF marker version of MapResult + */ + MapResult() { + super(); + this.jobID = Integer.MAX_VALUE; + } + + /** + * @return the job ID of the map job that produced this MapResult + */ + public int getJobID() { + return jobID; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerMapFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java similarity index 84% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerMapFunction.java rename to public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java index ddf4421d2..cc5335051 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerMapFunction.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java @@ -9,7 +9,7 @@ package org.broadinstitute.sting.utils.nanoScheduler; * Date: 8/24/12 * Time: 9:49 AM */ -public interface NanoSchedulerMapFunction { +public interface NSMapFunction { /** * Return function on input, returning a value of ResultType * @param input diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerProgressFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java similarity index 81% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerProgressFunction.java rename to public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java index 8631196a3..8b12c62c4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerProgressFunction.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java @@ -7,6 +7,6 @@ package org.broadinstitute.sting.utils.nanoScheduler; * Time: 2:10 PM * To change this template use File | Settings | File Templates. */ -public interface NanoSchedulerProgressFunction { +public interface NSProgressFunction { public void progress(final InputType lastMapInput); } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerReduceFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java similarity index 87% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerReduceFunction.java rename to public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java index 7e58eeaf9..879a33a1d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerReduceFunction.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java @@ -7,7 +7,7 @@ package org.broadinstitute.sting.utils.nanoScheduler; * Date: 8/24/12 * Time: 9:49 AM */ -public interface NanoSchedulerReduceFunction { +public interface NSReduceFunction { /** * Combine one with sum into a new ReduceType * @param one the result of a map call on an input element diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index fe8731d3b..664fb7b9b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -17,12 +17,12 @@ import java.util.concurrent.*; * * The overall framework works like this * - * nano <- new Nanoschedule(bufferSize, numberOfMapElementsToProcessTogether, nThreads) + * nano <- new Nanoschedule(inputBufferSize, numberOfMapElementsToProcessTogether, nThreads) * List[Input] outerData : outerDataLoop ) * result = nano.execute(outerData.iterator(), map, reduce) * - * bufferSize determines how many elements from the input stream are read in one go by the - * nanoscheduler. The scheduler may hold up to bufferSize in memory at one time, as well + * inputBufferSize determines how many elements from the input stream are read in one go by the + * nanoscheduler. The scheduler may hold up to inputBufferSize in memory at one time, as well * as up to inputBufferSize map results as well. * * numberOfMapElementsToProcessTogether determines how many input elements are processed @@ -48,40 +48,45 @@ public class NanoScheduler { private final static boolean LOG_MAP_TIMES = false; private final static boolean TIME_CALLS = true; - final int bufferSize; - final int nThreads; + private final static int MAP_BUFFER_SIZE_SCALE_FACTOR = 100; + final int inputBufferSize; + final int mapBufferSize; + final int nThreads; final ExecutorService inputExecutor; final ExecutorService reduceExecutor; - final ExecutorService mapExecutor; + final ThreadPoolExecutor mapExecutor; + boolean shutdown = false; boolean debug = false; + private NSProgressFunction progressFunction = null; - private NanoSchedulerProgressFunction progressFunction = null; - - final SimpleTimer outsideSchedulerTimer = new SimpleTimer("outside"); - final SimpleTimer inputTimer = new SimpleTimer("input"); - final SimpleTimer mapTimer = new SimpleTimer("map"); - final SimpleTimer reduceTimer = new SimpleTimer("reduce"); + final SimpleTimer outsideSchedulerTimer = TIME_CALLS ? new SimpleTimer("outside") : null; + final SimpleTimer inputTimer = TIME_CALLS ? new SimpleTimer("input") : null; + final SimpleTimer mapTimer = TIME_CALLS ? new SimpleTimer("map") : null; + final SimpleTimer reduceTimer = TIME_CALLS ? new SimpleTimer("reduce") : null; /** - * Create a new nanoschedule with the desire characteristics requested by the argument + * Create a new nanoscheduler with the desire characteristics requested by the argument * - * @param bufferSize the number of input elements to read in each scheduling cycle. - * @param nThreads the number of threads to use to get work done, in addition to the thread calling execute + * @param inputBufferSize the number of input elements to read in each scheduling cycle. + * @param nThreads the number of threads to use to get work done, in addition to the + * thread calling execute */ - public NanoScheduler(final int bufferSize, - final int nThreads) { - if ( bufferSize < 1 ) throw new IllegalArgumentException("bufferSize must be >= 1, got " + bufferSize); + public NanoScheduler(final int inputBufferSize, final int nThreads) { + if ( inputBufferSize < 1 ) throw new IllegalArgumentException("inputBufferSize must be >= 1, got " + inputBufferSize); if ( nThreads < 1 ) throw new IllegalArgumentException("nThreads must be >= 1, got " + nThreads); - this.bufferSize = bufferSize; + this.inputBufferSize = inputBufferSize; + this.mapBufferSize = inputBufferSize * MAP_BUFFER_SIZE_SCALE_FACTOR; this.nThreads = nThreads; if ( nThreads == 1 ) { - this.mapExecutor = this.inputExecutor = this.reduceExecutor = null; + this.mapExecutor = null; + this.inputExecutor = this.reduceExecutor = null; } else { - this.mapExecutor = Executors.newFixedThreadPool(nThreads-1, new NamedThreadFactory("NS-map-thread-%d")); + this.mapExecutor = (ThreadPoolExecutor)Executors.newFixedThreadPool(nThreads-1, new NamedThreadFactory("NS-map-thread-%d")); + this.mapExecutor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy()); this.inputExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-input-thread-%d")); this.reduceExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-reduce-thread-%d")); } @@ -104,8 +109,8 @@ public class NanoScheduler { * @return */ @Ensures("result > 0") - public int getBufferSize() { - return bufferSize; + public int getInputBufferSize() { + return inputBufferSize; } /** @@ -116,9 +121,11 @@ public class NanoScheduler { public void shutdown() { outsideSchedulerTimer.stop(); - shutdownExecutor("inputExecutor", inputExecutor); - shutdownExecutor("mapExecutor", mapExecutor); - shutdownExecutor("reduceExecutor", reduceExecutor); + if ( nThreads > 1 ) { + shutdownExecutor("inputExecutor", inputExecutor); + shutdownExecutor("mapExecutor", mapExecutor); + shutdownExecutor("reduceExecutor", reduceExecutor); + } shutdown = true; if (TIME_CALLS) { @@ -136,15 +143,15 @@ public class NanoScheduler { * @param name a string name for error messages for the executorService we are shutting down * @param executorService the executorService to shut down */ + @Requires({"name != null", "executorService != null"}) + @Ensures("executorService.isShutdown()") private void shutdownExecutor(final String name, final ExecutorService executorService) { - if ( executorService != null ) { - if ( executorService.isShutdown() || executorService.isTerminated() ) - throw new IllegalStateException("Executor service " + name + " is already shut down!"); + if ( executorService.isShutdown() || executorService.isTerminated() ) + throw new IllegalStateException("Executor service " + name + " is already shut down!"); - final List remaining = executorService.shutdownNow(); - if ( ! remaining.isEmpty() ) - throw new IllegalStateException(remaining.size() + " remaining tasks found in an executor " + name + ", unexpected behavior!"); - } + final List remaining = executorService.shutdownNow(); + if ( ! remaining.isEmpty() ) + throw new IllegalStateException(remaining.size() + " remaining tasks found in an executor " + name + ", unexpected behavior!"); } /** @@ -204,7 +211,7 @@ public class NanoScheduler { * * @param progressFunction a progress function to call, or null if you don't want any progress callback */ - public void setProgressFunction(final NanoSchedulerProgressFunction progressFunction) { + public void setProgressFunction(final NSProgressFunction progressFunction) { this.progressFunction = progressFunction; } @@ -231,9 +238,9 @@ public class NanoScheduler { * @return the last reduce value */ public ReduceType execute(final Iterator inputReader, - final NanoSchedulerMapFunction map, + final NSMapFunction map, final ReduceType initialValue, - final NanoSchedulerReduceFunction reduce) { + final NSReduceFunction reduce) { if ( isShutdown() ) throw new IllegalStateException("execute called on already shutdown NanoScheduler"); if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null"); if ( map == null ) throw new IllegalArgumentException("map function cannot be null"); @@ -259,9 +266,9 @@ public class NanoScheduler { */ @Requires({"inputReader != null", "map != null", "reduce != null"}) private ReduceType executeSingleThreaded(final Iterator inputReader, - final NanoSchedulerMapFunction map, + final NSMapFunction map, final ReduceType initialValue, - final NanoSchedulerReduceFunction reduce) { + final NSReduceFunction reduce) { ReduceType sum = initialValue; int i = 0; @@ -278,7 +285,7 @@ public class NanoScheduler { if ( LOG_MAP_TIMES ) logger.info("MAP TIME " + (mapTimer.currentTimeNano() - preMapTime)); if ( TIME_CALLS ) mapTimer.stop(); - if ( i++ % bufferSize == 0 && progressFunction != null ) + if ( i++ % inputBufferSize == 0 && progressFunction != null ) progressFunction.progress(input); // reduce @@ -299,55 +306,53 @@ public class NanoScheduler { */ @Requires({"inputReader != null", "map != null", "reduce != null"}) private ReduceType executeMultiThreaded(final Iterator inputReader, - final NanoSchedulerMapFunction map, + final NSMapFunction map, final ReduceType initialValue, - final NanoSchedulerReduceFunction reduce) { + final NSReduceFunction reduce) { debugPrint("Executing nanoScheduler"); - // a completion service that tracks when jobs complete, so we can wait in this thread - // until all of the map jobs are completed, without having to shut down the executor itself - final ExecutorCompletionService mapJobCompletionService = - new ExecutorCompletionService(mapExecutor); - // a blocking queue that limits the number of input datum to the requested buffer size - final BlockingQueue inputQueue = new LinkedBlockingDeque(bufferSize); + final BlockingQueue.InputValue> inputQueue + = new LinkedBlockingDeque.InputValue>(inputBufferSize); - // a priority queue that stores up to bufferSize * MAP_QUEUE_SCALE_FACTOR elements + // a priority queue that stores up to mapBufferSize elements // produced by completed map jobs. - final PriorityBlockingQueue mapResultQueue = new PriorityBlockingQueue(bufferSize*100); + final BlockingQueue>> mapResultQueue = + new LinkedBlockingDeque>>(mapBufferSize); - // TODO -- the logic of this blocking queue is wrong! We need to wait for map jobs in order, not just - // -- in the order in which they are produced + // Start running the input reader thread + inputExecutor.submit(new InputProducer(inputReader, inputTimer, inputQueue)); - // TODO -- map executor must have fixed size map jobs queue - - inputExecutor.submit(new InputProducer(inputReader, inputQueue)); - final Future reduceResult = reduceExecutor.submit(new ReducerThread(reduce, initialValue, mapResultQueue)); + // Start running the reducer thread + final ReducerThread reducer + = new ReducerThread(reduce, reduceTimer, initialValue, mapResultQueue); + final Future reduceResult = reduceExecutor.submit(reducer); try { int numJobs = 0; + while ( true ) { // block on input - final InputDatum inputEnqueueWrapped = inputQueue.take(); + final InputProducer.InputValue inputEnqueueWrapped = inputQueue.take(); if ( ! inputEnqueueWrapped.isLast() ) { // get the object itself - final InputType input = inputEnqueueWrapped.datum; + final InputType input = inputEnqueueWrapped.getValue(); - // the next map call has id + 1 + // the next map call has jobID + 1 numJobs++; // send job for map via the completion service - final CallableMap doMap = new CallableMap(map, numJobs, input, mapResultQueue); - mapJobCompletionService.submit(doMap, numJobs); + final CallableMap doMap = new CallableMap(map, numJobs, input); + final Future> mapJob = mapExecutor.submit(doMap); + mapResultQueue.put(mapJob); debugPrint(" Done with cycle of map/reduce"); - if ( progressFunction != null ) // TODO -- don't cycle so often + if ( numJobs % inputBufferSize == 0 && progressFunction != null ) progressFunction.progress(input); } else { - waitForLastJob(mapJobCompletionService, numJobs); - mapResultQueue.add(new MapResult()); + mapResultQueue.put(new FutureValue>(new MapResult())); return reduceResult.get(); // wait for our result of reduce } } @@ -358,147 +363,30 @@ public class NanoScheduler { } } - /** - * Helper routine that will wait until the last map job finishes running - * by taking numJob values from the executor completion service, using - * the blocking take() call. - */ - private void waitForLastJob(final ExecutorCompletionService mapJobCompletionService, - final int numJobs ) throws InterruptedException { - for ( int i = 0; i < numJobs; i++ ) - mapJobCompletionService.take(); - } - - private class ReducerThread implements Callable { - final NanoSchedulerReduceFunction reduce; - ReduceType sum; - final PriorityBlockingQueue mapResultQueue; - - public ReducerThread(final NanoSchedulerReduceFunction reduce, - final ReduceType sum, - final PriorityBlockingQueue mapResultQueue) { - this.reduce = reduce; - this.sum = sum; - this.mapResultQueue = mapResultQueue; - } - - public ReduceType call() { - try { - while ( true ) { - final MapResult result = mapResultQueue.take(); - //System.out.println("Reduce of map result " + result.id + " with sum " + sum); - if ( result.isLast() ) { - //System.out.println("Saw last! " + result.id); - return sum; - } - else { - if ( TIME_CALLS ) reduceTimer.restart(); - sum = reduce.apply(result.datum, sum); - if ( TIME_CALLS ) reduceTimer.stop(); - } - } - } catch (InterruptedException ex) { - //System.out.println("Interrupted"); - throw new ReviewedStingException("got execution exception", ex); - } - } - } - - private class InputProducer implements Runnable { - final Iterator inputReader; - final BlockingQueue outputQueue; - - public InputProducer(final Iterator inputReader, final BlockingQueue outputQueue) { - this.inputReader = inputReader; - this.outputQueue = outputQueue; - } - - public void run() { - try { - while ( inputReader.hasNext() ) { - if ( TIME_CALLS ) inputTimer.restart(); - final InputType input = inputReader.next(); - if ( TIME_CALLS ) inputTimer.stop(); - outputQueue.put(new InputDatum(input)); - } - - // add the EOF object so we know we are done - outputQueue.put(new InputDatum()); - } catch (InterruptedException ex) { - throw new ReviewedStingException("got execution exception", ex); - } - } - } - - private class BlockingDatum { - final boolean isLast; - final T datum; - - private BlockingDatum(final T datum) { - isLast = false; - this.datum = datum; - } - - private BlockingDatum() { - isLast = true; - this.datum = null; - } - - public boolean isLast() { - return isLast; - } - } - - - private class InputDatum extends BlockingDatum { - private InputDatum(InputType datum) { super(datum); } - private InputDatum() { } - } - - private class MapResult extends BlockingDatum implements Comparable { - final Integer id; - - private MapResult(MapType datum, Integer id) { - super(datum); - this.id = id; - } - - private MapResult() { - this.id = Integer.MAX_VALUE; - } - - @Override - public int compareTo(MapResult o) { - return id.compareTo(o.id); - } - } - /** * A simple callable version of the map function for use with the executor pool */ - private class CallableMap implements Runnable { + private class CallableMap implements Callable> { final int id; final InputType input; - final NanoSchedulerMapFunction map; - final PriorityBlockingQueue mapResultQueue; + final NSMapFunction map; @Requires({"map != null"}) - private CallableMap(final NanoSchedulerMapFunction map, + private CallableMap(final NSMapFunction map, final int id, - final InputType input, - final PriorityBlockingQueue mapResultQueue) { + final InputType input) { this.id = id; this.input = input; this.map = map; - this.mapResultQueue = mapResultQueue; } - @Override public void run() { + @Override + public MapResult call() { if ( TIME_CALLS ) mapTimer.restart(); if ( debug ) debugPrint("\t\tmap " + input); final MapType result = map.apply(input); if ( TIME_CALLS ) mapTimer.stop(); - mapResultQueue.add(new MapResult(result, id)); + return new MapResult(result, id); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java new file mode 100644 index 000000000..bd29799b6 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java @@ -0,0 +1,64 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; + +/** + * Thread that runs the reduce of the map/reduce. + * + * This thread reads from mapResultsQueue until the poison EOF object arrives. At each + * stage is calls reduce(value, sum). The blocking mapResultQueue ensures that the + * queue waits until the mapResultQueue has a value to take. Then, it gets and waits + * until the map result Future has a value. + */ +class ReducerThread implements Callable { + final NSReduceFunction reduce; + final SimpleTimer reduceTimer; + final BlockingQueue>> mapResultQueue; + + ReduceType sum; + int lastJobID = -1; + + public ReducerThread(final NSReduceFunction reduce, + final SimpleTimer reduceTimer, + final ReduceType sum, + final BlockingQueue>> mapResultQueue) { + if ( reduce == null ) throw new IllegalArgumentException("Reduce function cannot be null"); + if ( mapResultQueue == null ) throw new IllegalArgumentException("mapResultQueue cannot be null"); + + this.reduce = reduce; + this.reduceTimer = reduceTimer; + this.sum = sum; + this.mapResultQueue = mapResultQueue; + } + + public ReduceType call() { + try { + while ( true ) { + final MapResult result = mapResultQueue.take().get(); + if ( result.isLast() ) { + // we are done, just return sum + return sum; + } + else if ( result.getJobID() < lastJobID ) { + // make sure the map results are coming in order + throw new IllegalStateException("BUG: last jobID " + lastJobID + " > current jobID " + result.getJobID()); + } else { + // apply reduce, keeping track of sum + if ( reduceTimer != null ) reduceTimer.restart(); + sum = reduce.apply(result.getValue(), sum); + if ( reduceTimer != null ) reduceTimer.stop(); + } + } + } catch (ExecutionException ex) { + throw new ReviewedStingException("got execution exception", ex); + } catch (InterruptedException ex) { + throw new ReviewedStingException("got execution exception", ex); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index 21ac6dcec..47dcc1d5e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -22,11 +22,11 @@ import java.util.List; public class NanoSchedulerUnitTest extends BaseTest { public static final int NANO_SCHEDULE_MAX_RUNTIME = 60000; - private static class Map2x implements NanoSchedulerMapFunction { + private static class Map2x implements NSMapFunction { @Override public Integer apply(Integer input) { return input * 2; } } - private static class ReduceSum implements NanoSchedulerReduceFunction { + private static class ReduceSum implements NSReduceFunction { int prevOne = Integer.MIN_VALUE; @Override public Integer apply(Integer one, Integer sum) { @@ -35,7 +35,7 @@ public class NanoSchedulerUnitTest extends BaseTest { } } - private static class ProgressCallback implements NanoSchedulerProgressFunction { + private static class ProgressCallback implements NSProgressFunction { int callBacks = 0; @Override @@ -120,7 +120,7 @@ public class NanoSchedulerUnitTest extends BaseTest { final ProgressCallback callback = new ProgressCallback(); nanoScheduler.setProgressFunction(callback); - Assert.assertEquals(nanoScheduler.getBufferSize(), test.bufferSize, "bufferSize argument"); + Assert.assertEquals(nanoScheduler.getInputBufferSize(), test.bufferSize, "inputBufferSize argument"); Assert.assertEquals(nanoScheduler.getnThreads(), test.nThreads, "nThreads argument"); final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); From 8c0e3b1e0cd8c21e473543337c1c8b91fff44f2f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 7 Sep 2012 09:12:47 -0400 Subject: [PATCH 149/161] UnitTests for InputProducer --- .../nanoScheduler/InputProducerUnitTest.java | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java new file mode 100644 index 000000000..0973db8a3 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java @@ -0,0 +1,71 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.LinkedBlockingDeque; + +/** + * UnitTests for the NanoScheduler + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class InputProducerUnitTest extends BaseTest { + @DataProvider(name = "InputProducerTest") + public Object[][] createInputProducerTest() { + List tests = new ArrayList(); + + for ( final int nElements : Arrays.asList(0, 1, 10, 100, 1000, 10000, 100000) ) { + for ( final int queueSize : Arrays.asList(1, 10, 100) ) { + tests.add(new Object[]{ nElements, queueSize }); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "InputProducerTest", timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME) + public void testSingleThreadedNanoScheduler(final int nElements, final int queueSize) throws InterruptedException { + final List elements = new ArrayList(nElements); + for ( int i = 0; i < nElements; i++ ) elements.add(i); + + final LinkedBlockingDeque.InputValue> readQueue = + new LinkedBlockingDeque.InputValue>(queueSize); + + final InputProducer ip = new InputProducer(elements.iterator(), null, readQueue); + + final ExecutorService es = Executors.newSingleThreadExecutor(); + es.submit(ip); + + int lastValue = -1; + int nRead = 0; + while ( true ) { + final int observedQueueSize = readQueue.size(); + Assert.assertTrue(observedQueueSize <= queueSize, + "Reader is enqueuing more elements " + queueSize + " than allowed " + queueSize); + + final InputProducer.InputValue value = readQueue.take(); + if ( value.isLast() ) { + Assert.assertEquals(nRead, nElements, "Number of input values " + nRead + " not all that are expected " + nElements); + Assert.assertEquals(readQueue.size(), 0, "Last queue element found but queue contains more values!"); + break; + } else { + Assert.assertTrue(lastValue < value.getValue(), "Read values coming out of order!"); + final int expected = lastValue + 1; + Assert.assertEquals((int)value.getValue(), expected, "Value observed " + value.getValue() + " not equal to the expected value " + expected); + nRead++; + lastValue = value.getValue(); + } + } + } +} From bf87de8a252bc566d820cf85cfe7dcc745d8e679 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 7 Sep 2012 09:51:32 -0400 Subject: [PATCH 150/161] UnitTests for ReducerThread and InputProducer -- Uncovered bug in ReducerThread in detecting abnormal case where jobs are coming in out of order --- .../utils/nanoScheduler/ReducerThread.java | 1 + .../nanoScheduler/InputProducerUnitTest.java | 6 +- .../nanoScheduler/ReducerThreadUnitTest.java | 94 +++++++++++++++++++ 3 files changed, 98 insertions(+), 3 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java index bd29799b6..506e45453 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java @@ -49,6 +49,7 @@ class ReducerThread implements Callable { // make sure the map results are coming in order throw new IllegalStateException("BUG: last jobID " + lastJobID + " > current jobID " + result.getJobID()); } else { + lastJobID = result.getJobID(); // apply reduce, keeping track of sum if ( reduceTimer != null ) reduceTimer.restart(); sum = reduce.apply(result.getValue(), sum); diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java index 0973db8a3..b3365c13c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java @@ -13,7 +13,7 @@ import java.util.concurrent.Executors; import java.util.concurrent.LinkedBlockingDeque; /** - * UnitTests for the NanoScheduler + * UnitTests for the InputProducer * * User: depristo * Date: 8/24/12 @@ -35,7 +35,7 @@ public class InputProducerUnitTest extends BaseTest { } @Test(enabled = true, dataProvider = "InputProducerTest", timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME) - public void testSingleThreadedNanoScheduler(final int nElements, final int queueSize) throws InterruptedException { + public void testInputProducer(final int nElements, final int queueSize) throws InterruptedException { final List elements = new ArrayList(nElements); for ( int i = 0; i < nElements; i++ ) elements.add(i); @@ -52,7 +52,7 @@ public class InputProducerUnitTest extends BaseTest { while ( true ) { final int observedQueueSize = readQueue.size(); Assert.assertTrue(observedQueueSize <= queueSize, - "Reader is enqueuing more elements " + queueSize + " than allowed " + queueSize); + "Reader is enqueuing more elements " + observedQueueSize + " than allowed " + queueSize); final InputProducer.InputValue value = readQueue.take(); if ( value.isLast() ) { diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java new file mode 100644 index 000000000..61d1330bc --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java @@ -0,0 +1,94 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.*; + +/** + * UnitTests for the InputProducer + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class ReducerThreadUnitTest extends BaseTest { + @DataProvider(name = "ReducerThreadTest") + public Object[][] createReducerThreadTest() { + List tests = new ArrayList(); + + for ( final int nElements : Arrays.asList(0, 1, 10, 100, 1000, 10000, 100000) ) { + tests.add(new Object[]{ nElements }); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "ReducerThreadTest", timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME) + public void testReducerThreadTest(final int nElements) throws Exception { + List values = new ArrayList(nElements); + List jobIDs = new ArrayList(nElements); + for ( int i = 0; i < nElements; i++ ) { + values.add(i); + jobIDs.add(i); + } + + runTests(values, jobIDs); + } + + @Test(enabled = true, timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME, expectedExceptions = ExecutionException.class) + public void testReducerThreadTestByJobOrder() throws Exception { + runTests(Arrays.asList(0, 1, 2), Arrays.asList(1, 3, 2)); + } + + private void runTests( final List mapValues, final List jobIDs) throws Exception { + final LinkedBlockingDeque>> mapResultsQueue = + new LinkedBlockingDeque>>(mapValues.size()+1); + + for ( int i = 0; i < mapValues.size(); i++ ) { + final int value = mapValues.get(i); + final int jobID = jobIDs.get(i); + final MapResult mapResult = new MapResult(value, jobID); + mapResultsQueue.add(new FutureValue>(mapResult)); + } + mapResultsQueue.add(new FutureValue>(new MapResult())); + + final ReduceSumTest reduce = new ReduceSumTest(mapResultsQueue); + final ReducerThread thread + = new ReducerThread(reduce, null, 0, mapResultsQueue); + + final ExecutorService es = Executors.newSingleThreadExecutor(); + final Future value = es.submit(thread); + value.get(); + + Assert.assertEquals(reduce.nRead, mapValues.size()); + } + + public class ReduceSumTest implements NSReduceFunction { + final LinkedBlockingDeque>> mapResultsQueue; + int nRead = 0; + int lastValue = -1; + + public ReduceSumTest(LinkedBlockingDeque>> mapResultsQueue) { + this.mapResultsQueue = mapResultsQueue; + } + + @Override public Integer apply(Integer one, Integer sum) { + Assert.assertTrue(lastValue < one, "Reduce came in out of order. Prev " + lastValue + " cur " + one); + + Assert.assertTrue(lastValue < one, "Read values coming out of order!"); + final int expected = lastValue + 1; + Assert.assertEquals((int)one, expected, "Value observed " + one + " not equal to the expected value " + expected); + nRead++; + lastValue = expected; + + return one + sum; + } + } +} From d62eca5d92bc0761b7824eedc74186cd12e25744 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 7 Sep 2012 10:47:29 -0400 Subject: [PATCH 152/161] Update GATKPerformanceOverTime to measure -nt and -nct --- .../sting/gatk/executive/MicroScheduler.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 46d6b5882..c6ef9acf1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -102,9 +102,15 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * @return The best-fit microscheduler. */ public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { - if ( threadAllocation.isRunningInParallelMode() ) + if ( threadAllocation.isRunningInParallelMode() ) { + // TODO -- remove me when we fix running NCT within HMS + if ( threadAllocation.getNumDataThreads() > 1 && threadAllocation.getNumCPUThreadsPerDataThread() > 1) + throw new UserException("Currently the GATK does not support running CPU threads within data threads, " + + "please specify only one of NT and NCT"); + logger.info(String.format("Running the GATK in parallel mode with %d CPU thread(s) for each of %d data thread(s)", threadAllocation.getNumCPUThreadsPerDataThread(), threadAllocation.getNumDataThreads())); + } if ( threadAllocation.getNumDataThreads() > 1 ) { if (walker.isReduceByInterval()) From f25bf0f927ea7f36662bfbe0756c6f1c6204581a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 7 Sep 2012 11:03:00 -0400 Subject: [PATCH 153/161] EfficiencyMonitoringThreadFactoryUnitTests thing keeps timing out unnecessary --- .../threading/EfficiencyMonitoringThreadFactoryUnitTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java index d8da274ce..7381bebc4 100755 --- a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java @@ -44,7 +44,7 @@ import java.util.concurrent.TimeUnit; */ public class EfficiencyMonitoringThreadFactoryUnitTest extends BaseTest { // the duration of the tests -- 100 ms is tolerable given the number of tests we are doing - private final static long THREAD_TARGET_DURATION_IN_MILLISECOND = 10000; + private final static long THREAD_TARGET_DURATION_IN_MILLISECOND = 100000; private final static int MAX_THREADS = 4; final static Object GLOBAL_LOCK = new Object(); From 41a8a304a0ffc2b6a6209b8da6b4423d8c91bd22 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 7 Sep 2012 11:27:00 -0400 Subject: [PATCH 154/161] Catch masked OutOfMemory errors as User Errors --- .../src/org/broadinstitute/sting/gatk/CommandLineGATK.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index ce57d1a7a..1b41b85f4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -137,6 +137,10 @@ public class CommandLineGATK extends CommandLineExecutable { exitSystemWithUserError(new UserException.NoSpaceOnDevice()); if ( t.getCause() != null && t.getCause().getMessage().contains("No space left on device") ) exitSystemWithUserError(new UserException.NoSpaceOnDevice()); + + // masked out of memory error + if ( t.getCause() != null && t.getCause() instanceof OutOfMemoryError ) + exitSystemWithUserError(new UserException.NotEnoughMemory()); } /** From 3dc248a49d705826c40f94b2fdc3aeed38d989da Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 7 Sep 2012 11:41:38 -0400 Subject: [PATCH 155/161] Adding another test --- .../broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java index 12423595b..d3ee4e832 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java @@ -94,8 +94,10 @@ public class ErrorThrowing extends RodWalker implements TreeRed throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_1); } else if ( exceptionToThrow.equals("SamError2") ) { throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_2); - } else if ( exceptionToThrow.equals("NoSpace") ) { + } else if ( exceptionToThrow.equals("NoSpace1") ) { throw new net.sf.samtools.util.RuntimeIOException(new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); + } else if ( exceptionToThrow.equals("NoSpace2") ) { + throw new net.sf.samtools.SAMException("Exception writing BAM index file", new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); } else { throw new UserException.BadArgumentValue("exception", "exception isn't a recognized value " + exceptionToThrow); } From 3f2a4379af87425c2dcaf2dfa51549154a2ee409 Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Thu, 6 Sep 2012 15:10:35 -0400 Subject: [PATCH 156/161] Added forum API version stub to base URL for posting GATKDocs This will prevent bugs from occurring when Vanilla make changes to the API as described here: http://vanillaforums.com/blog/api#configuration Based on the bug that broke the website Guide section on 9/6/12, the GATKDocs posting system will probably break in the next release if this is not applied as a bug fix. --- .../src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java index 1dfc4ecc0..fe5f48a48 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java @@ -44,7 +44,7 @@ public class ForumAPIUtils { /** * How we post to the forum */ - private final static String API_URL = "https://gatk.vanillaforums.com/"; + private final static String API_URL = "https://gatkforums.broadinstitute.org/api/v1/"; final private static String ACCESS_TOKEN = "access_token="; public static List getPostedTools(String forumKey) { From b1677fc7195abb8c059ac7d0827764cffb2338e2 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 7 Sep 2012 14:25:57 -0400 Subject: [PATCH 157/161] Fixed JIRA GSA-520 for Guillermo: when intervals with zero coverage were present, DiagnoseTargets was trying to merge them with the next interval (even if non-overlapping) which would cause problems later on when it checked to make sure that intervals were strictly overlapping. --- .../gatk/walkers/diagnostics/targets/DiagnoseTargets.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java index 112eb278e..cbd3bc950 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java @@ -246,6 +246,14 @@ public class DiagnoseTargets extends LocusWalker { */ private void addNewOverlappingIntervals(GenomeLoc refLocus) { GenomeLoc interval = intervalListIterator.peek(); + + // skip any intervals with no coverage that we have passed + while (interval != null && interval.isBefore(refLocus)) { + intervalListIterator.next(); // discard the interval (we've already added it to the map) + interval = intervalListIterator.peek(); + } + + // add any intervals that overlap this one while (interval != null && !interval.isPast(refLocus)) { intervalMap.put(interval, createIntervalStatistic(interval)); intervalListIterator.next(); // discard the interval (we've already added it to the map) From 688fc9fb56741b4351fa319ab3f18dd4ad9d9589 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Sun, 9 Sep 2012 10:36:09 -0400 Subject: [PATCH 158/161] Bug fix in HC GenotypingEngine to ensure that all the merged complex events get properly added to the priority list used by VariantContextUtils when combining multiallelic events. --- .../haplotypecaller/GenotypingEngine.java | 36 +++++++++++++------ 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 9de9b3292..e83cf5d1f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -52,7 +52,11 @@ public class GenotypingEngine { noCall.add(Allele.NO_CALL); } - // This function is the streamlined approach, currently not being used + // WARN + // This function is the streamlined approach, currently not being used by default + // WARN + // WARN: This function is currently only being used by Menachem. Slated for removal/merging with the rest of the code. + // WARN @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) public List>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine, final ArrayList haplotypes, @@ -210,13 +214,9 @@ public class GenotypingEngine { System.out.println( ">> Events = " + h.getEventMap()); } } - // Create the VC merge priority list - final ArrayList priorityList = new ArrayList(); - for( int iii = 0; iii < haplotypes.size(); iii++ ) { - priorityList.add("HC" + iii); - } + final ArrayList priorityList = new ArrayList(); // filled in later, used to merge overlapping events into common reference view - cleanUpSymbolicUnassembledEvents( haplotypes, priorityList ); + cleanUpSymbolicUnassembledEvents( haplotypes ); if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 3 ) { // if not in GGA mode and have at least 3 samples try to create MNP and complex events by looking at LD structure mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc ); } @@ -236,6 +236,7 @@ public class GenotypingEngine { final VariantContext vc = eventMap.get(loc); if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) { eventsAtThisLoc.add(vc); + priorityList.add(vc.getSource()); } } } else { // we are in GGA mode! @@ -260,6 +261,22 @@ public class GenotypingEngine { // Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event final ArrayList> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes ); + // Sanity check the priority list + for( final VariantContext vc : eventsAtThisLoc ) { + if( !priorityList.contains(vc.getSource()) ) { + throw new ReviewedStingException("Event found on haplotype that wasn't added to priority list. Something went wrong in the merging of alleles."); + } + } + for( final String name : priorityList ) { + boolean found = false; + for( final VariantContext vc : eventsAtThisLoc ) { + if(vc.getSource().equals(name)) { found = true; break; } + } + if( !found ) { + throw new ReviewedStingException("Event added to priority list but wasn't found on any haplotype. Something went wrong in the merging of alleles."); + } + } + // Merge the event to find a common reference representation final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); if( mergedVC == null ) { continue; } @@ -299,9 +316,8 @@ public class GenotypingEngine { return returnCalls; } - protected static void cleanUpSymbolicUnassembledEvents( final ArrayList haplotypes, final ArrayList priorityList ) { + protected static void cleanUpSymbolicUnassembledEvents( final ArrayList haplotypes ) { final ArrayList haplotypesToRemove = new ArrayList(); - final ArrayList stringsToRemove = new ArrayList(); for( final Haplotype h : haplotypes ) { for( final VariantContext vc : h.getEventMap().values() ) { if( vc.isSymbolic() ) { @@ -309,7 +325,6 @@ public class GenotypingEngine { for( final VariantContext vc2 : h2.getEventMap().values() ) { if( vc.getStart() == vc2.getStart() && vc2.isIndel() ) { haplotypesToRemove.add(h); - stringsToRemove.add(vc.getSource()); break; } } @@ -318,7 +333,6 @@ public class GenotypingEngine { } } haplotypes.removeAll(haplotypesToRemove); - priorityList.removeAll(stringsToRemove); } protected void mergeConsecutiveEventsBasedOnLD( final ArrayList haplotypes, final TreeSet startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) { From 36913706c0bd2dbdddd119b784a52b310cb37a99 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Sun, 9 Sep 2012 13:47:54 -0400 Subject: [PATCH 159/161] Bug fix in HC GenotypingEngine to ensure that all the merged complex events get properly added to the priority list used by VariantContextUtils when combining multiallelic events. --- .../gatk/walkers/haplotypecaller/GenotypingEngine.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index e83cf5d1f..192befe67 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -188,6 +188,7 @@ public class GenotypingEngine { return returnCalls; } + // BUGBUG: Create a class to hold this complicated return type @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) public List>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine, final ArrayList haplotypes, @@ -214,7 +215,6 @@ public class GenotypingEngine { System.out.println( ">> Events = " + h.getEventMap()); } } - final ArrayList priorityList = new ArrayList(); // filled in later, used to merge overlapping events into common reference view cleanUpSymbolicUnassembledEvents( haplotypes ); if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 3 ) { // if not in GGA mode and have at least 3 samples try to create MNP and complex events by looking at LD structure @@ -229,7 +229,9 @@ public class GenotypingEngine { // Walk along each position in the key set and create each event to be outputted for( final int loc : startPosKeySet ) { if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { - final ArrayList eventsAtThisLoc = new ArrayList(); + final ArrayList eventsAtThisLoc = new ArrayList(); // the overlapping events to merge into a common reference view + final ArrayList priorityList = new ArrayList(); // used to merge overlapping events into common reference view + if( activeAllelesToGenotype.isEmpty() ) { for( final Haplotype h : haplotypes ) { final HashMap eventMap = h.getEventMap(); From d7499e0642519d6e0b56fd74ba684f4de9bbfc91 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sun, 9 Sep 2012 22:17:36 -0400 Subject: [PATCH 160/161] Updating the rank sum test documentation --- .../sting/gatk/walkers/annotator/BaseQualityRankSumTest.java | 2 +- .../sting/gatk/walkers/annotator/ClippingRankSumTest.java | 4 ++++ .../gatk/walkers/annotator/MappingQualityRankSumTest.java | 2 +- .../sting/gatk/walkers/annotator/ReadPosRankSumTest.java | 2 +- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index dc727fa48..577b1cfdc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -16,7 +16,7 @@ import java.util.*; /** * The u-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele). - * Note that the base quality rank sum test can not be calculated for homozygous sites. + * Note that the base quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. */ public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnotation { public List getKeyNames() { return Arrays.asList("BaseQRankSum"); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java index 1fd220f2f..c74f98ca3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java @@ -16,6 +16,10 @@ import java.util.*; * Date: 6/28/12 */ +/** + * The u-based z-approximation from the Mann-Whitney Rank Sum Test for reads with clipped bases (reads with ref bases vs. those with the alternate allele) + * Note that the clipping rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. + */ public class ClippingRankSumTest extends RankSumTest { public List getKeyNames() { return Arrays.asList("ClippingRankSum"); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index 6557f3e47..787c9b29b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -17,7 +17,7 @@ import java.util.*; /** * The u-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele) - * Note that the mapping quality rank sum test can not be calculated for homozygous sites. + * Note that the mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. */ public class MappingQualityRankSumTest extends RankSumTest implements StandardAnnotation { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index 1ac8ee113..de0ce2ce2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -20,7 +20,7 @@ import java.util.*; /** * The u-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error). - * Note that the read position rank sum test can not be calculated for homozygous sites. + * Note that the read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. */ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotation { From ac8a4dfc2d57c4797452d2229bda6ccdcb439763 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 10 Sep 2012 15:04:06 -0400 Subject: [PATCH 161/161] The comprehensive LIBS unit test is now truly comprehensive (or it would be if LIBS wasn't busted). The test can handle a read with any arbitrary legal CIGAR and iterates over the elements/bases in time with the real LIBS, failing if there are any differences. I've left the few hard-coded CIGARs in there for now with a note to move to all possible permutations once we move to fix LIBS (otherwise the tests would fail now). --- .../LocusIteratorByStateUnitTest.java | 172 ++++++++++++++---- 1 file changed, 132 insertions(+), 40 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index fbc063ab6..a5ead5665 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -1,8 +1,6 @@ package org.broadinstitute.sting.gatk.iterators; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMRecord; +import net.sf.samtools.*; import net.sf.samtools.util.CloseableIterator; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.ReadProperties; @@ -40,7 +38,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); } - private final LocusIteratorByState makeLTBS(List reads, ReadProperties readAttributes) { + private LocusIteratorByState makeLTBS(List reads, ReadProperties readAttributes) { return new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); } @@ -262,45 +260,36 @@ public class LocusIteratorByStateUnitTest extends BaseTest { // comprehensive LIBS/PileupElement tests // //////////////////////////////////////////// - private static final int IS_BEFORE_DELETED_BASE_FLAG = 1; - private static final int IS_BEFORE_DELETION_START_FLAG = 2; - private static final int IS_AFTER_DELETED_BASE_FLAG = 4; - private static final int IS_AFTER_DELETION_END_FLAG = 8; - private static final int IS_BEFORE_INSERTION_FLAG = 16; - private static final int IS_AFTER_INSERTION_FLAG = 32; - private static final int IS_NEXT_TO_SOFTCLIP_FLAG = 64; - private static class LIBSTest { final String cigar; final int readLength; - final List offsets; - final List flags; - private LIBSTest(final String cigar, final int readLength, final List offsets, final List flags) { + private LIBSTest(final String cigar, final int readLength) { this.cigar = cigar; this.readLength = readLength; - this.offsets = offsets; - this.flags = flags; } } @DataProvider(name = "LIBSTest") public Object[][] createLIBSTestData() { + + //TODO -- when LIBS is fixed this should be replaced to provide all possible permutations of CIGAR strings + return new Object[][]{ - {new LIBSTest("1I", 1, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))}, - {new LIBSTest("10I", 10, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))}, - {new LIBSTest("2M2I2M", 6, Arrays.asList(0,1,4,5), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG,IS_AFTER_INSERTION_FLAG,0))}, - {new LIBSTest("2M2I", 4, Arrays.asList(0,1), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG))}, + {new LIBSTest("1I", 1)}, + {new LIBSTest("10I", 10)}, + {new LIBSTest("2M2I2M", 6)}, + {new LIBSTest("2M2I", 4)}, //TODO -- uncomment these when LIBS is fixed //{new LIBSTest("2I2M", 4, Arrays.asList(2,3), Arrays.asList(IS_AFTER_INSERTION_FLAG,0))}, //{new LIBSTest("1I1M1D1M", 3, Arrays.asList(0,1), Arrays.asList(IS_AFTER_INSERTION_FLAG | IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG))}, //{new LIBSTest("1S1I1M", 3, Arrays.asList(2), Arrays.asList(IS_AFTER_INSERTION_FLAG))}, - {new LIBSTest("1M2D2M", 3, Arrays.asList(0,1,2), Arrays.asList(IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG,0))}, - {new LIBSTest("1S1M", 2, Arrays.asList(1), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))}, - {new LIBSTest("1M1S", 2, Arrays.asList(0), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))}, - {new LIBSTest("1S1M1I", 3, Arrays.asList(1), Arrays.asList(IS_BEFORE_INSERTION_FLAG | IS_NEXT_TO_SOFTCLIP_FLAG))} + //{new LIBSTest("1M2D2M", 3)}, + {new LIBSTest("1S1M", 2)}, + {new LIBSTest("1M1S", 2)}, + {new LIBSTest("1S1M1I", 3)} }; } @@ -315,26 +304,24 @@ public class LocusIteratorByStateUnitTest extends BaseTest { // create the iterator by state with the fake reads and fake records li = makeLTBS(Arrays.asList(read), createTestReadProperties()); + final LIBS_position tester = new LIBS_position(read); - int offset = 0; while ( li.hasNext() ) { AlignmentContext alignmentContext = li.next(); ReadBackedPileup p = alignmentContext.getBasePileup(); Assert.assertTrue(p.getNumberOfElements() == 1); PileupElement pe = p.iterator().next(); - final int flag = params.flags.get(offset); - Assert.assertEquals(pe.isBeforeDeletedBase(), (flag & IS_BEFORE_DELETED_BASE_FLAG) != 0); - Assert.assertEquals(pe.isBeforeDeletionStart(), (flag & IS_BEFORE_DELETION_START_FLAG) != 0); - Assert.assertEquals(pe.isAfterDeletedBase(), (flag & IS_AFTER_DELETED_BASE_FLAG) != 0); - Assert.assertEquals(pe.isAfterDeletionEnd(), (flag & IS_AFTER_DELETION_END_FLAG) != 0); - Assert.assertEquals(pe.isBeforeInsertion(), (flag & IS_BEFORE_INSERTION_FLAG) != 0); - Assert.assertEquals(pe.isAfterInsertion(), (flag & IS_AFTER_INSERTION_FLAG) != 0); - Assert.assertEquals(pe.isNextToSoftClip(), (flag & IS_NEXT_TO_SOFTCLIP_FLAG) != 0); + tester.stepForwardOnGenome(); - Assert.assertEquals(pe.getOffset(), params.offsets.get(offset).intValue()); - - offset++; + Assert.assertEquals(pe.isBeforeDeletedBase(), tester.isBeforeDeletedBase); + Assert.assertEquals(pe.isBeforeDeletionStart(), tester.isBeforeDeletionStart); + Assert.assertEquals(pe.isAfterDeletedBase(), tester.isAfterDeletedBase); + Assert.assertEquals(pe.isAfterDeletionEnd(), tester.isAfterDeletionEnd); + Assert.assertEquals(pe.isBeforeInsertion(), tester.isBeforeInsertion); + Assert.assertEquals(pe.isAfterInsertion(), tester.isAfterInsertion); + Assert.assertEquals(pe.isNextToSoftClip(), tester.isNextToSoftClip); + Assert.assertEquals(pe.getOffset(), tester.getCurrentReadOffset()); } } @@ -366,9 +353,7 @@ class FakeCloseableIterator implements CloseableIterator { } @Override - public void close() { - return; - } + public void close() {} @Override public boolean hasNext() { @@ -385,3 +370,110 @@ class FakeCloseableIterator implements CloseableIterator { throw new UnsupportedOperationException("Don't remove!"); } } + + +final class LIBS_position { + + SAMRecord read; + + final int numOperators; + int currentOperatorIndex = 0; + int currentPositionOnOperator = 0; + int currentReadOffset = 0; + + boolean isBeforeDeletionStart = false; + boolean isBeforeDeletedBase = false; + boolean isAfterDeletionEnd = false; + boolean isAfterDeletedBase = false; + boolean isBeforeInsertion = false; + boolean isAfterInsertion = false; + boolean isNextToSoftClip = false; + + boolean sawMop = false; + + public LIBS_position(final SAMRecord read) { + this.read = read; + numOperators = read.getCigar().numCigarElements(); + } + + public int getCurrentReadOffset() { + return Math.max(0, currentReadOffset - 1); + } + + /** + * Steps forward on the genome. Returns false when done reading the read, true otherwise. + */ + public boolean stepForwardOnGenome() { + if ( currentOperatorIndex == numOperators ) + return false; + + CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); + if ( currentPositionOnOperator >= curElement.getLength() ) { + if ( ++currentOperatorIndex == numOperators ) + return false; + + curElement = read.getCigar().getCigarElement(currentOperatorIndex); + currentPositionOnOperator = 0; + } + + switch ( curElement.getOperator() ) { + case I: // insertion w.r.t. the reference + if ( !sawMop ) + break; + case S: // soft clip + currentReadOffset += curElement.getLength(); + case H: // hard clip + case P: // padding + currentOperatorIndex++; + return stepForwardOnGenome(); + + case D: // deletion w.r.t. the reference + case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + currentPositionOnOperator++; + break; + + case M: + case EQ: + case X: + sawMop = true; + currentReadOffset++; + currentPositionOnOperator++; + break; + default: + throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); + } + + final boolean isFirstOp = currentOperatorIndex == 0; + final boolean isLastOp = currentOperatorIndex == numOperators - 1; + final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; + final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); + + isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); + isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); + isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); + isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); + isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) + || (!sawMop && curElement.getOperator() == CigarOperator.I); + isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); + isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) + || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); + + return true; + } + + private static boolean isBeforeOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isLastOp, + final boolean isLastBaseOfOp) { + return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; + } + + private static boolean isAfterOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isFirstOp, + final boolean isFirstBaseOfOp) { + return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; + } +}