From 0cc5c3d799d43886b64239f532e6889381011868 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 19 Dec 2011 15:46:32 -0500 Subject: [PATCH 01/17] General improvements to Queue -- Support for collecting resources info from DRMAA runners -- Disabled the non-standard mem_free argument so that we can actually use our own SGE cluster gsa4 -- NCoresRequest is a testing queue script for this. -- Added two command line arguments: -- multiCoreJerk: don't request multiple cores for jobs with nt > 1. This was the old behavior but it's really not the best way to run parallel jobs. Now with queue if you run nt = 4 the system requests 4 cores on your host. If this flag is thrown, though, it will only request 1 and you'll just use 4, like a jerk -- job_parallel_env: parallel environment named used with SGE to request multicore jobs. Equivalent to -pe job_parallel_env NT for NT > 1 jobs --- .../gatk/GATKExtensionsGenerator.java | 18 +++++++++------ .../sting/queue/QSettings.scala | 7 ++++++ .../queue/engine/drmaa/DrmaaJobRunner.scala | 14 +++++++++++- .../gridengine/GridEngineJobRunner.scala | 20 +++++++++++++++-- .../queue/engine/lsf/Lsf706JobRunner.scala | 22 ++++++++++++++++++- .../queue/function/CommandLineFunction.scala | 10 +++++++++ 6 files changed, 80 insertions(+), 11 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java index 67010c4d5..9c40fb976 100644 --- a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java +++ b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java @@ -147,7 +147,7 @@ public class GATKExtensionsGenerator extends CommandLineProgram { String clpConstructor = String.format("analysisName = \"%s\"%njavaMainClass = \"%s\"%n", clpClassName, clp.getName()); writeClass("org.broadinstitute.sting.queue.function.JavaCommandLineFunction", clpClassName, - false, clpConstructor, ArgumentDefinitionField.getArgumentFields(parser,clp), dependents); + false, clpConstructor, ArgumentDefinitionField.getArgumentFields(parser,clp), dependents, false); if (clp == CommandLineGATK.class) { for (Entry>> walkersByPackage: walkerManager.getWalkerNamesByPackage(false).entrySet()) { @@ -169,7 +169,7 @@ public class GATKExtensionsGenerator extends CommandLineProgram { } writeClass(GATK_EXTENSIONS_PACKAGE_NAME + "." + clpClassName, walkerName, - isScatter, constructor, argumentFields, dependents); + isScatter, constructor, argumentFields, dependents, true); } catch (Exception e) { throw new ReviewedStingException("Error generating wrappers for walker " + walkerType, e); } @@ -241,8 +241,9 @@ public class GATKExtensionsGenerator extends CommandLineProgram { * @throws IOException If the file cannot be written. */ private void writeClass(String baseClass, String className, boolean isScatter, - String constructor, List argumentFields, Set> dependents) throws IOException { - String content = getContent(CLASS_TEMPLATE, baseClass, className, constructor, isScatter, "", argumentFields, dependents); + String constructor, List argumentFields, + Set> dependents, boolean isGATKWalker) throws IOException { + String content = getContent(CLASS_TEMPLATE, baseClass, className, constructor, isScatter, "", argumentFields, dependents, isGATKWalker); writeFile(GATK_EXTENSIONS_PACKAGE_NAME + "." + className, content); } @@ -256,7 +257,7 @@ public class GATKExtensionsGenerator extends CommandLineProgram { */ private void writeFilter(String className, List argumentFields, Set> dependents) throws IOException { String content = getContent(TRAIT_TEMPLATE, "org.broadinstitute.sting.queue.function.CommandLineFunction", - className, "", false, String.format(" + \" -read_filter %s\"", className), argumentFields, dependents); + className, "", false, String.format(" + \" -read_filter %s\"", className), argumentFields, dependents, false); writeFile(GATK_EXTENSIONS_PACKAGE_NAME + "." + className, content); } @@ -350,7 +351,8 @@ public class GATKExtensionsGenerator extends CommandLineProgram { */ private static String getContent(String scalaTemplate, String baseClass, String className, String constructor, boolean isScatter, String commandLinePrefix, - List argumentFields, Set> dependents) { + List argumentFields, Set> dependents, + boolean isGATKWalker) { StringBuilder arguments = new StringBuilder(); StringBuilder commandLine = new StringBuilder(commandLinePrefix); @@ -384,8 +386,10 @@ public class GATKExtensionsGenerator extends CommandLineProgram { StringBuffer freezeFieldOverride = new StringBuffer(); for (String freezeField: freezeFields) freezeFieldOverride.append(freezeField); - if (freezeFieldOverride.length() > 0) { + if (freezeFieldOverride.length() > 0 || isGATKWalker) { freezeFieldOverride.insert(0, String.format("override def freezeFieldValues = {%nsuper.freezeFieldValues%n")); + if ( isGATKWalker ) + freezeFieldOverride.append(String.format("if ( num_threads.isDefined ) nCoresRequest = num_threads%n")); freezeFieldOverride.append(String.format("}%n%n")); } diff --git a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala index 648f9ffef..e8ac26a57 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala @@ -62,6 +62,13 @@ class QSettings { @Argument(fullName="resident_memory_request", shortName="resMemReq", doc="Default resident memory request for jobs, in gigabytes.", required=false) var residentRequest: Option[Double] = None + /** The name of the parallel environment (required for SGE, for example) */ + @Argument(fullName="job_parallel_env", shortName="jobParaEnv", doc="An SGE style parallel environment to use for jobs requesting more than 1 core. Equivalent to submitting jobs with -pe ARG nt for jobs with nt > 1", required=false) + var parallelEnvironmentName: String = "smp_pe" // Broad default + + @Argument(fullName="dontRequestMultipleCores", shortName="multiCoreJerk", doc="If provided, Queue will not request multiple processors for jobs using multiple processors. Sometimes you eat the bear, sometimes the bear eats you.", required=false) + var dontRequestMultipleCores: Boolean = false + @Argument(fullName="run_directory", shortName="runDir", doc="Root directory to run functions from.", required=false) var runDirectory = new File(".") diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala index 227261912..2aae2fc6b 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala @@ -28,8 +28,8 @@ import org.broadinstitute.sting.queue.QException import org.broadinstitute.sting.queue.util.{Logging,Retry} import org.broadinstitute.sting.queue.function.CommandLineFunction import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner} -import java.util.Collections import org.ggf.drmaa._ +import java.util.{Date, Collections} /** * Runs jobs using DRMAA. @@ -103,6 +103,18 @@ class DrmaaJobRunner(val session: Session, val function: CommandLineFunction) ex case Session.QUEUED_ACTIVE => returnStatus = RunnerStatus.RUNNING case Session.DONE => val jobInfo: JobInfo = session.wait(jobId, Session.TIMEOUT_NO_WAIT) + + // Update jobInfo + def convertDRMAATime(key: String): Date = { + val v = jobInfo.getResourceUsage.get(key) + if ( v != null ) new Date(v.toString.toDouble.toLong * 1000) else null; + } + if ( jobInfo.getResourceUsage != null ) { + getRunInfo.startTime = convertDRMAATime("start_time") + getRunInfo.doneTime = convertDRMAATime("end_time") + getRunInfo.exechosts = "unknown" + } + if ((jobInfo.hasExited && jobInfo.getExitStatus != 0) || jobInfo.hasSignaled || jobInfo.wasAborted) diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala index 96e3ffd95..fca92a7a1 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala @@ -52,13 +52,28 @@ class GridEngineJobRunner(session: Session, function: CommandLineFunction) exten nativeSpec += " -q " + function.jobQueue // If the resident set size is requested pass on the memory request - if (function.residentRequest.isDefined) - nativeSpec += " -l mem_free=%dM".format(function.residentRequest.map(_ * 1024).get.ceil.toInt) + // NOTE: 12/20/11: depristo commented this out because mem_free isn't + // such a standard feature in SGE (gsa-engineering queue doesn't support it) + // requiring it can make SGE not so usable. It's dangerous to not enforce + // that we have enough memory to run our jobs, but I'd rather be dangerous + // than not be able to run my jobs at all. +// if (function.residentRequest.isDefined) +// nativeSpec += " -l mem_free=%dM".format(function.residentRequest.map(_ * 1024).get.ceil.toInt) // If the resident set size limit is defined specify the memory limit if (function.residentLimit.isDefined) nativeSpec += " -l h_rss=%dM".format(function.residentLimit.map(_ * 1024).get.ceil.toInt) + // If more than 1 core is requested, set the proper request + // if we aren't being jerks and just stealing cores (previous behavior) + if ( function.nCoresRequest.getOrElse(1) > 1 ) { + if ( function.qSettings.dontRequestMultipleCores ) + logger.warn("Sending multicore job %s to farm without requesting appropriate number of cores (%d)".format( + function.jobName, function.nCoresRequest.get)) + else + nativeSpec += " -pe %s %d".format(function.qSettings.parallelEnvironmentName, function.nCoresRequest.get) + } + // Pass on any job resource requests nativeSpec += function.jobResourceRequests.map(" -l " + _).mkString @@ -70,6 +85,7 @@ class GridEngineJobRunner(session: Session, function: CommandLineFunction) exten if (priority.isDefined) nativeSpec += " -p " + priority.get + logger.debug("Native spec is: %s".format(nativeSpec)) (nativeSpec + " " + super.functionNativeSpec).trim() } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala index 323cc63ff..5ef78500c 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala @@ -56,6 +56,7 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR private val selectString = new StringBuffer() private val usageString = new StringBuffer() private val requestString = new StringBuffer() + private val spanString = new StringBuffer() /** * Dispatches the function on the LSF cluster. @@ -100,6 +101,23 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR appendRequest("rusage", usageString, ",", "mem=%d".format(memInUnits)) } + // + // Request multiple cores on the same host. If nCoresRequest > 1, and we + // aren't being jerks and stealing cores, set numProcessors and maxNumProcessors + // and the span[host=1] parameters to get us exactly the right number of + // cores on a single host + // + if ( function.nCoresRequest.getOrElse(1) > 1 ) { + if ( function.qSettings.dontRequestMultipleCores ) + logger.warn("Sending multicore job %s to farm without requesting appropriate number of cores (%d)".format( + function.jobName, function.nCoresRequest.get)) + else { + request.numProcessors = function.nCoresRequest.get + request.maxNumProcessors = request.numProcessors + appendRequest("span", spanString, ",", "hosts=1") + } + } + val resReq = getResourceRequest if (resReq.length > 0) { request.resReq = resReq @@ -167,10 +185,12 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR requestString.setLength(0) selectString.setLength(0) usageString.setLength(0) + spanString.setLength(0) requestString.append(function.jobResourceRequests.mkString(" ")) extractSection(requestString, "select", selectString) extractSection(requestString, "rusage", usageString) + extractSection(requestString, "span", spanString) } private def extractSection(requestString: StringBuffer, section: String, sectionString: StringBuffer) { @@ -196,7 +216,7 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR sectionString.insert(sectionString.length() - 1, separator + request) } - private def getResourceRequest = "%s %s %s".format(selectString, usageString, requestString).trim() + private def getResourceRequest = "%s %s %s %s".format(selectString, usageString, spanString, requestString).trim() } object Lsf706JobRunner extends Logging { diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala index b08832f22..167dcb593 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala @@ -17,6 +17,9 @@ trait CommandLineFunction extends QFunction with Logging { /** Resident memory request */ var residentRequest: Option[Double] = None + /** the number of SMP cores this job wants */ + var nCoresRequest: Option[Int] = None + /** Job project to run the command */ var jobProject: String = _ @@ -45,6 +48,9 @@ trait CommandLineFunction extends QFunction with Logging { if (commandLineFunction.residentRequest.isEmpty) commandLineFunction.residentRequest = this.residentRequest + if (commandLineFunction.nCoresRequest.isEmpty) + commandLineFunction.nCoresRequest = this.nCoresRequest + if (commandLineFunction.jobProject == null) commandLineFunction.jobProject = this.jobProject @@ -100,6 +106,10 @@ trait CommandLineFunction extends QFunction with Logging { if (residentRequest.isEmpty) residentRequest = qSettings.residentRequest + // the default value is 1 core + if (nCoresRequest.isEmpty) + nCoresRequest = Some(1) + if (residentRequest.isEmpty) residentRequest = memoryLimit From f73ad1c2e27d2418e428eb80d5047c5bf2b2770d Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 19 Dec 2011 14:13:55 -0500 Subject: [PATCH 06/17] Bugfix/Rewrite: Algorithm to determine adaptor boundaries The algorithm wasn't accounting for the case where the read is the reverse strand and the insert size is negative. * Fixed and rewrote for more clarity (with Ryan, Mark and Eric). * Restructured the code to handle GATKSAMRecords only * Cleaned up the other structures and functions around it to minimize clutter and potential for error. * Added unit tests for all 4 cases of adaptor boundaries. --- .../gatk/iterators/LocusIteratorByState.java | 7 +- .../sting/utils/sam/ReadUtils.java | 269 +++++++----------- .../utils/{ => sam}/ReadUtilsUnitTest.java | 47 ++- 3 files changed, 155 insertions(+), 168 deletions(-) rename public/java/test/org/broadinstitute/sting/utils/{ => sam}/ReadUtilsUnitTest.java (66%) diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index ee3ea63eb..75e787e05 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -39,7 +39,6 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.ReservoirDownsampler; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -432,7 +431,7 @@ public class LocusIteratorByState extends LocusIterator { while(iterator.hasNext()) { SAMRecordState state = iterator.next(); if ( state.getCurrentCigarOperator() != CigarOperator.D && state.getCurrentCigarOperator() != CigarOperator.N ) { - if ( filterBaseInRead(state.getRead(), location.getStart()) ) { + if ( filterBaseInRead((GATKSAMRecord) state.getRead(), location.getStart()) ) { //discarded_bases++; //printStatus("Adaptor bases", discarded_adaptor_bases); continue; @@ -481,8 +480,8 @@ public class LocusIteratorByState extends LocusIterator { * @param pos * @return */ - private static boolean filterBaseInRead(SAMRecord rec, long pos) { - return ReadUtils.readPairBaseOverlapType(rec, pos) == ReadUtils.OverlapType.IN_ADAPTOR; + private static boolean filterBaseInRead(GATKSAMRecord rec, long pos) { + return ReadUtils.isBaseInsideAdaptor(rec, pos); } private void updateReadStates() { diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index a46a7f0bb..aa90ecf6f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -45,85 +45,13 @@ import java.util.*; public class ReadUtils { private ReadUtils() { } - // ---------------------------------------------------------------------------------------------------- - // - // Reduced read utilities - // - // ---------------------------------------------------------------------------------------------------- + private static int DEFAULT_ADAPTOR_SIZE = 100; - // ---------------------------------------------------------------------------------------------------- - // - // General utilities - // - // ---------------------------------------------------------------------------------------------------- - public static SAMFileHeader copySAMFileHeader(SAMFileHeader toCopy) { - SAMFileHeader copy = new SAMFileHeader(); - - copy.setSortOrder(toCopy.getSortOrder()); - copy.setGroupOrder(toCopy.getGroupOrder()); - copy.setProgramRecords(toCopy.getProgramRecords()); - copy.setReadGroups(toCopy.getReadGroups()); - copy.setSequenceDictionary(toCopy.getSequenceDictionary()); - - for (Map.Entry e : toCopy.getAttributes()) - copy.setAttribute(e.getKey(), e.getValue()); - - return copy; + public enum ClippingTail { + LEFT_TAIL, + RIGHT_TAIL } - public static SAMFileWriter createSAMFileWriterWithCompression(SAMFileHeader header, boolean presorted, String file, int compression) { - if (file.endsWith(".bam")) - return new SAMFileWriterFactory().makeBAMWriter(header, presorted, new File(file), compression); - return new SAMFileWriterFactory().makeSAMOrBAMWriter(header, presorted, new File(file)); - } - - public static boolean isPlatformRead(SAMRecord read, String name) { - SAMReadGroupRecord readGroup = read.getReadGroup(); - if (readGroup != null) { - Object readPlatformAttr = readGroup.getAttribute("PL"); - if (readPlatformAttr != null) - return readPlatformAttr.toString().toUpperCase().contains(name); - } - return false; - } - - // --------------------------------------------------------------------------------------------------------- - // - // utilities for detecting overlapping reads - // - // --------------------------------------------------------------------------------------------------------- - - /** - * Detects read pairs where the reads are so long relative to the over fragment size that they are - * reading into each other's adaptors. - * - * Normally, fragments are sufficiently far apart that reads aren't reading into each other. - * - * |--------------------> first read - * <--------------------| second read - * - * Sometimes, mostly due to lab errors or constraints, fragment library are made too short relative to the - * length of the reads. For example, it's possible to have 76bp PE reads with 125 bp inserts, so that ~25 bp of each - * read overlaps with its mate. - * - * |--------OOOOOOOOOOOO> first read - * first read - * [record in hand] - * s2 - * <-----------------------| - * - * s1, e1, and s2 are all in the record. From isize we can can compute e2 as s1 + isize + 1 - * - * s2 - * |-----------------------> - * s1 e1 - * <-----------------------| [record in hand] - * - * Here we cannot calculate e2 since the record carries s2 and e1 + isize is s2 now! - * - * This makes the following code a little nasty, since we can only detect if a base is in the adaptor, but not - * if it overlaps the read. - * - * @param read - * @param basePos - * @param adaptorLength - * @return - */ - public static OverlapType readPairBaseOverlapType(final SAMRecord read, long basePos, final int adaptorLength) { - OverlapType state = OverlapType.NOT_OVERLAPPING; + public static SAMFileHeader copySAMFileHeader(SAMFileHeader toCopy) { + SAMFileHeader copy = new SAMFileHeader(); - Pair adaptorBoundaries = getAdaptorBoundaries(read, adaptorLength); + copy.setSortOrder(toCopy.getSortOrder()); + copy.setGroupOrder(toCopy.getGroupOrder()); + copy.setProgramRecords(toCopy.getProgramRecords()); + copy.setReadGroups(toCopy.getReadGroups()); + copy.setSequenceDictionary(toCopy.getSequenceDictionary()); - if ( adaptorBoundaries != null ) { // we're not an unmapped pair -- cannot filter out + for (Map.Entry e : toCopy.getAttributes()) + copy.setAttribute(e.getKey(), e.getValue()); - boolean inAdapator = basePos >= adaptorBoundaries.first && basePos <= adaptorBoundaries.second; - - if ( inAdapator ) { - state = OverlapType.IN_ADAPTOR; - //System.out.printf("baseOverlapState: %50s negStrand=%b base=%d start=%d stop=%d, adaptorStart=%d adaptorEnd=%d isize=%d => %s%n", - // read.getReadName(), read.getReadNegativeStrandFlag(), basePos, read.getAlignmentStart(), read.getAlignmentEnd(), adaptorBoundaries.first, adaptorBoundaries.second, read.getInferredInsertSize(), state); - } - } - - return state; + return copy; } - private static Pair getAdaptorBoundaries(SAMRecord read, int adaptorLength) { - int isize = read.getInferredInsertSize(); - if ( isize == 0 ) - return null; // don't worry about unmapped pairs + public static SAMFileWriter createSAMFileWriterWithCompression(SAMFileHeader header, boolean presorted, String file, int compression) { + if (file.endsWith(".bam")) + return new SAMFileWriterFactory().makeBAMWriter(header, presorted, new File(file), compression); + return new SAMFileWriterFactory().makeSAMOrBAMWriter(header, presorted, new File(file)); + } - int adaptorStart, adaptorEnd; - - if ( read.getReadNegativeStrandFlag() ) { - // we are on the negative strand, so our mate is on the positive strand - int mateStart = read.getMateAlignmentStart(); - adaptorStart = mateStart - adaptorLength - 1; - adaptorEnd = mateStart - 1; - } else { - // we are on the positive strand, so our mate is on the negative strand - int mateEnd = read.getAlignmentStart() + isize - 1; - adaptorStart = mateEnd + 1; - adaptorEnd = mateEnd + adaptorLength; + public static boolean isPlatformRead(SAMRecord read, String name) { + SAMReadGroupRecord readGroup = read.getReadGroup(); + if (readGroup != null) { + Object readPlatformAttr = readGroup.getAttribute("PL"); + if (readPlatformAttr != null) + return readPlatformAttr.toString().toUpperCase().contains(name); } + return false; + } - return new Pair(adaptorStart, adaptorEnd); + + /** + * is this base inside the adaptor of the read? + * + * There are two cases to treat here: + * + * 1) Read is in the negative strand => Adaptor boundary is on the left tail + * 2) Read is in the positive strand => Adaptor boundary is on the right tail + * + * Note: We return false to all reads that are UNMAPPED or have an weird big insert size (probably due to mismapping or bigger event) + * + * @param read the read to test + * @param basePos base position in REFERENCE coordinates (not read coordinates) + * @return whether or not the base is in the adaptor + */ + public static boolean isBaseInsideAdaptor(final GATKSAMRecord read, long basePos) { + Integer adaptorBoundary = getAdaptorBoundary(read); + if (adaptorBoundary == null || read.getInferredInsertSize() > DEFAULT_ADAPTOR_SIZE) + return false; + + return read.getReadNegativeStrandFlag() ? basePos <= adaptorBoundary : basePos >= adaptorBoundary; } /** + * Finds the adaptor boundary around the read and returns the first base inside the adaptor that is closest to + * the read boundary. If the read is in the positive strand, this is the first base after the end of the + * fragment (Picard calls it 'insert'), if the read is in the negative strand, this is the first base before the + * beginning of the fragment. * - * @param read original SAM record - * @param adaptorLength length of adaptor sequence - * @return a new read with adaptor sequence hard-clipped out or null if read is fully clipped + * There are two cases we need to treat here: + * + * 1) Our read is in the reverse strand : + * + * <----------------------| * + * |---------------------> + * + * in these cases, the adaptor boundary is at the mate start (minus one) + * + * 2) Our read is in the forward strand : + * + * |----------------------> * + * <----------------------| + * + * in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one) + * + * @param read the read being tested for the adaptor boundary + * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read. NULL if the read is unmapped or the insert size cannot be determined (and is necessary for the calculation). */ - public static GATKSAMRecord hardClipAdaptorSequence(final GATKSAMRecord read, int adaptorLength) { + protected static Integer getAdaptorBoundary(final SAMRecord read) { + if ( read.getReadUnmappedFlag() ) + return null; // don't worry about unmapped pairs - Pair adaptorBoundaries = getAdaptorBoundaries(read, adaptorLength); - GATKSAMRecord result = read; + final int isize = Math.abs(read.getInferredInsertSize()); // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value) + int adaptorBoundary; // the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read) - if ( adaptorBoundaries != null ) { - if ( read.getReadNegativeStrandFlag() && adaptorBoundaries.second >= read.getAlignmentStart() && adaptorBoundaries.first < read.getAlignmentEnd() ) - result = hardClipStartOfRead(read, adaptorBoundaries.second); - else if ( !read.getReadNegativeStrandFlag() && adaptorBoundaries.first <= read.getAlignmentEnd() ) - result = hardClipEndOfRead(read, adaptorBoundaries.first); - } + if ( read.getReadNegativeStrandFlag() ) + adaptorBoundary = read.getMateAlignmentStart() - 1; // case 1 (see header) + else if (isize > 0) + adaptorBoundary = read.getAlignmentStart() + isize + 1; // case 2 (see header) + else + return null; // this is a case 2 where for some reason we cannot estimate the insert size - return result; + return adaptorBoundary; } // return true if the read needs to be completely clipped @@ -557,7 +494,6 @@ public class ReadUtils { } - private static int DEFAULT_ADAPTOR_SIZE = 100; /** * @@ -565,12 +501,14 @@ public class ReadUtils { * @return a new read with adaptor sequence hard-clipped out or null if read is fully clipped */ public static GATKSAMRecord hardClipAdaptorSequence(final GATKSAMRecord read) { - return hardClipAdaptorSequence(read, DEFAULT_ADAPTOR_SIZE); + final Integer adaptorBoundary = getAdaptorBoundary(read); + + if (adaptorBoundary == null || !isInsideRead(read, adaptorBoundary)) + return read; + + return (read.getReadNegativeStrandFlag()) ? hardClipStartOfRead(read, adaptorBoundary) : hardClipEndOfRead(read, adaptorBoundary); } - public static OverlapType readPairBaseOverlapType(final SAMRecord read, long basePos) { - return readPairBaseOverlapType(read, basePos, DEFAULT_ADAPTOR_SIZE); - } public static boolean is454Read(SAMRecord read) { return isPlatformRead(read, "454"); @@ -716,18 +654,6 @@ public class ReadUtils { return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; } - private static boolean readIsEntirelyInsertion(GATKSAMRecord read) { - for (CigarElement cigarElement : read.getCigar().getCigarElements()) { - if (cigarElement.getOperator() != CigarOperator.INSERTION) - return false; - } - return true; - } - - public enum ClippingTail { - LEFT_TAIL, - RIGHT_TAIL - } /** * Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) in case it falls in @@ -866,7 +792,24 @@ public class ReadUtils { return comp.compare(read1, read2); } - // TEST UTILITIES + /** + * Is a base inside a read? + * + * @param read the read to evaluate + * @param referenceCoordinate the reference coordinate of the base to test + * @return true if it is inside the read, false otherwise. + */ + public static boolean isInsideRead(final GATKSAMRecord read, final int referenceCoordinate) { + return referenceCoordinate >= read.getAlignmentStart() && referenceCoordinate <= read.getAlignmentEnd(); + } + + private static boolean readIsEntirelyInsertion(GATKSAMRecord read) { + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + if (cigarElement.getOperator() != CigarOperator.INSERTION) + return false; + } + return true; + } diff --git a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java similarity index 66% rename from public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java index 630beaece..3878cbfa9 100755 --- a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.utils; +package org.broadinstitute.sting.utils.sam; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; @@ -69,4 +69,49 @@ public class ReadUtilsUnitTest extends BaseTest { Assert.assertEquals(reducedreadp.getRepresentativeCount(), REDUCED_READ_COUNTS[0]); Assert.assertEquals(reducedreadp.getQual(), readp.getQual()); } + + @Test + public void testGetAdaptorBoundary() { + final byte [] bases = {'A', 'C', 'G', 'T', 'A', 'C', 'G', 'T'}; + final byte [] quals = {30, 30, 30, 30, 30, 30, 30, 30}; + final String cigar = "8M"; + final int fragmentSize = 10; + final int mateStart = 1000; + final int BEFORE = mateStart - 2; + final int AFTER = mateStart + 2; + int myStart, boundary; + + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, cigar); + read.setMateAlignmentStart(mateStart); + read.setInferredInsertSize(fragmentSize); + + // Test case 1: positive strand, first read + myStart = BEFORE; + read.setAlignmentStart(myStart); + read.setReadNegativeStrandFlag(false); + boundary = ReadUtils.getAdaptorBoundary(read); + Assert.assertEquals(boundary, myStart + fragmentSize + 1); + + // Test case 2: positive strand, second read + myStart = AFTER; + read.setAlignmentStart(myStart); + read.setReadNegativeStrandFlag(false); + boundary = ReadUtils.getAdaptorBoundary(read); + Assert.assertEquals(boundary, myStart + fragmentSize + 1); + + // Test case 3: negative strand, second read + myStart = AFTER; + read.setAlignmentStart(myStart); + read.setReadNegativeStrandFlag(true); + boundary = ReadUtils.getAdaptorBoundary(read); + Assert.assertEquals(boundary, mateStart - 1); + + // Test case 4: negative strand, first read + myStart = BEFORE; + read.setAlignmentStart(myStart); + read.setReadNegativeStrandFlag(true); + boundary = ReadUtils.getAdaptorBoundary(read); + Assert.assertEquals(boundary, mateStart - 1); + + } } From 1c4774c475bb8f3e4a1a02336ea6836363f96c6c Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 20 Dec 2011 14:30:06 -0500 Subject: [PATCH 07/17] Static versions of the hard clipping utilities For simplified access to the hard clipping utilities. No need to create a ReadClipper object if you are not doing multiple complicated clipping operations, just use the static methods. examples: ReadClipper.hardClipLowQualEnds(2); ReadClipper.hardClipAdaptorSequence(); --- .../sting/utils/clipping/ReadClipper.java | 114 +++++++++++++----- 1 file changed, 83 insertions(+), 31 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java index f19a7a04f..d82472bae 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -50,6 +50,44 @@ public class ReadClipper { return read; } + /** + * Return a new read corresponding to this.read that's been clipped according to ops, if any are present. + * + * @param algorithm + * @return + */ + public GATKSAMRecord clipRead(ClippingRepresentation algorithm) { + if (ops == null) + return getRead(); + else { + try { + GATKSAMRecord clippedRead = (GATKSAMRecord) read.clone(); + for (ClippingOp op : getOps()) { + //check if the clipped read can still be clipped in the range requested + if (op.start < clippedRead.getReadLength()) { + ClippingOp fixedOperation = op; + if (op.stop >= clippedRead.getReadLength()) + fixedOperation = new ClippingOp(op.start, clippedRead.getReadLength() - 1); + + clippedRead = fixedOperation.apply(algorithm, clippedRead); + } + } + wasClipped = true; + ops.clear(); + if ( clippedRead.isEmpty() ) + return new GATKSAMRecord( clippedRead.getHeader() ); + return clippedRead; + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); // this should never happen + } + } + } + + + + + // QUICK USE UTILITY FUNCTION + public GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(int refStop) { return hardClipByReferenceCoordinates(-1, refStop); } @@ -163,39 +201,13 @@ public class ReadClipper { return clipRead(ClippingRepresentation.HARDCLIP_BASES); } + public GATKSAMRecord hardClipAdaptorSequence () { + final Integer adaptorBoundary = ReadUtils.getAdaptorBoundary(read); + if (adaptorBoundary == null || !ReadUtils.isInsideRead(read, adaptorBoundary)) + return read; - /** - * Return a new read corresponding to this.read that's been clipped according to ops, if any are present. - * - * @param algorithm - * @return - */ - public GATKSAMRecord clipRead(ClippingRepresentation algorithm) { - if (ops == null) - return getRead(); - else { - try { - GATKSAMRecord clippedRead = (GATKSAMRecord) read.clone(); - for (ClippingOp op : getOps()) { - //check if the clipped read can still be clipped in the range requested - if (op.start < clippedRead.getReadLength()) { - ClippingOp fixedOperation = op; - if (op.stop >= clippedRead.getReadLength()) - fixedOperation = new ClippingOp(op.start, clippedRead.getReadLength() - 1); - - clippedRead = fixedOperation.apply(algorithm, clippedRead); - } - } - wasClipped = true; - ops.clear(); - if ( clippedRead.isEmpty() ) - return new GATKSAMRecord( clippedRead.getHeader() ); - return clippedRead; - } catch (CloneNotSupportedException e) { - throw new RuntimeException(e); // this should never happen - } - } + return read.getReadNegativeStrandFlag() ? hardClipByReferenceCoordinatesLeftTail(adaptorBoundary) : hardClipByReferenceCoordinatesRightTail(adaptorBoundary); } public GATKSAMRecord hardClipLeadingInsertions() { @@ -218,4 +230,44 @@ public class ReadClipper { this.addOp(new ClippingOp(0, 0)); // UNSOFTCLIP_BASES doesn't need coordinates return this.clipRead(ClippingRepresentation.REVERT_SOFTCLIPPED_BASES); } + + + + // STATIC VERSIONS OF THE QUICK CLIPPING FUNCTIONS + + public static GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(GATKSAMRecord read, int refStop) { + return (new ReadClipper(read)).hardClipByReferenceCoordinates(-1, refStop); + } + + public static GATKSAMRecord hardClipByReferenceCoordinatesRightTail(GATKSAMRecord read, int refStart) { + return (new ReadClipper(read)).hardClipByReferenceCoordinates(refStart, -1); + } + + public static GATKSAMRecord hardClipByReadCoordinates(GATKSAMRecord read, int start, int stop) { + return (new ReadClipper(read)).hardClipByReadCoordinates(start, stop); + } + + public static GATKSAMRecord hardClipBothEndsByReferenceCoordinates(GATKSAMRecord read, int left, int right) { + return (new ReadClipper(read)).hardClipBothEndsByReferenceCoordinates(left, right); + } + + public static GATKSAMRecord hardClipLowQualEnds(GATKSAMRecord read, byte lowQual) { + return (new ReadClipper(read)).hardClipLowQualEnds(lowQual); + } + + public static GATKSAMRecord hardClipSoftClippedBases (GATKSAMRecord read) { + return (new ReadClipper(read)).hardClipSoftClippedBases(); + } + + public static GATKSAMRecord hardClipAdaptorSequence (GATKSAMRecord read) { + return (new ReadClipper(read)).hardClipAdaptorSequence(); + } + + public static GATKSAMRecord hardClipLeadingInsertions(GATKSAMRecord read) { + return (new ReadClipper(read)).hardClipLeadingInsertions(); + } + + public static GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read) { + return (new ReadClipper(read)).revertSoftClippedBases(); + } } From 07128a2ad2cfe4d2b63a4df947ef7831573bbe1d Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 20 Dec 2011 14:35:33 -0500 Subject: [PATCH 08/17] ReadUtils cleanup * Removed all clipping functionality from ReadUtils (it should all be done using the ReadClipper now) * Cleaned up functionality that wasn't being used or had been superseded by other code (in an effort to reduce multiple unsupported implementations) * Made all meaningful functions public and added better comments/explanation to the headers --- .../sting/gatk/walkers/ClipReadsWalker.java | 5 +- .../gatk/walkers/SplitSamFileWalker.java | 18 +- ...elGenotypeLikelihoodsCalculationModel.java | 3 +- .../gatk/walkers/indels/IndelRealigner.java | 2 +- .../indels/PairHMMIndelErrorModel.java | 3 +- .../sting/utils/sam/ReadUtils.java | 457 ++++-------------- 6 files changed, 112 insertions(+), 376 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java index ff3867120..c28e205bf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java @@ -299,9 +299,8 @@ public class ClipReadsWalker extends ReadWalker readGroups = new ArrayList(); header.setReadGroups(readGroups); @@ -121,4 +121,20 @@ public class SplitSamFileWalker extends ReadWalker e : toCopy.getAttributes()) + copy.setAttribute(e.getKey(), e.getValue()); + + return copy; + } + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 653a6f6e7..fe2086d47 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; @@ -125,7 +126,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood for ( ExtendedEventPileupElement p : indelPileup.toExtendedIterable() ) { //SAMRecord read = p.getRead(); - GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead()); + GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); if (read == null) continue; if(ReadUtils.is454Read(read)) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index ba031c497..8f939fc49 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -463,7 +463,7 @@ public class IndelRealigner extends ReadWalker { private void emitReadLists() { // pre-merge lists to sort them in preparation for constrained SAMFileWriter readsNotToClean.addAll(readsToClean.getReads()); - ReadUtils.coordinateSortReads(readsNotToClean); + ReadUtils.sortReadsByCoordinate(readsNotToClean); manager.addReads(readsNotToClean, readsActuallyCleaned); readsToClean.clear(); readsNotToClean.clear(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 09968f47e..d893e620e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -32,6 +32,7 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -409,7 +410,7 @@ public class PairHMMIndelErrorModel { } else { //System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName()); - SAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead()); + SAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); if (read == null) continue; diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index aa90ecf6f..0c3433eba 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -47,11 +47,34 @@ public class ReadUtils { private static int DEFAULT_ADAPTOR_SIZE = 100; + /** + * A marker to tell which end of the read has been clipped + */ public enum ClippingTail { LEFT_TAIL, RIGHT_TAIL } + /** + * A HashMap of the SAM spec read flag names + * + * Note: This is not being used right now, but can be useful in the future + */ + private static final Map readFlagNames = new HashMap(); + static { + readFlagNames.put(0x1, "Paired"); + readFlagNames.put(0x2, "Proper"); + readFlagNames.put(0x4, "Unmapped"); + readFlagNames.put(0x8, "MateUnmapped"); + readFlagNames.put(0x10, "Forward"); + //readFlagNames.put(0x20, "MateForward"); + readFlagNames.put(0x40, "FirstOfPair"); + readFlagNames.put(0x80, "SecondOfPair"); + readFlagNames.put(0x100, "NotPrimary"); + readFlagNames.put(0x200, "NON-PF"); + readFlagNames.put(0x400, "Duplicate"); + } + /** * This enum represents all the different ways in which a read can overlap an interval. * @@ -96,38 +119,22 @@ public class ReadUtils { */ public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED} - public static SAMFileHeader copySAMFileHeader(SAMFileHeader toCopy) { - SAMFileHeader copy = new SAMFileHeader(); - - copy.setSortOrder(toCopy.getSortOrder()); - copy.setGroupOrder(toCopy.getGroupOrder()); - copy.setProgramRecords(toCopy.getProgramRecords()); - copy.setReadGroups(toCopy.getReadGroups()); - copy.setSequenceDictionary(toCopy.getSequenceDictionary()); - - for (Map.Entry e : toCopy.getAttributes()) - copy.setAttribute(e.getKey(), e.getValue()); - - return copy; - } - + /** + * Creates a SAMFileWriter with the given compression level if you request a bam file. Creates a regular + * SAMFileWriter without compression otherwise. + * + * @param header + * @param presorted + * @param file + * @param compression + * @return a SAMFileWriter with the compression level if it is a bam. + */ public static SAMFileWriter createSAMFileWriterWithCompression(SAMFileHeader header, boolean presorted, String file, int compression) { if (file.endsWith(".bam")) return new SAMFileWriterFactory().makeBAMWriter(header, presorted, new File(file), compression); return new SAMFileWriterFactory().makeSAMOrBAMWriter(header, presorted, new File(file)); } - public static boolean isPlatformRead(SAMRecord read, String name) { - SAMReadGroupRecord readGroup = read.getReadGroup(); - if (readGroup != null) { - Object readPlatformAttr = readGroup.getAttribute("PL"); - if (readPlatformAttr != null) - return readPlatformAttr.toString().toUpperCase().contains(name); - } - return false; - } - - /** * is this base inside the adaptor of the read? * @@ -175,7 +182,7 @@ public class ReadUtils { * @param read the read being tested for the adaptor boundary * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read. NULL if the read is unmapped or the insert size cannot be determined (and is necessary for the calculation). */ - protected static Integer getAdaptorBoundary(final SAMRecord read) { + public static Integer getAdaptorBoundary(final SAMRecord read) { if ( read.getReadUnmappedFlag() ) return null; // don't worry about unmapped pairs @@ -192,363 +199,54 @@ public class ReadUtils { return adaptorBoundary; } - // return true if the read needs to be completely clipped - private static GATKSAMRecord hardClipStartOfRead(GATKSAMRecord oldRec, int stopPosition) { - - if ( stopPosition >= oldRec.getAlignmentEnd() ) { - // BAM representation issue -- we can't clip away all bases in a read, just leave it alone and let the filter deal with it - //System.out.printf("Entire read needs to be clipped: %50s %n", read.getReadName()); - return null; - } - - GATKSAMRecord read; - try { - read = (GATKSAMRecord)oldRec.clone(); - } catch (Exception e) { - return null; - } - - //System.out.printf("Clipping start of read: %50s start=%d adaptorEnd=%d isize=%d %n", - // read.getReadName(), read.getAlignmentStart(), stopPosition, read.getInferredInsertSize()); - - Cigar oldCigar = read.getCigar(); - LinkedList newCigarElements = new LinkedList(); - int currentPos = read.getAlignmentStart(); - int basesToClip = 0; - int basesAlreadyClipped = 0; - - for ( CigarElement ce : oldCigar.getCigarElements() ) { - - if ( currentPos > stopPosition) { - newCigarElements.add(ce); - continue; - } - - int elementLength = ce.getLength(); - switch ( ce.getOperator() ) { - case M: - for (int i = 0; i < elementLength; i++, currentPos++, basesToClip++) { - if ( currentPos > stopPosition ) { - newCigarElements.add(new CigarElement(elementLength - i, CigarOperator.M)); - break; - } - } - break; - case I: - case S: - basesToClip += elementLength; - break; - case D: - case N: - currentPos += elementLength; - break; - case H: - basesAlreadyClipped += elementLength; - case P: - break; - default: throw new ReviewedStingException("The " + ce.getOperator() + " cigar element is not currently supported"); - } - - } - - // copy over the unclipped bases - final byte[] bases = read.getReadBases(); - final byte[] quals = read.getBaseQualities(); - int newLength = bases.length - basesToClip; - byte[] newBases = new byte[newLength]; - byte[] newQuals = new byte[newLength]; - System.arraycopy(bases, basesToClip, newBases, 0, newLength); - System.arraycopy(quals, basesToClip, newQuals, 0, newLength); - read.setReadBases(newBases); - read.setBaseQualities(newQuals); - - // now add a CIGAR element for the clipped bases - newCigarElements.addFirst(new CigarElement(basesToClip + basesAlreadyClipped, CigarOperator.H)); - Cigar newCigar = new Cigar(newCigarElements); - read.setCigar(newCigar); - - // adjust the start accordingly - read.setAlignmentStart(stopPosition + 1); - - return read; - } - - private static GATKSAMRecord hardClipEndOfRead(GATKSAMRecord oldRec, int startPosition) { - - if ( startPosition <= oldRec.getAlignmentStart() ) { - // BAM representation issue -- we can't clip away all bases in a read, just leave it alone and let the filter deal with it - //System.out.printf("Entire read needs to be clipped: %50s %n", read.getReadName()); - return null; - } - - GATKSAMRecord read; - try { - read = (GATKSAMRecord)oldRec.clone(); - } catch (Exception e) { - return null; - } - - //System.out.printf("Clipping end of read: %50s adaptorStart=%d end=%d isize=%d %n", - // read.getReadName(), startPosition, read.getAlignmentEnd(), read.getInferredInsertSize()); - - Cigar oldCigar = read.getCigar(); - LinkedList newCigarElements = new LinkedList(); - int currentPos = read.getAlignmentStart(); - int basesToKeep = 0; - int basesAlreadyClipped = 0; - - for ( CigarElement ce : oldCigar.getCigarElements() ) { - - int elementLength = ce.getLength(); - - if ( currentPos >= startPosition ) { - if ( ce.getOperator() == CigarOperator.H ) - basesAlreadyClipped += elementLength; - continue; - } - - switch ( ce.getOperator() ) { - case M: - for (int i = 0; i < elementLength; i++, currentPos++, basesToKeep++) { - if ( currentPos == startPosition ) { - newCigarElements.add(new CigarElement(i, CigarOperator.M)); - break; - } - } - - if ( currentPos != startPosition ) - newCigarElements.add(ce); - break; - case I: - case S: - newCigarElements.add(ce); - basesToKeep += elementLength; - break; - case D: - case N: - newCigarElements.add(ce); - currentPos += elementLength; - break; - case H: - case P: - newCigarElements.add(ce); - break; - default: throw new ReviewedStingException("The " + ce.getOperator() + " cigar element is not currently supported"); - } - - } - - // copy over the unclipped bases - final byte[] bases = read.getReadBases(); - final byte[] quals = read.getBaseQualities(); - byte[] newBases = new byte[basesToKeep]; - byte[] newQuals = new byte[basesToKeep]; - System.arraycopy(bases, 0, newBases, 0, basesToKeep); - System.arraycopy(quals, 0, newQuals, 0, basesToKeep); - read.setReadBases(newBases); - read.setBaseQualities(newQuals); - - // now add a CIGAR element for the clipped bases - newCigarElements.add(new CigarElement((bases.length - basesToKeep) + basesAlreadyClipped, CigarOperator.H)); - Cigar newCigar = new Cigar(newCigarElements); - read.setCigar(newCigar); - - // adjust the stop accordingly - // read.setAlignmentEnd(startPosition - 1); - - return read; - } - /** - * Hard clips away (i.e.g, removes from the read) bases that were previously soft clipped. + * is the read a 454 read ? * - * @param read - * @return + * @param read the read to test + * @return checks the read group tag PL for the default 454 tag */ - @Requires("read != null") - @Ensures("result != null") - public static GATKSAMRecord hardClipSoftClippedBases(GATKSAMRecord read) { - List cigarElts = read.getCigar().getCigarElements(); - - if ( cigarElts.size() == 1 ) // can't be soft clipped, just return - return read; - - int keepStart = 0, keepEnd = read.getReadLength() - 1; - List newCigarElements = new LinkedList(); - - for ( int i = 0; i < cigarElts.size(); i++ ) { - CigarElement ce = cigarElts.get(i); - int l = ce.getLength(); - switch ( ce.getOperator() ) { - case S: - if ( i == 0 ) - keepStart = l; - else - keepEnd = read.getReadLength() - l - 1; - newCigarElements.add(new CigarElement(l, CigarOperator.HARD_CLIP)); - break; - - default: - newCigarElements.add(ce); - break; - } - } - - // Merges tandem cigar elements like 5H10H or 2S5S to 15H or 7S - // this will happen if you soft clip a read that has been hard clipped before - // like: 5H20S => 5H20H - List mergedCigarElements = new LinkedList(); - Iterator cigarElementIterator = newCigarElements.iterator(); - CigarOperator currentOperator = null; - int currentOperatorLength = 0; - while (cigarElementIterator.hasNext()) { - CigarElement cigarElement = cigarElementIterator.next(); - if (currentOperator != cigarElement.getOperator()) { - if (currentOperator != null) - mergedCigarElements.add(new CigarElement(currentOperatorLength, currentOperator)); - currentOperator = cigarElement.getOperator(); - currentOperatorLength = cigarElement.getLength(); - } - else - currentOperatorLength += cigarElement.getLength(); - } - mergedCigarElements.add(new CigarElement(currentOperatorLength, currentOperator)); - - return hardClipBases(read, keepStart, keepEnd, mergedCigarElements); - } - - /** - * Hard clips out the bases in read, keeping the bases from keepStart to keepEnd, inclusive. Note these - * are offsets, so they are 0 based - * - * @param read - * @param keepStart - * @param keepEnd - * @param newCigarElements - * @return - */ - @Requires({ - "read != null", - "keepStart >= 0", - "keepEnd < read.getReadLength()", - "read.getReadUnmappedFlag() || newCigarElements != null"}) - @Ensures("result != null") - public static GATKSAMRecord hardClipBases(GATKSAMRecord read, int keepStart, int keepEnd, List newCigarElements) { - int newLength = keepEnd - keepStart + 1; - if ( newLength != read.getReadLength() ) { - try { - read = (GATKSAMRecord)read.clone(); - // copy over the unclipped bases - final byte[] bases = read.getReadBases(); - final byte[] quals = read.getBaseQualities(); - byte[] newBases = new byte[newLength]; - byte[] newQuals = new byte[newLength]; - System.arraycopy(bases, keepStart, newBases, 0, newLength); - System.arraycopy(quals, keepStart, newQuals, 0, newLength); - read.setReadBases(newBases); - read.setBaseQualities(newQuals); - - // now add a CIGAR element for the clipped bases, if the read isn't unmapped - if ( ! read.getReadUnmappedFlag() ) { - Cigar newCigar = new Cigar(newCigarElements); - read.setCigar(newCigar); - } - } catch ( CloneNotSupportedException e ) { - throw new ReviewedStingException("WTF, where did clone go?", e); - } - } - - return read; - } - - public static GATKSAMRecord replaceSoftClipsWithMatches(GATKSAMRecord read) { - List newCigarElements = new ArrayList(); - - for ( CigarElement ce : read.getCigar().getCigarElements() ) { - if ( ce.getOperator() == CigarOperator.SOFT_CLIP ) - newCigarElements.add(new CigarElement(ce.getLength(), CigarOperator.MATCH_OR_MISMATCH)); - else - newCigarElements.add(ce); - } - - if ( newCigarElements.size() > 1 ) { // - CigarElement first = newCigarElements.get(0); - CigarElement second = newCigarElements.get(1); - if ( first.getOperator() == CigarOperator.MATCH_OR_MISMATCH && second.getOperator() == CigarOperator.MATCH_OR_MISMATCH ) { - newCigarElements.set(0, new CigarElement(first.getLength() + second.getLength(), CigarOperator.MATCH_OR_MISMATCH)); - newCigarElements.remove(1); - } - } - - if ( newCigarElements.size() > 1 ) { // - CigarElement penult = newCigarElements.get(newCigarElements.size()-2); - CigarElement last = newCigarElements.get(newCigarElements.size()-1); - if ( penult.getOperator() == CigarOperator.MATCH_OR_MISMATCH && penult.getOperator() == CigarOperator.MATCH_OR_MISMATCH ) { - newCigarElements.set(newCigarElements.size()-2, new CigarElement(penult.getLength() + last.getLength(), CigarOperator.MATCH_OR_MISMATCH)); - newCigarElements.remove(newCigarElements.size()-1); - } - } - - read.setCigar(new Cigar(newCigarElements)); - return read; - } - - - - /** - * - * @param read original SAM record - * @return a new read with adaptor sequence hard-clipped out or null if read is fully clipped - */ - public static GATKSAMRecord hardClipAdaptorSequence(final GATKSAMRecord read) { - final Integer adaptorBoundary = getAdaptorBoundary(read); - - if (adaptorBoundary == null || !isInsideRead(read, adaptorBoundary)) - return read; - - return (read.getReadNegativeStrandFlag()) ? hardClipStartOfRead(read, adaptorBoundary) : hardClipEndOfRead(read, adaptorBoundary); - } - - public static boolean is454Read(SAMRecord read) { return isPlatformRead(read, "454"); } + /** + * is the read a SOLiD read ? + * + * @param read the read to test + * @return checks the read group tag PL for the default SOLiD tag + */ public static boolean isSOLiDRead(SAMRecord read) { return isPlatformRead(read, "SOLID"); } + /** + * is the read a SLX read ? + * + * @param read the read to test + * @return checks the read group tag PL for the default SLX tag + */ public static boolean isSLXRead(SAMRecord read) { return isPlatformRead(read, "ILLUMINA"); } - private static final Map readFlagNames - = new HashMap(); - - static { - readFlagNames.put(0x1, "Paired"); - readFlagNames.put(0x2, "Proper"); - readFlagNames.put(0x4, "Unmapped"); - readFlagNames.put(0x8, "MateUnmapped"); - readFlagNames.put(0x10, "Forward"); - //readFlagNames.put(0x20, "MateForward"); - readFlagNames.put(0x40, "FirstOfPair"); - readFlagNames.put(0x80, "SecondOfPair"); - readFlagNames.put(0x100, "NotPrimary"); - readFlagNames.put(0x200, "NON-PF"); - readFlagNames.put(0x400, "Duplicate"); - } - - public static String readFlagsAsString(GATKSAMRecord read) { - String flags = ""; - for (int flag : readFlagNames.keySet()) { - if ((read.getFlags() & flag) != 0) { - flags += readFlagNames.get(flag) + " "; - } + /** + * checks if the read has a platform tag in the readgroup equal to 'name' ? + * + * @param read the read to test + * @param name the platform name to test + * @return whether or not name == PL tag in the read group of read + */ + public static boolean isPlatformRead(SAMRecord read, String name) { + SAMReadGroupRecord readGroup = read.getReadGroup(); + if (readGroup != null) { + Object readPlatformAttr = readGroup.getAttribute("PL"); + if (readPlatformAttr != null) + return readPlatformAttr.toString().toUpperCase().contains(name); } - return flags; + return false; } + /** * Returns the collections of reads sorted in coordinate order, according to the order defined * in the reads themselves @@ -556,12 +254,19 @@ public class ReadUtils { * @param reads * @return */ - public final static List coordinateSortReads(List reads) { + public final static List sortReadsByCoordinate(List reads) { final SAMRecordComparator comparer = new SAMRecordCoordinateComparator(); Collections.sort(reads, comparer); return reads; } + /** + * If a read starts in INSERTION, returns the first element length. + * + * Warning: If the read has Hard or Soft clips before the insertion this function will return 0. + * @param read + * @return the length of the first insertion, or 0 if there is none (see warning). + */ public final static int getFirstInsertionOffset(SAMRecord read) { CigarElement e = read.getCigar().getCigarElement(0); if ( e.getOperator() == CigarOperator.I ) @@ -570,8 +275,15 @@ public class ReadUtils { return 0; } + /** + * If a read ends in INSERTION, returns the last element length. + * + * Warning: If the read has Hard or Soft clips after the insertion this function will return 0. + * @param read + * @return the length of the last insertion, or 0 if there is none (see warning). + */ public final static int getLastInsertionOffset(SAMRecord read) { - CigarElement e = read.getCigar().getCigarElement(read.getCigarLength()-1); + CigarElement e = read.getCigar().getCigarElement(read.getCigarLength() - 1); if ( e.getOperator() == CigarOperator.I ) return e.getLength(); else @@ -622,6 +334,7 @@ public class ReadUtils { return ReadAndIntervalOverlap.OVERLAP_RIGHT; } + @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"}) public static int getRefCoordSoftUnclippedStart(GATKSAMRecord read) { int start = read.getUnclippedStart(); @@ -803,7 +516,13 @@ public class ReadUtils { return referenceCoordinate >= read.getAlignmentStart() && referenceCoordinate <= read.getAlignmentEnd(); } - private static boolean readIsEntirelyInsertion(GATKSAMRecord read) { + /** + * Is this read all insertion? + * + * @param read + * @return whether or not the only element in the cigar string is an Insertion + */ + public static boolean readIsEntirelyInsertion(GATKSAMRecord read) { for (CigarElement cigarElement : read.getCigar().getCigarElements()) { if (cigarElement.getOperator() != CigarOperator.INSERTION) return false; From cadff4024721a22c4694a68d0fb5e369960e441e Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 20 Dec 2011 14:58:18 -0500 Subject: [PATCH 09/17] getRefCoordSoftUnclippedStart and End refactor These functions are methods of the read, and supplement getAlignmentStart() and getUnclippedStart() by calculating the unclipped start counting only soft clips. * Removed from ReadUtils * Added to GATKSAMRecord * Changed name to getSoftStart() and getSoftEnd * Updated third party code accordingly. --- .../sting/utils/sam/GATKSAMRecord.java | 53 +++++++++++++++++-- .../sting/utils/sam/ReadUtils.java | 42 ++------------- .../utils/clipping/ReadClipperUnitTest.java | 9 ++-- 3 files changed, 57 insertions(+), 47 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index d3a52167a..e8df869e6 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -24,10 +24,8 @@ package org.broadinstitute.sting.utils.sam; -import net.sf.samtools.BAMRecord; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; +import com.google.java.contract.Ensures; +import net.sf.samtools.*; import org.broadinstitute.sting.utils.NGSPlatform; import java.util.HashMap; @@ -273,5 +271,52 @@ public class GATKSAMRecord extends BAMRecord { setReadGroup(rg); } + /** + * Calculates the reference coordinate for the beginning of the read taking into account soft clips but not hard clips. + * + * Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips. + * + * @return the unclipped start of the read taking soft clips (but not hard clips) into account + */ + @Ensures({"result >= getUnclippedStart()", "result <= getUnclippedEnd() || ReadUtils.readIsEntirelyInsertion(this)"}) + public int getSoftStart() { + int start = this.getUnclippedStart(); + for (CigarElement cigarElement : this.getCigar().getCigarElements()) { + if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) + start += cigarElement.getLength(); + else + break; + } + return start; + } + + /** + * Calculates the reference coordinate for the end of the read taking into account soft clips but not hard clips. + * + * Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips. + * + * @return the unclipped end of the read taking soft clips (but not hard clips) into account + */ + @Ensures({"result >= getUnclippedStart()", "result <= getUnclippedEnd() || ReadUtils.readIsEntirelyInsertion(this)"}) + public int getSoftEnd() { + int stop = this.getUnclippedStart(); + + if (ReadUtils.readIsEntirelyInsertion(this)) + return stop; + + int shift = 0; + CigarOperator lastOperator = null; + for (CigarElement cigarElement : this.getCigar().getCigarElements()) { + stop += shift; + lastOperator = cigarElement.getOperator(); + if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP || cigarElement.getOperator() == CigarOperator.HARD_CLIP) + shift = cigarElement.getLength(); + else + shift = 0; + } + return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; + } + + } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 0c3433eba..24fd48387 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -299,8 +299,8 @@ public class ReadUtils { */ public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(GATKSAMRecord read, GenomeLoc interval) { - int sStart = getRefCoordSoftUnclippedStart(read); - int sStop = getRefCoordSoftUnclippedEnd(read); + int sStart = read.getSoftStart(); + int sStop = read.getSoftEnd(); int uStart = read.getUnclippedStart(); int uStop = read.getUnclippedEnd(); @@ -334,40 +334,6 @@ public class ReadUtils { return ReadAndIntervalOverlap.OVERLAP_RIGHT; } - - @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"}) - public static int getRefCoordSoftUnclippedStart(GATKSAMRecord read) { - int start = read.getUnclippedStart(); - for (CigarElement cigarElement : read.getCigar().getCigarElements()) { - if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) - start += cigarElement.getLength(); - else - break; - } - return start; - } - - @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"}) - public static int getRefCoordSoftUnclippedEnd(GATKSAMRecord read) { - int stop = read.getUnclippedStart(); - - if (readIsEntirelyInsertion(read)) - return stop; - - int shift = 0; - CigarOperator lastOperator = null; - for (CigarElement cigarElement : read.getCigar().getCigarElements()) { - stop += shift; - lastOperator = cigarElement.getOperator(); - if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP || cigarElement.getOperator() == CigarOperator.HARD_CLIP) - shift = cigarElement.getLength(); - else - shift = 0; - } - return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; - } - - /** * Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) in case it falls in * a deletion following the typical clipping needs. If clipping the left tail (beginning of the read) returns @@ -407,14 +373,14 @@ public class ReadUtils { * @param refCoord * @return the read coordinate corresponding to the requested reference coordinate. (see warning!) */ - @Requires({"refCoord >= getRefCoordSoftUnclippedStart(read)", "refCoord <= getRefCoordSoftUnclippedEnd(read)"}) + @Requires({"refCoord >= read.getSoftStart()", "refCoord <= read.getSoftEnd()"}) @Ensures({"result.getFirst() >= 0", "result.getFirst() < read.getReadLength()"}) public static Pair getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord) { int readBases = 0; int refBases = 0; boolean fallsInsideDeletion = false; - int goal = refCoord - getRefCoordSoftUnclippedStart(read); // The goal is to move this many reference bases + int goal = refCoord - read.getSoftStart(); // The goal is to move this many reference bases boolean goalReached = refBases == goal; Iterator cigarElementIterator = read.getCigar().getCigarElements().iterator(); diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java index e228762b7..d1d5ddd8f 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java @@ -32,7 +32,6 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -90,13 +89,13 @@ public class ReadClipperUnitTest extends BaseTest { int alnStart = read.getAlignmentStart(); int alnEnd = read.getAlignmentEnd(); for (int i=alnStart; i<=alnEnd; i++) { - if (ReadUtils.getRefCoordSoftUnclippedStart(read) == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side + if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(alnStart, i); if (!clipLeft.isEmpty()) Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); } - if (ReadUtils.getRefCoordSoftUnclippedEnd(read) == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side + if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, alnEnd); if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); @@ -111,7 +110,7 @@ public class ReadClipperUnitTest extends BaseTest { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); int alnStart = read.getAlignmentStart(); int alnEnd = read.getAlignmentEnd(); - if (ReadUtils.getRefCoordSoftUnclippedStart(read) == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side + if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side for (int i=alnStart; i<=alnEnd; i++) { GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(alnStart, i); if (!clipLeft.isEmpty()) @@ -127,7 +126,7 @@ public class ReadClipperUnitTest extends BaseTest { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); int alnStart = read.getAlignmentStart(); int alnEnd = read.getAlignmentEnd(); - if (ReadUtils.getRefCoordSoftUnclippedEnd(read) == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side + if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side for (int i=alnStart; i<=alnEnd; i++) { GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, alnEnd); if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. From 731a4634152e682a61d7acd268c3656ae8b2d1c0 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 20 Dec 2011 17:44:21 -0500 Subject: [PATCH 10/17] Updated IntegrationTests with new adaptor clipper phew! --- .../walkers/PileupWalkerIntegrationTest.java | 5 ++- .../VariantAnnotatorIntegrationTest.java | 12 +++--- .../CallableLociWalkerIntegrationTest.java | 8 ++-- .../DepthOfCoverageB36IntegrationTest.java | 15 +++---- .../DepthOfCoverageIntegrationTest.java | 40 +++++++++---------- .../UnifiedGenotyperIntegrationTest.java | 22 +++++----- .../RecalibrationWalkersIntegrationTest.java | 30 +++++++------- .../sting/utils/sam/ReadUtilsUnitTest.java | 14 ------- 8 files changed, 69 insertions(+), 77 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java index a01a3f281..8bea5385a 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java @@ -29,9 +29,12 @@ public class PileupWalkerIntegrationTest extends WalkerTest { String gatk_args = "-T Pileup -I " + validationDataLocation + "OV-0930.normal.chunk.bam " + "-R " + hg18Reference + " -show_indels -o %s"; - String expected_md5="06eedc2e7927650961d99d703f4301a4"; + String expected_md5="da2a02d02abac9de14cc4b187d8595a1"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args,1,Arrays.asList(expected_md5)); executeTest("Testing the extended pileup with indel records included on a small chunk of Ovarian dataset with 20 indels (1 D, 19 I)", spec); + // before Adaptor clipping + // String expected_md5="06eedc2e7927650961d99d703f4301a4"; + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 245fda3c7..83c3d3a1e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -32,7 +32,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("fbb656369eaa48153d127bd12db59d8f")); + Arrays.asList("ac5f409856a1b79316469733e62abb91")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -40,7 +40,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("2977bb30c8b84a5f4094fe6090658561")); + Arrays.asList("f9aa7bee5a61ac1a9187d0cf1e8af471")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -66,7 +66,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("42dd979a0a931c18dc9be40308bac321")); + Arrays.asList("6f27fd863b6718d59d2a2d8e2a20bcae")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -74,7 +74,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("0948cd1dba7d61f283cc4cf2a7757d92")); + Arrays.asList("40bbd3d5a2397a007c0e74211fb33433")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @@ -82,7 +82,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testExcludeAnnotations() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("477eac07989593b58bb361f3429c085a")); + Arrays.asList("40622d39072b298440a77ecc794116e7")); executeTest("test exclude annotations", spec); } @@ -90,7 +90,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testOverwritingHeader() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + validationDataLocation + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, - Arrays.asList("062155edec46a8c52243475fbf3a2943")); + Arrays.asList("31faae1bc588d195ff553cf6c47fabfa")); executeTest("test overwriting header", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java index 3783525d1..186e5c3b7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java @@ -32,13 +32,13 @@ import java.util.Arrays; public class CallableLociWalkerIntegrationTest extends WalkerTest { final static String commonArgs = "-R " + b36KGReference + " -T CallableLoci -I " + validationDataLocation + "/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s"; - final static String SUMMARY_MD5 = "ed4c255bb78313b8e7982127caf3d6c4"; + final static String SUMMARY_MD5 = "cd597a8dae35c226a2cb110b1c9f32d5"; @Test public void testCallableLociWalkerBed() { String gatk_args = commonArgs + " -format BED -L 1:10,000,000-11,000,000 -summary %s"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, - Arrays.asList("884c9c2d96419d990a708d2bd98fcefa", SUMMARY_MD5)); + Arrays.asList("c86ac1ef404c11d5e5452e020c8f7ce9", SUMMARY_MD5)); executeTest("formatBed", spec); } @@ -46,7 +46,7 @@ public class CallableLociWalkerIntegrationTest extends WalkerTest { public void testCallableLociWalkerPerBase() { String gatk_args = commonArgs + " -format STATE_PER_BASE -L 1:10,000,000-11,000,000 -summary %s"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, - Arrays.asList("fb4524f8b3b213060c0c5b85362b5902", SUMMARY_MD5)); + Arrays.asList("d8536a55fe5f6fdb1ee6c9511082fdfd", SUMMARY_MD5)); executeTest("format_state_per_base", spec); } @@ -62,7 +62,7 @@ public class CallableLociWalkerIntegrationTest extends WalkerTest { public void testCallableLociWalker3() { String gatk_args = commonArgs + " -format BED -L 1:10,000,000-11,000,000 -minDepth 10 -maxDepth 100 --minBaseQuality 10 --minMappingQuality 20 -summary %s"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, - Arrays.asList("86bd1a5f79356b3656412c4b1c60709a", "6fefb144a60b89c27293ce5ca6e10e6a")); + Arrays.asList("bc966060184bf4605a31da7fe383464e", "d624eda8f6ed14b9251ebeec73e37867")); executeTest("formatBed lots of arguments", spec); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java index 043b2eaf2..f4cd4968b 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageB36IntegrationTest.java @@ -61,13 +61,14 @@ public class DepthOfCoverageB36IntegrationTest extends WalkerTest { File baseOutputFile = this.createTempFile("depthofcoveragemapq0",".tmp"); spec.setOutputFileLocation(baseOutputFile); - spec.addAuxFile("f39af6ad99520fd4fb27b409ab0344a0",baseOutputFile); - spec.addAuxFile("6b15f5330414b6d4e2f6caea42139fa1", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_counts")); - spec.addAuxFile("cc6640d82077991dde8a2b523935cdff", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions")); + spec.addAuxFile("5b6c16a1c667c844882e9dce71454fc4",baseOutputFile); + spec.addAuxFile("fc161ec1b61dc67bc6a5ce36cb2d02c9", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_counts")); + spec.addAuxFile("89321bbfb76a4e1edc0905d50503ba1f", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions")); spec.addAuxFile("0fb627234599c258a3fee1b2703e164a", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics")); - spec.addAuxFile("cb73a0fa0cee50f1fb8f249315d38128", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); - spec.addAuxFile("347b47ef73fbd4e277704ddbd7834f69", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); - spec.addAuxFile("4ec920335d4b9573f695c39d62748089", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); + spec.addAuxFile("4dd16b659065e331ed4bd3ab0dae6c1b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); + spec.addAuxFile("2be0c18b501f4a3d8c5e5f99738b4713", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); + spec.addAuxFile("5a26ef61f586f58310812580ce842462", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); + execute("testMapQ0Only",spec); } @@ -83,7 +84,7 @@ public class DepthOfCoverageB36IntegrationTest extends WalkerTest { File baseOutputFile = this.createTempFile("testManySamples",".tmp"); spec.setOutputFileLocation(baseOutputFile); - spec.addAuxFile("c9561b52344536d2b06ab97b0bb1a234",baseOutputFile); + spec.addAuxFile("d73fa1fc492f7dcc1d75056f8c12c92a",baseOutputFile); execute("testLotsOfSamples",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java index 646fb5e77..9d6638d53 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java @@ -55,26 +55,26 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest { spec.setOutputFileLocation(baseOutputFile); // now add the expected files that get generated - spec.addAuxFile("423571e4c05e7934322172654ac6dbb7", baseOutputFile); - spec.addAuxFile("9df5e7e07efeb34926c94a724714c219", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_counts")); - spec.addAuxFile("229b9b5bc2141c86dbc69c8acc9eba6a", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_proportions")); - spec.addAuxFile("9cd395f47b329b9dd00ad024fcac9929", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_statistics")); - spec.addAuxFile("471c34ad2e4f7228efd20702d5941ba9", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); - spec.addAuxFile("9667c77284c2c08e647b162d0e9652d4", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_statistics")); - spec.addAuxFile("5a96c75f96d6fa6ee617451d731dae37", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_summary")); - spec.addAuxFile("b82846df660f0aac8429aec57c2a62d6", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_counts")); - spec.addAuxFile("d32a8c425fadcc4c048bd8b48d0f61e5", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_proportions")); - spec.addAuxFile("7b9d0e93bf5b5313995be7010ef1f528", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_statistics")); - spec.addAuxFile("2aae346204c5f15517158da8e61a6c16", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_summary")); - spec.addAuxFile("e70952f241eebb9b5448f2e7cb288131", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_statistics")); - spec.addAuxFile("054ed1e184f46d6a170dc9bf6524270c", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_summary")); - spec.addAuxFile("d53431022f7387fe9ac47814ab1fcd88", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_counts")); - spec.addAuxFile("a395dafde101971d2b9e5ddb6cd4b7d0", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions")); - spec.addAuxFile("df0ba76e0e6082c0d29fcfd68efc6b77", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics")); - spec.addAuxFile("e013cb5b11b0321a81c8dbd7c1863787", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); - spec.addAuxFile("661160f571def8c323345b5859cfb9da", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); - spec.addAuxFile("c95a7a6840334cadd0e520939615c77b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); - + spec.addAuxFile("19e862f7ed3de97f2569803f766b7433", baseOutputFile); + spec.addAuxFile("c64cc5636d4880b80b71169ed1832cd7", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_counts")); + spec.addAuxFile("1a8ba07a60e55f9fdadc89d00b1f3394", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_proportions")); + spec.addAuxFile("0075cead73a901e3a9d07c5d9c2b75f4", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_statistics")); + spec.addAuxFile("d757be2f953f893e66eff1ef1f0fff4e", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); + spec.addAuxFile("de08996729c774590d6a4954c906fe84", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_statistics")); + spec.addAuxFile("58ad39b100d1f2af7d119f28ba626bfd", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_summary")); + spec.addAuxFile("0b4ce6059e6587ae5a986afbbcc7d783", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_counts")); + spec.addAuxFile("adc2b2babcdd72a843878acf2d510ca7", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_proportions")); + spec.addAuxFile("884281c139241c5db3c9f90e8684d084", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_statistics")); + spec.addAuxFile("b90636cad74ff4f6b9ff9a596e145bd6", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_summary")); + spec.addAuxFile("ad540b355ef90c566bebaeabd70026d2", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_statistics")); + spec.addAuxFile("27fe09a02a5b381e0ed633587c0f4b23", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_summary")); + spec.addAuxFile("5fcd53b4bd167b5e6d5f92329cf8678e", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_counts")); + spec.addAuxFile("7a2a19e54f73a8e07de2f020f1f913dd", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions")); + spec.addAuxFile("852a079c5e9e93e7daad31fd6a9f4a49", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics")); + spec.addAuxFile("0828762842103edfaf115ef4e50809c6", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); + spec.addAuxFile("5c5aeb28419bba1decb17f8a166777f2", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); + spec.addAuxFile("e5fd6216b3d6a751f3a90677b4e5bf3c", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); + execute("testBaseOutputNoFiltering",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index e049af064..ba33108cf 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("a2d3839c4ebb390b0012d495e4e53b3a")); + Arrays.asList("66ed60c6c1190754abd8a0a9d1d8d61e")); executeTest("test MultiSample Pilot1", spec); } @@ -129,8 +129,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testOutputParameter() { HashMap e = new HashMap(); e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" ); - e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "d971d392956aea69c3707da64d24485b" ); - e.put( "--output_mode EMIT_ALL_SITES", "21993e71ca1a06a0d1f11d58e3cc26c3" ); + e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "42e4ea7878ef8d96215accb3ba4e97b7" ); + e.put( "--output_mode EMIT_ALL_SITES", "e0443c720149647469f2a2f3fb73942f" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -189,7 +189,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("f0fbe472f155baf594b1eeb58166edef")); + Arrays.asList("2b2729414ae855d390e7940956745bce")); executeTest(String.format("test multiple technologies"), spec); } @@ -208,7 +208,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("8c87c749a7bb5a76ed8504d4ec254272")); + Arrays.asList("95c6120efb92e5a325a5cec7d77c2dab")); executeTest(String.format("test calling with BAQ"), spec); } @@ -227,7 +227,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("a64d2e65b5927260e4ce0d948760cc5c")); + Arrays.asList("d87ce4b405d4f7926d1c36aee7053975")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -255,7 +255,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("69107157632714150fc068d412e31939")); + Arrays.asList("c5989e5d67d9e5fe8c5c956f12a975da")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -265,7 +265,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("4ffda07590e06d58ed867ae326d74b2d")); + Arrays.asList("daca0741278de32e507ad367e67753b6")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1); } @@ -275,7 +275,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("45633d905136c86e9d3f90ce613255e5")); + Arrays.asList("0ccc4e876809566510429c64adece2c7")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); } @@ -285,7 +285,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, - Arrays.asList("8bd5028cf294850b8a95b3c0af23d728")); + Arrays.asList("cff6dd0f4eb1ef0b6fc476da6ffead19")); executeTest("test MultiSample Pilot2 indels with complicated records", spec3); } @@ -294,7 +294,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("814dcd66950635a870602d90a1618cce")); + Arrays.asList("1e2a4aab26e9ab0dae709d33a669e036")); executeTest("test MultiSample Phase1 indels with complicated records", spec4); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java index cbcd5835f..b4b1f7b8e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java @@ -31,10 +31,11 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @DataProvider(name = "cctestdata") public Object[][] createCCTestData() { - new CCTest( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "5a52b00d9794d27af723bcf93366681e" ); + + new CCTest( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "9469d6b65880abe4e5babc1c1a69889d" ); new CCTest( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "17d4b8001c982a70185e344929cf3941"); - new CCTest( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "714e65d6cb51ae32221a77ce84cbbcdc" ); - new CCTest( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "64e9f17a1cf6fc04c1f2717c2d2eca67" ); + new CCTest( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "36c0c467b6245c2c6c4e9c956443a154" ); + new CCTest( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "ed15f8bf03bb2ea9b7c26844be829c0d" ); return CCTest.getTests(CCTest.class); } @@ -88,10 +89,11 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @DataProvider(name = "trtestdata") public Object[][] createTRTestData() { - new TRTest( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "2864f231fab7030377f3c8826796e48f" ); + new TRTest( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "f020725d9f75ad8f1c14bfae056e250f" ); new TRTest( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "d04cf1f6df486e45226ebfbf93a188a5"); - new TRTest( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "74314e5562c1a65547bb0edaacffe602" ); - new TRTest( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "2a37c6001826bfabf87063b1dfcf594f" ); + new TRTest( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "b2f4757bc47cf23bd9a09f756c250787" ); + new TRTest( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "313a21a8a88e3460b6e71ec5ffc50f0f" ); + return TRTest.getTests(TRTest.class); } @@ -121,7 +123,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesUseOriginalQuals() { HashMap e = new HashMap(); - e.put( validationDataLocation + "originalQuals.1kg.chr1.1-1K.bam", "278846c55d97bd9812b758468a83f559"); + e.put( validationDataLocation + "originalQuals.1kg.chr1.1-1K.bam", "bd8288b1fc7629e2e8c2cf7f65fefa8f"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -145,7 +147,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorMaxQ70() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "2864f231fab7030377f3c8826796e48f" ); + e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "f020725d9f75ad8f1c14bfae056e250f" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -174,7 +176,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesSolidIndelsRemoveRefBias() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "8379f24cf5312587a1f92c162ecc220f" ); + e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1f643bca090478ba68aac88db835a629" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -200,7 +202,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorSolidIndelsRemoveRefBias() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "2ad4c17ac3ed380071137e4e53a398a5" ); + e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "613fb2bbe01af8cbe6a188a10c1582ca" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -228,7 +230,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesBED() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "b460478d9683e827784e42bc352db8bb"); + e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "b00e99219aeafe2516c6232b7d6a0a00"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -252,7 +254,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesVCFPlusDBsnp() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "9131d96f39badbf9753653f55b148012"); + e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "7b92788ce92f49415af3a75a2e4a2b33"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -280,7 +282,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesNoIndex() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "8993d32df5cb66c7149f59eccbd57f4c" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "f34f7141351a5dbf9664c67260f94e96" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -306,7 +308,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorNoIndex() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "5f913c98ca99754902e9d34f99df468f" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "13c83656567cee9e93bda9874ee80234" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java index 3878cbfa9..e9269ff48 100755 --- a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java @@ -33,20 +33,6 @@ public class ReadUtilsUnitTest extends BaseTest { reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, REDUCED_READ_COUNTS_TAG); } - private void testReadBasesAndQuals(GATKSAMRecord read, int expectedStart, int expectedStop) { - SAMRecord clipped = ReadUtils.hardClipBases(read, expectedStart, expectedStop - 1, null); - String expectedBases = BASES.substring(expectedStart, expectedStop); - String expectedQuals = QUALS.substring(expectedStart, expectedStop); - Assert.assertEquals(clipped.getReadBases(), expectedBases.getBytes(), "Clipped bases not those expected"); - Assert.assertEquals(clipped.getBaseQualityString(), expectedQuals, "Clipped quals not those expected"); - } - - @Test public void testNoClip() { testReadBasesAndQuals(read, 0, 4); } - @Test public void testClip1Front() { testReadBasesAndQuals(read, 1, 4); } - @Test public void testClip2Front() { testReadBasesAndQuals(read, 2, 4); } - @Test public void testClip1Back() { testReadBasesAndQuals(read, 0, 3); } - @Test public void testClip2Back() { testReadBasesAndQuals(read, 0, 2); } - @Test public void testReducedReads() { Assert.assertFalse(read.isReducedRead(), "isReducedRead is false for normal read"); From d50f9b98bbad3f37b6ca94e1e8205eb52669aa80 Mon Sep 17 00:00:00 2001 From: Matt Hanna Date: Tue, 20 Dec 2011 21:42:30 -0500 Subject: [PATCH 11/17] Make sure that the temporary ReadWalker performance improvement hack works well in the binary release, jic GATK 1.4 arrives before I get a Picard patch. --- build.xml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/build.xml b/build.xml index 232b074f6..04cb243d4 100644 --- a/build.xml +++ b/build.xml @@ -847,7 +847,6 @@ - @@ -858,6 +857,7 @@ + @@ -1118,6 +1118,11 @@ + + + + + From 3358c132a84105fa1749746ffed8d180f0a0929a Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 21 Dec 2011 15:14:05 -0500 Subject: [PATCH 15/17] Updating the MD5s Clipping adaptor boundaries changed the results of CountCovariates which affected the PPP output. a few more loci were visible to locus walkers. --- .../sting/queue/pipeline/PacbioProcessingPipelineTest.scala | 4 ++-- .../pipeline/examples/ExampleCountLociPipelineTest.scala | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala index 50aa66367..1278b6e16 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala @@ -29,7 +29,7 @@ import org.broadinstitute.sting.BaseTest class PacbioProcessingPipelineTest { @Test - def testBAM { + def testPacbioProcessingPipeline { val testOut = "exampleBAM.recal.bam" val spec = new PipelineTestSpec spec.name = "pacbioProcessingPipeline" @@ -40,7 +40,7 @@ class PacbioProcessingPipelineTest { " -blasr ", " -test ", " -D " + BaseTest.testDir + "exampleDBSNP.vcf").mkString - spec.fileMD5s += testOut -> "f0adce660b55cb91d5f987f9a145471e" + spec.fileMD5s += testOut -> "cf147e7f56806598371f8d5d6794b852" PipelineTest.executeTest(spec) } } diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala index f657e4be1..e737e52ea 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala @@ -39,7 +39,7 @@ class ExampleCountLociPipelineTest { " -R " + BaseTest.testDir + "exampleFASTA.fasta", " -I " + BaseTest.testDir + "exampleBAM.bam", " -o " + testOut).mkString - spec.fileMD5s += testOut -> "67823e4722495eb10a5e4c42c267b3a6" + spec.fileMD5s += testOut -> "ade93df31a6150321c1067e749cae9be" PipelineTest.executeTest(spec) } -} +} \ No newline at end of file From 32cdef9682e059ea9536b05fb4a3f37d998aca0d Mon Sep 17 00:00:00 2001 From: David Roazen Date: Thu, 22 Dec 2011 10:38:49 -0500 Subject: [PATCH 17/17] Rename *PerformanceTest test classes to *LargeScaleTest This is in preparation for the installation of the new performance test suite in Bamboo. Note that "ant performancetest" is now "ant largescaletest" --- build.xml | 12 ++++++------ ...Test.java => UnifiedGenotyperLargeScaleTest.java} | 2 +- ...ceTest.java => IndelRealignerLargeScaleTest.java} | 2 +- ...ava => RealignerTargetCreatorLargeScaleTest.java} | 2 +- ....java => RecalibrationWalkersLargeScaleTest.java} | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) rename public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/{UnifiedGenotyperPerformanceTest.java => UnifiedGenotyperLargeScaleTest.java} (96%) rename public/java/test/org/broadinstitute/sting/gatk/walkers/indels/{IndelRealignerPerformanceTest.java => IndelRealignerLargeScaleTest.java} (97%) rename public/java/test/org/broadinstitute/sting/gatk/walkers/indels/{RealignerTargetCreatorPerformanceTest.java => RealignerTargetCreatorLargeScaleTest.java} (95%) rename public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/{RecalibrationWalkersPerformanceTest.java => RecalibrationWalkersLargeScaleTest.java} (98%) diff --git a/build.xml b/build.xml index 232b074f6..ce9ab0c4d 100644 --- a/build.xml +++ b/build.xml @@ -993,7 +993,7 @@ - + @@ -1010,13 +1010,13 @@ - - + + - + @@ -1043,8 +1043,8 @@ - - + + diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperPerformanceTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLargeScaleTest.java similarity index 96% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperPerformanceTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLargeScaleTest.java index 3ff453dab..109088875 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperPerformanceTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLargeScaleTest.java @@ -5,7 +5,7 @@ import org.testng.annotations.Test; import java.util.ArrayList; -public class UnifiedGenotyperPerformanceTest extends WalkerTest { +public class UnifiedGenotyperLargeScaleTest extends WalkerTest { @Test public void testUnifiedGenotyperWholeGenome() { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerLargeScaleTest.java similarity index 97% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerLargeScaleTest.java index 77675b0f4..4526fc0d7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerLargeScaleTest.java @@ -5,7 +5,7 @@ import org.testng.annotations.Test; import java.util.ArrayList; -public class IndelRealignerPerformanceTest extends WalkerTest { +public class IndelRealignerLargeScaleTest extends WalkerTest { @Test public void testHighCoverage() { WalkerTestSpec spec = new WalkerTestSpec( diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorPerformanceTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorLargeScaleTest.java similarity index 95% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorPerformanceTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorLargeScaleTest.java index cc37cc191..3203ee100 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorPerformanceTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorLargeScaleTest.java @@ -5,7 +5,7 @@ import org.testng.annotations.Test; import java.util.ArrayList; -public class RealignerTargetCreatorPerformanceTest extends WalkerTest { +public class RealignerTargetCreatorLargeScaleTest extends WalkerTest { @Test public void testRealignerTargetCreator() { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersLargeScaleTest.java similarity index 98% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersLargeScaleTest.java index bccb95795..3f956e3b9 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersLargeScaleTest.java @@ -6,7 +6,7 @@ import org.testng.annotations.Test; import java.util.ArrayList; -public class RecalibrationWalkersPerformanceTest extends WalkerTest { +public class RecalibrationWalkersLargeScaleTest extends WalkerTest { private void testCountCovariatesWholeGenomeRunner(String moreArgs) { WalkerTestSpec spec = new WalkerTestSpec(