From b34e2f733fc206c67c01e725ac0e9ba4f7d99691 Mon Sep 17 00:00:00 2001 From: kshakir Date: Fri, 7 Jan 2011 22:03:36 +0000 Subject: [PATCH] Removed stochasticity from IndelRealigner by random sampling using and seed based on the read list. Updated the Queue scatter/gather for read walkers to include -L unmapped on the last scatter job when intervals aren't specified, and to map it correctly when it is explicitly set. Simplified the build.xml/ivy.xml to fix a bug reported with "ant clean dist test" where the scalac target wasn't found. Now building all scala code at the same time, just like all java code is compiled at the same time. Sped up the build for everyone by uncommenting a small bit of classes so that javac/scalac will not constantly launch trying to build .class files that will never compile. Moved some source files to their expected location so that the .java/.scala -> .class is a one-to-one match, again keeping the compilers from wasting cycles. Used and to skip extracting the help text and generating the GATK Queue extensions when the source files haven't been modified. Fixed a couple errors when the task is run. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4963 348d0f76-0448-11de-a6fe-93d51630548a --- build.xml | 196 +++++++++--------- ivy.xml | 10 +- .../gatk/walkers/indels/IndelRealigner.java | 12 +- .../sting/utils/interval/IntervalUtils.java | 11 +- scala/qscript/chartl/expanded_targets.q | 2 +- scala/qscript/fullCallingPipeline.q | 9 +- scala/src/HelloWorld.scala | 5 - .../gatk/IntervalScatterFunction.scala | 46 +++- .../ipf/intervals/ExpandIntervals.scala | 2 +- .../pipeline/PipelineArgumentCollection.scala | 5 +- .../BaseTransitionTableCalculator.scala | 6 +- .../scala}/IntervalAnnotationWalker.scala | 5 +- .../sting/scala}/ScalaCountLoci.scala | 0 13 files changed, 169 insertions(+), 140 deletions(-) delete mode 100644 scala/src/HelloWorld.scala rename scala/src/{ => org/broadinstitute/sting/scala}/BaseTransitionTableCalculator.scala (98%) rename scala/src/{ => org/broadinstitute/sting/scala}/IntervalAnnotationWalker.scala (98%) rename scala/src/{ => org/broadinstitute/sting/scala}/ScalaCountLoci.scala (100%) diff --git a/build.xml b/build.xml index 71e036527..0ddb9e563 100644 --- a/build.xml +++ b/build.xml @@ -10,8 +10,8 @@ - - + + @@ -73,14 +73,14 @@ - - + + - - + + @@ -105,12 +105,7 @@ - - - - - - + @@ -118,9 +113,9 @@ - + - + @@ -144,31 +139,24 @@ - + - - + + - + - - - - - - - - + @@ -202,7 +190,7 @@ - @@ -232,33 +220,61 @@ + + + + + + + + + + + + + + - + Generating Queue GATK extensions... - + + + + - - - - Building Queue... - - + + + + Building Scala... + + - - + + + + + + + + + + + + + unless="uptodate.extracthelp"> @@ -273,7 +289,7 @@ - + @@ -333,9 +349,17 @@ - + + + + + + + + + - + @@ -381,7 +405,7 @@ - + @@ -416,16 +440,20 @@ inheritAll is false so that 'ant queue' does not accidentally import params if the build was called with 'ant clean oneoffs queue'. Instead this task resets the parameters and is just like running - a fresh 'ant dist -Dqueue.target=core'. + a fresh 'ant dist -Dscala.target=core'. --> - + - - + + + + + + @@ -434,55 +462,30 @@ + - - - - + + + + - - + + + + + - - - - - - - - - - - - - - - - - - - - - - - Building Scala... - - - - - - - - - + + + @@ -493,8 +496,8 @@ - - + + @@ -521,13 +524,13 @@ - + - + - + @@ -599,20 +602,21 @@ + - + - + + - - - - + + + - + diff --git a/ivy.xml b/ivy.xml index 4f8bcbda2..7a9e0467a 100644 --- a/ivy.xml +++ b/ivy.xml @@ -33,20 +33,18 @@ + + - - - - - - + + diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 091831c82..80513cea3 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -182,10 +182,6 @@ public class IndelRealigner extends ReadWalker { private final ArrayList readsNotToClean = new ArrayList(); private final IdentityHashMap knownIndelsToTry = new IdentityHashMap(); - // random number generator - private static final long RANDOM_SEED = 1252863495; - private static final Random generator = new Random(RANDOM_SEED); - private static final int MAX_QUAL = 99; // fraction of mismatches that need to no longer mismatch for a column to be considered cleaned @@ -614,9 +610,9 @@ public class IndelRealigner extends ReadWalker { // if there are reads with a single indel in them, add that indel to the list of alternate consenses long totalRawMismatchSum = determineReadsThatNeedCleaning(reads, refReads, altReads, altAlignmentsToTest, altConsenses, leftmostIndex, reference); - // use 'Smith-Waterman' to create alternate consenses from reads that mismatch the reference + // use 'Smith-Waterman' to create alternate consenses from reads that mismatch the reference, using totalRawMismatchSum as the random seed if ( !USE_KNOWN_INDELS_ONLY ) - generateAlternateConsensesFromReads(altAlignmentsToTest, altConsenses, reference, leftmostIndex); + generateAlternateConsensesFromReads(altAlignmentsToTest, altConsenses, reference, leftmostIndex, totalRawMismatchSum); // if ( debugOn ) System.out.println("------\nChecking consenses...\n--------\n"); @@ -867,7 +863,8 @@ public class IndelRealigner extends ReadWalker { private void generateAlternateConsensesFromReads(final LinkedList altAlignmentsToTest, final Set altConsensesToPopulate, final byte[] reference, - final int leftmostIndex) { + final int leftmostIndex, + final long randomSeed) { // if we are under the limit, use all reads to generate alternate consenses if ( altAlignmentsToTest.size() <= MAX_READS_FOR_CONSENSUSES ) { @@ -879,6 +876,7 @@ public class IndelRealigner extends ReadWalker { // otherwise, choose reads for alternate consenses randomly else { int readsSeen = 0; + Random generator = new Random(randomSeed); while ( readsSeen++ < MAX_READS_FOR_CONSENSUSES && altConsensesToPopulate.size() <= MAX_CONSENSUSES) { int index = generator.nextInt(altAlignmentsToTest.size()); AlignedRead aRead = altAlignmentsToTest.remove(index); diff --git a/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index c80ab492a..cd3d603a1 100644 --- a/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -52,7 +52,7 @@ public class IntervalUtils { return null; } // if any argument is 'unmapped', "parse" it to a null entry. A null in this case means 'all the intervals with no alignment data'. - else if(fileOrInterval.trim().toLowerCase().equals("unmapped")) + else if (isUnmapped(fileOrInterval)) rawIntervals.add(GenomeLoc.UNMAPPED); // if it's a file, add items to raw interval list else if (isIntervalFile(fileOrInterval)) { @@ -75,6 +75,15 @@ public class IntervalUtils { return rawIntervals; } + /** + * Returns true if the interval string is the "unmapped" interval + * @param interval Interval to check + * @return true if the interval string is the "unmapped" interval + */ + public static boolean isUnmapped(String interval) { + return (interval != null && interval.trim().toLowerCase().equals("unmapped")); + } + /** * merge two interval lists, using an interval set rule * @param setOne a list of genomeLocs, in order (cannot be NULL) diff --git a/scala/qscript/chartl/expanded_targets.q b/scala/qscript/chartl/expanded_targets.q index ebf18f5b5..813e82758 100755 --- a/scala/qscript/chartl/expanded_targets.q +++ b/scala/qscript/chartl/expanded_targets.q @@ -1,6 +1,6 @@ import org.broadinstitute.sting.commandline.ArgumentCollection import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.library.ipf.ExpandIntervals +import org.broadinstitute.sting.queue.library.ipf.intervals.ExpandIntervals import org.broadinstitute.sting.queue.pipeline.PipelineArgumentCollection import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.utils.text.XReadLines diff --git a/scala/qscript/fullCallingPipeline.q b/scala/qscript/fullCallingPipeline.q index 4b23c1ac7..771ec7363 100755 --- a/scala/qscript/fullCallingPipeline.q +++ b/scala/qscript/fullCallingPipeline.q @@ -101,10 +101,8 @@ class fullCallingPipeline extends QScript { //val seq = qscript.machine //val expKind = qscript.protocol - // get contigs (needed for indel cleaning parallelism) - val contigs = IntervalUtils.distinctContigs( - qscript.pipeline.getProject.getReferenceFile, - List(qscript.pipeline.getProject.getIntervalList.getAbsolutePath)).toList + // get max num contigs for indel cleaning parallelism, plus 1 for -L unmapped + val numContigs = IntervalUtils.distinctContigs(qscript.pipeline.getProject.getReferenceFile).size + 1 for ( sample <- recalibratedSamples ) { val sampleId = sample.getId @@ -134,13 +132,12 @@ class fullCallingPipeline extends QScript { realigner.targetIntervals = targetCreator.out realigner.intervals = Nil realigner.intervalsString = Nil - realigner.scatterCount = num_cleaner_scatter_jobs min contigs.size + realigner.scatterCount = num_cleaner_scatter_jobs min numContigs realigner.rodBind :+= RodBind("dbsnp", dbsnpType, qscript.pipeline.getProject.getDbsnpFile) realigner.rodBind :+= RodBind("indels", "VCF", swapExt(realigner.reference_sequence.getParentFile, realigner.reference_sequence, "fasta", "1kg_pilot_indels.vcf")) // if scatter count is > 1, do standard scatter gather, if not, explicitly set up fix mates if (realigner.scatterCount > 1) { - realigner.intervalsString = contigs realigner.out = cleaned_bam // While gathering run fix mates. realigner.setupScatterFunction = { diff --git a/scala/src/HelloWorld.scala b/scala/src/HelloWorld.scala deleted file mode 100644 index 5c91a6d20..000000000 --- a/scala/src/HelloWorld.scala +++ /dev/null @@ -1,5 +0,0 @@ -object HelloWorld { - def main(args: Array[String]) { - println("Hello, world!") - } -} diff --git a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala index 256aab583..25d3bb699 100644 --- a/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala +++ b/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala @@ -2,16 +2,11 @@ package org.broadinstitute.sting.queue.extensions.gatk import org.broadinstitute.sting.commandline.ArgumentSource import org.broadinstitute.sting.utils.interval.IntervalUtils -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSource import java.io.File -import net.sf.picard.util.IntervalList -import net.sf.samtools.SAMFileHeader import collection.JavaConversions._ -import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser} import org.broadinstitute.sting.queue.util.IOUtils import org.broadinstitute.sting.queue.function.scattergather.{CloneFunction, ScatterGatherableFunction, ScatterFunction} import org.broadinstitute.sting.queue.function.{QFunction, InProcessFunction} -import org.broadinstitute.sting.queue.QException /** * An interval scatter function. @@ -19,11 +14,30 @@ import org.broadinstitute.sting.queue.QException class IntervalScatterFunction extends ScatterFunction with InProcessFunction { var splitByContig = false + /** The total number of clone jobs that will be created. */ + private var scatterCount: Int = _ + + /** The reference sequence for the GATK function. */ private var referenceSequence: File = _ + + /** The runtime field to set for specifying an interval file. */ private var intervalsField: ArgumentSource = _ + + /** The runtime field to set for specifying an interval string. */ private var intervalsStringField: ArgumentSource = _ + + /** The list of interval files ("/path/to/interval.list") or interval strings ("chr1", "chr2") to parse into smaller parts. */ private var intervals: List[String] = Nil + /** Whether the laster scatter job should also include any unmapped reads. */ + private var includeUnmapped: Boolean = _ + + /** + * Checks if the function is scatter gatherable. + * @param originalFunction Function to check. + * @return true if the function is a GATK function with the reference sequence set. + * @throws IllegalArgumentException if -BTI or -BTIMR are set. QScripts should not try to scatter gather with those option set. + */ def isScatterGatherable(originalFunction: ScatterGatherableFunction) = { if (originalFunction.isInstanceOf[CommandLineGATK]) { val gatk = originalFunction.asInstanceOf[CommandLineGATK] @@ -32,18 +46,32 @@ class IntervalScatterFunction extends ScatterFunction with InProcessFunction { } else false } + /** + * Sets the scatter gatherable function. + * @param originalFunction Function to bind. + */ def setScatterGatherable(originalFunction: ScatterGatherableFunction) = { val gatk = originalFunction.asInstanceOf[CommandLineGATK] - this.referenceSequence = gatk.reference_sequence - this.intervals ++= gatk.intervalsString - this.intervals ++= gatk.intervals.map(_.toString) this.intervalsField = QFunction.findField(originalFunction.getClass, "intervals") this.intervalsStringField = QFunction.findField(originalFunction.getClass, "intervalsString") + this.scatterCount = originalFunction.scatterCount + this.referenceSequence = gatk.reference_sequence + if (gatk.intervals.isEmpty && gatk.intervalsString.isEmpty) { + this.intervals ++= IntervalUtils.distinctContigs(this.referenceSequence).toList + this.includeUnmapped = this.splitByContig + } else { + this.intervals ++= gatk.intervals.map(_.toString) + this.intervals ++= gatk.intervalsString.filterNot(interval => IntervalUtils.isUnmapped(interval)) + this.includeUnmapped = gatk.intervalsString.exists(interval => IntervalUtils.isUnmapped(interval)) + } } def initCloneInputs(cloneFunction: CloneFunction, index: Int) = { cloneFunction.setFieldValue(this.intervalsField, List(new File("scatter.intervals"))) - cloneFunction.setFieldValue(this.intervalsStringField, List.empty[String]) + if (index == scatterCount && includeUnmapped) + cloneFunction.setFieldValue(this.intervalsStringField, List("unmapped")) + else + cloneFunction.setFieldValue(this.intervalsStringField, List.empty[String]) } def bindCloneInputs(cloneFunction: CloneFunction, index: Int) = { diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala index 27179e771..57b84e552 100755 --- a/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala +++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/intervals/ExpandIntervals.scala @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.queue.library.ipf +package org.broadinstitute.sting.queue.library.ipf.intervals import org.broadinstitute.sting.queue.function.InProcessFunction import org.broadinstitute.sting.commandline._ diff --git a/scala/src/org/broadinstitute/sting/queue/pipeline/PipelineArgumentCollection.scala b/scala/src/org/broadinstitute/sting/queue/pipeline/PipelineArgumentCollection.scala index 8d9fc68e4..0a0136923 100755 --- a/scala/src/org/broadinstitute/sting/queue/pipeline/PipelineArgumentCollection.scala +++ b/scala/src/org/broadinstitute/sting/queue/pipeline/PipelineArgumentCollection.scala @@ -39,9 +39,8 @@ class PipelineArgumentCollection { @Input(doc="Skip indel-cleaning for BAM files (for testing only)", shortName="skipCleaning", required=false) var skip_cleaning = false - @Input(doc="List of samples and bams (in the form sample_id k1:v1,k2:v2 "+ - "cleaned:/path/to/cleaned.bam,recalibrated:/path/to/recal.bam,unreacalibrated:/path/to/unrecal.bam)."+ - "Mutually exclusive with YAML",required=false, shortName="pBams") + @Input(doc="List of samples and bams (in the form sample_id k1:v1,k2:v2 cleaned:/path/to/cleaned.bam,recalibrated:/path/to/recal.bam,unreacalibrated:/path/to/unrecal.bam). Mutually exclusive with YAML", + required=false, shortName="pBams") var projectBams: File = _ @Input(doc="The project name. Mutually exclusive with YAML.", required = false, shortName="pName") diff --git a/scala/src/BaseTransitionTableCalculator.scala b/scala/src/org/broadinstitute/sting/scala/BaseTransitionTableCalculator.scala similarity index 98% rename from scala/src/BaseTransitionTableCalculator.scala rename to scala/src/org/broadinstitute/sting/scala/BaseTransitionTableCalculator.scala index ddd3b8548..7fe249beb 100755 --- a/scala/src/BaseTransitionTableCalculator.scala +++ b/scala/src/org/broadinstitute/sting/scala/BaseTransitionTableCalculator.scala @@ -1,5 +1,5 @@ -/**package org.broadinstitute.sting.scala - +package org.broadinstitute.sting.scala +/** import gatk.walkers.genotyper.{UnifiedGenotyper, GenotypeCall} import java.io.File import net.sf.samtools.SAMRecord @@ -68,7 +68,7 @@ class TransitionTable() { } } -class BaseTransitionTableCalculator { // extends LocusWalker[Unit,Int] { +**/class BaseTransitionTableCalculator /**{ // extends LocusWalker[Unit,Int] { private var MIN_MAPPING_QUALITY = 30 private var MIN_BASE_QUALITY = 20 private var MIN_LOD = 5 diff --git a/scala/src/IntervalAnnotationWalker.scala b/scala/src/org/broadinstitute/sting/scala/IntervalAnnotationWalker.scala similarity index 98% rename from scala/src/IntervalAnnotationWalker.scala rename to scala/src/org/broadinstitute/sting/scala/IntervalAnnotationWalker.scala index 50f7fafde..fa30fceff 100755 --- a/scala/src/IntervalAnnotationWalker.scala +++ b/scala/src/org/broadinstitute/sting/scala/IntervalAnnotationWalker.scala @@ -12,6 +12,7 @@ import org.broadinstitute.sting.gatk.walkers.{TreeReducible, RefWalker} import org.broadinstitute.sting.commandline.{Output, Argument} import org.broadinstitute.sting.utils.{BaseUtils, GenomeLoc} import collection.mutable.{ListBuffer, HashSet} +import java.lang.Math class IntervalAnnotationWalker extends RefWalker[AnnotationMetaData,List[IntervalInfoBuilder]] { @Argument(doc="Min proportion of bases overlapping between an interval of interest and an annotation interval for annotation to occur",shortName="mpb") @@ -109,9 +110,9 @@ class IntervalInfoBuilder(loc : GenomeLoc, minProp : Double) { Math.log(1+byteList.tail.size-byteList.tail.dropWhile( u => u.equals(byteList(1))).size) + calcEntropy(byteList.tail.foldLeft(ListBuffer(byteList(0)))( (a,b) => { if ( b.equals(byteList(1)) ) { - a.dropRight(1) + (a.last ++ b) + a.dropRight(1) :+ (a.last ++ b) } else { - a + b + a :+ b } })) } diff --git a/scala/src/ScalaCountLoci.scala b/scala/src/org/broadinstitute/sting/scala/ScalaCountLoci.scala similarity index 100% rename from scala/src/ScalaCountLoci.scala rename to scala/src/org/broadinstitute/sting/scala/ScalaCountLoci.scala