Added queue/util/BAMutilities Object [with BAM and VCF parsing utilities], which is now used by my qscripts that robustly split runs by sample
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5214 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
8040998c15
commit
8d0f1b75d5
|
|
@ -1,11 +1,8 @@
|
||||||
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
|
package oneoffs.fromer
|
||||||
|
|
||||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||||
import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
|
|
||||||
import org.broadinstitute.sting.queue.QScript
|
import org.broadinstitute.sting.queue.QScript
|
||||||
import org.apache.commons.io.FilenameUtils
|
import org.broadinstitute.sting.queue.util.BAMutilities
|
||||||
import tools.nsc.io.Process;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType
|
|
||||||
import scala.io.Source._
|
|
||||||
|
|
||||||
class PhaseSamples extends QScript {
|
class PhaseSamples extends QScript {
|
||||||
qscript =>
|
qscript =>
|
||||||
|
|
@ -55,7 +52,7 @@ class PhaseSamples extends QScript {
|
||||||
def script = {
|
def script = {
|
||||||
if (qscript.scatterCount > 0) throw new RuntimeException("scatter/gather currently not implemented")
|
if (qscript.scatterCount > 0) throw new RuntimeException("scatter/gather currently not implemented")
|
||||||
|
|
||||||
val samples: List[String] = getSamples()
|
val samples: List[String] = BAMutilities.getSamplesFromVCF(masterCalls)
|
||||||
Console.out.printf("Samples are %s%n", samples)
|
Console.out.printf("Samples are %s%n", samples)
|
||||||
|
|
||||||
val targets: List[Target] = bamsToTargets(samples, bams)
|
val targets: List[Target] = bamsToTargets(samples, bams)
|
||||||
|
|
@ -71,67 +68,20 @@ class PhaseSamples extends QScript {
|
||||||
}
|
}
|
||||||
|
|
||||||
def bamsToTargets(samples: List[String], bamsIn: File): List[Target] = {
|
def bamsToTargets(samples: List[String], bamsIn: File): List[Target] = {
|
||||||
val bams: List[File] = parseBamsInput(bamsIn)
|
val bams: List[File] = BAMutilities.parseBamsInput(bamsIn)
|
||||||
val sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = getMapOfBamsForSample(bams)
|
val sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = BAMutilities.getMapOfBamsForSample(bams)
|
||||||
|
|
||||||
def buildTargets(samples: List[String], count: Int): List[Target] = (samples splitAt samplesPerJob) match {
|
def buildTargets(samples: List[String], count: Int): List[Target] = (samples splitAt samplesPerJob) match {
|
||||||
case (Nil, y) =>
|
case (Nil, y) =>
|
||||||
return Nil
|
return Nil
|
||||||
case (subsamples, remaining) =>
|
case (subsamples, remaining) =>
|
||||||
return new Target("group" + count, subsamples, findBamsForSamples(subsamples, sampleToBams)) ::
|
return new Target("group" + count, subsamples, BAMutilities.findBamsForSamples(subsamples, sampleToBams)) ::
|
||||||
buildTargets(remaining, count + 1)
|
buildTargets(remaining, count + 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
return buildTargets(samples, 0)
|
return buildTargets(samples, 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
def getMapOfBamsForSample(bams: List[File]): scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = bams match {
|
|
||||||
case Nil => return scala.collection.mutable.Map.empty[String, scala.collection.mutable.Set[File]]
|
|
||||||
|
|
||||||
case x :: y =>
|
|
||||||
val m: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = getMapOfBamsForSample(y)
|
|
||||||
|
|
||||||
val getBamSamplesCommand: String = "samtools view -H " + x.getPath() + " | grep '^@RG' | awk '{for (i = 1; i <= NF; i++) if (substr($i,1,3) == \"SM:\") print substr($i,4)}' | sort | uniq"
|
|
||||||
//println("getBamSamplesCommand: " + getBamSamplesCommand)
|
|
||||||
val bamSamples: List[String] = Process(getBamSamplesCommand).iterator toList
|
|
||||||
|
|
||||||
for (s <- bamSamples) {
|
|
||||||
if (!m.contains(s))
|
|
||||||
m += s -> scala.collection.mutable.Set.empty[File]
|
|
||||||
|
|
||||||
m(s) = m(s) + x
|
|
||||||
}
|
|
||||||
|
|
||||||
return m
|
|
||||||
}
|
|
||||||
|
|
||||||
def findBamsForSamples(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]]): List[File] = {
|
|
||||||
val l: List[File] = Nil
|
|
||||||
l ++ findBamsForSamplesHelper(samples, sampleToBams)
|
|
||||||
}
|
|
||||||
|
|
||||||
def findBamsForSamplesHelper(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]]): scala.collection.mutable.Set[File] = samples match {
|
|
||||||
case Nil => scala.collection.mutable.Set.empty[File]
|
|
||||||
|
|
||||||
case x :: y =>
|
|
||||||
var bamsForSampleX: scala.collection.mutable.Set[File] = scala.collection.mutable.Set.empty[File]
|
|
||||||
if (sampleToBams.contains(x))
|
|
||||||
bamsForSampleX = sampleToBams(x)
|
|
||||||
return bamsForSampleX ++ findBamsForSamplesHelper(y, sampleToBams)
|
|
||||||
}
|
|
||||||
|
|
||||||
def getSamples(): List[String] = {
|
|
||||||
val getSamplesCommand: String = "cat " + masterCalls.getPath() + " | grep '^#CHROM' | head -1 | awk '{for (i = 10; i <= NF; i++) print $i}'"
|
|
||||||
//println("getSamplesCommand: " + getSamplesCommand)
|
|
||||||
return Process(getSamplesCommand).iterator toList
|
|
||||||
}
|
|
||||||
|
|
||||||
def parseBamsInput(bamsIn: File): List[File] = FilenameUtils.getExtension(bamsIn.getPath) match {
|
|
||||||
case "bam" => return List(bamsIn)
|
|
||||||
case "list" => return (for (line <- fromFile(bamsIn).getLines) yield new File(line)).toList
|
|
||||||
case _ => throw new RuntimeException("Unexpected BAM input type: " + bamsIn + "; only permitted extensions are .bam and .list")
|
|
||||||
}
|
|
||||||
|
|
||||||
class PhaseSamples(t: Target) extends org.broadinstitute.sting.queue.extensions.gatk.ReadBackedPhasing with CommandLineGATKArgs {
|
class PhaseSamples(t: Target) extends org.broadinstitute.sting.queue.extensions.gatk.ReadBackedPhasing with CommandLineGATKArgs {
|
||||||
this.rodBind :+= RodBind("variant", "VCF", qscript.masterCalls)
|
this.rodBind :+= RodBind("variant", "VCF", qscript.masterCalls)
|
||||||
this.out = t.phasedVCFFile
|
this.out = t.phasedVCFFile
|
||||||
|
|
|
||||||
|
|
@ -2,8 +2,8 @@ package oneoffs.fromer
|
||||||
|
|
||||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||||
import org.broadinstitute.sting.queue.QScript
|
import org.broadinstitute.sting.queue.QScript
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.DownsampleType
|
import org.broadinstitute.sting.gatk.DownsampleType
|
||||||
|
import org.broadinstitute.sting.queue.util.BAMutilities
|
||||||
|
|
||||||
|
|
||||||
class ReadDepthCNVanalysis extends QScript {
|
class ReadDepthCNVanalysis extends QScript {
|
||||||
|
|
@ -24,6 +24,9 @@ class ReadDepthCNVanalysis extends QScript {
|
||||||
@Input(doc = "level of parallelism for BAM DoC. By default is set to 0 [no scattering].", shortName = "scatter", required = false)
|
@Input(doc = "level of parallelism for BAM DoC. By default is set to 0 [no scattering].", shortName = "scatter", required = false)
|
||||||
var scatterCountInput = 0
|
var scatterCountInput = 0
|
||||||
|
|
||||||
|
@Input(doc = "Samples to phase together. By default is set to 1 [one job per sample].", shortName = "samplesPerJob", required = false)
|
||||||
|
var samplesPerJob = 1
|
||||||
|
|
||||||
@Output(doc = "DoC file to output", shortName = "o", required = true)
|
@Output(doc = "DoC file to output", shortName = "o", required = true)
|
||||||
var outputDoC: File = _
|
var outputDoC: File = _
|
||||||
|
|
||||||
|
|
@ -44,11 +47,40 @@ class ReadDepthCNVanalysis extends QScript {
|
||||||
this.logging_level = "INFO"
|
this.logging_level = "INFO"
|
||||||
}
|
}
|
||||||
|
|
||||||
def script = {
|
// A target has a list of samples and bam files to use for DoC
|
||||||
add(new DepthOfCoverage(bams, outputDoC))
|
class Target(val name: String, val samples: List[String], val bams: List[File]) {
|
||||||
|
def DoC_output = new File(name + "." + outputDoC)
|
||||||
|
|
||||||
|
override def toString(): String = String.format("[Target %s with samples %s against bams %s]", name, samples, bams)
|
||||||
}
|
}
|
||||||
|
|
||||||
class DepthOfCoverage(bam: File, docOutPath: File) extends org.broadinstitute.sting.queue.extensions.gatk.DepthOfCoverage with CommandLineGATKArgs {
|
def script = {
|
||||||
|
val sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = BAMutilities.getMapOfBamsForSample(BAMutilities.parseBamsInput(bams))
|
||||||
|
val samples: List[String] = sampleToBams.keys.toList
|
||||||
|
Console.out.printf("Samples are %s%n", samples)
|
||||||
|
|
||||||
|
val targets: List[Target] = buildTargets(samples, sampleToBams)
|
||||||
|
|
||||||
|
for (target <- targets) {
|
||||||
|
Console.out.printf("Target is %s%n", target)
|
||||||
|
add(new DoC(target))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def buildTargets(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]]): List[Target] = {
|
||||||
|
|
||||||
|
def buildTargetsHelper(samples: List[String], count: Int): List[Target] = (samples splitAt samplesPerJob) match {
|
||||||
|
case (Nil, y) =>
|
||||||
|
return Nil
|
||||||
|
case (subsamples, remaining) =>
|
||||||
|
return new Target("group" + count, subsamples, BAMutilities.findBamsForSamples(subsamples, sampleToBams)) ::
|
||||||
|
buildTargetsHelper(remaining, count + 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
return buildTargetsHelper(samples, 0)
|
||||||
|
}
|
||||||
|
|
||||||
|
class DoC(t: Target) extends org.broadinstitute.sting.queue.extensions.gatk.DepthOfCoverage with CommandLineGATKArgs {
|
||||||
this.omitIntervalStatistics = false
|
this.omitIntervalStatistics = false
|
||||||
this.omitDepthOutputAtEachBase = true
|
this.omitDepthOutputAtEachBase = true
|
||||||
this.omitLocusTable = true
|
this.omitLocusTable = true
|
||||||
|
|
@ -56,8 +88,8 @@ class ReadDepthCNVanalysis extends QScript {
|
||||||
this.minBaseQuality = Some(0)
|
this.minBaseQuality = Some(0)
|
||||||
this.minMappingQuality = Some(0)
|
this.minMappingQuality = Some(0)
|
||||||
|
|
||||||
this.out = docOutPath
|
this.out = t.DoC_output
|
||||||
this.input_file :+= bam
|
this.input_file = t.bams
|
||||||
|
|
||||||
this.dcov = Some(MAX_DEPTH)
|
this.dcov = Some(MAX_DEPTH)
|
||||||
this.downsampling_type = Some(DownsampleType.BY_SAMPLE)
|
this.downsampling_type = Some(DownsampleType.BY_SAMPLE)
|
||||||
|
|
@ -68,7 +100,7 @@ class ReadDepthCNVanalysis extends QScript {
|
||||||
|
|
||||||
this.scatterCount = scatterCountInput
|
this.scatterCount = scatterCountInput
|
||||||
|
|
||||||
override def dotString = "DOC: " + bam.getName
|
override def dotString = "DOC: " + t.DoC_output
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -0,0 +1,56 @@
|
||||||
|
package org.broadinstitute.sting.queue.util
|
||||||
|
|
||||||
|
import tools.nsc.io.Process
|
||||||
|
import java.io.File
|
||||||
|
import org.apache.commons.io.FilenameUtils
|
||||||
|
import scala.io.Source._
|
||||||
|
|
||||||
|
object BAMutilities {
|
||||||
|
|
||||||
|
def getSamplesFromVCF(calls: File): List[String] = {
|
||||||
|
val getSamplesCommand: String = "cat " + calls.getPath() + " | grep '^#CHROM' | head -1 | awk '{for (i = 10; i <= NF; i++) print $i}'"
|
||||||
|
//println("getSamplesCommand: " + getSamplesCommand)
|
||||||
|
return Process(getSamplesCommand).iterator toList
|
||||||
|
}
|
||||||
|
|
||||||
|
def parseBamsInput(bamsIn: File): List[File] = FilenameUtils.getExtension(bamsIn.getPath) match {
|
||||||
|
case "bam" => return List(bamsIn)
|
||||||
|
case "list" => return (for (line <- fromFile(bamsIn).getLines) yield new File(line)).toList
|
||||||
|
case _ => throw new RuntimeException("Unexpected BAM input type: " + bamsIn + "; only permitted extensions are .bam and .list")
|
||||||
|
}
|
||||||
|
|
||||||
|
def getMapOfBamsForSample(bams: List[File]): scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = bams match {
|
||||||
|
case Nil => return scala.collection.mutable.Map.empty[String, scala.collection.mutable.Set[File]]
|
||||||
|
|
||||||
|
case x :: y =>
|
||||||
|
val m: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = getMapOfBamsForSample(y)
|
||||||
|
|
||||||
|
val getBamSamplesCommand: String = "samtools view -H " + x.getPath() + " | grep '^@RG' | awk '{for (i = 1; i <= NF; i++) if (substr($i,1,3) == \"SM:\") print substr($i,4)}' | sort | uniq"
|
||||||
|
//println("getBamSamplesCommand: " + getBamSamplesCommand)
|
||||||
|
val bamSamples: List[String] = Process(getBamSamplesCommand).iterator toList
|
||||||
|
|
||||||
|
for (s <- bamSamples) {
|
||||||
|
if (!m.contains(s))
|
||||||
|
m += s -> scala.collection.mutable.Set.empty[File]
|
||||||
|
|
||||||
|
m(s) = m(s) + x
|
||||||
|
}
|
||||||
|
|
||||||
|
return m
|
||||||
|
}
|
||||||
|
|
||||||
|
def findBamsForSamples(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]]): List[File] = {
|
||||||
|
val l: List[File] = Nil
|
||||||
|
l ++ findBamsForSamplesHelper(samples, sampleToBams)
|
||||||
|
}
|
||||||
|
|
||||||
|
def findBamsForSamplesHelper(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]]): scala.collection.mutable.Set[File] = samples match {
|
||||||
|
case Nil => scala.collection.mutable.Set.empty[File]
|
||||||
|
|
||||||
|
case x :: y =>
|
||||||
|
var bamsForSampleX: scala.collection.mutable.Set[File] = scala.collection.mutable.Set.empty[File]
|
||||||
|
if (sampleToBams.contains(x))
|
||||||
|
bamsForSampleX = sampleToBams(x)
|
||||||
|
return bamsForSampleX ++ findBamsForSamplesHelper(y, sampleToBams)
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue