Thanks to Matt for walking me through a proper version of VCF_BAM_utilities! Feel free to add to it, or use it to get the samples in a VCF file, a BAM file, or a collection of BAM files

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5223 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
fromer 2011-02-09 18:08:27 +00:00
parent b992abb6eb
commit 947cc44854
3 changed files with 72 additions and 63 deletions

View File

@ -2,7 +2,7 @@ package oneoffs.fromer
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.util.BAMutilities
import org.broadinstitute.sting.queue.util.VCF_BAM_utilities
class PhaseSamples extends QScript {
qscript =>
@ -44,15 +44,21 @@ class PhaseSamples extends QScript {
// A target has a list of samples and bam files to use for phasing
class Target(val name: String, val samples: List[String], val bams: List[File]) {
def phasedVCFFile = new File(name + "." + outputPhased)
var prefix: String = outputPhased.getParent()
if (prefix == null)
prefix = ""
else
prefix = prefix + "/"
override def toString(): String = String.format("[Target %s with samples %s against bams %s]", name, samples, bams)
def phasedVCFFile = new File(prefix + name + "." + outputPhased.getName())
override def toString(): String = String.format("[Target %s [%s] with samples %s against bams %s]", name, phasedVCFFile, samples, bams)
}
def script = {
if (qscript.scatterCount > 0) throw new RuntimeException("scatter/gather currently not implemented")
val samples: List[String] = BAMutilities.getSamplesFromVCF(masterCalls)
val samples: List[String] = VCF_BAM_utilities.getSamplesFromVCF(masterCalls)
Console.out.printf("Samples are %s%n", samples)
val targets: List[Target] = bamsToTargets(samples, bams)
@ -68,14 +74,14 @@ class PhaseSamples extends QScript {
}
def bamsToTargets(samples: List[String], bamsIn: File): List[Target] = {
val bams: List[File] = BAMutilities.parseBamsInput(bamsIn)
val sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = BAMutilities.getMapOfBamsForSample(bams)
val bams: List[File] = VCF_BAM_utilities.parseBAMsInput(bamsIn)
val sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = VCF_BAM_utilities.getMapOfBAMsForSample(bams)
def buildTargets(samples: List[String], count: Int): List[Target] = (samples splitAt samplesPerJob) match {
case (Nil, y) =>
return Nil
case (subsamples, remaining) =>
return new Target("group" + count, subsamples, BAMutilities.findBamsForSamples(subsamples, sampleToBams)) ::
return new Target("group" + count, subsamples, VCF_BAM_utilities.findBAMsForSamples(subsamples, sampleToBams)) ::
buildTargets(remaining, count + 1)
}

View File

@ -1,56 +0,0 @@
package org.broadinstitute.sting.queue.util
import tools.nsc.io.Process
import java.io.File
import org.apache.commons.io.FilenameUtils
import scala.io.Source._
object BAMutilities {
def getSamplesFromVCF(calls: File): List[String] = {
val getSamplesCommand: String = "cat " + calls.getPath() + " | grep '^#CHROM' | head -1 | awk '{for (i = 10; i <= NF; i++) print $i}'"
//println("getSamplesCommand: " + getSamplesCommand)
return Process(getSamplesCommand).iterator toList
}
def parseBamsInput(bamsIn: File): List[File] = FilenameUtils.getExtension(bamsIn.getPath) match {
case "bam" => return List(bamsIn)
case "list" => return (for (line <- fromFile(bamsIn).getLines) yield new File(line)).toList
case _ => throw new RuntimeException("Unexpected BAM input type: " + bamsIn + "; only permitted extensions are .bam and .list")
}
def getMapOfBamsForSample(bams: List[File]): scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = bams match {
case Nil => return scala.collection.mutable.Map.empty[String, scala.collection.mutable.Set[File]]
case x :: y =>
val m: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = getMapOfBamsForSample(y)
val getBamSamplesCommand: String = "samtools view -H " + x.getPath() + " | grep '^@RG' | awk '{for (i = 1; i <= NF; i++) if (substr($i,1,3) == \"SM:\") print substr($i,4)}' | sort | uniq"
//println("getBamSamplesCommand: " + getBamSamplesCommand)
val bamSamples: List[String] = Process(getBamSamplesCommand).iterator toList
for (s <- bamSamples) {
if (!m.contains(s))
m += s -> scala.collection.mutable.Set.empty[File]
m(s) = m(s) + x
}
return m
}
def findBamsForSamples(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]]): List[File] = {
val l: List[File] = Nil
l ++ findBamsForSamplesHelper(samples, sampleToBams)
}
def findBamsForSamplesHelper(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]]): scala.collection.mutable.Set[File] = samples match {
case Nil => scala.collection.mutable.Set.empty[File]
case x :: y =>
var bamsForSampleX: scala.collection.mutable.Set[File] = scala.collection.mutable.Set.empty[File]
if (sampleToBams.contains(x))
bamsForSampleX = sampleToBams(x)
return bamsForSampleX ++ findBamsForSamplesHelper(y, sampleToBams)
}
}

View File

@ -0,0 +1,59 @@
package org.broadinstitute.sting.queue.util
import java.io.File
import org.apache.commons.io.FilenameUtils
import scala.io.Source._
import net.sf.samtools.SAMFileReader
import org.broad.tribble.source.BasicFeatureSource
import org.broad.tribble.vcf.{VCFHeader, VCFCodec}
import scala.collection.JavaConversions._
object VCF_BAM_utilities {
def getSamplesFromVCF(vcfFile: File): List[String] = {
return BasicFeatureSource.getFeatureSource(vcfFile.getPath(), new VCFCodec()).getHeader().asInstanceOf[VCFHeader].getGenotypeSamples().toList
}
def getSamplesInBAM(bam: File): List[String] = {
return new SAMFileReader(bam).getFileHeader().getReadGroups().toList.map(srgr => srgr.getSample()).toSet.toList
}
def parseBAMsInput(bamsIn: File): List[File] = FilenameUtils.getExtension(bamsIn.getPath) match {
case "bam" => return List(bamsIn)
case "list" => return (for (line <- fromFile(bamsIn).getLines) yield new File(line)).toList
case _ => throw new RuntimeException("Unexpected BAM input type: " + bamsIn + "; only permitted extensions are .bam and .list")
}
def getMapOfBAMsForSample(bams: List[File]): scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = bams match {
case Nil => return scala.collection.mutable.Map.empty[String, scala.collection.mutable.Set[File]]
case x :: y =>
val m: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = getMapOfBAMsForSample(y)
val bamSamples: List[String] = getSamplesInBAM(x)
for (s <- bamSamples) {
if (!m.contains(s))
m += s -> scala.collection.mutable.Set.empty[File]
m(s) = m(s) + x
}
return m
}
def findBAMsForSamples(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]]): List[File] = {
def findBAMsForSamplesHelper(samples: List[String]): scala.collection.mutable.Set[File] = samples match {
case Nil => scala.collection.mutable.Set.empty[File]
case x :: y =>
var bamsForSampleX: scala.collection.mutable.Set[File] = scala.collection.mutable.Set.empty[File]
if (sampleToBams.contains(x))
bamsForSampleX = sampleToBams(x)
return bamsForSampleX ++ findBAMsForSamplesHelper(y)
}
val l: List[File] = Nil
return l ++ findBAMsForSamplesHelper(samples)
}
}