diff --git a/scala/qscript/oneoffs/fromer/PhaseSamples.scala b/scala/qscript/oneoffs/fromer/PhaseSamples.scala index 40d34e855..8f382b230 100644 --- a/scala/qscript/oneoffs/fromer/PhaseSamples.scala +++ b/scala/qscript/oneoffs/fromer/PhaseSamples.scala @@ -2,7 +2,7 @@ package oneoffs.fromer import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.util.BAMutilities +import org.broadinstitute.sting.queue.util.VCF_BAM_utilities class PhaseSamples extends QScript { qscript => @@ -44,15 +44,21 @@ class PhaseSamples extends QScript { // A target has a list of samples and bam files to use for phasing class Target(val name: String, val samples: List[String], val bams: List[File]) { - def phasedVCFFile = new File(name + "." + outputPhased) + var prefix: String = outputPhased.getParent() + if (prefix == null) + prefix = "" + else + prefix = prefix + "/" - override def toString(): String = String.format("[Target %s with samples %s against bams %s]", name, samples, bams) + def phasedVCFFile = new File(prefix + name + "." + outputPhased.getName()) + + override def toString(): String = String.format("[Target %s [%s] with samples %s against bams %s]", name, phasedVCFFile, samples, bams) } def script = { if (qscript.scatterCount > 0) throw new RuntimeException("scatter/gather currently not implemented") - val samples: List[String] = BAMutilities.getSamplesFromVCF(masterCalls) + val samples: List[String] = VCF_BAM_utilities.getSamplesFromVCF(masterCalls) Console.out.printf("Samples are %s%n", samples) val targets: List[Target] = bamsToTargets(samples, bams) @@ -68,14 +74,14 @@ class PhaseSamples extends QScript { } def bamsToTargets(samples: List[String], bamsIn: File): List[Target] = { - val bams: List[File] = BAMutilities.parseBamsInput(bamsIn) - val sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = BAMutilities.getMapOfBamsForSample(bams) + val bams: List[File] = VCF_BAM_utilities.parseBAMsInput(bamsIn) + val sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = VCF_BAM_utilities.getMapOfBAMsForSample(bams) def buildTargets(samples: List[String], count: Int): List[Target] = (samples splitAt samplesPerJob) match { case (Nil, y) => return Nil case (subsamples, remaining) => - return new Target("group" + count, subsamples, BAMutilities.findBamsForSamples(subsamples, sampleToBams)) :: + return new Target("group" + count, subsamples, VCF_BAM_utilities.findBAMsForSamples(subsamples, sampleToBams)) :: buildTargets(remaining, count + 1) } diff --git a/scala/src/org/broadinstitute/sting/queue/util/BAMutilities.scala b/scala/src/org/broadinstitute/sting/queue/util/BAMutilities.scala deleted file mode 100644 index 7a5b9237b..000000000 --- a/scala/src/org/broadinstitute/sting/queue/util/BAMutilities.scala +++ /dev/null @@ -1,56 +0,0 @@ -package org.broadinstitute.sting.queue.util - -import tools.nsc.io.Process -import java.io.File -import org.apache.commons.io.FilenameUtils -import scala.io.Source._ - -object BAMutilities { - - def getSamplesFromVCF(calls: File): List[String] = { - val getSamplesCommand: String = "cat " + calls.getPath() + " | grep '^#CHROM' | head -1 | awk '{for (i = 10; i <= NF; i++) print $i}'" - //println("getSamplesCommand: " + getSamplesCommand) - return Process(getSamplesCommand).iterator toList - } - - def parseBamsInput(bamsIn: File): List[File] = FilenameUtils.getExtension(bamsIn.getPath) match { - case "bam" => return List(bamsIn) - case "list" => return (for (line <- fromFile(bamsIn).getLines) yield new File(line)).toList - case _ => throw new RuntimeException("Unexpected BAM input type: " + bamsIn + "; only permitted extensions are .bam and .list") - } - - def getMapOfBamsForSample(bams: List[File]): scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = bams match { - case Nil => return scala.collection.mutable.Map.empty[String, scala.collection.mutable.Set[File]] - - case x :: y => - val m: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = getMapOfBamsForSample(y) - - val getBamSamplesCommand: String = "samtools view -H " + x.getPath() + " | grep '^@RG' | awk '{for (i = 1; i <= NF; i++) if (substr($i,1,3) == \"SM:\") print substr($i,4)}' | sort | uniq" - //println("getBamSamplesCommand: " + getBamSamplesCommand) - val bamSamples: List[String] = Process(getBamSamplesCommand).iterator toList - - for (s <- bamSamples) { - if (!m.contains(s)) - m += s -> scala.collection.mutable.Set.empty[File] - - m(s) = m(s) + x - } - - return m - } - - def findBamsForSamples(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]]): List[File] = { - val l: List[File] = Nil - l ++ findBamsForSamplesHelper(samples, sampleToBams) - } - - def findBamsForSamplesHelper(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]]): scala.collection.mutable.Set[File] = samples match { - case Nil => scala.collection.mutable.Set.empty[File] - - case x :: y => - var bamsForSampleX: scala.collection.mutable.Set[File] = scala.collection.mutable.Set.empty[File] - if (sampleToBams.contains(x)) - bamsForSampleX = sampleToBams(x) - return bamsForSampleX ++ findBamsForSamplesHelper(y, sampleToBams) - } -} \ No newline at end of file diff --git a/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala b/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala new file mode 100644 index 000000000..91b7bd874 --- /dev/null +++ b/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala @@ -0,0 +1,59 @@ +package org.broadinstitute.sting.queue.util + +import java.io.File +import org.apache.commons.io.FilenameUtils +import scala.io.Source._ +import net.sf.samtools.SAMFileReader +import org.broad.tribble.source.BasicFeatureSource +import org.broad.tribble.vcf.{VCFHeader, VCFCodec} +import scala.collection.JavaConversions._ + +object VCF_BAM_utilities { + + def getSamplesFromVCF(vcfFile: File): List[String] = { + return BasicFeatureSource.getFeatureSource(vcfFile.getPath(), new VCFCodec()).getHeader().asInstanceOf[VCFHeader].getGenotypeSamples().toList + } + + def getSamplesInBAM(bam: File): List[String] = { + return new SAMFileReader(bam).getFileHeader().getReadGroups().toList.map(srgr => srgr.getSample()).toSet.toList + } + + def parseBAMsInput(bamsIn: File): List[File] = FilenameUtils.getExtension(bamsIn.getPath) match { + case "bam" => return List(bamsIn) + case "list" => return (for (line <- fromFile(bamsIn).getLines) yield new File(line)).toList + case _ => throw new RuntimeException("Unexpected BAM input type: " + bamsIn + "; only permitted extensions are .bam and .list") + } + + def getMapOfBAMsForSample(bams: List[File]): scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = bams match { + case Nil => return scala.collection.mutable.Map.empty[String, scala.collection.mutable.Set[File]] + + case x :: y => + val m: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = getMapOfBAMsForSample(y) + val bamSamples: List[String] = getSamplesInBAM(x) + + for (s <- bamSamples) { + if (!m.contains(s)) + m += s -> scala.collection.mutable.Set.empty[File] + + m(s) = m(s) + x + } + + return m + } + + def findBAMsForSamples(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]]): List[File] = { + + def findBAMsForSamplesHelper(samples: List[String]): scala.collection.mutable.Set[File] = samples match { + case Nil => scala.collection.mutable.Set.empty[File] + + case x :: y => + var bamsForSampleX: scala.collection.mutable.Set[File] = scala.collection.mutable.Set.empty[File] + if (sampleToBams.contains(x)) + bamsForSampleX = sampleToBams(x) + return bamsForSampleX ++ findBAMsForSamplesHelper(y) + } + + val l: List[File] = Nil + return l ++ findBAMsForSamplesHelper(samples) + } +} \ No newline at end of file