gatk-3.8/scala/test/org/broadinstitute/sting/queue/pipeline/FullCallingPipelineTest.scala

173 lines
7.4 KiB
Scala

package org.broadinstitute.sting.queue.pipeline
import org.testng.annotations.{DataProvider, Test}
import collection.JavaConversions._
import java.io.File
import org.broadinstitute.sting.datasources.pipeline.{PipelineSample, PipelineProject, Pipeline}
import org.broadinstitute.sting.utils.yaml.YamlUtils
import org.broadinstitute.sting.{WalkerTest, BaseTest}
import java.text.SimpleDateFormat
import java.util.Date
class FullCallingPipelineTest extends BaseTest {
def datasets = List(k1gChr20Dataset, k1gExomeDataset)
private final val validationReportsDataLocation = "/humgen/gsa-hpprojects/GATK/validationreports/submitted/"
val k1gChr20Dataset = {
val dataset = newK1gDataset("Barcoded_1000G_WEx_chr20")
dataset.pipeline.getProject.setIntervalList(new File(BaseTest.GATKDataLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list"))
dataset.validations :+= new IntegerValidation("eval.dbsnp.all.called.all.counter.nCalledLoci", 1352)
dataset.validations :+= new IntegerValidation("eval.dbsnp.all.called.known.counter.nCalledLoci", 1128)
dataset.validations :+= new IntegerValidation("eval.dbsnp.all.called.novel.counter.nCalledLoci", 224)
dataset.validations :+= new DoubleValidation("eval.dbsnp.all.called.all.titv.tiTvRatio", 3.6621)
dataset.validations :+= new DoubleValidation("eval.dbsnp.all.called.known.titv.tiTvRatio", 3.7395)
dataset.validations :+= new DoubleValidation("eval.dbsnp.all.called.novel.titv.tiTvRatio", 3.3077)
dataset.jobQueue = "hour"
dataset
}
val k1gExomeDataset = {
val dataset = newK1gDataset("Barcoded_1000G_WEx")
dataset.pipeline.getProject.setIntervalList(new File(BaseTest.GATKDataLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list"))
dataset.validations :+= new IntegerValidation("eval.dbsnp.all.called.all.counter.nCalledLoci", 50936)
dataset.validations :+= new IntegerValidation("eval.dbsnp.all.called.known.counter.nCalledLoci", 40956)
dataset.validations :+= new IntegerValidation("eval.dbsnp.all.called.novel.counter.nCalledLoci", 9980)
dataset.validations :+= new DoubleValidation("eval.dbsnp.all.called.all.titv.tiTvRatio", 3.2724)
dataset.validations :+= new DoubleValidation("eval.dbsnp.all.called.known.titv.tiTvRatio", 3.3349)
dataset.validations :+= new DoubleValidation("eval.dbsnp.all.called.novel.titv.tiTvRatio", 3.0340)
dataset.jobQueue = "gsa"
dataset.bigMemQueue = "gsa"
dataset
}
def newK1gDataset(projectName: String) = {
val project = new PipelineProject
project.setName(projectName)
project.setReferenceFile(new File(BaseTest.hg19Reference))
project.setDbsnpFile(new File(BaseTest.b37dbSNP129))
val squid = "C426"
val ids = List(
"NA19651","NA19655","NA19669","NA19834","HG01440",
"NA12342","NA12748","NA19649","NA19652","NA19654")
var samples = List.empty[PipelineSample]
for (id <- ids) {
val sample = new PipelineSample
sample.setId(projectName + "_" + id)
sample.setBamFiles(Map("recalibrated" -> new File("/seq/picard_aggregation/%1$s/%2$s/v6/%2$s.bam".format(squid,id))))
sample.setTags(Map("SQUIDProject" -> squid, "CollaboratorID" -> id))
samples :+= sample
}
val pipeline = new Pipeline
pipeline.setProject(project)
pipeline.setSamples(samples)
val dataset = new PipelineDataset
dataset.pipeline = pipeline
dataset.refseq = BaseTest.hg19Refseq
dataset.targetTiTv = "3.0"
dataset
}
@DataProvider(name="datasets")//, parallel=true)
final def convertDatasets: Array[Array[AnyRef]] =
datasets.map(dataset => Array(dataset.asInstanceOf[AnyRef])).toArray
@Test(dataProvider="datasets")
def testFullCallingPipeline(dataset: PipelineDataset) = {
val projectName = dataset.pipeline.getProject.getName
val testName = "fullCallingPipeline-" + projectName
val yamlFile = writeTempYaml(dataset.pipeline)
var cleanType = "cleaned"
// Run the pipeline with the expected inputs.
var pipelineCommand = ("-retry 1 -S scala/qscript/fullCallingPipeline.q" +
" -jobProject %s -Y %s -refseqTable %s" +
" --gatkjar %s/dist/GenomeAnalysisTK.jar")
.format(projectName, yamlFile, dataset.refseq, new File(".").getAbsolutePath)
if (!dataset.runIndelRealigner) {
pipelineCommand += " -skipCleaning"
cleanType = "uncleaned"
}
if (dataset.jobQueue != null)
pipelineCommand += " -jobQueue " + dataset.jobQueue
if (dataset.bigMemQueue != null)
pipelineCommand += " -bigMemQueue " + dataset.bigMemQueue
// Run the test, at least checking if the command compiles
PipelineTest.executeTest(testName, pipelineCommand, null)
// If actually running, evaluate the output validating the expressions.
if (PipelineTest.run) {
// path where the pipeline should have output the handfiltered vcf
val handFilteredVcf = PipelineTest.runDir(testName) + "SnpCalls/%s.%s.annotated.handfiltered.vcf".format(projectName, cleanType)
// eval modules to record in the validation directory
val evalModules = List("CompOverlap", "CountFunctionalClasses", "CountVariants", "SimpleMetricsBySample", "TiTvVariantEvaluator")
// write the report to the shared validation data location
val formatter = new SimpleDateFormat("yyyy.MM.dd.HH.mm.ss")
val reportLocation = "%s%s/validation.%s.eval".format(validationReportsDataLocation, testName, formatter.format(new Date))
new File(reportLocation).getParentFile.mkdirs
// Run variant eval generating the report and validating the pipeline vcfs.
var walkerCommand = ("-T VariantEval -R %s -D %s -B:eval,VCF %s" +
" -E %s -reportType R -reportLocation %s -L %s")
.format(
dataset.pipeline.getProject.getReferenceFile, dataset.pipeline.getProject.getDbsnpFile, handFilteredVcf,
evalModules.mkString(" -E "), reportLocation, dataset.pipeline.getProject.getIntervalList)
for (validation <- dataset.validations) {
walkerCommand += " -summary %s".format(validation.metric)
walkerCommand += " -validate '%1$s >= %2$s' -validate '%1$s <= %3$s'".format(
validation.metric, validation.min, validation.max)
}
WalkerTest.executeTest("fullCallingPipelineValidate-" + projectName, walkerCommand, null)
}
}
class PipelineDataset(
var pipeline: Pipeline = null,
var refseq: String = null,
var targetTiTv: String = null,
var validations: List[PipelineValidation] = Nil,
var jobQueue: String = null,
var bigMemQueue: String = null,
var runIndelRealigner: Boolean = false) {
override def toString = pipeline.getProject.getName
}
class PipelineValidation(val metric: String, val min: String, val max: String) {
}
class IntegerValidation(metric: String, target: Int)
extends PipelineValidation(metric,
(target * .99).floor.toInt.toString, (target * 1.01).ceil.toInt.toString) {
}
class DoubleValidation(metric: String, target: Double)
extends PipelineValidation(metric,
"%.2f".format((target * 99).floor / 100), "%.2f".format((target * 101).ceil / 100)) {
}
private def writeTempYaml(pipeline: Pipeline) = {
val tempFile = File.createTempFile(pipeline.getProject.getName + "-", ".yaml")
tempFile.deleteOnExit
YamlUtils.dump(pipeline, tempFile)
tempFile
}
}