From d3e2352072d0797bef54cf1c336ad8a1d554767b Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 7 Jan 2013 14:49:57 -0500 Subject: [PATCH] Moved processing pipelines to private These pipelines were supposed to serve as an example for the community, they were written a long-long-long time ago and are being used today by users as the 'best practice pipeline'. Unless we decide we want to support and maintain an example best-practices pipeline, I'm moving these to private. --- .../qscripts/DataProcessingPipeline.scala | 496 ------------------ .../MethodsDevelopmentCallingPipeline.scala | 373 ------------- .../qscripts/PacbioProcessingPipeline.scala | 188 ------- 3 files changed, 1057 deletions(-) delete mode 100755 public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala delete mode 100755 public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala delete mode 100755 public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala deleted file mode 100755 index 165e6a4e9..000000000 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ /dev/null @@ -1,496 +0,0 @@ -package org.broadinstitute.sting.queue.qscripts - -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.extensions.picard._ -import org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel -import org.broadinstitute.sting.utils.baq.BAQ.CalculationMode - -import collection.JavaConversions._ -import net.sf.samtools.SAMFileReader -import net.sf.samtools.SAMFileHeader.SortOrder - -import org.broadinstitute.sting.queue.util.QScriptUtils -import org.broadinstitute.sting.queue.function.ListWriterFunction -import org.broadinstitute.sting.commandline.Hidden -import org.broadinstitute.sting.commandline - -class DataProcessingPipeline extends QScript { - qscript => - - /**************************************************************************** - * Required Parameters - ****************************************************************************/ - - - @Input(doc="input BAM file - or list of BAM files", fullName="input", shortName="i", required=true) - var input: File = _ - - @Input(doc="Reference fasta file", fullName="reference", shortName="R", required=true) - var reference: File = _ - - @Input(doc="dbsnp ROD to use (must be in VCF format)", fullName="dbsnp", shortName="D", required=true) - var dbSNP: Seq[File] = Seq() - - /**************************************************************************** - * Optional Parameters - ****************************************************************************/ - - @Input(doc="extra VCF files to use as reference indels for Indel Realignment", fullName="extra_indels", shortName="indels", required=false) - var indels: Seq[File] = Seq() - - @Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false) - var bwaPath: File = _ - - @Argument(doc="the project name determines the final output (BAM file) base name. Example NA12878 yields NA12878.processed.bam", fullName="project", shortName="p", required=false) - var projectName: String = "project" - - @Argument(doc="Output path for the processed BAM files.", fullName="output_directory", shortName="outputDir", required=false) - var outputDir: String = "" - - @Argument(doc="the -L interval string to be used by GATK - output bams at interval only", fullName="gatk_interval_string", shortName="L", required=false) - var intervalString: String = "" - - @Input(doc="an intervals file to be used by GATK - output bams at intervals only", fullName="gatk_interval_file", shortName="intervals", required=false) - var intervals: File = _ - - @Argument(doc="Cleaning model: KNOWNS_ONLY, USE_READS or USE_SW", fullName="clean_model", shortName="cm", required=false) - var cleaningModel: String = "USE_READS" - - @Argument(doc="Decompose input BAM file and fully realign it using BWA and assume Single Ended reads", fullName="use_bwa_single_ended", shortName="bwase", required=false) - var useBWAse: Boolean = false - - @Argument(doc="Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads", fullName="use_bwa_pair_ended", shortName="bwape", required=false) - var useBWApe: Boolean = false - - @Argument(doc="Decompose input BAM file and fully realign it using BWA SW", fullName="use_bwa_sw", shortName="bwasw", required=false) - var useBWAsw: Boolean = false - - @Argument(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false) - var bwaThreads: Int = 1 - - @Argument(doc="Perform validation on the BAM files", fullName="validation", shortName="vs", required=false) - var validation: Boolean = false - - - /**************************************************************************** - * Hidden Parameters - ****************************************************************************/ - @Hidden - @Argument(doc="How many ways to scatter/gather", fullName="scatter_gather", shortName="sg", required=false) - var nContigs: Int = -1 - - @Hidden - @Argument(doc="Define the default platform for Count Covariates -- useful for techdev purposes only.", fullName="default_platform", shortName="dp", required=false) - var defaultPlatform: String = "" - - @Hidden - @Argument(doc="Run the pipeline in test mode only", fullName = "test_mode", shortName = "test", required=false) - var testMode: Boolean = false - - - /**************************************************************************** - * Global Variables - ****************************************************************************/ - - val queueLogDir: String = ".qlog/" // Gracefully hide Queue's output - - var cleanModelEnum: ConsensusDeterminationModel = ConsensusDeterminationModel.USE_READS - - - - - /**************************************************************************** - * Helper classes and methods - ****************************************************************************/ - - class ReadGroup (val id: String, - val lb: String, - val pl: String, - val pu: String, - val sm: String, - val cn: String, - val ds: String) - {} - - // Utility function to merge all bam files of similar samples. Generates one BAM file per sample. - // It uses the sample information on the header of the input BAM files. - // - // Because the realignment only happens after these scripts are executed, in case you are using - // bwa realignment, this function will operate over the original bam files and output over the - // (to be realigned) bam files. - def createSampleFiles(bamFiles: Seq[File], realignedBamFiles: Seq[File]): Map[String, Seq[File]] = { - - // Creating a table with SAMPLE information from each input BAM file - val sampleTable = scala.collection.mutable.Map.empty[String, Seq[File]] - val realignedIterator = realignedBamFiles.iterator - for (bam <- bamFiles) { - val rBam = realignedIterator.next() // advance to next element in the realignedBam list so they're in sync. - - val samReader = new SAMFileReader(bam) - val header = samReader.getFileHeader - val readGroups = header.getReadGroups - - // only allow one sample per file. Bam files with multiple samples would require pre-processing of the file - // with PrintReads to separate the samples. Tell user to do it himself! - assert(!QScriptUtils.hasMultipleSamples(readGroups), "The pipeline requires that only one sample is present in a BAM file. Please separate the samples in " + bam) - - // Fill out the sample table with the readgroups in this file - for (rg <- readGroups) { - val sample = rg.getSample - if (!sampleTable.contains(sample)) - sampleTable(sample) = Seq(rBam) - else if ( !sampleTable(sample).contains(rBam)) - sampleTable(sample) :+= rBam - } - } - sampleTable.toMap - } - - // Rebuilds the Read Group string to give BWA - def addReadGroups(inBam: File, outBam: File, samReader: SAMFileReader) { - val readGroups = samReader.getFileHeader.getReadGroups - var index: Int = readGroups.length - for (rg <- readGroups) { - val intermediateInBam: File = if (index == readGroups.length) { inBam } else { swapExt(outBam, ".bam", index+1 + "-rg.bam") } - val intermediateOutBam: File = if (index > 1) {swapExt(outBam, ".bam", index + "-rg.bam") } else { outBam} - val readGroup = new ReadGroup(rg.getReadGroupId, rg.getLibrary, rg.getPlatform, rg.getPlatformUnit, rg.getSample, rg.getSequencingCenter, rg.getDescription) - add(addReadGroup(intermediateInBam, intermediateOutBam, readGroup)) - index = index - 1 - } - } - - // Takes a list of processed BAM files and realign them using the BWA option requested (bwase or bwape). - // Returns a list of realigned BAM files. - def performAlignment(bams: Seq[File]): Seq[File] = { - var realignedBams: Seq[File] = Seq() - var index = 1 - for (bam <- bams) { - // first revert the BAM file to the original qualities - val saiFile1 = swapExt(bam, ".bam", "." + index + ".1.sai") - val saiFile2 = swapExt(bam, ".bam", "." + index + ".2.sai") - val realignedSamFile = swapExt(bam, ".bam", "." + index + ".realigned.sam") - val realignedBamFile = swapExt(bam, ".bam", "." + index + ".realigned.bam") - val rgRealignedBamFile = swapExt(bam, ".bam", "." + index + ".realigned.rg.bam") - - if (useBWAse) { - val revertedBAM = revertBAM(bam, true) - add(bwa_aln_se(revertedBAM, saiFile1), - bwa_sam_se(revertedBAM, saiFile1, realignedSamFile)) - } - else if (useBWApe) { - val revertedBAM = revertBAM(bam, true) - add(bwa_aln_pe(revertedBAM, saiFile1, 1), - bwa_aln_pe(revertedBAM, saiFile2, 2), - bwa_sam_pe(revertedBAM, saiFile1, saiFile2, realignedSamFile)) - } - else if (useBWAsw) { - val revertedBAM = revertBAM(bam, false) - val fastQ = swapExt(revertedBAM, ".bam", ".fq") - add(convertToFastQ(revertedBAM, fastQ), - bwa_sw(fastQ, realignedSamFile)) - } - add(sortSam(realignedSamFile, realignedBamFile, SortOrder.coordinate)) - addReadGroups(realignedBamFile, rgRealignedBamFile, new SAMFileReader(bam)) - realignedBams :+= rgRealignedBamFile - index = index + 1 - } - realignedBams - } - - def getIndelCleaningModel: ConsensusDeterminationModel = { - if (cleaningModel == "KNOWNS_ONLY") - ConsensusDeterminationModel.KNOWNS_ONLY - else if (cleaningModel == "USE_SW") - ConsensusDeterminationModel.USE_SW - else - ConsensusDeterminationModel.USE_READS - } - - def revertBams(bams: Seq[File], removeAlignmentInformation: Boolean): Seq[File] = { - var revertedBAMList: Seq[File] = Seq() - for (bam <- bams) - revertedBAMList :+= revertBAM(bam, removeAlignmentInformation) - revertedBAMList - } - - def revertBAM(bam: File, removeAlignmentInformation: Boolean): File = { - val revertedBAM = swapExt(bam, ".bam", ".reverted.bam") - add(revert(bam, revertedBAM, removeAlignmentInformation)) - revertedBAM - } - - /**************************************************************************** - * Main script - ****************************************************************************/ - - - def script() { - // final output list of processed bam files - var cohortList: Seq[File] = Seq() - - // sets the model for the Indel Realigner - cleanModelEnum = getIndelCleaningModel - - // keep a record of the number of contigs in the first bam file in the list - val bams = QScriptUtils.createSeqFromFile(input) - if (nContigs < 0) - nContigs = QScriptUtils.getNumberOfContigs(bams(0)) - - val realignedBAMs = if (useBWApe || useBWAse || useBWAsw) {performAlignment(bams)} else {revertBams(bams, false)} - - // generate a BAM file per sample joining all per lane files if necessary - val sampleBAMFiles: Map[String, Seq[File]] = createSampleFiles(bams, realignedBAMs) - - // if this is a 'knowns only' indel realignment run, do it only once for all samples. - val globalIntervals = new File(outputDir + projectName + ".intervals") - if (cleaningModel == ConsensusDeterminationModel.KNOWNS_ONLY) - add(target(null, globalIntervals)) - - // put each sample through the pipeline - for ((sample, bamList) <- sampleBAMFiles) { - - // BAM files generated by the pipeline - val bam = new File(qscript.projectName + "." + sample + ".bam") - val cleanedBam = swapExt(bam, ".bam", ".clean.bam") - val dedupedBam = swapExt(bam, ".bam", ".clean.dedup.bam") - val recalBam = swapExt(bam, ".bam", ".clean.dedup.recal.bam") - - // Accessory files - val targetIntervals = if (cleaningModel == ConsensusDeterminationModel.KNOWNS_ONLY) {globalIntervals} else {swapExt(bam, ".bam", ".intervals")} - val metricsFile = swapExt(bam, ".bam", ".metrics") - val preRecalFile = swapExt(bam, ".bam", ".pre_recal.table") - val postRecalFile = swapExt(bam, ".bam", ".post_recal.table") - val preOutPath = swapExt(bam, ".bam", ".pre") - val postOutPath = swapExt(bam, ".bam", ".post") - val preValidateLog = swapExt(bam, ".bam", ".pre.validation") - val postValidateLog = swapExt(bam, ".bam", ".post.validation") - - - // Validation is an optional step for the BAM file generated after - // alignment and the final bam file of the pipeline. - if (validation) { - for (sampleFile <- bamList) - add(validate(sampleFile, preValidateLog), - validate(recalBam, postValidateLog)) - } - - if (cleaningModel != ConsensusDeterminationModel.KNOWNS_ONLY) - add(target(bamList, targetIntervals)) - - add(clean(bamList, targetIntervals, cleanedBam), - dedup(cleanedBam, dedupedBam, metricsFile), - cov(dedupedBam, preRecalFile), - recal(dedupedBam, preRecalFile, recalBam), - cov(recalBam, postRecalFile)) - - - cohortList :+= recalBam - } - - // output a BAM list with all the processed per sample files - val cohortFile = new File(qscript.outputDir + qscript.projectName + ".cohort.list") - add(writeList(cohortList, cohortFile)) - } - - - - /**************************************************************************** - * Classes (GATK Walkers) - ****************************************************************************/ - - - - // General arguments to non-GATK tools - trait ExternalCommonArgs extends CommandLineFunction { - this.memoryLimit = 4 - this.isIntermediate = true - } - - // General arguments to GATK walkers - trait CommandLineGATKArgs extends CommandLineGATK with ExternalCommonArgs { - this.reference_sequence = qscript.reference - } - - trait SAMargs extends PicardBamFunction with ExternalCommonArgs { - this.maxRecordsInRam = 100000 - } - - case class target (inBams: Seq[File], outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs { - if (cleanModelEnum != ConsensusDeterminationModel.KNOWNS_ONLY) - this.input_file = inBams - this.out = outIntervals - this.mismatchFraction = 0.0 - this.known ++= qscript.dbSNP - if (indels != null) - this.known ++= qscript.indels - this.scatterCount = nContigs - this.analysisName = queueLogDir + outIntervals + ".target" - this.jobName = queueLogDir + outIntervals + ".target" - } - - case class clean (inBams: Seq[File], tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs { - this.input_file = inBams - this.targetIntervals = tIntervals - this.out = outBam - this.known ++= qscript.dbSNP - if (qscript.indels != null) - this.known ++= qscript.indels - this.consensusDeterminationModel = cleanModelEnum - this.compress = 0 - this.noPGTag = qscript.testMode; - this.scatterCount = nContigs - this.analysisName = queueLogDir + outBam + ".clean" - this.jobName = queueLogDir + outBam + ".clean" - } - - case class cov (inBam: File, outRecalFile: File) extends BaseRecalibrator with CommandLineGATKArgs { - this.knownSites ++= qscript.dbSNP - this.covariate ++= Seq("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "ContextCovariate") - this.input_file :+= inBam - this.disable_indel_quals = true - this.out = outRecalFile - if (!defaultPlatform.isEmpty) this.default_platform = defaultPlatform - if (!qscript.intervalString.isEmpty) this.intervalsString ++= Seq(qscript.intervalString) - else if (qscript.intervals != null) this.intervals :+= qscript.intervals - this.scatterCount = nContigs - this.analysisName = queueLogDir + outRecalFile + ".covariates" - this.jobName = queueLogDir + outRecalFile + ".covariates" - } - - case class recal (inBam: File, inRecalFile: File, outBam: File) extends PrintReads with CommandLineGATKArgs { - this.input_file :+= inBam - this.BQSR = inRecalFile - this.baq = CalculationMode.CALCULATE_AS_NECESSARY - this.out = outBam - if (!qscript.intervalString.isEmpty) this.intervalsString ++= Seq(qscript.intervalString) - else if (qscript.intervals != null) this.intervals :+= qscript.intervals - this.scatterCount = nContigs - this.isIntermediate = false - this.analysisName = queueLogDir + outBam + ".recalibration" - this.jobName = queueLogDir + outBam + ".recalibration" - } - - - - /**************************************************************************** - * Classes (non-GATK programs) - ****************************************************************************/ - - - case class dedup (inBam: File, outBam: File, metricsFile: File) extends MarkDuplicates with ExternalCommonArgs { - this.input :+= inBam - this.output = outBam - this.metrics = metricsFile - this.memoryLimit = 16 - this.analysisName = queueLogDir + outBam + ".dedup" - this.jobName = queueLogDir + outBam + ".dedup" - } - - case class joinBams (inBams: Seq[File], outBam: File) extends MergeSamFiles with ExternalCommonArgs { - this.input = inBams - this.output = outBam - this.analysisName = queueLogDir + outBam + ".joinBams" - this.jobName = queueLogDir + outBam + ".joinBams" - } - - case class sortSam (inSam: File, outBam: File, sortOrderP: SortOrder) extends SortSam with ExternalCommonArgs { - this.input :+= inSam - this.output = outBam - this.sortOrder = sortOrderP - this.analysisName = queueLogDir + outBam + ".sortSam" - this.jobName = queueLogDir + outBam + ".sortSam" - } - - case class validate (inBam: File, outLog: File) extends ValidateSamFile with ExternalCommonArgs { - this.input :+= inBam - this.output = outLog - this.REFERENCE_SEQUENCE = qscript.reference - this.isIntermediate = false - this.analysisName = queueLogDir + outLog + ".validate" - this.jobName = queueLogDir + outLog + ".validate" - } - - - case class addReadGroup (inBam: File, outBam: File, readGroup: ReadGroup) extends AddOrReplaceReadGroups with ExternalCommonArgs { - this.input :+= inBam - this.output = outBam - this.RGID = readGroup.id - this.RGCN = readGroup.cn - this.RGDS = readGroup.ds - this.RGLB = readGroup.lb - this.RGPL = readGroup.pl - this.RGPU = readGroup.pu - this.RGSM = readGroup.sm - this.analysisName = queueLogDir + outBam + ".rg" - this.jobName = queueLogDir + outBam + ".rg" - } - - case class revert (inBam: File, outBam: File, removeAlignmentInfo: Boolean) extends RevertSam with ExternalCommonArgs { - this.output = outBam - this.input :+= inBam - this.removeAlignmentInformation = removeAlignmentInfo; - this.sortOrder = if (removeAlignmentInfo) {SortOrder.queryname} else {SortOrder.coordinate} - this.analysisName = queueLogDir + outBam + "revert" - this.jobName = queueLogDir + outBam + ".revert" - } - - case class convertToFastQ (inBam: File, outFQ: File) extends SamToFastq with ExternalCommonArgs { - this.input :+= inBam - this.fastq = outFQ - this.analysisName = queueLogDir + outFQ + "convert_to_fastq" - this.jobName = queueLogDir + outFQ + ".convert_to_fastq" - } - - case class bwa_aln_se (inBam: File, outSai: File) extends CommandLineFunction with ExternalCommonArgs { - @Input(doc="bam file to be aligned") var bam = inBam - @Output(doc="output sai file") var sai = outSai - def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b " + bam + " > " + sai - this.analysisName = queueLogDir + outSai + ".bwa_aln_se" - this.jobName = queueLogDir + outSai + ".bwa_aln_se" - } - - case class bwa_aln_pe (inBam: File, outSai1: File, index: Int) extends CommandLineFunction with ExternalCommonArgs { - @Input(doc="bam file to be aligned") var bam = inBam - @Output(doc="output sai file for 1st mating pair") var sai = outSai1 - def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b" + index + " " + bam + " > " + sai - this.analysisName = queueLogDir + outSai1 + ".bwa_aln_pe1" - this.jobName = queueLogDir + outSai1 + ".bwa_aln_pe1" - } - - case class bwa_sam_se (inBam: File, inSai: File, outBam: File) extends CommandLineFunction with ExternalCommonArgs { - @Input(doc="bam file to be aligned") var bam = inBam - @Input(doc="bwa alignment index file") var sai = inSai - @Output(doc="output aligned bam file") var alignedBam = outBam - def commandLine = bwaPath + " samse " + reference + " " + sai + " " + bam + " > " + alignedBam - this.memoryLimit = 6 - this.analysisName = queueLogDir + outBam + ".bwa_sam_se" - this.jobName = queueLogDir + outBam + ".bwa_sam_se" - } - - case class bwa_sam_pe (inBam: File, inSai1: File, inSai2:File, outBam: File) extends CommandLineFunction with ExternalCommonArgs { - @Input(doc="bam file to be aligned") var bam = inBam - @Input(doc="bwa alignment index file for 1st mating pair") var sai1 = inSai1 - @Input(doc="bwa alignment index file for 2nd mating pair") var sai2 = inSai2 - @Output(doc="output aligned bam file") var alignedBam = outBam - def commandLine = bwaPath + " sampe " + reference + " " + sai1 + " " + sai2 + " " + bam + " " + bam + " > " + alignedBam - this.memoryLimit = 6 - this.analysisName = queueLogDir + outBam + ".bwa_sam_pe" - this.jobName = queueLogDir + outBam + ".bwa_sam_pe" - } - - case class bwa_sw (inFastQ: File, outBam: File) extends CommandLineFunction with ExternalCommonArgs { - @Input(doc="fastq file to be aligned") var fq = inFastQ - @Output(doc="output bam file") var bam = outBam - def commandLine = bwaPath + " bwasw -t " + bwaThreads + " " + reference + " " + fq + " > " + bam - this.analysisName = queueLogDir + outBam + ".bwasw" - this.jobName = queueLogDir + outBam + ".bwasw" - } - - case class writeList(inBams: Seq[File], outBamList: File) extends ListWriterFunction { - this.inputFiles = inBams - this.listFile = outBamList - this.analysisName = queueLogDir + outBamList + ".bamList" - this.jobName = queueLogDir + outBamList + ".bamList" - } -} diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala deleted file mode 100755 index b860358ca..000000000 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala +++ /dev/null @@ -1,373 +0,0 @@ -package org.broadinstitute.sting.queue.qscripts - -import org.broadinstitute.sting.queue.extensions.gatk._ -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.gatk.phonehome.GATKRunReport - -class MethodsDevelopmentCallingPipeline extends QScript { - qscript => - - @Argument(shortName="outputDir", doc="output directory", required=false) - var outputDir: String = "./" - - @Argument(shortName="skipCalling", doc="skip the calling part of the pipeline and only run VQSR on preset, gold standard VCF files", required=false) - var skipCalling: Boolean = false - - @Argument(shortName="dataset", doc="selects the datasets to run. If not provided, all datasets will be used", required=false) - var datasets: List[String] = Nil - - @Argument(shortName="runGoldStandard", doc="run the pipeline with the goldstandard VCF files for comparison", required=false) - var runGoldStandard: Boolean = false - - @Argument(shortName="noBAQ", doc="turns off BAQ calculation", required=false) - var noBAQ: Boolean = false - - @Argument(shortName="noIndels", doc="do not call indels with the Unified Genotyper", required=false) - var noIndels: Boolean = false - - @Argument(shortName="mbq", doc="The minimum Phred-Scaled quality score threshold to be considered a good base.", required=false) - var minimumBaseQuality: Int = -1 - - @Argument(shortName="deletions", doc="Maximum deletion fraction allowed at a site to call a genotype.", required=false) - var deletions: Double = -1 - - @Argument(shortName="sample", doc="Samples to include in Variant Eval", required=false) - var samples: List[String] = Nil - - class Target( - val baseName: String, - val reference: File, - val dbsnpFile: String, - val hapmapFile: String, - val maskFile: String, - val bamList: File, - val goldStandard_VCF: File, - val intervals: String, - val indelTranchTarget: Double, - val snpTrancheTarget: Double, - val isLowpass: Boolean, - val isExome: Boolean, - val nSamples: Int) { - val name = qscript.outputDir + baseName - val clusterFile = new File(name + ".clusters") - val rawSnpVCF = new File(name + ".raw.vcf") - val rawIndelVCF = new File(name + ".raw.indel.vcf") - val filteredIndelVCF = new File(name + ".filtered.indel.vcf") - val recalibratedSnpVCF = new File(name + ".snp.recalibrated.vcf") - val recalibratedIndelVCF = new File(name + ".indel.recalibrated.vcf") - val tranchesSnpFile = new File(name + ".snp.tranches") - val tranchesIndelFile = new File(name + ".indel.tranches") - val vqsrSnpRscript = name + ".snp.vqsr.r" - val vqsrIndelRscript = name + ".indel.vqsr.r" - val recalSnpFile = new File(name + ".snp.tranches.recal") - val recalIndelFile = new File(name + ".indel.tranches.recal") - val goldStandardRecalibratedVCF = new File(name + "goldStandard.recalibrated.vcf") - val goldStandardTranchesFile = new File(name + "goldStandard.tranches") - val goldStandardRecalFile = new File(name + "goldStandard.tranches.recal") - val evalFile = new File(name + ".snp.eval") - val evalIndelFile = new File(name + ".indel.eval") - val goldStandardName = qscript.outputDir + "goldStandard/" + baseName - val goldStandardClusterFile = new File(goldStandardName + ".clusters") - } - - val b37_decoy = new File("/humgen/1kg/reference/human_g1k_v37_decoy.fasta") - val hg19 = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") - val hg18 = new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") - val b36 = new File("/humgen/1kg/reference/human_b36_both.fasta") - val b37 = new File("/humgen/1kg/reference/human_g1k_v37.fasta") - val dbSNP_hg18_129 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_hg18.rod" // Special case for NA12878 collections that can't use 132 because they are part of it. - val dbSNP_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132.b36.excluding_sites_after_129.vcf" - val dbSNP_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.leftAligned.vcf" // Special case for NA12878 collections that can't use 132 because they are part of it. - val hapmap_hg18 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.hg18_fwd.vcf" - val hapmap_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b36_fwd.vcf" - val hapmap_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" - val training_hapmap_b37 = "/humgen/1kg/processing/pipeline_test_bams/hapmap3.3_training_chr20.vcf" - val omni_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_sites_1525_samples.b36.vcf" - val omni_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf" - val indelMask_b36 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b36.bed" - val indelMask_b37 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b37.bed" - val training_1000G = "/humgen/1kg/processing/official_release/phase1/projectConsensus/phase1.wgs.projectConsensus.v2b.recal.highQuality.vcf" - val badSites_1000G = "/humgen/1kg/processing/official_release/phase1/projectConsensus/phase1.wgs.projectConsensus.v2b.recal.terrible.vcf" - val projectConsensus_1000G = "/humgen/1kg/processing/official_release/phase1/projectConsensus/ALL.wgs.projectConsensus_v2b.20101123.snps.sites.vcf" - val indelGoldStandardCallset = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/GoldStandardIndel/gold.standard.indel.MillsAnd1000G.b37.vcf" - - val lowPass: Boolean = true - val exome: Boolean = true - val indels: Boolean = true - - val queueLogDir = ".qlog/" - - // BUGBUG: We no longer support b36/hg18 because several of the necessary files aren't available aligned to those references - - val targetDataSets: Map[String, Target] = Map( - "NA12878_gold" -> new Target("NA12878.goldStandard", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, - new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/data/goldStandard.list"), - new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/analysis/snps/NA12878.HiSeq19.filtered.vcf"), // ** There is no gold standard for the gold standard ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 90.0, 99.0, lowPass, !exome, 391), - "NA12878_wgs_b37" -> new Target("NA12878.HiSeq.WGS.b37", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, - new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.bam"), - new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/analysis/snps/NA12878.HiSeq19.filtered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 90.0, 99.0, !lowPass, !exome, 1), - "NA12878_wgs_decoy" -> new Target("NA12878.HiSeq.WGS.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37, - new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37_decoy.NA12878.clean.dedup.recal.bam"), - new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/analysis/snps/NA12878.HiSeq19.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 90.0, 99.0, !lowPass, !exome, 1), - "NA12878_wgs_hg18" -> new Target("NA12878.HiSeq.WGS.hg18", hg18, dbSNP_hg18_129, hapmap_hg18, - "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.indels.10.mask", - new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"), - new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 90.0, 99.0, !lowPass, !exome, 1), - "NA12878_wex_b37" -> new Target("NA12878.HiSeq.WEx.b37", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, - new File("/seq/picard_aggregation/C339/NA12878/v3/NA12878.bam"), - new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 90.0, 98.0, !lowPass, exome, 1), - "NA12878_wex_hg18" -> new Target("NA12878.HiSeq.WEx.hg18", hg18, dbSNP_hg18_129, hapmap_hg18, - "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.indels.10.mask", - new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.WEx.cleaned.recal.bam"), - new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.vcf"), - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list", 90.0, 98.0, !lowPass, exome, 1), - "NA12878_wex_decoy" -> new Target("NA12878.HiSeq.WEx.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37, - new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.b37_decoy.NA12878.clean.dedup.recal.bam"), - new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 90.0, 98.0, !lowPass, exome, 1), - "CEUTrio_wex_b37" -> new Target("CEUTrio.HiSeq.WEx.b37", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, - new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.bwa.cleaned.recal.bam"), - new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 90.0, 98.0, !lowPass, exome, 3), - "CEUTrio_wgs_b37" -> new Target("CEUTrio.HiSeq.WGS.b37", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, - new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.bwa.cleaned.recal.bam"), - new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 90.0, 99.0, !lowPass, !exome, 3), - "CEUTrio_wex_decoy" -> new Target("CEUTrio.HiSeq.WEx.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37, - new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.b37_decoy.list"), - new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 90.0, 98.0, !lowPass, exome, 3), - "CEUTrio_wgs_decoy" -> new Target("CEUTrio.HiSeq.WGS.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37, - new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37_decoy.list"), - new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 90.0, 99.0, !lowPass, !exome, 3), - "GA2hg19" -> new Target("NA12878.GA2.hg19", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, - new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.GA2.WGS.bwa.cleaned.hg19.bam"), - new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/analysis/snps/NA12878.GA2.hg19.filtered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 90.0, 99.0, !lowPass, !exome, 1), - "FIN" -> new Target("FIN", b37, dbSNP_b37, hapmap_b37, indelMask_b37, - new File("/humgen/1kg/processing/pipeline_test_bams/FIN.79sample.Nov2010.chr20.bam"), - new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 90.0, 99.0, lowPass, !exome, 79), - "TGPWExGdA" -> new Target("1000G.WEx.GdA", b37, dbSNP_b37, hapmap_b37, indelMask_b37, - new File("/humgen/1kg/processing/pipeline_test_bams/Barcoded_1000G_WEx_Reduced_Plate_1.cleaned.list"), // BUGBUG: reduce from 60 to 20 people - new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 90.0, 99.0, !lowPass, exome, 96), - "LowPassN60" -> new Target("lowpass.N60", b36, dbSNP_b36, hapmap_b36, indelMask_b36, - new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/lowpass.chr20.cleaned.matefixed.bam"), // the bam list to call from - new File("/home/radon01/depristo/work/oneOffProjects/VQSRCutByNRS/lowpass.N60.chr20.filtered.vcf"), // the gold standard VCF file to run through the VQSR - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 90.0, 99.0, lowPass, !exome, 60), // chunked interval list to use with Queue's scatter/gather functionality - "LowPassEUR363Nov" -> new Target("EUR.nov2010", b37, dbSNP_b37, hapmap_b37, indelMask_b37, - new File("/humgen/1kg/processing/pipeline_test_bams/EUR.363sample.Nov2010.chr20.bam"), - new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 90.0, 99.0, lowPass, !exome, 363) - ) - - - def script = { - - // Selects the datasets in the -dataset argument and adds them to targets. - var targets: List[Target] = List() - if (!datasets.isEmpty) - for (ds <- datasets) - targets ::= targetDataSets(ds) - else // If -dataset is not specified, all datasets are used. - for (targetDS <- targetDataSets.valuesIterator) - targets ::= targetDS - - val goldStandard = true - for (target <- targets) { - if( !skipCalling ) { - if (!noIndels) add(new indelCall(target), new indelRecal(target), new indelCut(target), new indelEvaluation(target)) - add(new snpCall(target)) - add(new snpRecal(target, !goldStandard)) - add(new snpCut(target, !goldStandard)) - add(new snpEvaluation(target)) - } - if ( runGoldStandard ) { - add(new snpRecal(target, goldStandard)) - add(new snpCut(target, goldStandard)) - } - } - } - - - trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { - logging_level = "INFO"; - memoryLimit = 4; - phone_home = GATKRunReport.PhoneHomeOption.NO_ET - } - - def bai(bam: File) = new File(bam + ".bai") - - // 1.) Unified Genotyper Base - class GenotyperBase (t: Target) extends UnifiedGenotyper with UNIVERSAL_GATK_ARGS { - this.reference_sequence = t.reference - this.intervalsString ++= List(t.intervals) - this.scatterCount = 140 - this.nt = 2 - this.dcov = if ( t.isLowpass ) { 50 } else { 250 } - this.stand_call_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 } - this.stand_emit_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 } - this.input_file :+= t.bamList - this.D = new File(t.dbsnpFile) - } - - // 1a.) Call SNPs with UG - class snpCall (t: Target) extends GenotyperBase(t) { - if (minimumBaseQuality >= 0) - this.min_base_quality_score = minimumBaseQuality - if (qscript.deletions >= 0) - this.max_deletion_fraction = qscript.deletions - this.out = t.rawSnpVCF - this.glm = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.SNP - this.baq = if (noBAQ || t.isExome) {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF} else {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.CALCULATE_AS_NECESSARY} - this.analysisName = t.name + "_UGs" - this.jobName = queueLogDir + t.name + ".snpcall" - } - - // 1b.) Call Indels with UG - class indelCall (t: Target) extends GenotyperBase(t) { - this.memoryLimit = 6 - this.out = t.rawIndelVCF - this.glm = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.INDEL - this.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF - this.analysisName = t.name + "_UGi" - this.jobName = queueLogDir + t.name + ".indelcall" - } - - // 2.) Hard Filtering for indels - class indelFilter (t: Target) extends VariantFiltration with UNIVERSAL_GATK_ARGS { - this.memoryLimit = 2 - this.reference_sequence = t.reference - this.intervalsString ++= List(t.intervals) - this.scatterCount = 10 - this.V = t.rawIndelVCF - this.out = t.filteredIndelVCF - this.filterName ++= List("IndelQD", "IndelReadPosRankSum", "IndelFS") - this.filterExpression ++= List("QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0") - if (t.nSamples >= 10) { - this.filterName ++= List("IndelInbreedingCoeff") - this.filterExpression ++= List("InbreedingCoeff < -0.8") - } - this.analysisName = t.name + "_VF" - this.jobName = queueLogDir + t.name + ".indelfilter" - } - - class VQSRBase(t: Target) extends VariantRecalibrator with UNIVERSAL_GATK_ARGS { - this.nt = 2 - this.reference_sequence = t.reference - this.intervalsString ++= List(t.intervals) - this.allPoly = true - this.tranche ++= List("100.0", "99.9", "99.5", "99.3", "99.0", "98.9", "98.8", "98.5", "98.4", "98.3", "98.2", "98.1", "98.0", "97.9", "97.8", "97.5", "97.0", "95.0", "90.0") - } - - class snpRecal(t: Target, goldStandard: Boolean) extends VQSRBase(t) with UNIVERSAL_GATK_ARGS { - this.input :+= ( if ( goldStandard ) { t.goldStandard_VCF } else { t.rawSnpVCF } ) - this.resource :+= new TaggedFile( t.hapmapFile, "known=false,training=true,truth=true,prior=15.0" ) - this.resource :+= new TaggedFile( omni_b37, "known=false,training=true,truth=true,prior=12.0" ) - this.resource :+= new TaggedFile( training_1000G, "known=false,training=true,prior=10.0" ) - this.resource :+= new TaggedFile( t.dbsnpFile, "known=true,prior=2.0" ) - this.resource :+= new TaggedFile( projectConsensus_1000G, "prior=8.0" ) - this.use_annotation ++= List("QD", "HaplotypeScore", "MQRankSum", "ReadPosRankSum", "MQ", "FS") - if(t.nSamples >= 10) - this.use_annotation ++= List("InbreedingCoeff") // InbreedingCoeff is a population-wide statistic that requires at least 10 samples to calculate - if(!t.isExome) - this.use_annotation ++= List("DP") - else { // exome specific parameters - this.resource :+= new TaggedFile( badSites_1000G, "bad=true,prior=2.0" ) - this.mG = 6 - if(t.nSamples <= 3) { // very few exome samples means very few variants - this.mG = 4 - this.percentBad = 0.04 - } - } - this.tranches_file = if ( goldStandard ) { t.goldStandardTranchesFile } else { t.tranchesSnpFile } - this.recal_file = if ( goldStandard ) { t.goldStandardRecalFile } else { t.recalSnpFile } - this.rscript_file = t.vqsrSnpRscript - this.mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP - this.analysisName = t.name + "_VQSRs" - this.jobName = queueLogDir + t.name + ".snprecal" - } - - class indelRecal(t: Target) extends VQSRBase(t) with UNIVERSAL_GATK_ARGS { - this.input :+= t.rawIndelVCF - this.resource :+= new TaggedFile(indelGoldStandardCallset, "known=false,training=true,truth=true,prior=12.0" ) - this.resource :+= new TaggedFile( t.dbsnpFile, "known=true,prior=2.0" ) - this.use_annotation ++= List("QD", "HaplotypeScore", "ReadPosRankSum", "FS") - if(t.nSamples >= 10) - this.use_annotation ++= List("InbreedingCoeff") // InbreedingCoeff is a population-wide statistic that requires at least 10 samples to calculate - this.tranches_file = t.tranchesIndelFile - this.recal_file = t.recalIndelFile - this.rscript_file = t.vqsrIndelRscript - this.mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL - this.analysisName = t.name + "_VQSRi" - this.jobName = queueLogDir + t.name + ".indelrecal" - } - - - // 4.) Apply the recalibration table to the appropriate tranches - class applyVQSRBase (t: Target) extends ApplyRecalibration with UNIVERSAL_GATK_ARGS { - this.memoryLimit = 6 - this.reference_sequence = t.reference - this.intervalsString ++= List(t.intervals) - } - - class snpCut (t: Target, goldStandard: Boolean) extends applyVQSRBase(t) { - this.input :+= ( if ( goldStandard ) { t.goldStandard_VCF } else { t.rawSnpVCF } ) - this.tranches_file = if ( goldStandard ) { t.goldStandardTranchesFile } else { t.tranchesSnpFile} - this.recal_file = if ( goldStandard ) { t.goldStandardRecalFile } else { t.recalSnpFile } - this.ts_filter_level = t.snpTrancheTarget - this.mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP - this.out = t.recalibratedSnpVCF - this.analysisName = t.name + "_AVQSRs" - this.jobName = queueLogDir + t.name + ".snpcut" - } - - class indelCut (t: Target) extends applyVQSRBase(t) { - this.input :+= t.rawIndelVCF - this.tranches_file = t.tranchesIndelFile - this.recal_file = t.recalIndelFile - this.ts_filter_level = t.indelTranchTarget - this.mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL - this.out = t.recalibratedIndelVCF - this.analysisName = t.name + "_AVQSRi" - this.jobName = queueLogDir + t.name + ".indelcut" - } - - - // 5.) Variant Evaluation Base(OPTIONAL) - class EvalBase(t: Target) extends VariantEval with UNIVERSAL_GATK_ARGS { - this.memoryLimit = 3 - this.comp :+= new TaggedFile(t.hapmapFile, "hapmap" ) - this.D = new File(t.dbsnpFile) - this.reference_sequence = t.reference - this.intervalsString ++= List(t.intervals) - this.sample = samples - } - - // 5a.) SNP Evaluation (OPTIONAL) based on the cut vcf - class snpEvaluation(t: Target) extends EvalBase(t) { - if (t.reference == b37 || t.reference == hg19) this.comp :+= new TaggedFile( omni_b37, "omni" ) - this.eval :+= t.recalibratedSnpVCF - this.out = t.evalFile - this.analysisName = t.name + "_VEs" - this.jobName = queueLogDir + t.name + ".snpeval" - } - - // 5b.) Indel Evaluation (OPTIONAL) - class indelEvaluation(t: Target) extends EvalBase(t) { - this.eval :+= t.recalibratedIndelVCF - this.comp :+= new TaggedFile(indelGoldStandardCallset, "indelGS" ) - this.noEV = true - this.evalModule = List("CompOverlap", "CountVariants", "TiTvVariantEvaluator", "ValidationReport", "IndelStatistics") - this.out = t.evalIndelFile - this.analysisName = t.name + "_VEi" - this.jobName = queueLogDir + queueLogDir + t.name + ".indeleval" - } -} diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala deleted file mode 100755 index ef73840b3..000000000 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala +++ /dev/null @@ -1,188 +0,0 @@ -package org.broadinstitute.sting.queue.qscripts - -import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.util.QScriptUtils -import net.sf.samtools.SAMFileHeader.SortOrder -import org.broadinstitute.sting.utils.exceptions.UserException -import org.broadinstitute.sting.commandline.Hidden -import org.broadinstitute.sting.queue.extensions.picard.{ReorderSam, SortSam, AddOrReplaceReadGroups} -import org.broadinstitute.sting.queue.extensions.gatk._ - -/** - * Created by IntelliJ IDEA. - * User: carneiro - * Date: 4/20/11 - * Time: 16:29 PM - */ - - -class PacbioProcessingPipeline extends QScript { - - @Input(doc="input FASTA/FASTQ/BAM file - or list of FASTA/FASTQ/BAM files. ", shortName="i", required=true) - var input: File = _ - - @Input(doc="Reference fasta file", shortName="R", required=true) - var reference: File = _ - - @Input(doc="dbsnp VCF file to use ", shortName="D", required=true) - var dbSNP: File = _ - - @Argument(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false) - var threads: Int = 0 - - @Argument(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false) - var sample: String = "NA" - - @Input(doc="The path to the binary of bwa to align fasta/fastq files", fullName="path_to_bwa", shortName="bwa", required=false) - var bwaPath: File = _ - - @Argument(doc="Input is a BLASR generated BAM file", shortName = "blasr", fullName="blasr_bam", required=false) - var BLASR_BAM: Boolean = false - - @Hidden - @Argument(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false) - var dbq: Int = 20 - - @Hidden - @Argument(shortName="bwastring", required=false) - var bwastring: String = "" - - @Hidden - @Argument(shortName = "test", fullName = "test_mode", required = false) - var testMode: Boolean = false - - val queueLogDir: String = ".qlog/" - - def script() { - - val fileList: Seq[File] = QScriptUtils.createSeqFromFile(input) - - for (file: File <- fileList) { - - var USE_BWA: Boolean = false - var resetQuals: Boolean = true - - if (file.endsWith(".fasta") || file.endsWith(".fq") || file.endsWith(".fastq")) { - if (bwaPath == null) { - throw new UserException("You provided a fasta/fastq file but didn't provide the path for BWA"); - } - USE_BWA = true - if (file.endsWith(".fq") || file.endsWith(".fastq")) - resetQuals = false - } - - // FASTA -> BAM steps - val alignedSam: File = file.getName + ".aligned.sam" - val sortedBam: File = swapExt(alignedSam, ".sam", ".bam") - val rgBam: File = swapExt(file, ".bam", ".rg.bam") - - val bamBase = if (USE_BWA) {rgBam} else {file} - - // BAM Steps - val mqBAM: File = swapExt(bamBase, ".bam", ".mq.bam") - val recalFile1: File = swapExt(bamBase, ".bam", ".recal1.table") - val recalFile2: File = swapExt(bamBase, ".bam", ".recal2.table") - val recalBam: File = swapExt(bamBase, ".bam", ".recal.bam") - val path1: String = recalBam + ".before" - val path2: String = recalBam + ".after" - - if (USE_BWA) { - add(align(file, alignedSam), - sortSam(alignedSam, sortedBam), - addReadGroup(sortedBam, rgBam, sample)) - } - - else if (BLASR_BAM) { - val reorderedBAM = swapExt(bamBase, ".bam", ".reordered.bam") - add(reorder(bamBase, reorderedBAM), - changeMQ(reorderedBAM, mqBAM)) - } - - val bam = if (BLASR_BAM) {mqBAM} else {bamBase} - - add(cov(bam, recalFile1, resetQuals), - recal(bam, recalFile1, recalBam), - cov(recalBam, recalFile2, false)) - } - } - - - // General arguments to non-GATK tools - trait ExternalCommonArgs extends CommandLineFunction { - this.memoryLimit = 4 - this.isIntermediate = true - } - - trait CommandLineGATKArgs extends CommandLineGATK with ExternalCommonArgs { - this.reference_sequence = reference - } - - - case class align(@Input inFastq: File, @Output outSam: File) extends ExternalCommonArgs { - def commandLine = bwaPath + " bwasw -b5 -q2 -r1 -z20 -t16 " + reference + " " + inFastq + " > " + outSam - this.memoryLimit = 8 - this.analysisName = queueLogDir + outSam + ".bwa_sam_se" - this.jobName = queueLogDir + outSam + ".bwa_sam_se" - } - - case class sortSam (inSam: File, outBam: File) extends SortSam with ExternalCommonArgs { - this.input = List(inSam) - this.output = outBam - this.memoryLimit = 8 - this.sortOrder = SortOrder.coordinate - this.analysisName = queueLogDir + outBam + ".sortSam" - this.jobName = queueLogDir + outBam + ".sortSam" - } - - case class reorder (inSam: File, outSam: File) extends ReorderSam with ExternalCommonArgs { - this.input = List(inSam) - this.output = outSam - this.sortReference = reference - } - - case class changeMQ(inBam: File, outBam: File) extends PrintReads with CommandLineGATKArgs { - this.input_file :+= inBam - this.out = outBam - this.read_filter :+= "ReassignMappingQuality" - } - - case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups with ExternalCommonArgs { - @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") - this.input = List(inBam) - this.output = outBam - this.RGID = "1" - this.RGCN = "BI" - this.RGPL = "PacBio_RS" - this.RGSM = sample - this.RGLB = "default_library" - this.RGPU = "default_pu" - this.analysisName = queueLogDir + outBam + ".rg" - this.jobName = queueLogDir + outBam + ".rg" - } - - case class cov (inBam: File, outRecalFile: File, resetQuals: Boolean) extends BaseRecalibrator with CommandLineGATKArgs { - if (resetQuals) - this.DBQ = dbq - this.knownSites :+= dbSNP - this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "ContextCovariate") - this.input_file :+= inBam - this.disable_indel_quals = true - this.out = outRecalFile - this.analysisName = queueLogDir + outRecalFile + ".covariates" - this.jobName = queueLogDir + outRecalFile + ".covariates" - this.scatterCount = threads - this.read_filter :+= "BadCigar" - } - - case class recal (inBam: File, inRecalFile: File, outBam: File) extends PrintReads with CommandLineGATKArgs { - this.DBQ = dbq - this.input_file :+= inBam - this.BQSR = inRecalFile - this.out = outBam - this.isIntermediate = false - this.analysisName = queueLogDir + outBam + ".recalibration" - this.jobName = queueLogDir + outBam + ".recalibration" - this.read_filter :+= "BadCigar" - this.scatterCount = threads - } -}