Merge branch 'master' of github.com:broadinstitute/gsa-unstable
This commit is contained in:
commit
8bbb9e1cc2
|
|
@ -1,496 +0,0 @@
|
|||
package org.broadinstitute.sting.queue.qscripts
|
||||
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.queue.extensions.picard._
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel
|
||||
import org.broadinstitute.sting.utils.baq.BAQ.CalculationMode
|
||||
|
||||
import collection.JavaConversions._
|
||||
import net.sf.samtools.SAMFileReader
|
||||
import net.sf.samtools.SAMFileHeader.SortOrder
|
||||
|
||||
import org.broadinstitute.sting.queue.util.QScriptUtils
|
||||
import org.broadinstitute.sting.queue.function.ListWriterFunction
|
||||
import org.broadinstitute.sting.commandline.Hidden
|
||||
import org.broadinstitute.sting.commandline
|
||||
|
||||
class DataProcessingPipeline extends QScript {
|
||||
qscript =>
|
||||
|
||||
/****************************************************************************
|
||||
* Required Parameters
|
||||
****************************************************************************/
|
||||
|
||||
|
||||
@Input(doc="input BAM file - or list of BAM files", fullName="input", shortName="i", required=true)
|
||||
var input: File = _
|
||||
|
||||
@Input(doc="Reference fasta file", fullName="reference", shortName="R", required=true)
|
||||
var reference: File = _
|
||||
|
||||
@Input(doc="dbsnp ROD to use (must be in VCF format)", fullName="dbsnp", shortName="D", required=true)
|
||||
var dbSNP: Seq[File] = Seq()
|
||||
|
||||
/****************************************************************************
|
||||
* Optional Parameters
|
||||
****************************************************************************/
|
||||
|
||||
@Input(doc="extra VCF files to use as reference indels for Indel Realignment", fullName="extra_indels", shortName="indels", required=false)
|
||||
var indels: Seq[File] = Seq()
|
||||
|
||||
@Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false)
|
||||
var bwaPath: File = _
|
||||
|
||||
@Argument(doc="the project name determines the final output (BAM file) base name. Example NA12878 yields NA12878.processed.bam", fullName="project", shortName="p", required=false)
|
||||
var projectName: String = "project"
|
||||
|
||||
@Argument(doc="Output path for the processed BAM files.", fullName="output_directory", shortName="outputDir", required=false)
|
||||
var outputDir: String = ""
|
||||
|
||||
@Argument(doc="the -L interval string to be used by GATK - output bams at interval only", fullName="gatk_interval_string", shortName="L", required=false)
|
||||
var intervalString: String = ""
|
||||
|
||||
@Input(doc="an intervals file to be used by GATK - output bams at intervals only", fullName="gatk_interval_file", shortName="intervals", required=false)
|
||||
var intervals: File = _
|
||||
|
||||
@Argument(doc="Cleaning model: KNOWNS_ONLY, USE_READS or USE_SW", fullName="clean_model", shortName="cm", required=false)
|
||||
var cleaningModel: String = "USE_READS"
|
||||
|
||||
@Argument(doc="Decompose input BAM file and fully realign it using BWA and assume Single Ended reads", fullName="use_bwa_single_ended", shortName="bwase", required=false)
|
||||
var useBWAse: Boolean = false
|
||||
|
||||
@Argument(doc="Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads", fullName="use_bwa_pair_ended", shortName="bwape", required=false)
|
||||
var useBWApe: Boolean = false
|
||||
|
||||
@Argument(doc="Decompose input BAM file and fully realign it using BWA SW", fullName="use_bwa_sw", shortName="bwasw", required=false)
|
||||
var useBWAsw: Boolean = false
|
||||
|
||||
@Argument(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false)
|
||||
var bwaThreads: Int = 1
|
||||
|
||||
@Argument(doc="Perform validation on the BAM files", fullName="validation", shortName="vs", required=false)
|
||||
var validation: Boolean = false
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
* Hidden Parameters
|
||||
****************************************************************************/
|
||||
@Hidden
|
||||
@Argument(doc="How many ways to scatter/gather", fullName="scatter_gather", shortName="sg", required=false)
|
||||
var nContigs: Int = -1
|
||||
|
||||
@Hidden
|
||||
@Argument(doc="Define the default platform for Count Covariates -- useful for techdev purposes only.", fullName="default_platform", shortName="dp", required=false)
|
||||
var defaultPlatform: String = ""
|
||||
|
||||
@Hidden
|
||||
@Argument(doc="Run the pipeline in test mode only", fullName = "test_mode", shortName = "test", required=false)
|
||||
var testMode: Boolean = false
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
* Global Variables
|
||||
****************************************************************************/
|
||||
|
||||
val queueLogDir: String = ".qlog/" // Gracefully hide Queue's output
|
||||
|
||||
var cleanModelEnum: ConsensusDeterminationModel = ConsensusDeterminationModel.USE_READS
|
||||
|
||||
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
* Helper classes and methods
|
||||
****************************************************************************/
|
||||
|
||||
class ReadGroup (val id: String,
|
||||
val lb: String,
|
||||
val pl: String,
|
||||
val pu: String,
|
||||
val sm: String,
|
||||
val cn: String,
|
||||
val ds: String)
|
||||
{}
|
||||
|
||||
// Utility function to merge all bam files of similar samples. Generates one BAM file per sample.
|
||||
// It uses the sample information on the header of the input BAM files.
|
||||
//
|
||||
// Because the realignment only happens after these scripts are executed, in case you are using
|
||||
// bwa realignment, this function will operate over the original bam files and output over the
|
||||
// (to be realigned) bam files.
|
||||
def createSampleFiles(bamFiles: Seq[File], realignedBamFiles: Seq[File]): Map[String, Seq[File]] = {
|
||||
|
||||
// Creating a table with SAMPLE information from each input BAM file
|
||||
val sampleTable = scala.collection.mutable.Map.empty[String, Seq[File]]
|
||||
val realignedIterator = realignedBamFiles.iterator
|
||||
for (bam <- bamFiles) {
|
||||
val rBam = realignedIterator.next() // advance to next element in the realignedBam list so they're in sync.
|
||||
|
||||
val samReader = new SAMFileReader(bam)
|
||||
val header = samReader.getFileHeader
|
||||
val readGroups = header.getReadGroups
|
||||
|
||||
// only allow one sample per file. Bam files with multiple samples would require pre-processing of the file
|
||||
// with PrintReads to separate the samples. Tell user to do it himself!
|
||||
assert(!QScriptUtils.hasMultipleSamples(readGroups), "The pipeline requires that only one sample is present in a BAM file. Please separate the samples in " + bam)
|
||||
|
||||
// Fill out the sample table with the readgroups in this file
|
||||
for (rg <- readGroups) {
|
||||
val sample = rg.getSample
|
||||
if (!sampleTable.contains(sample))
|
||||
sampleTable(sample) = Seq(rBam)
|
||||
else if ( !sampleTable(sample).contains(rBam))
|
||||
sampleTable(sample) :+= rBam
|
||||
}
|
||||
}
|
||||
sampleTable.toMap
|
||||
}
|
||||
|
||||
// Rebuilds the Read Group string to give BWA
|
||||
def addReadGroups(inBam: File, outBam: File, samReader: SAMFileReader) {
|
||||
val readGroups = samReader.getFileHeader.getReadGroups
|
||||
var index: Int = readGroups.length
|
||||
for (rg <- readGroups) {
|
||||
val intermediateInBam: File = if (index == readGroups.length) { inBam } else { swapExt(outBam, ".bam", index+1 + "-rg.bam") }
|
||||
val intermediateOutBam: File = if (index > 1) {swapExt(outBam, ".bam", index + "-rg.bam") } else { outBam}
|
||||
val readGroup = new ReadGroup(rg.getReadGroupId, rg.getLibrary, rg.getPlatform, rg.getPlatformUnit, rg.getSample, rg.getSequencingCenter, rg.getDescription)
|
||||
add(addReadGroup(intermediateInBam, intermediateOutBam, readGroup))
|
||||
index = index - 1
|
||||
}
|
||||
}
|
||||
|
||||
// Takes a list of processed BAM files and realign them using the BWA option requested (bwase or bwape).
|
||||
// Returns a list of realigned BAM files.
|
||||
def performAlignment(bams: Seq[File]): Seq[File] = {
|
||||
var realignedBams: Seq[File] = Seq()
|
||||
var index = 1
|
||||
for (bam <- bams) {
|
||||
// first revert the BAM file to the original qualities
|
||||
val saiFile1 = swapExt(bam, ".bam", "." + index + ".1.sai")
|
||||
val saiFile2 = swapExt(bam, ".bam", "." + index + ".2.sai")
|
||||
val realignedSamFile = swapExt(bam, ".bam", "." + index + ".realigned.sam")
|
||||
val realignedBamFile = swapExt(bam, ".bam", "." + index + ".realigned.bam")
|
||||
val rgRealignedBamFile = swapExt(bam, ".bam", "." + index + ".realigned.rg.bam")
|
||||
|
||||
if (useBWAse) {
|
||||
val revertedBAM = revertBAM(bam, true)
|
||||
add(bwa_aln_se(revertedBAM, saiFile1),
|
||||
bwa_sam_se(revertedBAM, saiFile1, realignedSamFile))
|
||||
}
|
||||
else if (useBWApe) {
|
||||
val revertedBAM = revertBAM(bam, true)
|
||||
add(bwa_aln_pe(revertedBAM, saiFile1, 1),
|
||||
bwa_aln_pe(revertedBAM, saiFile2, 2),
|
||||
bwa_sam_pe(revertedBAM, saiFile1, saiFile2, realignedSamFile))
|
||||
}
|
||||
else if (useBWAsw) {
|
||||
val revertedBAM = revertBAM(bam, false)
|
||||
val fastQ = swapExt(revertedBAM, ".bam", ".fq")
|
||||
add(convertToFastQ(revertedBAM, fastQ),
|
||||
bwa_sw(fastQ, realignedSamFile))
|
||||
}
|
||||
add(sortSam(realignedSamFile, realignedBamFile, SortOrder.coordinate))
|
||||
addReadGroups(realignedBamFile, rgRealignedBamFile, new SAMFileReader(bam))
|
||||
realignedBams :+= rgRealignedBamFile
|
||||
index = index + 1
|
||||
}
|
||||
realignedBams
|
||||
}
|
||||
|
||||
def getIndelCleaningModel: ConsensusDeterminationModel = {
|
||||
if (cleaningModel == "KNOWNS_ONLY")
|
||||
ConsensusDeterminationModel.KNOWNS_ONLY
|
||||
else if (cleaningModel == "USE_SW")
|
||||
ConsensusDeterminationModel.USE_SW
|
||||
else
|
||||
ConsensusDeterminationModel.USE_READS
|
||||
}
|
||||
|
||||
def revertBams(bams: Seq[File], removeAlignmentInformation: Boolean): Seq[File] = {
|
||||
var revertedBAMList: Seq[File] = Seq()
|
||||
for (bam <- bams)
|
||||
revertedBAMList :+= revertBAM(bam, removeAlignmentInformation)
|
||||
revertedBAMList
|
||||
}
|
||||
|
||||
def revertBAM(bam: File, removeAlignmentInformation: Boolean): File = {
|
||||
val revertedBAM = swapExt(bam, ".bam", ".reverted.bam")
|
||||
add(revert(bam, revertedBAM, removeAlignmentInformation))
|
||||
revertedBAM
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
* Main script
|
||||
****************************************************************************/
|
||||
|
||||
|
||||
def script() {
|
||||
// final output list of processed bam files
|
||||
var cohortList: Seq[File] = Seq()
|
||||
|
||||
// sets the model for the Indel Realigner
|
||||
cleanModelEnum = getIndelCleaningModel
|
||||
|
||||
// keep a record of the number of contigs in the first bam file in the list
|
||||
val bams = QScriptUtils.createSeqFromFile(input)
|
||||
if (nContigs < 0)
|
||||
nContigs = QScriptUtils.getNumberOfContigs(bams(0))
|
||||
|
||||
val realignedBAMs = if (useBWApe || useBWAse || useBWAsw) {performAlignment(bams)} else {revertBams(bams, false)}
|
||||
|
||||
// generate a BAM file per sample joining all per lane files if necessary
|
||||
val sampleBAMFiles: Map[String, Seq[File]] = createSampleFiles(bams, realignedBAMs)
|
||||
|
||||
// if this is a 'knowns only' indel realignment run, do it only once for all samples.
|
||||
val globalIntervals = new File(outputDir + projectName + ".intervals")
|
||||
if (cleaningModel == ConsensusDeterminationModel.KNOWNS_ONLY)
|
||||
add(target(null, globalIntervals))
|
||||
|
||||
// put each sample through the pipeline
|
||||
for ((sample, bamList) <- sampleBAMFiles) {
|
||||
|
||||
// BAM files generated by the pipeline
|
||||
val bam = new File(qscript.projectName + "." + sample + ".bam")
|
||||
val cleanedBam = swapExt(bam, ".bam", ".clean.bam")
|
||||
val dedupedBam = swapExt(bam, ".bam", ".clean.dedup.bam")
|
||||
val recalBam = swapExt(bam, ".bam", ".clean.dedup.recal.bam")
|
||||
|
||||
// Accessory files
|
||||
val targetIntervals = if (cleaningModel == ConsensusDeterminationModel.KNOWNS_ONLY) {globalIntervals} else {swapExt(bam, ".bam", ".intervals")}
|
||||
val metricsFile = swapExt(bam, ".bam", ".metrics")
|
||||
val preRecalFile = swapExt(bam, ".bam", ".pre_recal.table")
|
||||
val postRecalFile = swapExt(bam, ".bam", ".post_recal.table")
|
||||
val preOutPath = swapExt(bam, ".bam", ".pre")
|
||||
val postOutPath = swapExt(bam, ".bam", ".post")
|
||||
val preValidateLog = swapExt(bam, ".bam", ".pre.validation")
|
||||
val postValidateLog = swapExt(bam, ".bam", ".post.validation")
|
||||
|
||||
|
||||
// Validation is an optional step for the BAM file generated after
|
||||
// alignment and the final bam file of the pipeline.
|
||||
if (validation) {
|
||||
for (sampleFile <- bamList)
|
||||
add(validate(sampleFile, preValidateLog),
|
||||
validate(recalBam, postValidateLog))
|
||||
}
|
||||
|
||||
if (cleaningModel != ConsensusDeterminationModel.KNOWNS_ONLY)
|
||||
add(target(bamList, targetIntervals))
|
||||
|
||||
add(clean(bamList, targetIntervals, cleanedBam),
|
||||
dedup(cleanedBam, dedupedBam, metricsFile),
|
||||
cov(dedupedBam, preRecalFile),
|
||||
recal(dedupedBam, preRecalFile, recalBam),
|
||||
cov(recalBam, postRecalFile))
|
||||
|
||||
|
||||
cohortList :+= recalBam
|
||||
}
|
||||
|
||||
// output a BAM list with all the processed per sample files
|
||||
val cohortFile = new File(qscript.outputDir + qscript.projectName + ".cohort.list")
|
||||
add(writeList(cohortList, cohortFile))
|
||||
}
|
||||
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
* Classes (GATK Walkers)
|
||||
****************************************************************************/
|
||||
|
||||
|
||||
|
||||
// General arguments to non-GATK tools
|
||||
trait ExternalCommonArgs extends CommandLineFunction {
|
||||
this.memoryLimit = 4
|
||||
this.isIntermediate = true
|
||||
}
|
||||
|
||||
// General arguments to GATK walkers
|
||||
trait CommandLineGATKArgs extends CommandLineGATK with ExternalCommonArgs {
|
||||
this.reference_sequence = qscript.reference
|
||||
}
|
||||
|
||||
trait SAMargs extends PicardBamFunction with ExternalCommonArgs {
|
||||
this.maxRecordsInRam = 100000
|
||||
}
|
||||
|
||||
case class target (inBams: Seq[File], outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs {
|
||||
if (cleanModelEnum != ConsensusDeterminationModel.KNOWNS_ONLY)
|
||||
this.input_file = inBams
|
||||
this.out = outIntervals
|
||||
this.mismatchFraction = 0.0
|
||||
this.known ++= qscript.dbSNP
|
||||
if (indels != null)
|
||||
this.known ++= qscript.indels
|
||||
this.scatterCount = nContigs
|
||||
this.analysisName = queueLogDir + outIntervals + ".target"
|
||||
this.jobName = queueLogDir + outIntervals + ".target"
|
||||
}
|
||||
|
||||
case class clean (inBams: Seq[File], tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs {
|
||||
this.input_file = inBams
|
||||
this.targetIntervals = tIntervals
|
||||
this.out = outBam
|
||||
this.known ++= qscript.dbSNP
|
||||
if (qscript.indels != null)
|
||||
this.known ++= qscript.indels
|
||||
this.consensusDeterminationModel = cleanModelEnum
|
||||
this.compress = 0
|
||||
this.noPGTag = qscript.testMode;
|
||||
this.scatterCount = nContigs
|
||||
this.analysisName = queueLogDir + outBam + ".clean"
|
||||
this.jobName = queueLogDir + outBam + ".clean"
|
||||
}
|
||||
|
||||
case class cov (inBam: File, outRecalFile: File) extends BaseRecalibrator with CommandLineGATKArgs {
|
||||
this.knownSites ++= qscript.dbSNP
|
||||
this.covariate ++= Seq("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "ContextCovariate")
|
||||
this.input_file :+= inBam
|
||||
this.disable_indel_quals = true
|
||||
this.out = outRecalFile
|
||||
if (!defaultPlatform.isEmpty) this.default_platform = defaultPlatform
|
||||
if (!qscript.intervalString.isEmpty) this.intervalsString ++= Seq(qscript.intervalString)
|
||||
else if (qscript.intervals != null) this.intervals :+= qscript.intervals
|
||||
this.scatterCount = nContigs
|
||||
this.analysisName = queueLogDir + outRecalFile + ".covariates"
|
||||
this.jobName = queueLogDir + outRecalFile + ".covariates"
|
||||
}
|
||||
|
||||
case class recal (inBam: File, inRecalFile: File, outBam: File) extends PrintReads with CommandLineGATKArgs {
|
||||
this.input_file :+= inBam
|
||||
this.BQSR = inRecalFile
|
||||
this.baq = CalculationMode.CALCULATE_AS_NECESSARY
|
||||
this.out = outBam
|
||||
if (!qscript.intervalString.isEmpty) this.intervalsString ++= Seq(qscript.intervalString)
|
||||
else if (qscript.intervals != null) this.intervals :+= qscript.intervals
|
||||
this.scatterCount = nContigs
|
||||
this.isIntermediate = false
|
||||
this.analysisName = queueLogDir + outBam + ".recalibration"
|
||||
this.jobName = queueLogDir + outBam + ".recalibration"
|
||||
}
|
||||
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
* Classes (non-GATK programs)
|
||||
****************************************************************************/
|
||||
|
||||
|
||||
case class dedup (inBam: File, outBam: File, metricsFile: File) extends MarkDuplicates with ExternalCommonArgs {
|
||||
this.input :+= inBam
|
||||
this.output = outBam
|
||||
this.metrics = metricsFile
|
||||
this.memoryLimit = 16
|
||||
this.analysisName = queueLogDir + outBam + ".dedup"
|
||||
this.jobName = queueLogDir + outBam + ".dedup"
|
||||
}
|
||||
|
||||
case class joinBams (inBams: Seq[File], outBam: File) extends MergeSamFiles with ExternalCommonArgs {
|
||||
this.input = inBams
|
||||
this.output = outBam
|
||||
this.analysisName = queueLogDir + outBam + ".joinBams"
|
||||
this.jobName = queueLogDir + outBam + ".joinBams"
|
||||
}
|
||||
|
||||
case class sortSam (inSam: File, outBam: File, sortOrderP: SortOrder) extends SortSam with ExternalCommonArgs {
|
||||
this.input :+= inSam
|
||||
this.output = outBam
|
||||
this.sortOrder = sortOrderP
|
||||
this.analysisName = queueLogDir + outBam + ".sortSam"
|
||||
this.jobName = queueLogDir + outBam + ".sortSam"
|
||||
}
|
||||
|
||||
case class validate (inBam: File, outLog: File) extends ValidateSamFile with ExternalCommonArgs {
|
||||
this.input :+= inBam
|
||||
this.output = outLog
|
||||
this.REFERENCE_SEQUENCE = qscript.reference
|
||||
this.isIntermediate = false
|
||||
this.analysisName = queueLogDir + outLog + ".validate"
|
||||
this.jobName = queueLogDir + outLog + ".validate"
|
||||
}
|
||||
|
||||
|
||||
case class addReadGroup (inBam: File, outBam: File, readGroup: ReadGroup) extends AddOrReplaceReadGroups with ExternalCommonArgs {
|
||||
this.input :+= inBam
|
||||
this.output = outBam
|
||||
this.RGID = readGroup.id
|
||||
this.RGCN = readGroup.cn
|
||||
this.RGDS = readGroup.ds
|
||||
this.RGLB = readGroup.lb
|
||||
this.RGPL = readGroup.pl
|
||||
this.RGPU = readGroup.pu
|
||||
this.RGSM = readGroup.sm
|
||||
this.analysisName = queueLogDir + outBam + ".rg"
|
||||
this.jobName = queueLogDir + outBam + ".rg"
|
||||
}
|
||||
|
||||
case class revert (inBam: File, outBam: File, removeAlignmentInfo: Boolean) extends RevertSam with ExternalCommonArgs {
|
||||
this.output = outBam
|
||||
this.input :+= inBam
|
||||
this.removeAlignmentInformation = removeAlignmentInfo;
|
||||
this.sortOrder = if (removeAlignmentInfo) {SortOrder.queryname} else {SortOrder.coordinate}
|
||||
this.analysisName = queueLogDir + outBam + "revert"
|
||||
this.jobName = queueLogDir + outBam + ".revert"
|
||||
}
|
||||
|
||||
case class convertToFastQ (inBam: File, outFQ: File) extends SamToFastq with ExternalCommonArgs {
|
||||
this.input :+= inBam
|
||||
this.fastq = outFQ
|
||||
this.analysisName = queueLogDir + outFQ + "convert_to_fastq"
|
||||
this.jobName = queueLogDir + outFQ + ".convert_to_fastq"
|
||||
}
|
||||
|
||||
case class bwa_aln_se (inBam: File, outSai: File) extends CommandLineFunction with ExternalCommonArgs {
|
||||
@Input(doc="bam file to be aligned") var bam = inBam
|
||||
@Output(doc="output sai file") var sai = outSai
|
||||
def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b " + bam + " > " + sai
|
||||
this.analysisName = queueLogDir + outSai + ".bwa_aln_se"
|
||||
this.jobName = queueLogDir + outSai + ".bwa_aln_se"
|
||||
}
|
||||
|
||||
case class bwa_aln_pe (inBam: File, outSai1: File, index: Int) extends CommandLineFunction with ExternalCommonArgs {
|
||||
@Input(doc="bam file to be aligned") var bam = inBam
|
||||
@Output(doc="output sai file for 1st mating pair") var sai = outSai1
|
||||
def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b" + index + " " + bam + " > " + sai
|
||||
this.analysisName = queueLogDir + outSai1 + ".bwa_aln_pe1"
|
||||
this.jobName = queueLogDir + outSai1 + ".bwa_aln_pe1"
|
||||
}
|
||||
|
||||
case class bwa_sam_se (inBam: File, inSai: File, outBam: File) extends CommandLineFunction with ExternalCommonArgs {
|
||||
@Input(doc="bam file to be aligned") var bam = inBam
|
||||
@Input(doc="bwa alignment index file") var sai = inSai
|
||||
@Output(doc="output aligned bam file") var alignedBam = outBam
|
||||
def commandLine = bwaPath + " samse " + reference + " " + sai + " " + bam + " > " + alignedBam
|
||||
this.memoryLimit = 6
|
||||
this.analysisName = queueLogDir + outBam + ".bwa_sam_se"
|
||||
this.jobName = queueLogDir + outBam + ".bwa_sam_se"
|
||||
}
|
||||
|
||||
case class bwa_sam_pe (inBam: File, inSai1: File, inSai2:File, outBam: File) extends CommandLineFunction with ExternalCommonArgs {
|
||||
@Input(doc="bam file to be aligned") var bam = inBam
|
||||
@Input(doc="bwa alignment index file for 1st mating pair") var sai1 = inSai1
|
||||
@Input(doc="bwa alignment index file for 2nd mating pair") var sai2 = inSai2
|
||||
@Output(doc="output aligned bam file") var alignedBam = outBam
|
||||
def commandLine = bwaPath + " sampe " + reference + " " + sai1 + " " + sai2 + " " + bam + " " + bam + " > " + alignedBam
|
||||
this.memoryLimit = 6
|
||||
this.analysisName = queueLogDir + outBam + ".bwa_sam_pe"
|
||||
this.jobName = queueLogDir + outBam + ".bwa_sam_pe"
|
||||
}
|
||||
|
||||
case class bwa_sw (inFastQ: File, outBam: File) extends CommandLineFunction with ExternalCommonArgs {
|
||||
@Input(doc="fastq file to be aligned") var fq = inFastQ
|
||||
@Output(doc="output bam file") var bam = outBam
|
||||
def commandLine = bwaPath + " bwasw -t " + bwaThreads + " " + reference + " " + fq + " > " + bam
|
||||
this.analysisName = queueLogDir + outBam + ".bwasw"
|
||||
this.jobName = queueLogDir + outBam + ".bwasw"
|
||||
}
|
||||
|
||||
case class writeList(inBams: Seq[File], outBamList: File) extends ListWriterFunction {
|
||||
this.inputFiles = inBams
|
||||
this.listFile = outBamList
|
||||
this.analysisName = queueLogDir + outBamList + ".bamList"
|
||||
this.jobName = queueLogDir + outBamList + ".bamList"
|
||||
}
|
||||
}
|
||||
|
|
@ -1,373 +0,0 @@
|
|||
package org.broadinstitute.sting.queue.qscripts
|
||||
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport
|
||||
|
||||
class MethodsDevelopmentCallingPipeline extends QScript {
|
||||
qscript =>
|
||||
|
||||
@Argument(shortName="outputDir", doc="output directory", required=false)
|
||||
var outputDir: String = "./"
|
||||
|
||||
@Argument(shortName="skipCalling", doc="skip the calling part of the pipeline and only run VQSR on preset, gold standard VCF files", required=false)
|
||||
var skipCalling: Boolean = false
|
||||
|
||||
@Argument(shortName="dataset", doc="selects the datasets to run. If not provided, all datasets will be used", required=false)
|
||||
var datasets: List[String] = Nil
|
||||
|
||||
@Argument(shortName="runGoldStandard", doc="run the pipeline with the goldstandard VCF files for comparison", required=false)
|
||||
var runGoldStandard: Boolean = false
|
||||
|
||||
@Argument(shortName="noBAQ", doc="turns off BAQ calculation", required=false)
|
||||
var noBAQ: Boolean = false
|
||||
|
||||
@Argument(shortName="noIndels", doc="do not call indels with the Unified Genotyper", required=false)
|
||||
var noIndels: Boolean = false
|
||||
|
||||
@Argument(shortName="mbq", doc="The minimum Phred-Scaled quality score threshold to be considered a good base.", required=false)
|
||||
var minimumBaseQuality: Int = -1
|
||||
|
||||
@Argument(shortName="deletions", doc="Maximum deletion fraction allowed at a site to call a genotype.", required=false)
|
||||
var deletions: Double = -1
|
||||
|
||||
@Argument(shortName="sample", doc="Samples to include in Variant Eval", required=false)
|
||||
var samples: List[String] = Nil
|
||||
|
||||
class Target(
|
||||
val baseName: String,
|
||||
val reference: File,
|
||||
val dbsnpFile: String,
|
||||
val hapmapFile: String,
|
||||
val maskFile: String,
|
||||
val bamList: File,
|
||||
val goldStandard_VCF: File,
|
||||
val intervals: String,
|
||||
val indelTranchTarget: Double,
|
||||
val snpTrancheTarget: Double,
|
||||
val isLowpass: Boolean,
|
||||
val isExome: Boolean,
|
||||
val nSamples: Int) {
|
||||
val name = qscript.outputDir + baseName
|
||||
val clusterFile = new File(name + ".clusters")
|
||||
val rawSnpVCF = new File(name + ".raw.vcf")
|
||||
val rawIndelVCF = new File(name + ".raw.indel.vcf")
|
||||
val filteredIndelVCF = new File(name + ".filtered.indel.vcf")
|
||||
val recalibratedSnpVCF = new File(name + ".snp.recalibrated.vcf")
|
||||
val recalibratedIndelVCF = new File(name + ".indel.recalibrated.vcf")
|
||||
val tranchesSnpFile = new File(name + ".snp.tranches")
|
||||
val tranchesIndelFile = new File(name + ".indel.tranches")
|
||||
val vqsrSnpRscript = name + ".snp.vqsr.r"
|
||||
val vqsrIndelRscript = name + ".indel.vqsr.r"
|
||||
val recalSnpFile = new File(name + ".snp.tranches.recal")
|
||||
val recalIndelFile = new File(name + ".indel.tranches.recal")
|
||||
val goldStandardRecalibratedVCF = new File(name + "goldStandard.recalibrated.vcf")
|
||||
val goldStandardTranchesFile = new File(name + "goldStandard.tranches")
|
||||
val goldStandardRecalFile = new File(name + "goldStandard.tranches.recal")
|
||||
val evalFile = new File(name + ".snp.eval")
|
||||
val evalIndelFile = new File(name + ".indel.eval")
|
||||
val goldStandardName = qscript.outputDir + "goldStandard/" + baseName
|
||||
val goldStandardClusterFile = new File(goldStandardName + ".clusters")
|
||||
}
|
||||
|
||||
val b37_decoy = new File("/humgen/1kg/reference/human_g1k_v37_decoy.fasta")
|
||||
val hg19 = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
|
||||
val hg18 = new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
|
||||
val b36 = new File("/humgen/1kg/reference/human_b36_both.fasta")
|
||||
val b37 = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
|
||||
val dbSNP_hg18_129 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_hg18.rod" // Special case for NA12878 collections that can't use 132 because they are part of it.
|
||||
val dbSNP_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132.b36.excluding_sites_after_129.vcf"
|
||||
val dbSNP_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.leftAligned.vcf" // Special case for NA12878 collections that can't use 132 because they are part of it.
|
||||
val hapmap_hg18 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.hg18_fwd.vcf"
|
||||
val hapmap_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b36_fwd.vcf"
|
||||
val hapmap_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf"
|
||||
val training_hapmap_b37 = "/humgen/1kg/processing/pipeline_test_bams/hapmap3.3_training_chr20.vcf"
|
||||
val omni_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_sites_1525_samples.b36.vcf"
|
||||
val omni_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf"
|
||||
val indelMask_b36 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b36.bed"
|
||||
val indelMask_b37 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b37.bed"
|
||||
val training_1000G = "/humgen/1kg/processing/official_release/phase1/projectConsensus/phase1.wgs.projectConsensus.v2b.recal.highQuality.vcf"
|
||||
val badSites_1000G = "/humgen/1kg/processing/official_release/phase1/projectConsensus/phase1.wgs.projectConsensus.v2b.recal.terrible.vcf"
|
||||
val projectConsensus_1000G = "/humgen/1kg/processing/official_release/phase1/projectConsensus/ALL.wgs.projectConsensus_v2b.20101123.snps.sites.vcf"
|
||||
val indelGoldStandardCallset = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/GoldStandardIndel/gold.standard.indel.MillsAnd1000G.b37.vcf"
|
||||
|
||||
val lowPass: Boolean = true
|
||||
val exome: Boolean = true
|
||||
val indels: Boolean = true
|
||||
|
||||
val queueLogDir = ".qlog/"
|
||||
|
||||
// BUGBUG: We no longer support b36/hg18 because several of the necessary files aren't available aligned to those references
|
||||
|
||||
val targetDataSets: Map[String, Target] = Map(
|
||||
"NA12878_gold" -> new Target("NA12878.goldStandard", hg19, dbSNP_b37, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/data/goldStandard.list"),
|
||||
new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/analysis/snps/NA12878.HiSeq19.filtered.vcf"), // ** There is no gold standard for the gold standard **
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 90.0, 99.0, lowPass, !exome, 391),
|
||||
"NA12878_wgs_b37" -> new Target("NA12878.HiSeq.WGS.b37", hg19, dbSNP_b37, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.bam"),
|
||||
new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/analysis/snps/NA12878.HiSeq19.filtered.vcf"),
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 90.0, 99.0, !lowPass, !exome, 1),
|
||||
"NA12878_wgs_decoy" -> new Target("NA12878.HiSeq.WGS.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37_decoy.NA12878.clean.dedup.recal.bam"),
|
||||
new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/analysis/snps/NA12878.HiSeq19.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 90.0, 99.0, !lowPass, !exome, 1),
|
||||
"NA12878_wgs_hg18" -> new Target("NA12878.HiSeq.WGS.hg18", hg18, dbSNP_hg18_129, hapmap_hg18,
|
||||
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.indels.10.mask",
|
||||
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"),
|
||||
new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf"),
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 90.0, 99.0, !lowPass, !exome, 1),
|
||||
"NA12878_wex_b37" -> new Target("NA12878.HiSeq.WEx.b37", hg19, dbSNP_b37, hapmap_b37, indelMask_b37,
|
||||
new File("/seq/picard_aggregation/C339/NA12878/v3/NA12878.bam"),
|
||||
new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
|
||||
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 90.0, 98.0, !lowPass, exome, 1),
|
||||
"NA12878_wex_hg18" -> new Target("NA12878.HiSeq.WEx.hg18", hg18, dbSNP_hg18_129, hapmap_hg18,
|
||||
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.indels.10.mask",
|
||||
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.WEx.cleaned.recal.bam"),
|
||||
new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.vcf"),
|
||||
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list", 90.0, 98.0, !lowPass, exome, 1),
|
||||
"NA12878_wex_decoy" -> new Target("NA12878.HiSeq.WEx.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.b37_decoy.NA12878.clean.dedup.recal.bam"),
|
||||
new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
|
||||
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 90.0, 98.0, !lowPass, exome, 1),
|
||||
"CEUTrio_wex_b37" -> new Target("CEUTrio.HiSeq.WEx.b37", hg19, dbSNP_b37, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.bwa.cleaned.recal.bam"),
|
||||
new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"),
|
||||
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 90.0, 98.0, !lowPass, exome, 3),
|
||||
"CEUTrio_wgs_b37" -> new Target("CEUTrio.HiSeq.WGS.b37", hg19, dbSNP_b37, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.bwa.cleaned.recal.bam"),
|
||||
new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 90.0, 99.0, !lowPass, !exome, 3),
|
||||
"CEUTrio_wex_decoy" -> new Target("CEUTrio.HiSeq.WEx.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.b37_decoy.list"),
|
||||
new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
|
||||
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 90.0, 98.0, !lowPass, exome, 3),
|
||||
"CEUTrio_wgs_decoy" -> new Target("CEUTrio.HiSeq.WGS.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37_decoy.list"),
|
||||
new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 90.0, 99.0, !lowPass, !exome, 3),
|
||||
"GA2hg19" -> new Target("NA12878.GA2.hg19", hg19, dbSNP_b37, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.GA2.WGS.bwa.cleaned.hg19.bam"),
|
||||
new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/analysis/snps/NA12878.GA2.hg19.filtered.vcf"),
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 90.0, 99.0, !lowPass, !exome, 1),
|
||||
"FIN" -> new Target("FIN", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/1kg/processing/pipeline_test_bams/FIN.79sample.Nov2010.chr20.bam"),
|
||||
new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 90.0, 99.0, lowPass, !exome, 79),
|
||||
"TGPWExGdA" -> new Target("1000G.WEx.GdA", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/1kg/processing/pipeline_test_bams/Barcoded_1000G_WEx_Reduced_Plate_1.cleaned.list"), // BUGBUG: reduce from 60 to 20 people
|
||||
new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
|
||||
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 90.0, 99.0, !lowPass, exome, 96),
|
||||
"LowPassN60" -> new Target("lowpass.N60", b36, dbSNP_b36, hapmap_b36, indelMask_b36,
|
||||
new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/lowpass.chr20.cleaned.matefixed.bam"), // the bam list to call from
|
||||
new File("/home/radon01/depristo/work/oneOffProjects/VQSRCutByNRS/lowpass.N60.chr20.filtered.vcf"), // the gold standard VCF file to run through the VQSR
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 90.0, 99.0, lowPass, !exome, 60), // chunked interval list to use with Queue's scatter/gather functionality
|
||||
"LowPassEUR363Nov" -> new Target("EUR.nov2010", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/1kg/processing/pipeline_test_bams/EUR.363sample.Nov2010.chr20.bam"),
|
||||
new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 90.0, 99.0, lowPass, !exome, 363)
|
||||
)
|
||||
|
||||
|
||||
def script = {
|
||||
|
||||
// Selects the datasets in the -dataset argument and adds them to targets.
|
||||
var targets: List[Target] = List()
|
||||
if (!datasets.isEmpty)
|
||||
for (ds <- datasets)
|
||||
targets ::= targetDataSets(ds)
|
||||
else // If -dataset is not specified, all datasets are used.
|
||||
for (targetDS <- targetDataSets.valuesIterator)
|
||||
targets ::= targetDS
|
||||
|
||||
val goldStandard = true
|
||||
for (target <- targets) {
|
||||
if( !skipCalling ) {
|
||||
if (!noIndels) add(new indelCall(target), new indelRecal(target), new indelCut(target), new indelEvaluation(target))
|
||||
add(new snpCall(target))
|
||||
add(new snpRecal(target, !goldStandard))
|
||||
add(new snpCut(target, !goldStandard))
|
||||
add(new snpEvaluation(target))
|
||||
}
|
||||
if ( runGoldStandard ) {
|
||||
add(new snpRecal(target, goldStandard))
|
||||
add(new snpCut(target, goldStandard))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
trait UNIVERSAL_GATK_ARGS extends CommandLineGATK {
|
||||
logging_level = "INFO";
|
||||
memoryLimit = 4;
|
||||
phone_home = GATKRunReport.PhoneHomeOption.NO_ET
|
||||
}
|
||||
|
||||
def bai(bam: File) = new File(bam + ".bai")
|
||||
|
||||
// 1.) Unified Genotyper Base
|
||||
class GenotyperBase (t: Target) extends UnifiedGenotyper with UNIVERSAL_GATK_ARGS {
|
||||
this.reference_sequence = t.reference
|
||||
this.intervalsString ++= List(t.intervals)
|
||||
this.scatterCount = 140
|
||||
this.nt = 2
|
||||
this.dcov = if ( t.isLowpass ) { 50 } else { 250 }
|
||||
this.stand_call_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 }
|
||||
this.stand_emit_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 }
|
||||
this.input_file :+= t.bamList
|
||||
this.D = new File(t.dbsnpFile)
|
||||
}
|
||||
|
||||
// 1a.) Call SNPs with UG
|
||||
class snpCall (t: Target) extends GenotyperBase(t) {
|
||||
if (minimumBaseQuality >= 0)
|
||||
this.min_base_quality_score = minimumBaseQuality
|
||||
if (qscript.deletions >= 0)
|
||||
this.max_deletion_fraction = qscript.deletions
|
||||
this.out = t.rawSnpVCF
|
||||
this.glm = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.SNP
|
||||
this.baq = if (noBAQ || t.isExome) {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF} else {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.CALCULATE_AS_NECESSARY}
|
||||
this.analysisName = t.name + "_UGs"
|
||||
this.jobName = queueLogDir + t.name + ".snpcall"
|
||||
}
|
||||
|
||||
// 1b.) Call Indels with UG
|
||||
class indelCall (t: Target) extends GenotyperBase(t) {
|
||||
this.memoryLimit = 6
|
||||
this.out = t.rawIndelVCF
|
||||
this.glm = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.INDEL
|
||||
this.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF
|
||||
this.analysisName = t.name + "_UGi"
|
||||
this.jobName = queueLogDir + t.name + ".indelcall"
|
||||
}
|
||||
|
||||
// 2.) Hard Filtering for indels
|
||||
class indelFilter (t: Target) extends VariantFiltration with UNIVERSAL_GATK_ARGS {
|
||||
this.memoryLimit = 2
|
||||
this.reference_sequence = t.reference
|
||||
this.intervalsString ++= List(t.intervals)
|
||||
this.scatterCount = 10
|
||||
this.V = t.rawIndelVCF
|
||||
this.out = t.filteredIndelVCF
|
||||
this.filterName ++= List("IndelQD", "IndelReadPosRankSum", "IndelFS")
|
||||
this.filterExpression ++= List("QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0")
|
||||
if (t.nSamples >= 10) {
|
||||
this.filterName ++= List("IndelInbreedingCoeff")
|
||||
this.filterExpression ++= List("InbreedingCoeff < -0.8")
|
||||
}
|
||||
this.analysisName = t.name + "_VF"
|
||||
this.jobName = queueLogDir + t.name + ".indelfilter"
|
||||
}
|
||||
|
||||
class VQSRBase(t: Target) extends VariantRecalibrator with UNIVERSAL_GATK_ARGS {
|
||||
this.nt = 2
|
||||
this.reference_sequence = t.reference
|
||||
this.intervalsString ++= List(t.intervals)
|
||||
this.allPoly = true
|
||||
this.tranche ++= List("100.0", "99.9", "99.5", "99.3", "99.0", "98.9", "98.8", "98.5", "98.4", "98.3", "98.2", "98.1", "98.0", "97.9", "97.8", "97.5", "97.0", "95.0", "90.0")
|
||||
}
|
||||
|
||||
class snpRecal(t: Target, goldStandard: Boolean) extends VQSRBase(t) with UNIVERSAL_GATK_ARGS {
|
||||
this.input :+= ( if ( goldStandard ) { t.goldStandard_VCF } else { t.rawSnpVCF } )
|
||||
this.resource :+= new TaggedFile( t.hapmapFile, "known=false,training=true,truth=true,prior=15.0" )
|
||||
this.resource :+= new TaggedFile( omni_b37, "known=false,training=true,truth=true,prior=12.0" )
|
||||
this.resource :+= new TaggedFile( training_1000G, "known=false,training=true,prior=10.0" )
|
||||
this.resource :+= new TaggedFile( t.dbsnpFile, "known=true,prior=2.0" )
|
||||
this.resource :+= new TaggedFile( projectConsensus_1000G, "prior=8.0" )
|
||||
this.use_annotation ++= List("QD", "HaplotypeScore", "MQRankSum", "ReadPosRankSum", "MQ", "FS")
|
||||
if(t.nSamples >= 10)
|
||||
this.use_annotation ++= List("InbreedingCoeff") // InbreedingCoeff is a population-wide statistic that requires at least 10 samples to calculate
|
||||
if(!t.isExome)
|
||||
this.use_annotation ++= List("DP")
|
||||
else { // exome specific parameters
|
||||
this.resource :+= new TaggedFile( badSites_1000G, "bad=true,prior=2.0" )
|
||||
this.mG = 6
|
||||
if(t.nSamples <= 3) { // very few exome samples means very few variants
|
||||
this.mG = 4
|
||||
this.percentBad = 0.04
|
||||
}
|
||||
}
|
||||
this.tranches_file = if ( goldStandard ) { t.goldStandardTranchesFile } else { t.tranchesSnpFile }
|
||||
this.recal_file = if ( goldStandard ) { t.goldStandardRecalFile } else { t.recalSnpFile }
|
||||
this.rscript_file = t.vqsrSnpRscript
|
||||
this.mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP
|
||||
this.analysisName = t.name + "_VQSRs"
|
||||
this.jobName = queueLogDir + t.name + ".snprecal"
|
||||
}
|
||||
|
||||
class indelRecal(t: Target) extends VQSRBase(t) with UNIVERSAL_GATK_ARGS {
|
||||
this.input :+= t.rawIndelVCF
|
||||
this.resource :+= new TaggedFile(indelGoldStandardCallset, "known=false,training=true,truth=true,prior=12.0" )
|
||||
this.resource :+= new TaggedFile( t.dbsnpFile, "known=true,prior=2.0" )
|
||||
this.use_annotation ++= List("QD", "HaplotypeScore", "ReadPosRankSum", "FS")
|
||||
if(t.nSamples >= 10)
|
||||
this.use_annotation ++= List("InbreedingCoeff") // InbreedingCoeff is a population-wide statistic that requires at least 10 samples to calculate
|
||||
this.tranches_file = t.tranchesIndelFile
|
||||
this.recal_file = t.recalIndelFile
|
||||
this.rscript_file = t.vqsrIndelRscript
|
||||
this.mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL
|
||||
this.analysisName = t.name + "_VQSRi"
|
||||
this.jobName = queueLogDir + t.name + ".indelrecal"
|
||||
}
|
||||
|
||||
|
||||
// 4.) Apply the recalibration table to the appropriate tranches
|
||||
class applyVQSRBase (t: Target) extends ApplyRecalibration with UNIVERSAL_GATK_ARGS {
|
||||
this.memoryLimit = 6
|
||||
this.reference_sequence = t.reference
|
||||
this.intervalsString ++= List(t.intervals)
|
||||
}
|
||||
|
||||
class snpCut (t: Target, goldStandard: Boolean) extends applyVQSRBase(t) {
|
||||
this.input :+= ( if ( goldStandard ) { t.goldStandard_VCF } else { t.rawSnpVCF } )
|
||||
this.tranches_file = if ( goldStandard ) { t.goldStandardTranchesFile } else { t.tranchesSnpFile}
|
||||
this.recal_file = if ( goldStandard ) { t.goldStandardRecalFile } else { t.recalSnpFile }
|
||||
this.ts_filter_level = t.snpTrancheTarget
|
||||
this.mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP
|
||||
this.out = t.recalibratedSnpVCF
|
||||
this.analysisName = t.name + "_AVQSRs"
|
||||
this.jobName = queueLogDir + t.name + ".snpcut"
|
||||
}
|
||||
|
||||
class indelCut (t: Target) extends applyVQSRBase(t) {
|
||||
this.input :+= t.rawIndelVCF
|
||||
this.tranches_file = t.tranchesIndelFile
|
||||
this.recal_file = t.recalIndelFile
|
||||
this.ts_filter_level = t.indelTranchTarget
|
||||
this.mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL
|
||||
this.out = t.recalibratedIndelVCF
|
||||
this.analysisName = t.name + "_AVQSRi"
|
||||
this.jobName = queueLogDir + t.name + ".indelcut"
|
||||
}
|
||||
|
||||
|
||||
// 5.) Variant Evaluation Base(OPTIONAL)
|
||||
class EvalBase(t: Target) extends VariantEval with UNIVERSAL_GATK_ARGS {
|
||||
this.memoryLimit = 3
|
||||
this.comp :+= new TaggedFile(t.hapmapFile, "hapmap" )
|
||||
this.D = new File(t.dbsnpFile)
|
||||
this.reference_sequence = t.reference
|
||||
this.intervalsString ++= List(t.intervals)
|
||||
this.sample = samples
|
||||
}
|
||||
|
||||
// 5a.) SNP Evaluation (OPTIONAL) based on the cut vcf
|
||||
class snpEvaluation(t: Target) extends EvalBase(t) {
|
||||
if (t.reference == b37 || t.reference == hg19) this.comp :+= new TaggedFile( omni_b37, "omni" )
|
||||
this.eval :+= t.recalibratedSnpVCF
|
||||
this.out = t.evalFile
|
||||
this.analysisName = t.name + "_VEs"
|
||||
this.jobName = queueLogDir + t.name + ".snpeval"
|
||||
}
|
||||
|
||||
// 5b.) Indel Evaluation (OPTIONAL)
|
||||
class indelEvaluation(t: Target) extends EvalBase(t) {
|
||||
this.eval :+= t.recalibratedIndelVCF
|
||||
this.comp :+= new TaggedFile(indelGoldStandardCallset, "indelGS" )
|
||||
this.noEV = true
|
||||
this.evalModule = List("CompOverlap", "CountVariants", "TiTvVariantEvaluator", "ValidationReport", "IndelStatistics")
|
||||
this.out = t.evalIndelFile
|
||||
this.analysisName = t.name + "_VEi"
|
||||
this.jobName = queueLogDir + queueLogDir + t.name + ".indeleval"
|
||||
}
|
||||
}
|
||||
|
|
@ -1,188 +0,0 @@
|
|||
package org.broadinstitute.sting.queue.qscripts
|
||||
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.queue.util.QScriptUtils
|
||||
import net.sf.samtools.SAMFileHeader.SortOrder
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException
|
||||
import org.broadinstitute.sting.commandline.Hidden
|
||||
import org.broadinstitute.sting.queue.extensions.picard.{ReorderSam, SortSam, AddOrReplaceReadGroups}
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: carneiro
|
||||
* Date: 4/20/11
|
||||
* Time: 16:29 PM
|
||||
*/
|
||||
|
||||
|
||||
class PacbioProcessingPipeline extends QScript {
|
||||
|
||||
@Input(doc="input FASTA/FASTQ/BAM file - or list of FASTA/FASTQ/BAM files. ", shortName="i", required=true)
|
||||
var input: File = _
|
||||
|
||||
@Input(doc="Reference fasta file", shortName="R", required=true)
|
||||
var reference: File = _
|
||||
|
||||
@Input(doc="dbsnp VCF file to use ", shortName="D", required=true)
|
||||
var dbSNP: File = _
|
||||
|
||||
@Argument(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false)
|
||||
var threads: Int = 0
|
||||
|
||||
@Argument(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false)
|
||||
var sample: String = "NA"
|
||||
|
||||
@Input(doc="The path to the binary of bwa to align fasta/fastq files", fullName="path_to_bwa", shortName="bwa", required=false)
|
||||
var bwaPath: File = _
|
||||
|
||||
@Argument(doc="Input is a BLASR generated BAM file", shortName = "blasr", fullName="blasr_bam", required=false)
|
||||
var BLASR_BAM: Boolean = false
|
||||
|
||||
@Hidden
|
||||
@Argument(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false)
|
||||
var dbq: Int = 20
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName="bwastring", required=false)
|
||||
var bwastring: String = ""
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName = "test", fullName = "test_mode", required = false)
|
||||
var testMode: Boolean = false
|
||||
|
||||
val queueLogDir: String = ".qlog/"
|
||||
|
||||
def script() {
|
||||
|
||||
val fileList: Seq[File] = QScriptUtils.createSeqFromFile(input)
|
||||
|
||||
for (file: File <- fileList) {
|
||||
|
||||
var USE_BWA: Boolean = false
|
||||
var resetQuals: Boolean = true
|
||||
|
||||
if (file.endsWith(".fasta") || file.endsWith(".fq") || file.endsWith(".fastq")) {
|
||||
if (bwaPath == null) {
|
||||
throw new UserException("You provided a fasta/fastq file but didn't provide the path for BWA");
|
||||
}
|
||||
USE_BWA = true
|
||||
if (file.endsWith(".fq") || file.endsWith(".fastq"))
|
||||
resetQuals = false
|
||||
}
|
||||
|
||||
// FASTA -> BAM steps
|
||||
val alignedSam: File = file.getName + ".aligned.sam"
|
||||
val sortedBam: File = swapExt(alignedSam, ".sam", ".bam")
|
||||
val rgBam: File = swapExt(file, ".bam", ".rg.bam")
|
||||
|
||||
val bamBase = if (USE_BWA) {rgBam} else {file}
|
||||
|
||||
// BAM Steps
|
||||
val mqBAM: File = swapExt(bamBase, ".bam", ".mq.bam")
|
||||
val recalFile1: File = swapExt(bamBase, ".bam", ".recal1.table")
|
||||
val recalFile2: File = swapExt(bamBase, ".bam", ".recal2.table")
|
||||
val recalBam: File = swapExt(bamBase, ".bam", ".recal.bam")
|
||||
val path1: String = recalBam + ".before"
|
||||
val path2: String = recalBam + ".after"
|
||||
|
||||
if (USE_BWA) {
|
||||
add(align(file, alignedSam),
|
||||
sortSam(alignedSam, sortedBam),
|
||||
addReadGroup(sortedBam, rgBam, sample))
|
||||
}
|
||||
|
||||
else if (BLASR_BAM) {
|
||||
val reorderedBAM = swapExt(bamBase, ".bam", ".reordered.bam")
|
||||
add(reorder(bamBase, reorderedBAM),
|
||||
changeMQ(reorderedBAM, mqBAM))
|
||||
}
|
||||
|
||||
val bam = if (BLASR_BAM) {mqBAM} else {bamBase}
|
||||
|
||||
add(cov(bam, recalFile1, resetQuals),
|
||||
recal(bam, recalFile1, recalBam),
|
||||
cov(recalBam, recalFile2, false))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// General arguments to non-GATK tools
|
||||
trait ExternalCommonArgs extends CommandLineFunction {
|
||||
this.memoryLimit = 4
|
||||
this.isIntermediate = true
|
||||
}
|
||||
|
||||
trait CommandLineGATKArgs extends CommandLineGATK with ExternalCommonArgs {
|
||||
this.reference_sequence = reference
|
||||
}
|
||||
|
||||
|
||||
case class align(@Input inFastq: File, @Output outSam: File) extends ExternalCommonArgs {
|
||||
def commandLine = bwaPath + " bwasw -b5 -q2 -r1 -z20 -t16 " + reference + " " + inFastq + " > " + outSam
|
||||
this.memoryLimit = 8
|
||||
this.analysisName = queueLogDir + outSam + ".bwa_sam_se"
|
||||
this.jobName = queueLogDir + outSam + ".bwa_sam_se"
|
||||
}
|
||||
|
||||
case class sortSam (inSam: File, outBam: File) extends SortSam with ExternalCommonArgs {
|
||||
this.input = List(inSam)
|
||||
this.output = outBam
|
||||
this.memoryLimit = 8
|
||||
this.sortOrder = SortOrder.coordinate
|
||||
this.analysisName = queueLogDir + outBam + ".sortSam"
|
||||
this.jobName = queueLogDir + outBam + ".sortSam"
|
||||
}
|
||||
|
||||
case class reorder (inSam: File, outSam: File) extends ReorderSam with ExternalCommonArgs {
|
||||
this.input = List(inSam)
|
||||
this.output = outSam
|
||||
this.sortReference = reference
|
||||
}
|
||||
|
||||
case class changeMQ(inBam: File, outBam: File) extends PrintReads with CommandLineGATKArgs {
|
||||
this.input_file :+= inBam
|
||||
this.out = outBam
|
||||
this.read_filter :+= "ReassignMappingQuality"
|
||||
}
|
||||
|
||||
case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups with ExternalCommonArgs {
|
||||
@Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai")
|
||||
this.input = List(inBam)
|
||||
this.output = outBam
|
||||
this.RGID = "1"
|
||||
this.RGCN = "BI"
|
||||
this.RGPL = "PacBio_RS"
|
||||
this.RGSM = sample
|
||||
this.RGLB = "default_library"
|
||||
this.RGPU = "default_pu"
|
||||
this.analysisName = queueLogDir + outBam + ".rg"
|
||||
this.jobName = queueLogDir + outBam + ".rg"
|
||||
}
|
||||
|
||||
case class cov (inBam: File, outRecalFile: File, resetQuals: Boolean) extends BaseRecalibrator with CommandLineGATKArgs {
|
||||
if (resetQuals)
|
||||
this.DBQ = dbq
|
||||
this.knownSites :+= dbSNP
|
||||
this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "ContextCovariate")
|
||||
this.input_file :+= inBam
|
||||
this.disable_indel_quals = true
|
||||
this.out = outRecalFile
|
||||
this.analysisName = queueLogDir + outRecalFile + ".covariates"
|
||||
this.jobName = queueLogDir + outRecalFile + ".covariates"
|
||||
this.scatterCount = threads
|
||||
this.read_filter :+= "BadCigar"
|
||||
}
|
||||
|
||||
case class recal (inBam: File, inRecalFile: File, outBam: File) extends PrintReads with CommandLineGATKArgs {
|
||||
this.DBQ = dbq
|
||||
this.input_file :+= inBam
|
||||
this.BQSR = inRecalFile
|
||||
this.out = outBam
|
||||
this.isIntermediate = false
|
||||
this.analysisName = queueLogDir + outBam + ".recalibration"
|
||||
this.jobName = queueLogDir + outBam + ".recalibration"
|
||||
this.read_filter :+= "BadCigar"
|
||||
this.scatterCount = threads
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue