Merge branch 'master' of github.com:broadinstitute/gsa-unstable

This commit is contained in:
Eric Banks 2013-01-07 15:00:35 -05:00
commit 1a4b112865
3 changed files with 0 additions and 1057 deletions

View File

@ -1,496 +0,0 @@
package org.broadinstitute.sting.queue.qscripts
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.picard._
import org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel
import org.broadinstitute.sting.utils.baq.BAQ.CalculationMode
import collection.JavaConversions._
import net.sf.samtools.SAMFileReader
import net.sf.samtools.SAMFileHeader.SortOrder
import org.broadinstitute.sting.queue.util.QScriptUtils
import org.broadinstitute.sting.queue.function.ListWriterFunction
import org.broadinstitute.sting.commandline.Hidden
import org.broadinstitute.sting.commandline
class DataProcessingPipeline extends QScript {
qscript =>
/****************************************************************************
* Required Parameters
****************************************************************************/
@Input(doc="input BAM file - or list of BAM files", fullName="input", shortName="i", required=true)
var input: File = _
@Input(doc="Reference fasta file", fullName="reference", shortName="R", required=true)
var reference: File = _
@Input(doc="dbsnp ROD to use (must be in VCF format)", fullName="dbsnp", shortName="D", required=true)
var dbSNP: Seq[File] = Seq()
/****************************************************************************
* Optional Parameters
****************************************************************************/
@Input(doc="extra VCF files to use as reference indels for Indel Realignment", fullName="extra_indels", shortName="indels", required=false)
var indels: Seq[File] = Seq()
@Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false)
var bwaPath: File = _
@Argument(doc="the project name determines the final output (BAM file) base name. Example NA12878 yields NA12878.processed.bam", fullName="project", shortName="p", required=false)
var projectName: String = "project"
@Argument(doc="Output path for the processed BAM files.", fullName="output_directory", shortName="outputDir", required=false)
var outputDir: String = ""
@Argument(doc="the -L interval string to be used by GATK - output bams at interval only", fullName="gatk_interval_string", shortName="L", required=false)
var intervalString: String = ""
@Input(doc="an intervals file to be used by GATK - output bams at intervals only", fullName="gatk_interval_file", shortName="intervals", required=false)
var intervals: File = _
@Argument(doc="Cleaning model: KNOWNS_ONLY, USE_READS or USE_SW", fullName="clean_model", shortName="cm", required=false)
var cleaningModel: String = "USE_READS"
@Argument(doc="Decompose input BAM file and fully realign it using BWA and assume Single Ended reads", fullName="use_bwa_single_ended", shortName="bwase", required=false)
var useBWAse: Boolean = false
@Argument(doc="Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads", fullName="use_bwa_pair_ended", shortName="bwape", required=false)
var useBWApe: Boolean = false
@Argument(doc="Decompose input BAM file and fully realign it using BWA SW", fullName="use_bwa_sw", shortName="bwasw", required=false)
var useBWAsw: Boolean = false
@Argument(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false)
var bwaThreads: Int = 1
@Argument(doc="Perform validation on the BAM files", fullName="validation", shortName="vs", required=false)
var validation: Boolean = false
/****************************************************************************
* Hidden Parameters
****************************************************************************/
@Hidden
@Argument(doc="How many ways to scatter/gather", fullName="scatter_gather", shortName="sg", required=false)
var nContigs: Int = -1
@Hidden
@Argument(doc="Define the default platform for Count Covariates -- useful for techdev purposes only.", fullName="default_platform", shortName="dp", required=false)
var defaultPlatform: String = ""
@Hidden
@Argument(doc="Run the pipeline in test mode only", fullName = "test_mode", shortName = "test", required=false)
var testMode: Boolean = false
/****************************************************************************
* Global Variables
****************************************************************************/
val queueLogDir: String = ".qlog/" // Gracefully hide Queue's output
var cleanModelEnum: ConsensusDeterminationModel = ConsensusDeterminationModel.USE_READS
/****************************************************************************
* Helper classes and methods
****************************************************************************/
class ReadGroup (val id: String,
val lb: String,
val pl: String,
val pu: String,
val sm: String,
val cn: String,
val ds: String)
{}
// Utility function to merge all bam files of similar samples. Generates one BAM file per sample.
// It uses the sample information on the header of the input BAM files.
//
// Because the realignment only happens after these scripts are executed, in case you are using
// bwa realignment, this function will operate over the original bam files and output over the
// (to be realigned) bam files.
def createSampleFiles(bamFiles: Seq[File], realignedBamFiles: Seq[File]): Map[String, Seq[File]] = {
// Creating a table with SAMPLE information from each input BAM file
val sampleTable = scala.collection.mutable.Map.empty[String, Seq[File]]
val realignedIterator = realignedBamFiles.iterator
for (bam <- bamFiles) {
val rBam = realignedIterator.next() // advance to next element in the realignedBam list so they're in sync.
val samReader = new SAMFileReader(bam)
val header = samReader.getFileHeader
val readGroups = header.getReadGroups
// only allow one sample per file. Bam files with multiple samples would require pre-processing of the file
// with PrintReads to separate the samples. Tell user to do it himself!
assert(!QScriptUtils.hasMultipleSamples(readGroups), "The pipeline requires that only one sample is present in a BAM file. Please separate the samples in " + bam)
// Fill out the sample table with the readgroups in this file
for (rg <- readGroups) {
val sample = rg.getSample
if (!sampleTable.contains(sample))
sampleTable(sample) = Seq(rBam)
else if ( !sampleTable(sample).contains(rBam))
sampleTable(sample) :+= rBam
}
}
sampleTable.toMap
}
// Rebuilds the Read Group string to give BWA
def addReadGroups(inBam: File, outBam: File, samReader: SAMFileReader) {
val readGroups = samReader.getFileHeader.getReadGroups
var index: Int = readGroups.length
for (rg <- readGroups) {
val intermediateInBam: File = if (index == readGroups.length) { inBam } else { swapExt(outBam, ".bam", index+1 + "-rg.bam") }
val intermediateOutBam: File = if (index > 1) {swapExt(outBam, ".bam", index + "-rg.bam") } else { outBam}
val readGroup = new ReadGroup(rg.getReadGroupId, rg.getLibrary, rg.getPlatform, rg.getPlatformUnit, rg.getSample, rg.getSequencingCenter, rg.getDescription)
add(addReadGroup(intermediateInBam, intermediateOutBam, readGroup))
index = index - 1
}
}
// Takes a list of processed BAM files and realign them using the BWA option requested (bwase or bwape).
// Returns a list of realigned BAM files.
def performAlignment(bams: Seq[File]): Seq[File] = {
var realignedBams: Seq[File] = Seq()
var index = 1
for (bam <- bams) {
// first revert the BAM file to the original qualities
val saiFile1 = swapExt(bam, ".bam", "." + index + ".1.sai")
val saiFile2 = swapExt(bam, ".bam", "." + index + ".2.sai")
val realignedSamFile = swapExt(bam, ".bam", "." + index + ".realigned.sam")
val realignedBamFile = swapExt(bam, ".bam", "." + index + ".realigned.bam")
val rgRealignedBamFile = swapExt(bam, ".bam", "." + index + ".realigned.rg.bam")
if (useBWAse) {
val revertedBAM = revertBAM(bam, true)
add(bwa_aln_se(revertedBAM, saiFile1),
bwa_sam_se(revertedBAM, saiFile1, realignedSamFile))
}
else if (useBWApe) {
val revertedBAM = revertBAM(bam, true)
add(bwa_aln_pe(revertedBAM, saiFile1, 1),
bwa_aln_pe(revertedBAM, saiFile2, 2),
bwa_sam_pe(revertedBAM, saiFile1, saiFile2, realignedSamFile))
}
else if (useBWAsw) {
val revertedBAM = revertBAM(bam, false)
val fastQ = swapExt(revertedBAM, ".bam", ".fq")
add(convertToFastQ(revertedBAM, fastQ),
bwa_sw(fastQ, realignedSamFile))
}
add(sortSam(realignedSamFile, realignedBamFile, SortOrder.coordinate))
addReadGroups(realignedBamFile, rgRealignedBamFile, new SAMFileReader(bam))
realignedBams :+= rgRealignedBamFile
index = index + 1
}
realignedBams
}
def getIndelCleaningModel: ConsensusDeterminationModel = {
if (cleaningModel == "KNOWNS_ONLY")
ConsensusDeterminationModel.KNOWNS_ONLY
else if (cleaningModel == "USE_SW")
ConsensusDeterminationModel.USE_SW
else
ConsensusDeterminationModel.USE_READS
}
def revertBams(bams: Seq[File], removeAlignmentInformation: Boolean): Seq[File] = {
var revertedBAMList: Seq[File] = Seq()
for (bam <- bams)
revertedBAMList :+= revertBAM(bam, removeAlignmentInformation)
revertedBAMList
}
def revertBAM(bam: File, removeAlignmentInformation: Boolean): File = {
val revertedBAM = swapExt(bam, ".bam", ".reverted.bam")
add(revert(bam, revertedBAM, removeAlignmentInformation))
revertedBAM
}
/****************************************************************************
* Main script
****************************************************************************/
def script() {
// final output list of processed bam files
var cohortList: Seq[File] = Seq()
// sets the model for the Indel Realigner
cleanModelEnum = getIndelCleaningModel
// keep a record of the number of contigs in the first bam file in the list
val bams = QScriptUtils.createSeqFromFile(input)
if (nContigs < 0)
nContigs = QScriptUtils.getNumberOfContigs(bams(0))
val realignedBAMs = if (useBWApe || useBWAse || useBWAsw) {performAlignment(bams)} else {revertBams(bams, false)}
// generate a BAM file per sample joining all per lane files if necessary
val sampleBAMFiles: Map[String, Seq[File]] = createSampleFiles(bams, realignedBAMs)
// if this is a 'knowns only' indel realignment run, do it only once for all samples.
val globalIntervals = new File(outputDir + projectName + ".intervals")
if (cleaningModel == ConsensusDeterminationModel.KNOWNS_ONLY)
add(target(null, globalIntervals))
// put each sample through the pipeline
for ((sample, bamList) <- sampleBAMFiles) {
// BAM files generated by the pipeline
val bam = new File(qscript.projectName + "." + sample + ".bam")
val cleanedBam = swapExt(bam, ".bam", ".clean.bam")
val dedupedBam = swapExt(bam, ".bam", ".clean.dedup.bam")
val recalBam = swapExt(bam, ".bam", ".clean.dedup.recal.bam")
// Accessory files
val targetIntervals = if (cleaningModel == ConsensusDeterminationModel.KNOWNS_ONLY) {globalIntervals} else {swapExt(bam, ".bam", ".intervals")}
val metricsFile = swapExt(bam, ".bam", ".metrics")
val preRecalFile = swapExt(bam, ".bam", ".pre_recal.table")
val postRecalFile = swapExt(bam, ".bam", ".post_recal.table")
val preOutPath = swapExt(bam, ".bam", ".pre")
val postOutPath = swapExt(bam, ".bam", ".post")
val preValidateLog = swapExt(bam, ".bam", ".pre.validation")
val postValidateLog = swapExt(bam, ".bam", ".post.validation")
// Validation is an optional step for the BAM file generated after
// alignment and the final bam file of the pipeline.
if (validation) {
for (sampleFile <- bamList)
add(validate(sampleFile, preValidateLog),
validate(recalBam, postValidateLog))
}
if (cleaningModel != ConsensusDeterminationModel.KNOWNS_ONLY)
add(target(bamList, targetIntervals))
add(clean(bamList, targetIntervals, cleanedBam),
dedup(cleanedBam, dedupedBam, metricsFile),
cov(dedupedBam, preRecalFile),
recal(dedupedBam, preRecalFile, recalBam),
cov(recalBam, postRecalFile))
cohortList :+= recalBam
}
// output a BAM list with all the processed per sample files
val cohortFile = new File(qscript.outputDir + qscript.projectName + ".cohort.list")
add(writeList(cohortList, cohortFile))
}
/****************************************************************************
* Classes (GATK Walkers)
****************************************************************************/
// General arguments to non-GATK tools
trait ExternalCommonArgs extends CommandLineFunction {
this.memoryLimit = 4
this.isIntermediate = true
}
// General arguments to GATK walkers
trait CommandLineGATKArgs extends CommandLineGATK with ExternalCommonArgs {
this.reference_sequence = qscript.reference
}
trait SAMargs extends PicardBamFunction with ExternalCommonArgs {
this.maxRecordsInRam = 100000
}
case class target (inBams: Seq[File], outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs {
if (cleanModelEnum != ConsensusDeterminationModel.KNOWNS_ONLY)
this.input_file = inBams
this.out = outIntervals
this.mismatchFraction = 0.0
this.known ++= qscript.dbSNP
if (indels != null)
this.known ++= qscript.indels
this.scatterCount = nContigs
this.analysisName = queueLogDir + outIntervals + ".target"
this.jobName = queueLogDir + outIntervals + ".target"
}
case class clean (inBams: Seq[File], tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs {
this.input_file = inBams
this.targetIntervals = tIntervals
this.out = outBam
this.known ++= qscript.dbSNP
if (qscript.indels != null)
this.known ++= qscript.indels
this.consensusDeterminationModel = cleanModelEnum
this.compress = 0
this.noPGTag = qscript.testMode;
this.scatterCount = nContigs
this.analysisName = queueLogDir + outBam + ".clean"
this.jobName = queueLogDir + outBam + ".clean"
}
case class cov (inBam: File, outRecalFile: File) extends BaseRecalibrator with CommandLineGATKArgs {
this.knownSites ++= qscript.dbSNP
this.covariate ++= Seq("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "ContextCovariate")
this.input_file :+= inBam
this.disable_indel_quals = true
this.out = outRecalFile
if (!defaultPlatform.isEmpty) this.default_platform = defaultPlatform
if (!qscript.intervalString.isEmpty) this.intervalsString ++= Seq(qscript.intervalString)
else if (qscript.intervals != null) this.intervals :+= qscript.intervals
this.scatterCount = nContigs
this.analysisName = queueLogDir + outRecalFile + ".covariates"
this.jobName = queueLogDir + outRecalFile + ".covariates"
}
case class recal (inBam: File, inRecalFile: File, outBam: File) extends PrintReads with CommandLineGATKArgs {
this.input_file :+= inBam
this.BQSR = inRecalFile
this.baq = CalculationMode.CALCULATE_AS_NECESSARY
this.out = outBam
if (!qscript.intervalString.isEmpty) this.intervalsString ++= Seq(qscript.intervalString)
else if (qscript.intervals != null) this.intervals :+= qscript.intervals
this.scatterCount = nContigs
this.isIntermediate = false
this.analysisName = queueLogDir + outBam + ".recalibration"
this.jobName = queueLogDir + outBam + ".recalibration"
}
/****************************************************************************
* Classes (non-GATK programs)
****************************************************************************/
case class dedup (inBam: File, outBam: File, metricsFile: File) extends MarkDuplicates with ExternalCommonArgs {
this.input :+= inBam
this.output = outBam
this.metrics = metricsFile
this.memoryLimit = 16
this.analysisName = queueLogDir + outBam + ".dedup"
this.jobName = queueLogDir + outBam + ".dedup"
}
case class joinBams (inBams: Seq[File], outBam: File) extends MergeSamFiles with ExternalCommonArgs {
this.input = inBams
this.output = outBam
this.analysisName = queueLogDir + outBam + ".joinBams"
this.jobName = queueLogDir + outBam + ".joinBams"
}
case class sortSam (inSam: File, outBam: File, sortOrderP: SortOrder) extends SortSam with ExternalCommonArgs {
this.input :+= inSam
this.output = outBam
this.sortOrder = sortOrderP
this.analysisName = queueLogDir + outBam + ".sortSam"
this.jobName = queueLogDir + outBam + ".sortSam"
}
case class validate (inBam: File, outLog: File) extends ValidateSamFile with ExternalCommonArgs {
this.input :+= inBam
this.output = outLog
this.REFERENCE_SEQUENCE = qscript.reference
this.isIntermediate = false
this.analysisName = queueLogDir + outLog + ".validate"
this.jobName = queueLogDir + outLog + ".validate"
}
case class addReadGroup (inBam: File, outBam: File, readGroup: ReadGroup) extends AddOrReplaceReadGroups with ExternalCommonArgs {
this.input :+= inBam
this.output = outBam
this.RGID = readGroup.id
this.RGCN = readGroup.cn
this.RGDS = readGroup.ds
this.RGLB = readGroup.lb
this.RGPL = readGroup.pl
this.RGPU = readGroup.pu
this.RGSM = readGroup.sm
this.analysisName = queueLogDir + outBam + ".rg"
this.jobName = queueLogDir + outBam + ".rg"
}
case class revert (inBam: File, outBam: File, removeAlignmentInfo: Boolean) extends RevertSam with ExternalCommonArgs {
this.output = outBam
this.input :+= inBam
this.removeAlignmentInformation = removeAlignmentInfo;
this.sortOrder = if (removeAlignmentInfo) {SortOrder.queryname} else {SortOrder.coordinate}
this.analysisName = queueLogDir + outBam + "revert"
this.jobName = queueLogDir + outBam + ".revert"
}
case class convertToFastQ (inBam: File, outFQ: File) extends SamToFastq with ExternalCommonArgs {
this.input :+= inBam
this.fastq = outFQ
this.analysisName = queueLogDir + outFQ + "convert_to_fastq"
this.jobName = queueLogDir + outFQ + ".convert_to_fastq"
}
case class bwa_aln_se (inBam: File, outSai: File) extends CommandLineFunction with ExternalCommonArgs {
@Input(doc="bam file to be aligned") var bam = inBam
@Output(doc="output sai file") var sai = outSai
def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b " + bam + " > " + sai
this.analysisName = queueLogDir + outSai + ".bwa_aln_se"
this.jobName = queueLogDir + outSai + ".bwa_aln_se"
}
case class bwa_aln_pe (inBam: File, outSai1: File, index: Int) extends CommandLineFunction with ExternalCommonArgs {
@Input(doc="bam file to be aligned") var bam = inBam
@Output(doc="output sai file for 1st mating pair") var sai = outSai1
def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b" + index + " " + bam + " > " + sai
this.analysisName = queueLogDir + outSai1 + ".bwa_aln_pe1"
this.jobName = queueLogDir + outSai1 + ".bwa_aln_pe1"
}
case class bwa_sam_se (inBam: File, inSai: File, outBam: File) extends CommandLineFunction with ExternalCommonArgs {
@Input(doc="bam file to be aligned") var bam = inBam
@Input(doc="bwa alignment index file") var sai = inSai
@Output(doc="output aligned bam file") var alignedBam = outBam
def commandLine = bwaPath + " samse " + reference + " " + sai + " " + bam + " > " + alignedBam
this.memoryLimit = 6
this.analysisName = queueLogDir + outBam + ".bwa_sam_se"
this.jobName = queueLogDir + outBam + ".bwa_sam_se"
}
case class bwa_sam_pe (inBam: File, inSai1: File, inSai2:File, outBam: File) extends CommandLineFunction with ExternalCommonArgs {
@Input(doc="bam file to be aligned") var bam = inBam
@Input(doc="bwa alignment index file for 1st mating pair") var sai1 = inSai1
@Input(doc="bwa alignment index file for 2nd mating pair") var sai2 = inSai2
@Output(doc="output aligned bam file") var alignedBam = outBam
def commandLine = bwaPath + " sampe " + reference + " " + sai1 + " " + sai2 + " " + bam + " " + bam + " > " + alignedBam
this.memoryLimit = 6
this.analysisName = queueLogDir + outBam + ".bwa_sam_pe"
this.jobName = queueLogDir + outBam + ".bwa_sam_pe"
}
case class bwa_sw (inFastQ: File, outBam: File) extends CommandLineFunction with ExternalCommonArgs {
@Input(doc="fastq file to be aligned") var fq = inFastQ
@Output(doc="output bam file") var bam = outBam
def commandLine = bwaPath + " bwasw -t " + bwaThreads + " " + reference + " " + fq + " > " + bam
this.analysisName = queueLogDir + outBam + ".bwasw"
this.jobName = queueLogDir + outBam + ".bwasw"
}
case class writeList(inBams: Seq[File], outBamList: File) extends ListWriterFunction {
this.inputFiles = inBams
this.listFile = outBamList
this.analysisName = queueLogDir + outBamList + ".bamList"
this.jobName = queueLogDir + outBamList + ".bamList"
}
}

View File

@ -1,373 +0,0 @@
package org.broadinstitute.sting.queue.qscripts
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport
class MethodsDevelopmentCallingPipeline extends QScript {
qscript =>
@Argument(shortName="outputDir", doc="output directory", required=false)
var outputDir: String = "./"
@Argument(shortName="skipCalling", doc="skip the calling part of the pipeline and only run VQSR on preset, gold standard VCF files", required=false)
var skipCalling: Boolean = false
@Argument(shortName="dataset", doc="selects the datasets to run. If not provided, all datasets will be used", required=false)
var datasets: List[String] = Nil
@Argument(shortName="runGoldStandard", doc="run the pipeline with the goldstandard VCF files for comparison", required=false)
var runGoldStandard: Boolean = false
@Argument(shortName="noBAQ", doc="turns off BAQ calculation", required=false)
var noBAQ: Boolean = false
@Argument(shortName="noIndels", doc="do not call indels with the Unified Genotyper", required=false)
var noIndels: Boolean = false
@Argument(shortName="mbq", doc="The minimum Phred-Scaled quality score threshold to be considered a good base.", required=false)
var minimumBaseQuality: Int = -1
@Argument(shortName="deletions", doc="Maximum deletion fraction allowed at a site to call a genotype.", required=false)
var deletions: Double = -1
@Argument(shortName="sample", doc="Samples to include in Variant Eval", required=false)
var samples: List[String] = Nil
class Target(
val baseName: String,
val reference: File,
val dbsnpFile: String,
val hapmapFile: String,
val maskFile: String,
val bamList: File,
val goldStandard_VCF: File,
val intervals: String,
val indelTranchTarget: Double,
val snpTrancheTarget: Double,
val isLowpass: Boolean,
val isExome: Boolean,
val nSamples: Int) {
val name = qscript.outputDir + baseName
val clusterFile = new File(name + ".clusters")
val rawSnpVCF = new File(name + ".raw.vcf")
val rawIndelVCF = new File(name + ".raw.indel.vcf")
val filteredIndelVCF = new File(name + ".filtered.indel.vcf")
val recalibratedSnpVCF = new File(name + ".snp.recalibrated.vcf")
val recalibratedIndelVCF = new File(name + ".indel.recalibrated.vcf")
val tranchesSnpFile = new File(name + ".snp.tranches")
val tranchesIndelFile = new File(name + ".indel.tranches")
val vqsrSnpRscript = name + ".snp.vqsr.r"
val vqsrIndelRscript = name + ".indel.vqsr.r"
val recalSnpFile = new File(name + ".snp.tranches.recal")
val recalIndelFile = new File(name + ".indel.tranches.recal")
val goldStandardRecalibratedVCF = new File(name + "goldStandard.recalibrated.vcf")
val goldStandardTranchesFile = new File(name + "goldStandard.tranches")
val goldStandardRecalFile = new File(name + "goldStandard.tranches.recal")
val evalFile = new File(name + ".snp.eval")
val evalIndelFile = new File(name + ".indel.eval")
val goldStandardName = qscript.outputDir + "goldStandard/" + baseName
val goldStandardClusterFile = new File(goldStandardName + ".clusters")
}
val b37_decoy = new File("/humgen/1kg/reference/human_g1k_v37_decoy.fasta")
val hg19 = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
val hg18 = new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
val b36 = new File("/humgen/1kg/reference/human_b36_both.fasta")
val b37 = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
val dbSNP_hg18_129 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_hg18.rod" // Special case for NA12878 collections that can't use 132 because they are part of it.
val dbSNP_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132.b36.excluding_sites_after_129.vcf"
val dbSNP_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.leftAligned.vcf" // Special case for NA12878 collections that can't use 132 because they are part of it.
val hapmap_hg18 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.hg18_fwd.vcf"
val hapmap_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b36_fwd.vcf"
val hapmap_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf"
val training_hapmap_b37 = "/humgen/1kg/processing/pipeline_test_bams/hapmap3.3_training_chr20.vcf"
val omni_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_sites_1525_samples.b36.vcf"
val omni_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_sites_1525_samples.b37.vcf"
val indelMask_b36 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b36.bed"
val indelMask_b37 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b37.bed"
val training_1000G = "/humgen/1kg/processing/official_release/phase1/projectConsensus/phase1.wgs.projectConsensus.v2b.recal.highQuality.vcf"
val badSites_1000G = "/humgen/1kg/processing/official_release/phase1/projectConsensus/phase1.wgs.projectConsensus.v2b.recal.terrible.vcf"
val projectConsensus_1000G = "/humgen/1kg/processing/official_release/phase1/projectConsensus/ALL.wgs.projectConsensus_v2b.20101123.snps.sites.vcf"
val indelGoldStandardCallset = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/GoldStandardIndel/gold.standard.indel.MillsAnd1000G.b37.vcf"
val lowPass: Boolean = true
val exome: Boolean = true
val indels: Boolean = true
val queueLogDir = ".qlog/"
// BUGBUG: We no longer support b36/hg18 because several of the necessary files aren't available aligned to those references
val targetDataSets: Map[String, Target] = Map(
"NA12878_gold" -> new Target("NA12878.goldStandard", hg19, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/data/goldStandard.list"),
new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/analysis/snps/NA12878.HiSeq19.filtered.vcf"), // ** There is no gold standard for the gold standard **
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 90.0, 99.0, lowPass, !exome, 391),
"NA12878_wgs_b37" -> new Target("NA12878.HiSeq.WGS.b37", hg19, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.bam"),
new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/analysis/snps/NA12878.HiSeq19.filtered.vcf"),
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 90.0, 99.0, !lowPass, !exome, 1),
"NA12878_wgs_decoy" -> new Target("NA12878.HiSeq.WGS.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37_decoy.NA12878.clean.dedup.recal.bam"),
new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/analysis/snps/NA12878.HiSeq19.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 90.0, 99.0, !lowPass, !exome, 1),
"NA12878_wgs_hg18" -> new Target("NA12878.HiSeq.WGS.hg18", hg18, dbSNP_hg18_129, hapmap_hg18,
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.indels.10.mask",
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"),
new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf"),
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 90.0, 99.0, !lowPass, !exome, 1),
"NA12878_wex_b37" -> new Target("NA12878.HiSeq.WEx.b37", hg19, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/seq/picard_aggregation/C339/NA12878/v3/NA12878.bam"),
new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 90.0, 98.0, !lowPass, exome, 1),
"NA12878_wex_hg18" -> new Target("NA12878.HiSeq.WEx.hg18", hg18, dbSNP_hg18_129, hapmap_hg18,
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.indels.10.mask",
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.WEx.cleaned.recal.bam"),
new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.vcf"),
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list", 90.0, 98.0, !lowPass, exome, 1),
"NA12878_wex_decoy" -> new Target("NA12878.HiSeq.WEx.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.b37_decoy.NA12878.clean.dedup.recal.bam"),
new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 90.0, 98.0, !lowPass, exome, 1),
"CEUTrio_wex_b37" -> new Target("CEUTrio.HiSeq.WEx.b37", hg19, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.bwa.cleaned.recal.bam"),
new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"),
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 90.0, 98.0, !lowPass, exome, 3),
"CEUTrio_wgs_b37" -> new Target("CEUTrio.HiSeq.WGS.b37", hg19, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.bwa.cleaned.recal.bam"),
new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 90.0, 99.0, !lowPass, !exome, 3),
"CEUTrio_wex_decoy" -> new Target("CEUTrio.HiSeq.WEx.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.b37_decoy.list"),
new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 90.0, 98.0, !lowPass, exome, 3),
"CEUTrio_wgs_decoy" -> new Target("CEUTrio.HiSeq.WGS.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37_decoy.list"),
new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 90.0, 99.0, !lowPass, !exome, 3),
"GA2hg19" -> new Target("NA12878.GA2.hg19", hg19, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.GA2.WGS.bwa.cleaned.hg19.bam"),
new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/analysis/snps/NA12878.GA2.hg19.filtered.vcf"),
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 90.0, 99.0, !lowPass, !exome, 1),
"FIN" -> new Target("FIN", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/1kg/processing/pipeline_test_bams/FIN.79sample.Nov2010.chr20.bam"),
new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 90.0, 99.0, lowPass, !exome, 79),
"TGPWExGdA" -> new Target("1000G.WEx.GdA", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/1kg/processing/pipeline_test_bams/Barcoded_1000G_WEx_Reduced_Plate_1.cleaned.list"), // BUGBUG: reduce from 60 to 20 people
new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 90.0, 99.0, !lowPass, exome, 96),
"LowPassN60" -> new Target("lowpass.N60", b36, dbSNP_b36, hapmap_b36, indelMask_b36,
new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/lowpass.chr20.cleaned.matefixed.bam"), // the bam list to call from
new File("/home/radon01/depristo/work/oneOffProjects/VQSRCutByNRS/lowpass.N60.chr20.filtered.vcf"), // the gold standard VCF file to run through the VQSR
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 90.0, 99.0, lowPass, !exome, 60), // chunked interval list to use with Queue's scatter/gather functionality
"LowPassEUR363Nov" -> new Target("EUR.nov2010", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/1kg/processing/pipeline_test_bams/EUR.363sample.Nov2010.chr20.bam"),
new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 90.0, 99.0, lowPass, !exome, 363)
)
def script = {
// Selects the datasets in the -dataset argument and adds them to targets.
var targets: List[Target] = List()
if (!datasets.isEmpty)
for (ds <- datasets)
targets ::= targetDataSets(ds)
else // If -dataset is not specified, all datasets are used.
for (targetDS <- targetDataSets.valuesIterator)
targets ::= targetDS
val goldStandard = true
for (target <- targets) {
if( !skipCalling ) {
if (!noIndels) add(new indelCall(target), new indelRecal(target), new indelCut(target), new indelEvaluation(target))
add(new snpCall(target))
add(new snpRecal(target, !goldStandard))
add(new snpCut(target, !goldStandard))
add(new snpEvaluation(target))
}
if ( runGoldStandard ) {
add(new snpRecal(target, goldStandard))
add(new snpCut(target, goldStandard))
}
}
}
trait UNIVERSAL_GATK_ARGS extends CommandLineGATK {
logging_level = "INFO";
memoryLimit = 4;
phone_home = GATKRunReport.PhoneHomeOption.NO_ET
}
def bai(bam: File) = new File(bam + ".bai")
// 1.) Unified Genotyper Base
class GenotyperBase (t: Target) extends UnifiedGenotyper with UNIVERSAL_GATK_ARGS {
this.reference_sequence = t.reference
this.intervalsString ++= List(t.intervals)
this.scatterCount = 140
this.nt = 2
this.dcov = if ( t.isLowpass ) { 50 } else { 250 }
this.stand_call_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 }
this.stand_emit_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 }
this.input_file :+= t.bamList
this.D = new File(t.dbsnpFile)
}
// 1a.) Call SNPs with UG
class snpCall (t: Target) extends GenotyperBase(t) {
if (minimumBaseQuality >= 0)
this.min_base_quality_score = minimumBaseQuality
if (qscript.deletions >= 0)
this.max_deletion_fraction = qscript.deletions
this.out = t.rawSnpVCF
this.glm = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.SNP
this.baq = if (noBAQ || t.isExome) {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF} else {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.CALCULATE_AS_NECESSARY}
this.analysisName = t.name + "_UGs"
this.jobName = queueLogDir + t.name + ".snpcall"
}
// 1b.) Call Indels with UG
class indelCall (t: Target) extends GenotyperBase(t) {
this.memoryLimit = 6
this.out = t.rawIndelVCF
this.glm = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.INDEL
this.baq = org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF
this.analysisName = t.name + "_UGi"
this.jobName = queueLogDir + t.name + ".indelcall"
}
// 2.) Hard Filtering for indels
class indelFilter (t: Target) extends VariantFiltration with UNIVERSAL_GATK_ARGS {
this.memoryLimit = 2
this.reference_sequence = t.reference
this.intervalsString ++= List(t.intervals)
this.scatterCount = 10
this.V = t.rawIndelVCF
this.out = t.filteredIndelVCF
this.filterName ++= List("IndelQD", "IndelReadPosRankSum", "IndelFS")
this.filterExpression ++= List("QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0")
if (t.nSamples >= 10) {
this.filterName ++= List("IndelInbreedingCoeff")
this.filterExpression ++= List("InbreedingCoeff < -0.8")
}
this.analysisName = t.name + "_VF"
this.jobName = queueLogDir + t.name + ".indelfilter"
}
class VQSRBase(t: Target) extends VariantRecalibrator with UNIVERSAL_GATK_ARGS {
this.nt = 2
this.reference_sequence = t.reference
this.intervalsString ++= List(t.intervals)
this.allPoly = true
this.tranche ++= List("100.0", "99.9", "99.5", "99.3", "99.0", "98.9", "98.8", "98.5", "98.4", "98.3", "98.2", "98.1", "98.0", "97.9", "97.8", "97.5", "97.0", "95.0", "90.0")
}
class snpRecal(t: Target, goldStandard: Boolean) extends VQSRBase(t) with UNIVERSAL_GATK_ARGS {
this.input :+= ( if ( goldStandard ) { t.goldStandard_VCF } else { t.rawSnpVCF } )
this.resource :+= new TaggedFile( t.hapmapFile, "known=false,training=true,truth=true,prior=15.0" )
this.resource :+= new TaggedFile( omni_b37, "known=false,training=true,truth=true,prior=12.0" )
this.resource :+= new TaggedFile( training_1000G, "known=false,training=true,prior=10.0" )
this.resource :+= new TaggedFile( t.dbsnpFile, "known=true,prior=2.0" )
this.resource :+= new TaggedFile( projectConsensus_1000G, "prior=8.0" )
this.use_annotation ++= List("QD", "HaplotypeScore", "MQRankSum", "ReadPosRankSum", "MQ", "FS")
if(t.nSamples >= 10)
this.use_annotation ++= List("InbreedingCoeff") // InbreedingCoeff is a population-wide statistic that requires at least 10 samples to calculate
if(!t.isExome)
this.use_annotation ++= List("DP")
else { // exome specific parameters
this.resource :+= new TaggedFile( badSites_1000G, "bad=true,prior=2.0" )
this.mG = 6
if(t.nSamples <= 3) { // very few exome samples means very few variants
this.mG = 4
this.percentBad = 0.04
}
}
this.tranches_file = if ( goldStandard ) { t.goldStandardTranchesFile } else { t.tranchesSnpFile }
this.recal_file = if ( goldStandard ) { t.goldStandardRecalFile } else { t.recalSnpFile }
this.rscript_file = t.vqsrSnpRscript
this.mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP
this.analysisName = t.name + "_VQSRs"
this.jobName = queueLogDir + t.name + ".snprecal"
}
class indelRecal(t: Target) extends VQSRBase(t) with UNIVERSAL_GATK_ARGS {
this.input :+= t.rawIndelVCF
this.resource :+= new TaggedFile(indelGoldStandardCallset, "known=false,training=true,truth=true,prior=12.0" )
this.resource :+= new TaggedFile( t.dbsnpFile, "known=true,prior=2.0" )
this.use_annotation ++= List("QD", "HaplotypeScore", "ReadPosRankSum", "FS")
if(t.nSamples >= 10)
this.use_annotation ++= List("InbreedingCoeff") // InbreedingCoeff is a population-wide statistic that requires at least 10 samples to calculate
this.tranches_file = t.tranchesIndelFile
this.recal_file = t.recalIndelFile
this.rscript_file = t.vqsrIndelRscript
this.mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL
this.analysisName = t.name + "_VQSRi"
this.jobName = queueLogDir + t.name + ".indelrecal"
}
// 4.) Apply the recalibration table to the appropriate tranches
class applyVQSRBase (t: Target) extends ApplyRecalibration with UNIVERSAL_GATK_ARGS {
this.memoryLimit = 6
this.reference_sequence = t.reference
this.intervalsString ++= List(t.intervals)
}
class snpCut (t: Target, goldStandard: Boolean) extends applyVQSRBase(t) {
this.input :+= ( if ( goldStandard ) { t.goldStandard_VCF } else { t.rawSnpVCF } )
this.tranches_file = if ( goldStandard ) { t.goldStandardTranchesFile } else { t.tranchesSnpFile}
this.recal_file = if ( goldStandard ) { t.goldStandardRecalFile } else { t.recalSnpFile }
this.ts_filter_level = t.snpTrancheTarget
this.mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP
this.out = t.recalibratedSnpVCF
this.analysisName = t.name + "_AVQSRs"
this.jobName = queueLogDir + t.name + ".snpcut"
}
class indelCut (t: Target) extends applyVQSRBase(t) {
this.input :+= t.rawIndelVCF
this.tranches_file = t.tranchesIndelFile
this.recal_file = t.recalIndelFile
this.ts_filter_level = t.indelTranchTarget
this.mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL
this.out = t.recalibratedIndelVCF
this.analysisName = t.name + "_AVQSRi"
this.jobName = queueLogDir + t.name + ".indelcut"
}
// 5.) Variant Evaluation Base(OPTIONAL)
class EvalBase(t: Target) extends VariantEval with UNIVERSAL_GATK_ARGS {
this.memoryLimit = 3
this.comp :+= new TaggedFile(t.hapmapFile, "hapmap" )
this.D = new File(t.dbsnpFile)
this.reference_sequence = t.reference
this.intervalsString ++= List(t.intervals)
this.sample = samples
}
// 5a.) SNP Evaluation (OPTIONAL) based on the cut vcf
class snpEvaluation(t: Target) extends EvalBase(t) {
if (t.reference == b37 || t.reference == hg19) this.comp :+= new TaggedFile( omni_b37, "omni" )
this.eval :+= t.recalibratedSnpVCF
this.out = t.evalFile
this.analysisName = t.name + "_VEs"
this.jobName = queueLogDir + t.name + ".snpeval"
}
// 5b.) Indel Evaluation (OPTIONAL)
class indelEvaluation(t: Target) extends EvalBase(t) {
this.eval :+= t.recalibratedIndelVCF
this.comp :+= new TaggedFile(indelGoldStandardCallset, "indelGS" )
this.noEV = true
this.evalModule = List("CompOverlap", "CountVariants", "TiTvVariantEvaluator", "ValidationReport", "IndelStatistics")
this.out = t.evalIndelFile
this.analysisName = t.name + "_VEi"
this.jobName = queueLogDir + queueLogDir + t.name + ".indeleval"
}
}

View File

@ -1,188 +0,0 @@
package org.broadinstitute.sting.queue.qscripts
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.util.QScriptUtils
import net.sf.samtools.SAMFileHeader.SortOrder
import org.broadinstitute.sting.utils.exceptions.UserException
import org.broadinstitute.sting.commandline.Hidden
import org.broadinstitute.sting.queue.extensions.picard.{ReorderSam, SortSam, AddOrReplaceReadGroups}
import org.broadinstitute.sting.queue.extensions.gatk._
/**
* Created by IntelliJ IDEA.
* User: carneiro
* Date: 4/20/11
* Time: 16:29 PM
*/
class PacbioProcessingPipeline extends QScript {
@Input(doc="input FASTA/FASTQ/BAM file - or list of FASTA/FASTQ/BAM files. ", shortName="i", required=true)
var input: File = _
@Input(doc="Reference fasta file", shortName="R", required=true)
var reference: File = _
@Input(doc="dbsnp VCF file to use ", shortName="D", required=true)
var dbSNP: File = _
@Argument(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false)
var threads: Int = 0
@Argument(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false)
var sample: String = "NA"
@Input(doc="The path to the binary of bwa to align fasta/fastq files", fullName="path_to_bwa", shortName="bwa", required=false)
var bwaPath: File = _
@Argument(doc="Input is a BLASR generated BAM file", shortName = "blasr", fullName="blasr_bam", required=false)
var BLASR_BAM: Boolean = false
@Hidden
@Argument(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false)
var dbq: Int = 20
@Hidden
@Argument(shortName="bwastring", required=false)
var bwastring: String = ""
@Hidden
@Argument(shortName = "test", fullName = "test_mode", required = false)
var testMode: Boolean = false
val queueLogDir: String = ".qlog/"
def script() {
val fileList: Seq[File] = QScriptUtils.createSeqFromFile(input)
for (file: File <- fileList) {
var USE_BWA: Boolean = false
var resetQuals: Boolean = true
if (file.endsWith(".fasta") || file.endsWith(".fq") || file.endsWith(".fastq")) {
if (bwaPath == null) {
throw new UserException("You provided a fasta/fastq file but didn't provide the path for BWA");
}
USE_BWA = true
if (file.endsWith(".fq") || file.endsWith(".fastq"))
resetQuals = false
}
// FASTA -> BAM steps
val alignedSam: File = file.getName + ".aligned.sam"
val sortedBam: File = swapExt(alignedSam, ".sam", ".bam")
val rgBam: File = swapExt(file, ".bam", ".rg.bam")
val bamBase = if (USE_BWA) {rgBam} else {file}
// BAM Steps
val mqBAM: File = swapExt(bamBase, ".bam", ".mq.bam")
val recalFile1: File = swapExt(bamBase, ".bam", ".recal1.table")
val recalFile2: File = swapExt(bamBase, ".bam", ".recal2.table")
val recalBam: File = swapExt(bamBase, ".bam", ".recal.bam")
val path1: String = recalBam + ".before"
val path2: String = recalBam + ".after"
if (USE_BWA) {
add(align(file, alignedSam),
sortSam(alignedSam, sortedBam),
addReadGroup(sortedBam, rgBam, sample))
}
else if (BLASR_BAM) {
val reorderedBAM = swapExt(bamBase, ".bam", ".reordered.bam")
add(reorder(bamBase, reorderedBAM),
changeMQ(reorderedBAM, mqBAM))
}
val bam = if (BLASR_BAM) {mqBAM} else {bamBase}
add(cov(bam, recalFile1, resetQuals),
recal(bam, recalFile1, recalBam),
cov(recalBam, recalFile2, false))
}
}
// General arguments to non-GATK tools
trait ExternalCommonArgs extends CommandLineFunction {
this.memoryLimit = 4
this.isIntermediate = true
}
trait CommandLineGATKArgs extends CommandLineGATK with ExternalCommonArgs {
this.reference_sequence = reference
}
case class align(@Input inFastq: File, @Output outSam: File) extends ExternalCommonArgs {
def commandLine = bwaPath + " bwasw -b5 -q2 -r1 -z20 -t16 " + reference + " " + inFastq + " > " + outSam
this.memoryLimit = 8
this.analysisName = queueLogDir + outSam + ".bwa_sam_se"
this.jobName = queueLogDir + outSam + ".bwa_sam_se"
}
case class sortSam (inSam: File, outBam: File) extends SortSam with ExternalCommonArgs {
this.input = List(inSam)
this.output = outBam
this.memoryLimit = 8
this.sortOrder = SortOrder.coordinate
this.analysisName = queueLogDir + outBam + ".sortSam"
this.jobName = queueLogDir + outBam + ".sortSam"
}
case class reorder (inSam: File, outSam: File) extends ReorderSam with ExternalCommonArgs {
this.input = List(inSam)
this.output = outSam
this.sortReference = reference
}
case class changeMQ(inBam: File, outBam: File) extends PrintReads with CommandLineGATKArgs {
this.input_file :+= inBam
this.out = outBam
this.read_filter :+= "ReassignMappingQuality"
}
case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups with ExternalCommonArgs {
@Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai")
this.input = List(inBam)
this.output = outBam
this.RGID = "1"
this.RGCN = "BI"
this.RGPL = "PacBio_RS"
this.RGSM = sample
this.RGLB = "default_library"
this.RGPU = "default_pu"
this.analysisName = queueLogDir + outBam + ".rg"
this.jobName = queueLogDir + outBam + ".rg"
}
case class cov (inBam: File, outRecalFile: File, resetQuals: Boolean) extends BaseRecalibrator with CommandLineGATKArgs {
if (resetQuals)
this.DBQ = dbq
this.knownSites :+= dbSNP
this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "ContextCovariate")
this.input_file :+= inBam
this.disable_indel_quals = true
this.out = outRecalFile
this.analysisName = queueLogDir + outRecalFile + ".covariates"
this.jobName = queueLogDir + outRecalFile + ".covariates"
this.scatterCount = threads
this.read_filter :+= "BadCigar"
}
case class recal (inBam: File, inRecalFile: File, outBam: File) extends PrintReads with CommandLineGATKArgs {
this.DBQ = dbq
this.input_file :+= inBam
this.BQSR = inRecalFile
this.out = outBam
this.isIntermediate = false
this.analysisName = queueLogDir + outBam + ".recalibration"
this.jobName = queueLogDir + outBam + ".recalibration"
this.read_filter :+= "BadCigar"
this.scatterCount = threads
}
}