Oneoff project to downsample, bootstrap and call snps to test sensitivity/specificity of downsampled coverage in WEX projects.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5713 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
deed7c47a1
commit
3868a7e778
|
|
@ -0,0 +1,133 @@
|
|||
package oneoffs.carneiro
|
||||
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import scala.io.Source._
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: carneiro
|
||||
* Date: 3/17/11
|
||||
* Time: 11:29 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
|
||||
|
||||
class downsampling extends QScript {
|
||||
|
||||
@Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true)
|
||||
var GATKjar: File = _
|
||||
|
||||
@Input(doc="input BAM file - or list of BAM files", shortName="i", required=true)
|
||||
var input: File = _
|
||||
|
||||
@Input(doc="target intervals", shortName="t", required=true)
|
||||
var targetIntervals: File = _
|
||||
|
||||
@Input(doc="bootstrap number", shortName="b", required=false)
|
||||
var bootstrap: Int = 1
|
||||
|
||||
@Input(doc="downsampling step", shortName="ds", required=true)
|
||||
var downsamplingStep: Double = _
|
||||
|
||||
@Input(doc="downsampling floor", shortName="df", required=false)
|
||||
var downsamplingFloor: Double = 0.0
|
||||
|
||||
@Input(doc="downsampling ceiling", shortName="dc", required=false)
|
||||
var downsamplingCeiling: Double = 1.0
|
||||
|
||||
@Input(doc="Reference fasta file", shortName="R", required=false)
|
||||
var reference: File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
|
||||
|
||||
@Input(doc="dbSNP file", shortName="D", required=false)
|
||||
var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf")
|
||||
|
||||
@Input(doc="project name", shortName="p", required=false)
|
||||
var base: String = "prj"
|
||||
|
||||
def countLines(file: File):Int = {
|
||||
var count: Int = 0
|
||||
for (l <- fromFile(file).getLines) {
|
||||
count = count + 1
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
val queueLogDir: String = ".qlog/"
|
||||
val outFile: String = "cov.out"
|
||||
|
||||
def script = {
|
||||
val nIntervals = math.min(200, countLines(targetIntervals))
|
||||
|
||||
var f: Double = downsamplingCeiling
|
||||
var i: Int = 1
|
||||
while (f>=downsamplingFloor) {
|
||||
var b: Int = bootstrap
|
||||
while(b > 0) {
|
||||
val file = swapExt(outFile, ".out", ".F" + i + "." + b + ".out")
|
||||
add(cov(f, file))
|
||||
b = b - 1
|
||||
}
|
||||
val snp_out = new File(base + ".F" + i + ".raw.vcf")
|
||||
val filter_out = new File(base + ".F" + i + ".filtered.vcf")
|
||||
val eval_out = new File(base + ".F" + i + ".eval")
|
||||
|
||||
add( snps(f, snp_out, nIntervals),
|
||||
filter(snp_out, filter_out),
|
||||
eval(filter_out, eval_out))
|
||||
|
||||
f = f - downsamplingStep
|
||||
i = i + 1
|
||||
}
|
||||
}
|
||||
|
||||
trait CommandLineGATKArgs extends CommandLineGATK {
|
||||
this.intervals :+= targetIntervals
|
||||
this.jarFile = GATKjar
|
||||
this.reference_sequence = reference
|
||||
this.memoryLimit = 4
|
||||
}
|
||||
|
||||
case class cov (fraction: Double, outFile: File) extends Percent20xCoverage with CommandLineGATKArgs {
|
||||
this.input_file :+= input
|
||||
this.out = outFile
|
||||
this.ndrs = true
|
||||
this.downsample_to_fraction = fraction
|
||||
this.jobName = queueLogDir + outFile + ".cov"
|
||||
}
|
||||
|
||||
case class snps (fraction: Double, outFile: File, nIntervals: Int) extends UnifiedGenotyper with CommandLineGATKArgs {
|
||||
this.memoryLimit = 6
|
||||
this.downsample_to_coverage = 600
|
||||
this.genotype_likelihoods_model = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.SNP
|
||||
this.input_file :+= input
|
||||
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
|
||||
this.downsample_to_fraction = fraction
|
||||
this.scatterCount = nIntervals
|
||||
this.out = outFile
|
||||
this.analysisName = outFile + "_snps"
|
||||
this.jobName = queueLogDir + outFile
|
||||
}
|
||||
|
||||
case class filter (inFile: File, outFile: File) extends VariantFiltration with CommandLineGATKArgs {
|
||||
this.filterName ++= List("SNPSBFilter","SNPQDFilter","SNPHRunFilter")
|
||||
this.filterExpression ++= List("\"SB>=0.10\"","\"QD<5.0\"","\"HRun>=4\"")
|
||||
this.clusterWindowSize = 10
|
||||
this.clusterSize = 3
|
||||
this.variantVCF = inFile
|
||||
this.out = outFile
|
||||
this.analysisName = outFile + "_filter"
|
||||
this.jobName = queueLogDir + outFile
|
||||
}
|
||||
|
||||
case class eval (inFile: File, outFile: File) extends VariantEval with CommandLineGATKArgs {
|
||||
this.noST = true
|
||||
this.noEV = true
|
||||
this.evalModule ++= List("TiTvVariantEvaluator", "CountVariants", "ValidationReport")
|
||||
this.stratificationModule ++= List("EvalRod", "CompRod", "Novelty")
|
||||
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
|
||||
this.rodBind :+= RodBind("eval", "VCF", inFile)
|
||||
this.out = outFile
|
||||
this.analysisName = outFile + "_VariantEval"
|
||||
this.jobName = queueLogDir + outFile
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue