diff --git a/scala/qscript/oneoffs/carneiro/downsampling.scala b/scala/qscript/oneoffs/carneiro/downsampling.scala new file mode 100644 index 000000000..fcb0a8dc9 --- /dev/null +++ b/scala/qscript/oneoffs/carneiro/downsampling.scala @@ -0,0 +1,133 @@ +package oneoffs.carneiro + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.gatk._ +import scala.io.Source._ +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 3/17/11 + * Time: 11:29 AM + * To change this template use File | Settings | File Templates. + */ + + +class downsampling extends QScript { + + @Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true) + var GATKjar: File = _ + + @Input(doc="input BAM file - or list of BAM files", shortName="i", required=true) + var input: File = _ + + @Input(doc="target intervals", shortName="t", required=true) + var targetIntervals: File = _ + + @Input(doc="bootstrap number", shortName="b", required=false) + var bootstrap: Int = 1 + + @Input(doc="downsampling step", shortName="ds", required=true) + var downsamplingStep: Double = _ + + @Input(doc="downsampling floor", shortName="df", required=false) + var downsamplingFloor: Double = 0.0 + + @Input(doc="downsampling ceiling", shortName="dc", required=false) + var downsamplingCeiling: Double = 1.0 + + @Input(doc="Reference fasta file", shortName="R", required=false) + var reference: File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") + + @Input(doc="dbSNP file", shortName="D", required=false) + var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf") + + @Input(doc="project name", shortName="p", required=false) + var base: String = "prj" + + def countLines(file: File):Int = { + var count: Int = 0 + for (l <- fromFile(file).getLines) { + count = count + 1 + } + return count + } + + val queueLogDir: String = ".qlog/" + val outFile: String = "cov.out" + + def script = { + val nIntervals = math.min(200, countLines(targetIntervals)) + + var f: Double = downsamplingCeiling + var i: Int = 1 + while (f>=downsamplingFloor) { + var b: Int = bootstrap + while(b > 0) { + val file = swapExt(outFile, ".out", ".F" + i + "." + b + ".out") + add(cov(f, file)) + b = b - 1 + } + val snp_out = new File(base + ".F" + i + ".raw.vcf") + val filter_out = new File(base + ".F" + i + ".filtered.vcf") + val eval_out = new File(base + ".F" + i + ".eval") + + add( snps(f, snp_out, nIntervals), + filter(snp_out, filter_out), + eval(filter_out, eval_out)) + + f = f - downsamplingStep + i = i + 1 + } + } + + trait CommandLineGATKArgs extends CommandLineGATK { + this.intervals :+= targetIntervals + this.jarFile = GATKjar + this.reference_sequence = reference + this.memoryLimit = 4 + } + + case class cov (fraction: Double, outFile: File) extends Percent20xCoverage with CommandLineGATKArgs { + this.input_file :+= input + this.out = outFile + this.ndrs = true + this.downsample_to_fraction = fraction + this.jobName = queueLogDir + outFile + ".cov" + } + + case class snps (fraction: Double, outFile: File, nIntervals: Int) extends UnifiedGenotyper with CommandLineGATKArgs { + this.memoryLimit = 6 + this.downsample_to_coverage = 600 + this.genotype_likelihoods_model = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.SNP + this.input_file :+= input + this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + this.downsample_to_fraction = fraction + this.scatterCount = nIntervals + this.out = outFile + this.analysisName = outFile + "_snps" + this.jobName = queueLogDir + outFile + } + + case class filter (inFile: File, outFile: File) extends VariantFiltration with CommandLineGATKArgs { + this.filterName ++= List("SNPSBFilter","SNPQDFilter","SNPHRunFilter") + this.filterExpression ++= List("\"SB>=0.10\"","\"QD<5.0\"","\"HRun>=4\"") + this.clusterWindowSize = 10 + this.clusterSize = 3 + this.variantVCF = inFile + this.out = outFile + this.analysisName = outFile + "_filter" + this.jobName = queueLogDir + outFile + } + + case class eval (inFile: File, outFile: File) extends VariantEval with CommandLineGATKArgs { + this.noST = true + this.noEV = true + this.evalModule ++= List("TiTvVariantEvaluator", "CountVariants", "ValidationReport") + this.stratificationModule ++= List("EvalRod", "CompRod", "Novelty") + this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + this.rodBind :+= RodBind("eval", "VCF", inFile) + this.out = outFile + this.analysisName = outFile + "_VariantEval" + this.jobName = queueLogDir + outFile + } +}