java changes:

VariantEvalWalker's logger is made public, so that variant eval modules can access it through the parent object.
 DesignFileGenerator comment lists how best to bind things to it, and the feature accessor is better refined to grab the genome loc. (old change)

scala changes:

convenience addAll( List[CommandLineFunction] ) added to QScript class (and thus removed from the fCPV2)
useful command line functions added to a new library package for command line functions (these are fast simple VCF command lines)
bug fixed in ProjectManagement for the class where there's only one batch to be batch-merged (not really part of the use-case, but an edge-condition that came up during pipeline testing)
first draft of a private mutations pipeline which will be elaborated in future



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4823 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-12-12 05:10:45 +00:00
parent abd6ce1c77
commit cf75caf653
8 changed files with 122 additions and 7 deletions

View File

@ -914,5 +914,5 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
return eval.hasGenotypes() ? eval.getNSamples() : nSamples;
}
protected Logger getLogger() { return logger; }
public Logger getLogger() { return logger; }
}

View File

@ -19,7 +19,7 @@ import java.io.PrintStream;
* Takes an interval list and annotates intervals with genes and exons falling within that interval
* Was written in order to annotate the Whole Exome Agilent designs at the Broad institute
* Bind the refseq rod as -B refseq,refseq,/path/to/refGene.txt
* Bind the interval list as -B interval_list,intervals,/path/to/intervals.interval_list
* Bind the interval list as -B interval_list,bed,/path/to/intervals.interval_list
* Bind the additional files file as -B gene*,bed,/path/to/other/file.bed
* @Author chartl
* @Date Apr 26, 2010
@ -39,7 +39,7 @@ public class DesignFileGeneratorWalker extends RodWalker<Long,Long> {
return null;
}
List<Object> intervalsList= tracker.getReferenceMetaData("interval_list");
List<GATKFeature> intervalsList= tracker.getGATKFeatureMetaData("interval_list",false);
List<Object> refseqList = tracker.getReferenceMetaData("refseq");
List<GATKFeature> bedList = tracker.getGATKFeatureMetaData("gene",false);

View File

@ -61,6 +61,4 @@ class fullCallingPipelineV2 extends QScript {
addAll(lib.StandardCallingPipeline(bamFiles,indel_vcf,recal_vcf,handfilt_vcf,fcp.pipelineArgs.target_titv,fcp.pipelineArgs.refseqTable))
}
def addAll(clfs: List[CommandLineFunction]) = { clfs.foreach(c => add(c)) }
}

View File

@ -0,0 +1,57 @@
import collection.JavaConversions._
import java.io.FileNotFoundException
import org.broadinstitute.sting.datasources.pipeline._
import org.broadinstitute.sting.queue.extensions.gatk.{VariantFiltration, UnifiedGenotyper}
import org.broadinstitute.sting.queue.library.clf.vcf._
import org.broadinstitute.sting.queue.pipeline._
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.utils.yaml.YamlUtils
class private_mutations extends QScript {
@Argument(shortName="yaml",fullName="eomiYaml",doc="Project YAML file",required=true) var eomiYaml: File = _
@Argument(shortName="sting",fullName="stingDir",doc="path to the Sting directory",required=true) var sting: String = _
@Argument(shortName="out",fullName="finalVCF",doc="the merged vcf to write to", required=true) var finalMergedVCF : File = _
var gatkjar : File = _
def script = {
gatkjar = new File(sting+"/dist/GenomeAnalysisTK.jar")
var input_pipeline : Pipeline = YamlUtils.load(classOf[Pipeline],eomiYaml)
var eomi_pipeline : Pipeline = new Pipeline
// use only QC-positive samples
eomi_pipeline.setProject( input_pipeline.getProject )
eomi_pipeline.setSamples( input_pipeline.getSamples.filter( p => p.getTags.get("QCStatus").equals("PASS")) )
var vcLib : VariantCalling = new VariantCalling(eomi_pipeline,gatkjar)
var pmLib : ProjectManagement = new ProjectManagement(sting)
/*eomi_pipeline.getSamples.foreach( p =>
if ( ! p.getBamFiles.get("recalibrated").exists) throw new FileNotFoundException(
p.getBamFiles.get("recalibrated").getAbsolutePath+" does not exist" ))*/
var batches : List[List[PipelineSample]] = eomi_pipeline.getSamples.toList.grouped(100).toList
var genotypers : List[UnifiedGenotyper] = batches.map( pl => pl.map( p => p.getBamFiles.get("recalibrated") ) ).zipWithIndex.map(
b => vcLib.StandardUnifiedGenotyper(b._1,new File(eomi_pipeline.getProject.getName+"_batch%d.raw.vcf".format(1+b._2))))
addAll(genotypers)
var handFilters : List[VariantFiltration] = genotypers.map( g => vcLib.StandardHandfilter(g.out,swapExt(g.out,".raw.vcf",".handfiltered.vcf")))
addAll(handFilters)
addAll(pmLib.MergeBatches(handFilters.map( _.out), batches.flatten.map( p => p.getBamFiles.get("recalibrated")),
finalMergedVCF,eomi_pipeline.getProject.getReferenceFile,20))
var afr_sams : List[PipelineSample] = eomi_pipeline.getSamples.toList.filter( p => p.getTags.get("Population").equals("AFRAMR"))
var eur_sams : List[PipelineSample] = eomi_pipeline.getSamples.toList.filter( p => p.getTags.get("Population").equals("EURAMR") ||
p.getTags.get("Population").equals("UNK"))
var variant_loci : VCFExtractIntervals = new VCFExtractIntervals(finalMergedVCF,swapExt(finalMergedVCF,".vcf",".intervals.list"),false)
add(variant_loci)
var extract_afr : VCFExtractSamples = new VCFExtractSamples(finalMergedVCF,swapExt(finalMergedVCF,".vcf",".afr.vcf"),afr_sams.map(_.getId))
var extract_eur : VCFExtractSamples = new VCFExtractSamples(finalMergedVCF,swapExt(finalMergedVCF,".vcf",".eur+unk.vcf"),eur_sams.map(_.getId))
add(extract_afr)
add(extract_eur)
}
}

View File

@ -69,6 +69,10 @@ trait QScript extends Logging {
functions.foreach(function => function.addOrder = QScript.nextAddOrder)
this.functions ++= functions
}
def addAll(functions: List[QFunction] ) = {
functions.foreach( f => add(f) )
}
}
object QScript {

View File

@ -0,0 +1,21 @@
package org.broadinstitute.sting.queue.library.clf.vcf
import java.io.File
import org.broadinstitute.sting.commandline.{Argument, Output, Input}
import org.broadinstitute.sting.queue.function.CommandLineFunction
class VCFExtractIntervals(inVCF: File, outList: File, passOnly: Boolean ) extends CommandLineFunction {
def this(vcf: File, list: File ) = this(vcf,list,false)
def this(vcf: File) = this(vcf, new File(vcf.getAbsolutePath.replace("vcf","intervals.list")),false)
def this(vcf: File, removeFilters: Boolean) = this(vcf, new File(vcf.getAbsolutePath.replace("vcf","intervals.list")), removeFilters)
@Input(doc="The VCF from which to extract an interval list") var inputVCF : File = inVCF
@Output(doc="The file to write the interval list to") var outputList : File = outList
@Argument(doc="Whether to use all sites, or only the unfiltered sites") var usePFOnly: Boolean = passOnly
def commandLine = {
if ( usePFOnly ) "grep PASS %s | awk '{print $1\":\"$2}' > %s".format(inputVCF.getAbsolutePath,outputList.getAbsolutePath)
else "grep -v \\\\# %s | awk '{print $1\":\"$2}' > %s".format(inputVCF.getAbsolutePath,outputList.getAbsolutePath)
}
}

View File

@ -0,0 +1,33 @@
package org.broadinstitute.sting.queue.library.clf.vcf
import java.io.File
import collection.JavaConversions._
import org.broadinstitute.sting.commandline._
import org.broadinstitute.sting.queue.function.CommandLineFunction
import org.broadinstitute.sting.utils.text.XReadLines
class VCFExtractSamples(inVCF: File, outVCF: File, samples: List[String]) extends CommandLineFunction {
@Input(doc="input VCF from which to extract samples") var inputVCF : File = inVCF
@Output(doc="output VCF to write extracted samples to") var outputVCF : File = outVCF
@Argument(doc="List of samples to extract from the VCF") var sampleList : List[String] = samples
var sampleGrep : String = _
def this(in: File, out: File, samples: File) = this(in,out, (new XReadLines(samples)).readLines.toList)
override def freezeFieldValues = {
this.logger.warn("Note: Using VCFExtractSamples invalidates AC/AF/AN annotations. This is an explicit warning.")
sampleGrep = "\"" + sampleList.foldLeft[String](sampleList.get(0))( (a,b) => a + "|" + b) + "\""
super.freezeFieldValues
}
def commandLine = {
var first : String = "head -n 500 %s | grep \\\\#\\\\# > %s".format(inputVCF.getAbsolutePath,outputVCF.getAbsolutePath)
var second : String = "head -n 500 %s | grep \\\\#CHR | tr '\\t' '\\n' | awk '{print NF\"\\t\"$1}' ".format(inputVCF)
second += "| egrep %s | awk '{print $1}' | tr '\\n' ',' | xargs -i cut -f1-9,\\{\\} %s | grep -v \\\\#\\\\# >> %s".format(sampleGrep,inputVCF.getAbsolutePath,outputVCF.getAbsolutePath)
first+" ; "+second
}
}

View File

@ -16,6 +16,8 @@ class ProjectManagement(stingPath: String) {
var stingDirPath : String = stingPath
def this(f : File) = this(f.getAbsolutePath)
class PassFilterAlleles(vcf_files: List[File], out_list: File) extends CommandLineFunction {
@Input(doc="List of VCFs to extract PF sites from") var vcfs = vcf_files
@Output(doc="The file to write the site list to") var out_vcf = out_list
@ -24,7 +26,7 @@ class ProjectManagement(stingPath: String) {
def commandLine = {
"egrep \"FORMAT|format\" %s | cut -f1-8 > %s ; grep PASS %s | tr ':' '\\t' | awk '{print $2\"\\t\"$3\"\\t\"$4\"\\t\"$5\"\\t\"$6\"\\t.\\t.\\t.\"}' | sort -n -k2,2 | uniq | perl %s - %s.fai >> %s".format(
vcf_files(1).getAbsolutePath, out_vcf.getAbsolutePath, vcf_files.foldLeft[String]("")( (b,a) => b + " " + a.getAbsolutePath), sortByRef, ref.getAbsolutePath, out_vcf.getAbsolutePath
vcf_files(0).getAbsolutePath, out_vcf.getAbsolutePath, vcf_files.foldLeft[String]("")( (b,a) => b + " " + a.getAbsolutePath), sortByRef, ref.getAbsolutePath, out_vcf.getAbsolutePath
)
}
}
@ -50,7 +52,7 @@ class ProjectManagement(stingPath: String) {
cmds :+= ints
var calcs: List[UGCalcLikelihoods] = allBams.grouped(size).toList.zipWithIndex.map(u => LikelihoodCalc(u._1,ref,ints.intervals,pfSites.out_vcf, new File("batch%d.likelihoods.vcf".format(u._2))))
var calcs: List[UGCalcLikelihoods] = allBams.grouped(size).toList.zipWithIndex.map(u => LikelihoodCalc(u._1,ref,ints.intervals,pfSites.out_vcf, new File("MBatch%d.likelihoods.vcf".format(u._2))))
cmds ++= calcs