BatchMerge now uses the correct UG settings, recently added by Eric

ExpandIntervals now checks that identical intervals are not created by (un)fortunately-spaced targets
VCFExtractIntervals no longer creates duplicate intervals in the case where a VCF has multiple entries at the same site



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5294 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2011-02-23 18:46:15 +00:00
parent 9357bee921
commit 07d381ec51
3 changed files with 52 additions and 7 deletions

View File

@ -1,6 +1,7 @@
import java.io.{FileReader, BufferedReader}
import org.broadinstitute.sting.commandline.Hidden
import org.broadinstitute.sting.datasources.pipeline.Pipeline
import org.broadinstitute.sting.gatk.walkers.genotyper.{GenotypeLikelihoodsCalculationModel, UnifiedGenotyperEngine}
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.library.ipf.vcf.{VCFSimpleMerge, VCFExtractSites,VCFExtractIntervals}
import org.broadinstitute.sting.queue.pipeline.{ProjectManagement, BamProcessing, VariantCalling}
@ -30,7 +31,7 @@ class batchMergePipeline extends QScript {
var bams : List[File] = extractFileEntries(bamList)
trait ExtractArgs extends VCFExtractSites {
this.keepFilters = true
this.keepFilters = false
this.keepInfo = false
this.keepQual = false
}
@ -55,10 +56,11 @@ class batchMergePipeline extends QScript {
}
this.intervals :+= extractIntervals.listOut
this.alleleVCF = combineVCFs.outVCF
this.output_all_callable_bases = true
this.jarFile = new File(stingDir+"/dist/GenomeAnalysisTK.jar")
this.memoryLimit = Some(4)
this.scatterCount = 60
this.output_mode = Some(UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES)
this.genotyping_mode = Some(GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES)
}
def newUGCL( bams: (List[File],Int) ) : UGCalcLikelihoods = {
@ -72,18 +74,33 @@ class batchMergePipeline extends QScript {
addAll(calcs)
trait CallVariantsArgs extends UGCallVariants {
this.output_all_callable_bases = true
this.reference_sequence = batchMerge.ref
this.intervals :+= extractIntervals.listOut
this.jarFile = new File(stingDir+"/dist/GenomeAnalysisTK.jar")
this.scatterCount = 30
this.memoryLimit = Some(8)
this.output_mode = Some(UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES)
this.genotyping_mode = Some(GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES)
}
var cVars : UGCallVariants = new UGCallVariants with CallVariantsArgs
cVars.rodBind ++= calcs.map( a => new RodBind("variant"+a.out.getName.replace(".vcf",""),"vcf",a.out) )
cVars.out = batchOut
add(cVars)
trait CombineVariantsArgs extends CombineVariants {
this.reference_sequence = batchMerge.ref
this.intervals :+= extractIntervals.listOut
this.jarFile = new File(batchMerge.stingDir+"/dist/GenomeAnalysisTK.jar")
this.scatterCount = 10
this.memoryLimit=Some(4)
}
var combine : CombineVariants = new CombineVariants with CombineVariantsArgs
combine.out = swapExt(batchOut,".vcf",".variant.combined.vcf")
combine.rodBind ++= vcfs.map( u => new RodBind(u.getName,"vcf",u) )
add(combine)
}
def extractFileEntries(in: File): List[File] = {

View File

@ -68,7 +68,7 @@ class ExpandIntervals(in : File, start: Int, size: Int, out: File, ref: File, ip
def ok( loc : GenomeLoc, prev: GenomeLoc, next: GenomeLoc ) : Boolean = {
//System.out.println("%s - %s - %s".format(repr(next),repr(loc),repr(previous)))
( next == null || loc.distance(next) >= start) && (prev == null || loc.distance(prev) >= start)
( next == null || loc.distance(next) >= start) && (prev == null || loc.distance(prev) >= start) && (!loc.equals(prev) && !loc.equals(next))
}
def repr(loc : GenomeLoc) : String = {

View File

@ -18,14 +18,42 @@ class VCFExtractIntervals(inVCF: File, outList: File, useFilterSites: Boolean) e
def run = {
out = new PrintWriter(new PrintStream(listOut))
asScalaIterator(new XReadLines(vcfIn)).foreach(vcf2int)
var elems = asScalaIterator(new XReadLines(vcfIn)).map(vcf2int).filter(p => !p.equals(""))
var prev : String = null
if ( elems.hasNext ) {
prev = elems.next
}
var cur : String = null
if ( elems.hasNext ) {
cur = elems.next
} else {
out.printf("%s%n",prev)
}
while ( elems.hasNext ) {
out.printf("%s%n",prev)
while ( cur.equals(prev) && elems.hasNext && !cur.equals("") ) {
cur = elems.next
}
if ( ! cur.equals(prev) ) {
if ( elems.hasNext ) {
prev = cur
cur = elems.next
} else {
out.printf("%s%n",cur)
}
}
}
out.close
}
def vcf2int( vcfLine: String ) : Unit = {
def vcf2int( vcfLine: String ) : String = {
var spline = vcfLine.split("\t")
if ( ! vcfLine.startsWith("#") && (spline(6).equals("PASS") || spline(6).equals(".") || keepFilters) ) {
out.print("%s:%s%n".format(spline(0),spline(1)))
return("%s:%s".format(spline(0),spline(1)))
} else {
return ""
}
}