With multi-sample genotyping must come scatter+gather. Also Khalid informed me of the .group(size) method, so removing my useless (but pretty) code.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4797 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f4c43f013f
commit
02de9a9764
|
|
@ -17,13 +17,13 @@ class ProjectManagement(stingPath: String) {
|
||||||
|
|
||||||
class PassFilterAlleles(vcf_files: List[File], out_list: File) extends CommandLineFunction {
|
class PassFilterAlleles(vcf_files: List[File], out_list: File) extends CommandLineFunction {
|
||||||
@Input(doc="List of VCFs to extract PF sites from") var vcfs = vcf_files
|
@Input(doc="List of VCFs to extract PF sites from") var vcfs = vcf_files
|
||||||
@Output(doc="The file to write the site list to") var out_intervals = out_list
|
@Output(doc="The file to write the site list to") var out_vcf = out_list
|
||||||
@Argument(doc="Path to the SortByRef script") var sortByRef: String = _
|
@Argument(doc="Path to the SortByRef script") var sortByRef: String = _
|
||||||
@Argument(doc="Path to the reference file on disk") var ref: File = _
|
@Argument(doc="Path to the reference file on disk") var ref: File = _
|
||||||
|
|
||||||
def commandLine = {
|
def commandLine = {
|
||||||
"egrep \"FORMAT|format\" %s | cut -f1-8 > %s ; grep PASS %s | tr ':' '\\t' | awk '{print $2\"\\t\"$3\"\\t\"$4\"\\t\"$5\"\\t\"$6\"\\t.\\t.\\t.\"}' | sort -n -k2,2 | uniq | perl %s - %s.fai >> %s".format(
|
"egrep \"FORMAT|format\" %s | cut -f1-8 > %s ; grep PASS %s | tr ':' '\\t' | awk '{print $2\"\\t\"$3\"\\t\"$4\"\\t\"$5\"\\t\"$6\"\\t.\\t.\\t.\"}' | sort -n -k2,2 | uniq | perl %s - %s.fai >> %s".format(
|
||||||
vcf_files(1).getAbsolutePath, out_list.getAbsolutePath, vcf_files.foldLeft[String]("")( (b,a) => b + " " + a.getAbsolutePath), sortByRef, ref.getAbsolutePath, out_list.getAbsolutePath
|
vcf_files(1).getAbsolutePath, out_vcf.getAbsolutePath, vcf_files.foldLeft[String]("")( (b,a) => b + " " + a.getAbsolutePath), sortByRef, ref.getAbsolutePath, out_vcf.getAbsolutePath
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -36,20 +36,16 @@ class ProjectManagement(stingPath: String) {
|
||||||
|
|
||||||
cmds :+= pfSites
|
cmds :+= pfSites
|
||||||
|
|
||||||
var calcs: List[UGCalcLikelihoods] = batchLikelihoods(allBams,ref,pfSites.out_intervals,size)
|
var calcs: List[UGCalcLikelihoods] = allBams.grouped(size).toList.zipWithIndex.map(u => LikelihoodCalc(u._1,ref,pfSites.out_vcf, new File("batch%d.likelihoods.vcf".format(u._2))))
|
||||||
|
|
||||||
cmds ++= calcs
|
cmds ++= calcs
|
||||||
|
|
||||||
cmds :+= VariantCallMerge(calcs.map( a => a.out), ref, pfSites.out_intervals, mergedVCF)
|
cmds :+= VariantCallMerge(calcs.map( a => a.out), ref, pfSites.out_vcf, mergedVCF)
|
||||||
|
|
||||||
return cmds
|
return cmds
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def batchLikelihoods(bams: List[File], ref: File, alleleVCF: File, size: Int) : List[UGCalcLikelihoods] = {
|
|
||||||
return CollectionUtils.segmentBySize(bams,size).zipWithIndex.map( u => LikelihoodCalc(u._1,ref,alleleVCF, new File("batch%d.likelihoods.vcf".format(u._2))))
|
|
||||||
}
|
|
||||||
|
|
||||||
def LikelihoodCalc( bams: List[File], ref: File, alleleVCF: File, outVCF: File ) : UGCalcLikelihoods = {
|
def LikelihoodCalc( bams: List[File], ref: File, alleleVCF: File, outVCF: File ) : UGCalcLikelihoods = {
|
||||||
var calc = new UGCalcLikelihoods
|
var calc = new UGCalcLikelihoods
|
||||||
calc.input_file ++= bams
|
calc.input_file ++= bams
|
||||||
|
|
@ -57,6 +53,7 @@ class ProjectManagement(stingPath: String) {
|
||||||
calc.jarFile = new File(pm.stingDirPath+"dist/GenomeAnalysisTK.jar")
|
calc.jarFile = new File(pm.stingDirPath+"dist/GenomeAnalysisTK.jar")
|
||||||
calc.downsample_to_coverage = Some(300)
|
calc.downsample_to_coverage = Some(300)
|
||||||
calc.memoryLimit = if ( bams.size < 5 ) Some(2) else if(bams.size<50) Some(4) else Some(6)
|
calc.memoryLimit = if ( bams.size < 5 ) Some(2) else if(bams.size<50) Some(4) else Some(6)
|
||||||
|
calc.scatterCount = if (bams.size < 5 ) 1 else if (bams.size < 50) 10 else 50
|
||||||
calc.min_base_quality_score = Some(22)
|
calc.min_base_quality_score = Some(22)
|
||||||
calc.min_mapping_quality_score = Some(20)
|
calc.min_mapping_quality_score = Some(20)
|
||||||
calc.genotype = true
|
calc.genotype = true
|
||||||
|
|
|
||||||
|
|
@ -88,15 +88,4 @@ object CollectionUtils {
|
||||||
}
|
}
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Takes a List, returns a list of Lists; which are direct sub-lists of the input of a specific constant size
|
|
||||||
* (except perhaps the final element, which can be smaller)
|
|
||||||
* @param value -- The list to be batched
|
|
||||||
* @param size -- the sublist size
|
|
||||||
* @return the list batched into smaller lists of size N
|
|
||||||
*/
|
|
||||||
def segmentBySize[T](value: List[T], size: Int) : List[List[T]] = {
|
|
||||||
return if(value.size == 0) Nil else List(value.splitAt(size)._1) ++ segmentBySize(value.drop(size),size)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue