114 lines
5.9 KiB
Plaintext
Executable File
114 lines
5.9 KiB
Plaintext
Executable File
import java.io.{FileReader, File, BufferedReader}
|
|
import net.sf.picard.reference.FastaSequenceFile
|
|
import org.broadinstitute.sting.datasources.pipeline.Pipeline
|
|
import org.broadinstitute.sting.gatk.DownsampleType
|
|
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model
|
|
import org.broadinstitute.sting.queue.extensions.gatk._
|
|
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
|
|
import org.broadinstitute.sting.queue.extensions.samtools._
|
|
import org.broadinstitute.sting.queue.{QException, QScript}
|
|
import collection.JavaConversions._
|
|
import org.broadinstitute.sting.utils.yaml.YamlUtils
|
|
import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
|
|
|
|
class omni_qc extends QScript {
|
|
qscript =>
|
|
|
|
// NON-OMNI VCF FILES
|
|
var pilot3_release_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/pilot3/merge_release/ALL.exon.2010_03.genotypes.vcf","vcf")
|
|
var pilot1_ceu_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/CEU.low_coverage.2010_07.genotypes.vcf.gz","vcf")
|
|
var pilot1_chb_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/CHBJPT.low_coverage.2010_07.genotypes.vcf.gz","vcf")
|
|
var pilot1_yri_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/YRI.low_coverage.2010_07.genotypes.vcf.gz","vcf")
|
|
var august_calls_vcf = null
|
|
var hiseq_calls_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/NA12878.HiSeq.v9.b36.vcf.gz","vcf")
|
|
var pilot1_with_na12878_vcf = new TaggedFile("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/calls/v2/N60/lowpass.N60.recal.mG6.retranche.vcf","vcf")
|
|
|
|
// OMNI VCF FILES
|
|
var OMNI_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_795_b36.vcf","vcf")
|
|
var OMNI_b37_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_795_b37.deduped.vcf.gz","vcf")
|
|
var OMNI_hapmap_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_pilot_b36.vcf.gz","vcf")
|
|
|
|
// INTERVALS
|
|
var pilot3_interval_list: String = "/humgen/gsa-hpprojects/1kg/1kg_pilot3/documents/CenterSpecificTargetLists/results/p3overlap.targets.b36.interval_list"
|
|
var pilot1_interval_list: String = _
|
|
var hiseq_interval_list: String = _
|
|
var production_interval_list: String = "20"
|
|
|
|
// OTHER
|
|
var failed_qc_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/failed_qc_samples.txt")
|
|
|
|
trait OmniArgs extends CommandLineGATK {
|
|
this.reference_sequence = new File("/humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta")
|
|
this.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar")
|
|
}
|
|
|
|
protected def sampleFileToString(samFile: File) : List[String] = {
|
|
var reader = new BufferedReader(new FileReader(samFile))
|
|
var line: String = ""
|
|
var sampleList: List[String] = Nil
|
|
line = reader.readLine
|
|
while ( line != null ) {
|
|
sampleList :+= line
|
|
line = reader.readLine
|
|
}
|
|
|
|
return sampleList
|
|
}
|
|
|
|
class vcf2bed(b: String) extends CommandLineFunction {
|
|
@Input(doc="A VCF file to be put into an interval list") var in_vcf: File = _
|
|
@Output(doc="An interval list file to be used with -L") var out_list: File = _
|
|
|
|
def commandLine = "python /humgen/gsa-scr1/chartl/projects/omni/scripts/vcf2bed.py %s %s".format(in_vcf.getAbsolutePath,out_list.getAbsolutePath)
|
|
}
|
|
|
|
protected def runme(samples: File, intervals: String, compVCF: TaggedFile, outDir: String, base: String) {
|
|
var subset = new SelectVariants with OmniArgs
|
|
subset.variantVCF = Omni_chip_vcf
|
|
subset.analysisName = "Subset_"+base
|
|
subset.sample = sampleFileToString(samples)
|
|
if ( intervals != null ) {
|
|
subset.intervalsString :+= intervals
|
|
}
|
|
subset.out = new TaggedFile(outDir+base+".subset.vcf","vcf")
|
|
|
|
var makeOmniSiteList = new vcf2bed("foo")
|
|
makeOmniSiteList.analysisName = "Omni_list_"+base
|
|
makeOmniSiteList.in_vcf = subset.out
|
|
makeOmniSiteList.out_list = new TaggedFile(outDir+base+".subset.omni.intervals.list","intervals.list")
|
|
|
|
var eval = new VariantEval with OmniArgs
|
|
eval.rodBind :+= new RodBind("evalOmni","vcf",subset.out)
|
|
eval.rodBind :+= new RodBind("comp"+base,"vcf",compVCF)
|
|
eval.intervals = makeOmniSiteList.out_list
|
|
eval.evalModule :+= "GenotypeConcordance"
|
|
eval.evalModule :+= "SimpleMetricsBySample"
|
|
eval.reportLocation = new TaggedFile(outDir+base+".subset.omni.eval","eval")
|
|
eval.reportType = Some(VE2TemplateType.R)
|
|
eval.analysisName = base+"_Eval"
|
|
eval.select ++= List("AC==1","AC==2","AC==3","AC==4","AC==5","AC>5")
|
|
eval.selectName ++= List("AC1","AC2","AC3","AC4","AC5","AC6+")
|
|
eval.disI = true
|
|
eval.outputVCF = outDir+base+".eval.interesting.sites.vcf"
|
|
|
|
if ( base.equalsIgnoreCase("production") ) {
|
|
subset.variantVCF = Omni_b37_vcf
|
|
subset.reference_sequence = new File("/humgen/gsa-hpprojects/1kg/reference/human_g1k_v37.fasta")
|
|
eval.reference_sequence = new File("/humgen/gsa-hpprojects/1kg/reference/human_g1k_v37.fasta")
|
|
eval.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod")
|
|
}
|
|
|
|
|
|
|
|
add(subset,makeOmniSiteList,eval)
|
|
}
|
|
|
|
def script = {
|
|
runme(pilot1_overlap_samples,pilot1_interval_list,pilot1_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/pilot1/","pilot1")
|
|
runme(pilot3_overlap_samples,pilot3_interval_list,pilot3_vcf_broad,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/pilot3/","pilot3")
|
|
runme(hiseq_overlap_samples,hiseq_interval_list,hiSeq_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/hiSeq/","NA12878_HiSeq")
|
|
runme(production_overlap_samples,production_interval_list,production_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/production/","production")
|
|
runme(pilot3_overlap_samples,pilot3_interval_list,pilot3_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/pilot3/","pilot3_release")
|
|
}
|
|
}
|