From cb0b2f9811d6a8ca65378c9af6251556783bf141 Mon Sep 17 00:00:00 2001 From: chartl Date: Thu, 4 Nov 2010 19:57:27 +0000 Subject: [PATCH] My analysis script for private mutations. I'm committing it because it contains a number of specialized command line functions that could prove useful in the future. (For example: ConcatVCF and ExtractSample) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4626 348d0f76-0448-11de-a6fe-93d51630548a --- scala/qscript/chartl/private_mutations.q | 401 +++++++++++++++++++++++ 1 file changed, 401 insertions(+) create mode 100755 scala/qscript/chartl/private_mutations.q diff --git a/scala/qscript/chartl/private_mutations.q b/scala/qscript/chartl/private_mutations.q new file mode 100755 index 000000000..890bec937 --- /dev/null +++ b/scala/qscript/chartl/private_mutations.q @@ -0,0 +1,401 @@ +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils +import org.broadinstitute.sting.queue.extensions.gatk.{UnifiedGenotyper, RodBind, CombineVariants, SelectVariants} +import org.broadinstitute.sting.queue.QScript +import tools.nsc.io.File + +class private_mutations extends QScript { + pm_script => // alias for the script arguments + + val eomi_merged_calls : File = new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/resources/esp_merged.vcf") + val g1k_exome_calls : File = new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/resources/g1k_exomes.vcf") + val ceu_pilot_calls : File = new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/resources/CEU.low_coverage.genotypes.vcf") + + val SINGLE_SAMPLE_EXPRESSION = "\"PASS.*AC=1;|PASS.*AC=2;.*1/1|PASS.*AC=2.*1\\|1\"" + val NOVEL_VARIANT_EXPRESSION = "-v \"rs[0-9]\"" + val KNOWN_VARIANT_EXPRESSION = "\"rs[0-9].*PASS\"" + + val BASE_DIR = "/humgen/gsa-hphome1/chartl/projects/private_mutations/queue/" + val LIST_DIR = BASE_DIR + "lists/" + val VCF_DIR = BASE_DIR + "vcfs/" + val EVAL_DIR = BASE_DIR + "evals/" + val SCRATCH_DIR = BASE_DIR + "intermediate/" + + /* + * Steps of analysis. + * + * todo -- 1) Calculate number of EOMI calls top-down, cumulative include/exclude that are: + * - present in one sample + * - not in dbsnp 129 + * - not in 1000G pilot CEU + * - not in 1000G production EUR + * - not in 1000G production exome EUR + * + * todo -- 2) Distributional aspects of remaining mutations + * - Depth of Coverage + * - Functional Class + * + * todo -- 3) 1000G Exome v Lowpass -- Venn of single-sample variants + * - Venn numbers + * - Distribution of depth for exome variants missed in low-pass + * + * todo -- 4) Estimate relationship between depth and sensitivity to private variation + * + * todo -- 5) Calculate % of exome empowered to detect private variation + * + * todo -- 6) Identify loci across 1000G Ex and EOMI that are low-powered + * - venn of low-power targets + * - mapping the intersection + * - genes affected by low power + */ + + + /* + * Scripting commands go here + */ + def script = { + // setup analysis resources (best done in loop) + val g1k_lowpass_chr : List[File] = (1 to 22).toList.map( i => new File("/humgen/1kg/processing/allPopulations_wholeGenome_august_release/calls/chr%d/ALL.august.chr%d.recal.vrcut.EUR.vcf".format(i,i))) + val eomi_annot_set : List[File] = (1 to 7).toList.map( i => new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/resources/esp_set%d.vcf".format(i))) + val g1k_lowpass_hg19_merged : File = new File(VCF_DIR+"g1k_lowpass_merged.hg19.sites.vcf") + val eomi_merged_hg19 : File = eomi_merged_calls + val ceu_hg19 : File = new File(VCF_DIR+"G1K_Pilot.CEU.hg19.vcf") + + combineAndMergeSites(g1k_lowpass_chr,g1k_lowpass_hg19_merged) + liftoverVCF(ceu_pilot_calls,ceu_hg19) + + + + // get EOMI ss-only list + var eomi_single_sample : Vcf2List = new Vcf2List + eomi_single_sample.in_vcf = eomi_merged_hg19 + eomi_single_sample.filter = SINGLE_SAMPLE_EXPRESSION + eomi_single_sample.out_list = new File(LIST_DIR+"EOMI_single_sample_variants.sites.list") + + add(eomi_single_sample) + + var eomi_dbsnp : Vcf2List = new Vcf2List + eomi_dbsnp.in_vcf = eomi_merged_hg19 + eomi_dbsnp.filter = KNOWN_VARIANT_EXPRESSION + eomi_dbsnp.out_list = new File(LIST_DIR+"EOMI_novel_variants.sites.list") + + add(eomi_dbsnp) + + var ceu_sites : Vcf2List = new Vcf2List + ceu_sites.in_vcf = ceu_hg19 + ceu_sites.out_list = new File(SCRATCH_DIR+"G1KP_CEU_variants.sites.list") + + add(ceu_sites) + + var g1k_lowpass_sites : Vcf2List = new Vcf2List + g1k_lowpass_sites.in_vcf = g1k_lowpass_hg19_merged + g1k_lowpass_sites.out_list = new File(SCRATCH_DIR+"G1K_lowpass_EUR.sites.list") + + add(g1k_lowpass_sites) + + var g1k_exome_sites : Vcf2List = new Vcf2List + g1k_exome_sites.in_vcf = g1k_exome_calls + g1k_exome_sites.out_list = new File(SCRATCH_DIR+"G1K_exomes.sites.list") + + add(g1k_exome_sites) + + var remove_dbsnp : RemoveIntersect = new RemoveIntersect + remove_dbsnp.fileToUnique = eomi_single_sample.out_list + remove_dbsnp.comparisonFile = eomi_dbsnp.out_list + remove_dbsnp.outputFile = new File(LIST_DIR+"EOMI_single_sample.nodbsnp.list") + + add(remove_dbsnp) + + var remove_ceu : RemoveIntersect = new RemoveIntersect + remove_ceu.fileToUnique = remove_dbsnp.outputFile + remove_ceu.comparisonFile = ceu_sites.out_list + remove_ceu.outputFile = new File(LIST_DIR+"EOMI_single_sample.nodbsnp.noceu.list") + + add(remove_ceu) + + var remove_lowpass : RemoveIntersect = new RemoveIntersect + remove_lowpass.fileToUnique = remove_ceu.outputFile + remove_lowpass.comparisonFile = g1k_lowpass_sites.out_list + remove_lowpass.outputFile = new File(LIST_DIR+"EOMI_single_sample.nodbsnp.noceu.noeur.list") + + add(remove_lowpass) + + var remove_exome : RemoveIntersect = new RemoveIntersect + remove_exome.fileToUnique = remove_lowpass.outputFile + remove_exome.comparisonFile = g1k_exome_sites.out_list + remove_exome.outputFile = new File(LIST_DIR+"EOMI_single_sample.nodbsnp.noceu.noeur.noexome.list") + + add(remove_exome) + + var subset : SelectVariants = new SelectVariants + subset.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar") + subset.reference_sequence = new File("/humgen/1kg/reference/human_g1k_v37.fasta") + subset.variantVCF = eomi_merged_hg19 + subset.out = new File(VCF_DIR+"EOMI_merged.hg19.private.vcf") + subset.intervals :+= remove_exome.outputFile + + add(subset) + + var varDepths : GetVariantDepths = new GetVariantDepths + varDepths.inVCF = subset.out + varDepths.outFile = new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/results/EOMI.private.variant.depths.txt") + + add(varDepths) + + var lowpass_single : Vcf2List = new Vcf2List + lowpass_single.in_vcf = g1k_lowpass_hg19_merged + lowpass_single.filter = SINGLE_SAMPLE_EXPRESSION + lowpass_single.out_list = new File(LIST_DIR+"g1k_lowpass_single_sample.variants.list") + + add(lowpass_single) + + var exome_single : Vcf2List = new Vcf2List + exome_single.in_vcf = g1k_exome_calls + exome_single.filter = SINGLE_SAMPLE_EXPRESSION + exome_single.out_list = new File(LIST_DIR+"g1k_exome_single_sample.variants.list") + + var exome_extract : ExtractSites = new ExtractSites(g1k_exome_calls,new File(VCF_DIR+"g1k_exome.sites.vcf")) + add(exome_extract) + + var getVennExS : CombineVariants = new CombineVariants + getVennExS.rodBind :+= new RodBind("lowpass","VCF",g1k_lowpass_hg19_merged); + getVennExS.rodBind :+= new RodBind("exome","VCF",exome_extract.outputVCF); + getVennExS.priority = "exome,lowpass" + getVennExS.intervals :+= exome_single.out_list + getVennExS.reference_sequence = new File("/humgen/1kg/reference/human_g1k_v37.fasta") + getVennExS.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar") + getVennExS.genotypeMergeOptions = Some(VariantContextUtils.GenotypeMergeType.UNIQUIFY) + getVennExS.variantMergeOptions = Some(VariantContextUtils.VariantMergeType.UNION) + getVennExS.out = new File(VCF_DIR + "g1k_exome_plus_lowpass.singlesample.merged.exome.sites.vcf") + + add(getVennExS) + + var getVennLPS : CombineVariants = new CombineVariants + getVennLPS.rodBind :+= new RodBind("lowpass","VCF",g1k_lowpass_hg19_merged); + getVennLPS.rodBind :+= new RodBind("exome","VCF",exome_extract.outputVCF); + getVennLPS.priority = "exome,lowpass" + getVennLPS.intervals :+= lowpass_single.out_list + getVennLPS.reference_sequence = new File("/humgen/1kg/reference/human_g1k_v37.fasta") + getVennLPS.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar") + getVennLPS.genotypeMergeOptions = Some(VariantContextUtils.GenotypeMergeType.UNIQUIFY) + getVennLPS.variantMergeOptions = Some(VariantContextUtils.VariantMergeType.UNION) + getVennLPS.out = new File(VCF_DIR + "g1k_exome_plus_lowpass.singlesample.merged.lowpass.sites.vcf") + + add(getVennLPS) + + var getG1KOverlap : GetOverlapSamples = new GetOverlapSamples + getG1KOverlap.vcf1 = g1k_exome_calls + getG1KOverlap.vcf2 = g1k_lowpass_chr(1) // EUR only + getG1KOverlap.samples = new File(LIST_DIR+"g1k_EUR_exome_lowpass_overlap.txt") + + add(getG1KOverlap) + + callOverlaps(getG1KOverlap.samples,exome_single.out_list, g1k_exome_calls) + + } + + /* + * defined modules go here + */ + + class Vcf2List extends CommandLineFunction { + @Input(doc="The vcf file to convert to list",required=true) + var in_vcf: File = _ + @Argument(doc="An egrep-based filter to apply during conversion",required=false) + var filter: String = "PASS" + @Output(doc="The list file to write to", required=true) + var out_list: File = _ + + def commandLine = { + "egrep %s %s | awk '{print $1\":\"$2}' > %s".format(filter,in_vcf.getAbsolutePath,out_list.getAbsolutePath) + } + } + + class GetOverlapSamples extends CommandLineFunction { + @Input(doc="vcf1") + var vcf1: File = _ + @Input(doc="vcf2") + var vcf2: File = _ + @Output(doc="sample file") + var samples: File = _ + + def commandLine = { + "head -n 500 %s %s | grep \\\\#CHR | cut -f10- | tr '\\t' '\\n' | sort | uniq -c | awk '{if ($1 == 2) print $2}' > %s".format( + vcf1.getAbsolutePath, + vcf2.getAbsolutePath, + samples.getAbsolutePath + ) + } + } + + class RemoveIntersect extends CommandLineFunction { + @Input(doc="The vcf file whose unique entries should be retained",required=true) + var fileToUnique: File = _ + @Input(doc="The vcf file whose entries overlapping the unique file you would like to remove",required=true) + var comparisonFile: File = _ + @Output(doc="The file containing only the fileToUnique-unique entries", required=true) + var outputFile: File = _ + + def commandLine = { + var tmpFile: File = java.io.File.createTempFile("removeintersect","tmp") + + "cat %s %s | sort | uniq -c | awk '{if ($1==2) print $2}' > %s ; cat %s %s | sort | uniq -c | awk '{if ($1 == 1) print $2}' > %s".format( + fileToUnique.getAbsolutePath,comparisonFile.getAbsolutePath, tmpFile.getAbsolutePath, + tmpFile.getAbsolutePath, fileToUnique.getAbsolutePath, outputFile.getAbsolutePath + ) + } + } + + class ExtractSites(inVCF: File, oVCF: File) extends CommandLineFunction { + @Input(doc="The vcf file to generate a sites only file from",required=true) + var inputVCF: File = inVCF + @Output(doc="The sites-only vcf to write to") + var outputVCF: File = oVCF + + def commandLine = { + "cut -f1-9 %s > %s".format(inputVCF.getAbsolutePath,outputVCF.getAbsolutePath) + } + } + + class ConcatVCF(in: List[File], out: File) extends CommandLineFunction { + @Input(doc="The files to concatenate",required=true) + var inputVCFs : List[File] = in + @Output(doc="The file to write to", required=true) + var outputVCF: File = out + + def commandLine = { + var header : File = java.io.File.createTempFile("concatVCF","header.tmp") + var body_unsorted : File = java.io.File.createTempFile("concatVCF","body.unsorted.tmp") + var body : File = java.io.File.createTempFile("concatVCF","body.tmp") + + "grep \\\\# %s > %s ; grep -v \\\\# %s | sed 's/.vcf:/\\t/g' | cut -f2- > %s ; perl %s -tmp . %s %s > %s ; cat %s %s > %s".format( + inputVCFs(0).getAbsolutePath, + header.getAbsolutePath, + inputVCFs.foldLeft[String]("")((x1: String, x2: File ) => x1 + " " + x2.getAbsolutePath ), + body_unsorted.getAbsolutePath, + "/humgen/gsa-scr1/chartl/sting/perl/sortByRef.pl", + body_unsorted.getAbsolutePath, + "/humgen/1kg/reference/human_g1k_v37.fasta.fai", + body.getAbsolutePath, + header.getAbsolutePath, + body.getAbsolutePath, + outputVCF.getAbsolutePath + ) + } + } + + class ExtractSample(vcfIn: File, sample: String, vcfOut: File) extends CommandLineFunction { + @Input(doc="File from which to extract sample") + var inVCF: File = vcfIn + @Argument(doc="The sample to extract") + var inSample: String = sample + @Input(doc="The VCF file to write to") + var outVCF: File = vcfOut + + def commandLine = { + "head -n 500 %s | grep \\\\#CHR | tr '\\t' '\\n' | grep -n %s | tr ':' '\\t' | cut -f1 | xargs -i cut -f1-9,\\{\\} %s | egrep \"\\\\#|PASS.*0/1|PASS.*1/0|PASS.*1/1\" > %s".format( + inVCF.getAbsolutePath, inSample, inVCF.getAbsolutePath, outVCF.getAbsolutePath + ) + } + } + + class GetVariantDepths extends CommandLineFunction { + @Input(doc="The VCF to get the per-sample depth at variant calls") + var inVCF : File = _ + @Output(doc="The depth file to write to") + var outFile : File = _ + + def commandLine = { + "grep PASS %s | cut -f10- | tr '\\t' '\\n' | egrep \"1/0|0/1|1/1|1\\|0|0\\|1|1\\|1\" | tr ':' '\\t' | awk '{print $3}' > %s".format( + inVCF.getAbsolutePath, outFile.getAbsolutePath + ) + } + } + + def combineAndMergeSites(chrInputs: List[File], output: File) = { + var sites_only: List[ExtractSites] = chrInputs.map( f => new ExtractSites(f,new File(VCF_DIR+swapExt(f,".vcf",".sites.vcf").getName))) + for ( s <- sites_only ) { + add(s) + } + var trivialMerge : ConcatVCF = new ConcatVCF(sites_only.map( s => s.outputVCF ), output) + add(trivialMerge) + } + + def liftoverVCF(b36vcf: File, b37vcf: File) = { + + class Liftover(in: File, out: File) extends CommandLineFunction { + @Input(doc="foo") + val input: File = in + @Output(doc="foo") + val output : File = out + + val sting : String = "/humgen/gsa-scr1/chartl/sting/" + val chain : String = "/humgen/gsa-hpprojects/GATK/data/Liftover_Chain_Files/b36ToHg19.broad.over.chain" + + def commandLine = { + "%s/perl/liftOverVCF.pl -vcf %s -chain %s -out %s -gatk %s -newRef %s -oldRef %s -tmp .".format( + sting, input, chain, output, sting, "/seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19", + "/humgen/1kg/reference/human_b36_both" + ) + } + + class RMDupes(in: File, out:File) extends CommandLineFunction { + @Input(doc="foo") + var input : File = in + @Output(doc="foo") + var output : File = out + + def commandLine = "cat %s | python /humgen/gsa-scr1/chartl/projects/omni/scripts/filterDupes.py > %s".format( + input.getAbsolutePath,output.getAbsolutePath + ) + } + + var lift : Liftover = new Liftover(b36vcf,swapExt(b37vcf,".vcf",".undeduped.vcf")) + + add(lift) + + var rmDupes : RMDupes = new RMDupes(lift.output,b37vcf) + + add(rmDupes) + } + } + + def callOverlaps(samples: File, intervals: File, wexCalls: File) = { + class GetBamList(sample: String, oList: File) extends CommandLineFunction { + @Argument(doc="foo") + var inSample = sample + @Output(doc="foo") + var outList = oList + + def commandLine = { + "grep %s /humgen/1kg/processing/allPopulations_wholeGenome_august_release/bamLists/*.list | tr ':' '\\t' | awk '{print $2}' > %s".format( + inSample, + outList.getAbsolutePath + ) + } + } + for ( s <- scala.io.Source.fromFile(samples).getLines ) { + var extract : ExtractSample = new ExtractSample(wexCalls,s,new File(VCF_DIR+"g1k_exome.subset.%s.vcf".format(s))) + add(extract) + + var sites : Vcf2List = new Vcf2List + sites.in_vcf = extract.outVCF + sites.out_list = new File(SCRATCH_DIR+"g1k_exome.subset.%s.variants.list".format(s)) + add(sites) + + var bamList : GetBamList = new GetBamList(s,new File(SCRATCH_DIR+"%s.bams.list".format(s))) + + add(bamList) + + var genotype : UnifiedGenotyper = new UnifiedGenotyper + genotype.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar") + genotype.reference_sequence = new File("/humgen/1kg/reference/human_g1k_v37.fasta") + genotype.intervals :+= sites.out_list + genotype.out = new File(VCF_DIR+"g1k_lowpass.%s.exome_sites.vcf".format(s)) + genotype.input_file :+= bamList.outList + + add(genotype) + + } + } + +} \ No newline at end of file