gatk-3.8/scala/qscript/oneoffs/chartl/old/private_mutations_old.q

408 lines
16 KiB
Plaintext
Raw Normal View History

import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils
import org.broadinstitute.sting.queue.extensions.gatk.{UnifiedGenotyper, RodBind, CombineVariants, SelectVariants}
import org.broadinstitute.sting.queue.QScript
import tools.nsc.io.File
class private_mutations_old extends QScript {
pm_script => // alias for the script arguments
val eomi_merged_calls : File = new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/resources/esp_merged.vcf")
val g1k_exome_calls : File = new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/resources/g1k_exomes.vcf")
val ceu_pilot_calls : File = new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/resources/CEU.low_coverage.genotypes.vcf")
val SINGLE_SAMPLE_EXPRESSION = "\"PASS.*AC=1;|PASS.*AC=2;.*1/1|PASS.*AC=2.*1\\|1\""
val NOVEL_VARIANT_EXPRESSION = "-v \"rs[0-9]\""
val KNOWN_VARIANT_EXPRESSION = "\"rs[0-9].*PASS\""
val BASE_DIR = "/humgen/gsa-hphome1/chartl/projects/private_mutations/queue/"
val LIST_DIR = BASE_DIR + "lists/"
val VCF_DIR = BASE_DIR + "vcfs/"
val EVAL_DIR = BASE_DIR + "evals/"
val SCRATCH_DIR = BASE_DIR + "intermediate/"
/*
* Steps of analysis.
*
* todo -- 1) Calculate number of EOMI calls top-down, cumulative include/exclude that are:
* - present in one sample
* - not in dbsnp 129
* - not in 1000G pilot CEU
* - not in 1000G production EUR
* - not in 1000G production exome EUR
*
* todo -- 2) Distributional aspects of remaining mutations
* - Depth of Coverage
* - Functional Class
*
* todo -- 3) 1000G Exome v Lowpass -- Venn of single-sample variants
* - Venn numbers
* - Distribution of depth for exome variants missed in low-pass
*
* todo -- 4) Estimate relationship between depth and sensitivity to private variation
*
* todo -- 5) Calculate % of exome empowered to detect private variation
*
* todo -- 6) Identify loci across 1000G Ex and EOMI that are low-powered
* - venn of low-power targets
* - mapping the intersection
* - genes affected by low power
*/
/*
* Scripting commands go here
*/
def script = {
// setup analysis resources (best done in loop)
val g1k_lowpass_chr : List[File] = (1 to 22).toList.map( i => new File("/humgen/1kg/processing/allPopulations_wholeGenome_august_release/calls/chr%d/ALL.august.chr%d.recal.vrcut.EUR.vcf".format(i,i)))
val eomi_annot_set : List[File] = (1 to 7).toList.map( i => new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/resources/esp_set%d.vcf".format(i)))
val g1k_lowpass_hg19_merged : File = new File(VCF_DIR+"g1k_lowpass_merged.hg19.sites.vcf")
val eomi_merged_hg19 : File = eomi_merged_calls
val ceu_hg19 : File = new File(VCF_DIR+"G1K_Pilot.CEU.hg19.vcf")
combineAndMergeSites(g1k_lowpass_chr,g1k_lowpass_hg19_merged)
liftoverVCF(ceu_pilot_calls,ceu_hg19)
// get EOMI ss-only list
var eomi_single_sample : Vcf2List = new Vcf2List
eomi_single_sample.in_vcf = eomi_merged_hg19
eomi_single_sample.filter = SINGLE_SAMPLE_EXPRESSION
eomi_single_sample.out_list = new File(LIST_DIR+"EOMI_single_sample_variants.sites.list")
add(eomi_single_sample)
var eomi_dbsnp : Vcf2List = new Vcf2List
eomi_dbsnp.in_vcf = eomi_merged_hg19
eomi_dbsnp.filter = KNOWN_VARIANT_EXPRESSION
eomi_dbsnp.out_list = new File(LIST_DIR+"EOMI_novel_variants.sites.list")
add(eomi_dbsnp)
var ceu_sites : Vcf2List = new Vcf2List
ceu_sites.in_vcf = ceu_hg19
ceu_sites.out_list = new File(SCRATCH_DIR+"G1KP_CEU_variants.sites.list")
add(ceu_sites)
var g1k_lowpass_sites : Vcf2List = new Vcf2List
g1k_lowpass_sites.in_vcf = g1k_lowpass_hg19_merged
g1k_lowpass_sites.out_list = new File(SCRATCH_DIR+"G1K_lowpass_EUR.sites.list")
add(g1k_lowpass_sites)
var g1k_exome_sites : Vcf2List = new Vcf2List
g1k_exome_sites.in_vcf = g1k_exome_calls
g1k_exome_sites.out_list = new File(SCRATCH_DIR+"G1K_exomes.sites.list")
add(g1k_exome_sites)
var remove_dbsnp : RemoveIntersect = new RemoveIntersect
remove_dbsnp.fileToUnique = eomi_single_sample.out_list
remove_dbsnp.comparisonFile = eomi_dbsnp.out_list
remove_dbsnp.outputFile = new File(LIST_DIR+"EOMI_single_sample.nodbsnp.list")
add(remove_dbsnp)
var remove_ceu : RemoveIntersect = new RemoveIntersect
remove_ceu.fileToUnique = remove_dbsnp.outputFile
remove_ceu.comparisonFile = ceu_sites.out_list
remove_ceu.outputFile = new File(LIST_DIR+"EOMI_single_sample.nodbsnp.noceu.list")
add(remove_ceu)
var remove_lowpass : RemoveIntersect = new RemoveIntersect
remove_lowpass.fileToUnique = remove_ceu.outputFile
remove_lowpass.comparisonFile = g1k_lowpass_sites.out_list
remove_lowpass.outputFile = new File(LIST_DIR+"EOMI_single_sample.nodbsnp.noceu.noeur.list")
add(remove_lowpass)
var remove_exome : RemoveIntersect = new RemoveIntersect
remove_exome.fileToUnique = remove_lowpass.outputFile
remove_exome.comparisonFile = g1k_exome_sites.out_list
remove_exome.outputFile = new File(LIST_DIR+"EOMI_single_sample.nodbsnp.noceu.noeur.noexome.list")
add(remove_exome)
var subset : SelectVariants = new SelectVariants
subset.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar")
subset.reference_sequence = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
subset.variantVCF = eomi_merged_hg19
subset.out = new File(VCF_DIR+"EOMI_merged.hg19.private.vcf")
subset.intervals :+= remove_exome.outputFile
add(subset)
var varDepths : GetVariantDepths = new GetVariantDepths
varDepths.inVCF = subset.out
varDepths.outFile = new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/results/EOMI.private.variant.depths.txt")
add(varDepths)
var lowpass_single : Vcf2List = new Vcf2List
lowpass_single.in_vcf = g1k_lowpass_hg19_merged
lowpass_single.filter = SINGLE_SAMPLE_EXPRESSION
lowpass_single.out_list = new File(LIST_DIR+"g1k_lowpass_single_sample.variants.list")
add(lowpass_single)
var exome_single : Vcf2List = new Vcf2List
exome_single.in_vcf = g1k_exome_calls
exome_single.filter = SINGLE_SAMPLE_EXPRESSION
exome_single.out_list = new File(LIST_DIR+"g1k_exome_single_sample.variants.list")
add(exome_single)
var exome_extract : ExtractSites = new ExtractSites(g1k_exome_calls,new File(VCF_DIR+"g1k_exome.sites.vcf"))
add(exome_extract)
var getVennExS : CombineVariants = new CombineVariants
getVennExS.rodBind :+= new RodBind("lowpass","VCF",g1k_lowpass_hg19_merged);
getVennExS.rodBind :+= new RodBind("exome","VCF",exome_extract.outputVCF);
getVennExS.priority = "exome,lowpass"
getVennExS.intervals :+= exome_single.out_list
getVennExS.reference_sequence = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
getVennExS.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar")
Walkers can now specify a class extending from Gatherer to merge custom output formats. Add @Gather(MyGatherer.class) to the walker @Output. JavaCommandLineFunctions can now specify the classpath+mainclass as an alternative to specifying a path to an executable jar. JCLF by default pass on the current classpath and only require the mainclass be specified by the developer extending the JCLF, relieving the QScript author from having to explicitly specify the jar. Like the Picard MergeSamFiles, GATK engine by default is now run from the current classpath. The GATK can still be overridden via .jarFile or .javaClasspath. Walkers from the GATK package are now also embedded into the Queue package. Updated AnalyzeCovariates to make it easier to guess the main class, AnalyzeCovariates instead of AnalyzeCovariatesCLP. Removed the GATK jar argument from the example QScripts. Removed one of the most FAQ when getting started with Scala/Queue, the use of Option[_] in QScripts: 1) Fixed mistaken assumption with java enums. In java enums can be null so they don't need nullable wrappers. 2) Added syntactic sugar for Nullable primitives to the QScript trait. Any variable defined as Option[Int] can just be assigned an Int value or None, ex: myFunc.memoryLimit = 3 Removed other unused code. Re-fixed dry run function ordering. Re-ordered the QCommandline companion object so that IntelliJ doesn't complain about missing main methods. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5504 348d0f76-0448-11de-a6fe-93d51630548a
2011-03-24 22:03:51 +08:00
getVennExS.genotypeMergeOptions = VariantContextUtils.GenotypeMergeType.UNIQUIFY
getVennExS.variantMergeOptions = VariantContextUtils.VariantMergeType.UNION
getVennExS.out = new File(VCF_DIR + "g1k_exome_plus_lowpass.singlesample.merged.exome.sites.vcf")
//add(getVennExS)
var getVennLPS : CombineVariants = new CombineVariants
getVennLPS.rodBind :+= new RodBind("lowpass","VCF",g1k_lowpass_hg19_merged);
getVennLPS.rodBind :+= new RodBind("exome","VCF",exome_extract.outputVCF);
getVennLPS.priority = "exome,lowpass"
getVennLPS.intervals :+= lowpass_single.out_list
getVennLPS.reference_sequence = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
getVennLPS.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar")
Walkers can now specify a class extending from Gatherer to merge custom output formats. Add @Gather(MyGatherer.class) to the walker @Output. JavaCommandLineFunctions can now specify the classpath+mainclass as an alternative to specifying a path to an executable jar. JCLF by default pass on the current classpath and only require the mainclass be specified by the developer extending the JCLF, relieving the QScript author from having to explicitly specify the jar. Like the Picard MergeSamFiles, GATK engine by default is now run from the current classpath. The GATK can still be overridden via .jarFile or .javaClasspath. Walkers from the GATK package are now also embedded into the Queue package. Updated AnalyzeCovariates to make it easier to guess the main class, AnalyzeCovariates instead of AnalyzeCovariatesCLP. Removed the GATK jar argument from the example QScripts. Removed one of the most FAQ when getting started with Scala/Queue, the use of Option[_] in QScripts: 1) Fixed mistaken assumption with java enums. In java enums can be null so they don't need nullable wrappers. 2) Added syntactic sugar for Nullable primitives to the QScript trait. Any variable defined as Option[Int] can just be assigned an Int value or None, ex: myFunc.memoryLimit = 3 Removed other unused code. Re-fixed dry run function ordering. Re-ordered the QCommandline companion object so that IntelliJ doesn't complain about missing main methods. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5504 348d0f76-0448-11de-a6fe-93d51630548a
2011-03-24 22:03:51 +08:00
getVennLPS.genotypeMergeOptions = VariantContextUtils.GenotypeMergeType.UNIQUIFY
getVennLPS.variantMergeOptions = VariantContextUtils.VariantMergeType.UNION
getVennLPS.out = new File(VCF_DIR + "g1k_exome_plus_lowpass.singlesample.merged.lowpass.sites.vcf")
add(getVennLPS)
var getG1KOverlap : GetOverlapSamples = new GetOverlapSamples
getG1KOverlap.vcf1 = g1k_exome_calls
getG1KOverlap.vcf2 = g1k_lowpass_chr(1) // EUR only
getG1KOverlap.samples = new File(LIST_DIR+"g1k_EUR_exome_lowpass_overlap.txt")
add(getG1KOverlap)
//callOverlaps(getG1KOverlap.samples,exome_single.out_list, g1k_exome_calls)
}
/*
* defined modules go here
*/
class Vcf2List extends CommandLineFunction {
@Input(doc="The vcf file to convert to list",required=true)
var in_vcf: File = _
@Argument(doc="An egrep-based filter to apply during conversion",required=false)
var filter: String = "PASS"
@Output(doc="The list file to write to", required=true)
var out_list: File = _
def commandLine = {
"egrep %s %s | awk '{print $1\":\"$2}' > %s".format(filter,in_vcf.getAbsolutePath,out_list.getAbsolutePath)
}
}
class GetOverlapSamples extends CommandLineFunction {
@Input(doc="vcf1")
var vcf1: File = _
@Input(doc="vcf2")
var vcf2: File = _
@Output(doc="sample file")
var samples: File = _
def commandLine = {
"head -n 500 %s %s | grep \\\\#CHR | cut -f10- | tr '\\t' '\\n' | sort | uniq -c | awk '{if ($1 == 2) print $2}' > %s".format(
vcf1.getAbsolutePath,
vcf2.getAbsolutePath,
samples.getAbsolutePath
)
}
}
class RemoveIntersect extends CommandLineFunction {
@Input(doc="The vcf file whose unique entries should be retained",required=true)
var fileToUnique: File = _
@Input(doc="The vcf file whose entries overlapping the unique file you would like to remove",required=true)
var comparisonFile: File = _
@Output(doc="The file containing only the fileToUnique-unique entries", required=true)
var outputFile: File = _
def commandLine = {
var tmpFile: File = java.io.File.createTempFile("removeintersect","tmp")
"cat %s %s | sort | uniq -c | awk '{if ($1==2) print $2}' > %s ; cat %s %s | sort | uniq -c | awk '{if ($1 == 1) print $2}' > %s".format(
fileToUnique.getAbsolutePath,comparisonFile.getAbsolutePath, tmpFile.getAbsolutePath,
tmpFile.getAbsolutePath, fileToUnique.getAbsolutePath, outputFile.getAbsolutePath
)
}
}
class ExtractSites(inVCF: File, oVCF: File) extends CommandLineFunction {
@Input(doc="The vcf file to generate a sites only file from",required=true)
var inputVCF: File = inVCF
@Output(doc="The sites-only vcf to write to")
var outputVCF: File = oVCF
def commandLine = {
"cut -f1-9 %s > %s".format(inputVCF.getAbsolutePath,outputVCF.getAbsolutePath)
}
}
class ConcatVCF(in: List[File], out: File) extends CommandLineFunction {
@Input(doc="The files to concatenate",required=true)
var inputVCFs : List[File] = in
@Output(doc="The file to write to", required=true)
var outputVCF: File = out
def commandLine = {
var header : File = java.io.File.createTempFile("concatVCF","header.tmp")
var body_unsorted : File = java.io.File.createTempFile("concatVCF","body.unsorted.tmp")
var body : File = java.io.File.createTempFile("concatVCF","body.tmp")
"grep \\\\# %s > %s ; grep -v \\\\# %s | sed 's/.vcf:/\\t/g' | cut -f2- > %s ; perl %s -tmp . %s %s > %s ; cat %s %s > %s".format(
inputVCFs(0).getAbsolutePath,
header.getAbsolutePath,
inputVCFs.foldLeft[String]("")((x1: String, x2: File ) => x1 + " " + x2.getAbsolutePath ),
body_unsorted.getAbsolutePath,
"/humgen/gsa-scr1/chartl/sting/perl/sortByRef.pl",
body_unsorted.getAbsolutePath,
"/humgen/1kg/reference/human_g1k_v37.fasta.fai",
body.getAbsolutePath,
header.getAbsolutePath,
body.getAbsolutePath,
outputVCF.getAbsolutePath
)
}
}
class ExtractSample(vcfIn: File, sample: String, vcfOut: File) extends CommandLineFunction {
@Input(doc="File from which to extract sample")
var inVCF: File = vcfIn
@Argument(doc="The sample to extract")
var inSample: String = sample
@Output(doc="The VCF file to write to")
var outVCF: File = vcfOut
def commandLine = {
"head -n 500 %s | grep \\\\#CHR | tr '\\t' '\\n' | grep -n %s | tr ':' '\\t' | cut -f1 | xargs -i cut -f1-9,\\{\\} %s | egrep \"\\\\#|PASS.*0/1|PASS.*1/0|PASS.*1/1\" > %s".format(
inVCF.getAbsolutePath, inSample, inVCF.getAbsolutePath, outVCF.getAbsolutePath
)
}
}
class GetVariantDepths extends CommandLineFunction {
@Input(doc="The VCF to get the per-sample depth at variant calls")
var inVCF : File = _
@Output(doc="The depth file to write to")
var outFile : File = _
def commandLine = {
"grep PASS %s | cut -f10- | tr '\\t' '\\n' | egrep \"1/0|0/1|1/1|1\\|0|0\\|1|1\\|1\" | tr ':' '\\t' | awk '{print $3}' > %s".format(
inVCF.getAbsolutePath, outFile.getAbsolutePath
)
}
}
def combineAndMergeSites(chrInputs: List[File], output: File) = {
var sites_only: List[ExtractSites] = chrInputs.map( f => new ExtractSites(f,new File(VCF_DIR+swapExt(f,".vcf",".sites.vcf").getName)))
for ( s <- sites_only ) {
add(s)
}
var trivialMerge : ConcatVCF = new ConcatVCF(sites_only.map( s => s.outputVCF ), output)
add(trivialMerge)
}
def liftoverVCF(b36vcf: File, b37vcf: File) = {
class Liftover(in: File, out: File) extends CommandLineFunction {
@Input(doc="foo")
val input: File = in
@Output(doc="foo")
val output : File = out
val sting : String = "/humgen/gsa-scr1/chartl/sting/"
val chain : String = "/humgen/gsa-hpprojects/GATK/data/Liftover_Chain_Files/b36ToHg19.broad.over.chain"
def commandLine = {
"%s/perl/liftOverVCF.pl -vcf %s -chain %s -out %s -gatk %s -newRef %s -oldRef %s -tmp .".format(
sting, input, chain, output, sting, "/seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19",
"/humgen/1kg/reference/human_b36_both"
)
}
class RMDupes(in: File, out:File) extends CommandLineFunction {
@Input(doc="foo")
var input : File = in
@Output(doc="foo")
var output : File = out
def commandLine = "cat %s | python /humgen/gsa-scr1/chartl/projects/omni/scripts/filterDupes.py > %s".format(
input.getAbsolutePath,output.getAbsolutePath
)
}
var lift : Liftover = new Liftover(b36vcf,swapExt(b37vcf,".vcf",".undeduped.vcf"))
add(lift)
var rmDupes : RMDupes = new RMDupes(lift.output,b37vcf)
add(rmDupes)
}
}
def callOverlaps(samples: File, intervals: File, wexCalls: File) = {
class GetBamList(sample: String, oList: File) extends CommandLineFunction {
@Argument(doc="foo")
var inSample = sample
@Output(doc="foo")
var outList = oList
@Input(doc = "foo")
var waitForMe: File = _
def commandLine = {
"grep %s /humgen/1kg/processing/allPopulations_wholeGenome_august_release/bamLists/*.list | tr ':' '\\t' | awk '{print $2}' | sort | uniq > %s".format(
inSample,
outList.getAbsolutePath
)
}
}
for ( s <- scala.io.Source.fromFile(samples).getLines ) {
var extract : ExtractSample = new ExtractSample(wexCalls,s,new File(VCF_DIR+"g1k_exome.subset.%s.vcf".format(s)))
add(extract)
var sites : Vcf2List = new Vcf2List
sites.in_vcf = extract.outVCF
sites.out_list = new File(SCRATCH_DIR+"g1k_exome.subset.%s.variants.list".format(s))
add(sites)
var bamList : GetBamList = new GetBamList(s,new File(SCRATCH_DIR+"%s.bams.list".format(s)))
bamList.waitForMe = sites.out_list
add(bamList)
var genotype : UnifiedGenotyper = new UnifiedGenotyper
genotype.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar")
genotype.reference_sequence = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
genotype.intervals :+= sites.out_list
genotype.out = new File(VCF_DIR+"g1k_lowpass.%s.exome_sites.vcf".format(s))
genotype.input_file :+= bamList.outList
Walkers can now specify a class extending from Gatherer to merge custom output formats. Add @Gather(MyGatherer.class) to the walker @Output. JavaCommandLineFunctions can now specify the classpath+mainclass as an alternative to specifying a path to an executable jar. JCLF by default pass on the current classpath and only require the mainclass be specified by the developer extending the JCLF, relieving the QScript author from having to explicitly specify the jar. Like the Picard MergeSamFiles, GATK engine by default is now run from the current classpath. The GATK can still be overridden via .jarFile or .javaClasspath. Walkers from the GATK package are now also embedded into the Queue package. Updated AnalyzeCovariates to make it easier to guess the main class, AnalyzeCovariates instead of AnalyzeCovariatesCLP. Removed the GATK jar argument from the example QScripts. Removed one of the most FAQ when getting started with Scala/Queue, the use of Option[_] in QScripts: 1) Fixed mistaken assumption with java enums. In java enums can be null so they don't need nullable wrappers. 2) Added syntactic sugar for Nullable primitives to the QScript trait. Any variable defined as Option[Int] can just be assigned an Int value or None, ex: myFunc.memoryLimit = 3 Removed other unused code. Re-fixed dry run function ordering. Re-ordered the QCommandline companion object so that IntelliJ doesn't complain about missing main methods. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5504 348d0f76-0448-11de-a6fe-93d51630548a
2011-03-24 22:03:51 +08:00
genotype.memoryLimit = 3
genotype.output_all_callable_bases = true
add(genotype)
}
}
}