408 lines
16 KiB
Plaintext
Executable File
408 lines
16 KiB
Plaintext
Executable File
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils
|
|
import org.broadinstitute.sting.queue.extensions.gatk.{UnifiedGenotyper, RodBind, CombineVariants, SelectVariants}
|
|
import org.broadinstitute.sting.queue.QScript
|
|
import tools.nsc.io.File
|
|
|
|
class private_mutations extends QScript {
|
|
pm_script => // alias for the script arguments
|
|
|
|
val eomi_merged_calls : File = new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/resources/esp_merged.vcf")
|
|
val g1k_exome_calls : File = new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/resources/g1k_exomes.vcf")
|
|
val ceu_pilot_calls : File = new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/resources/CEU.low_coverage.genotypes.vcf")
|
|
|
|
val SINGLE_SAMPLE_EXPRESSION = "\"PASS.*AC=1;|PASS.*AC=2;.*1/1|PASS.*AC=2.*1\\|1\""
|
|
val NOVEL_VARIANT_EXPRESSION = "-v \"rs[0-9]\""
|
|
val KNOWN_VARIANT_EXPRESSION = "\"rs[0-9].*PASS\""
|
|
|
|
val BASE_DIR = "/humgen/gsa-hphome1/chartl/projects/private_mutations/queue/"
|
|
val LIST_DIR = BASE_DIR + "lists/"
|
|
val VCF_DIR = BASE_DIR + "vcfs/"
|
|
val EVAL_DIR = BASE_DIR + "evals/"
|
|
val SCRATCH_DIR = BASE_DIR + "intermediate/"
|
|
|
|
/*
|
|
* Steps of analysis.
|
|
*
|
|
* todo -- 1) Calculate number of EOMI calls top-down, cumulative include/exclude that are:
|
|
* - present in one sample
|
|
* - not in dbsnp 129
|
|
* - not in 1000G pilot CEU
|
|
* - not in 1000G production EUR
|
|
* - not in 1000G production exome EUR
|
|
*
|
|
* todo -- 2) Distributional aspects of remaining mutations
|
|
* - Depth of Coverage
|
|
* - Functional Class
|
|
*
|
|
* todo -- 3) 1000G Exome v Lowpass -- Venn of single-sample variants
|
|
* - Venn numbers
|
|
* - Distribution of depth for exome variants missed in low-pass
|
|
*
|
|
* todo -- 4) Estimate relationship between depth and sensitivity to private variation
|
|
*
|
|
* todo -- 5) Calculate % of exome empowered to detect private variation
|
|
*
|
|
* todo -- 6) Identify loci across 1000G Ex and EOMI that are low-powered
|
|
* - venn of low-power targets
|
|
* - mapping the intersection
|
|
* - genes affected by low power
|
|
*/
|
|
|
|
|
|
/*
|
|
* Scripting commands go here
|
|
*/
|
|
def script = {
|
|
// setup analysis resources (best done in loop)
|
|
val g1k_lowpass_chr : List[File] = (1 to 22).toList.map( i => new File("/humgen/1kg/processing/allPopulations_wholeGenome_august_release/calls/chr%d/ALL.august.chr%d.recal.vrcut.EUR.vcf".format(i,i)))
|
|
val eomi_annot_set : List[File] = (1 to 7).toList.map( i => new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/resources/esp_set%d.vcf".format(i)))
|
|
val g1k_lowpass_hg19_merged : File = new File(VCF_DIR+"g1k_lowpass_merged.hg19.sites.vcf")
|
|
val eomi_merged_hg19 : File = eomi_merged_calls
|
|
val ceu_hg19 : File = new File(VCF_DIR+"G1K_Pilot.CEU.hg19.vcf")
|
|
|
|
combineAndMergeSites(g1k_lowpass_chr,g1k_lowpass_hg19_merged)
|
|
liftoverVCF(ceu_pilot_calls,ceu_hg19)
|
|
|
|
|
|
|
|
// get EOMI ss-only list
|
|
var eomi_single_sample : Vcf2List = new Vcf2List
|
|
eomi_single_sample.in_vcf = eomi_merged_hg19
|
|
eomi_single_sample.filter = SINGLE_SAMPLE_EXPRESSION
|
|
eomi_single_sample.out_list = new File(LIST_DIR+"EOMI_single_sample_variants.sites.list")
|
|
|
|
add(eomi_single_sample)
|
|
|
|
var eomi_dbsnp : Vcf2List = new Vcf2List
|
|
eomi_dbsnp.in_vcf = eomi_merged_hg19
|
|
eomi_dbsnp.filter = KNOWN_VARIANT_EXPRESSION
|
|
eomi_dbsnp.out_list = new File(LIST_DIR+"EOMI_novel_variants.sites.list")
|
|
|
|
add(eomi_dbsnp)
|
|
|
|
var ceu_sites : Vcf2List = new Vcf2List
|
|
ceu_sites.in_vcf = ceu_hg19
|
|
ceu_sites.out_list = new File(SCRATCH_DIR+"G1KP_CEU_variants.sites.list")
|
|
|
|
add(ceu_sites)
|
|
|
|
var g1k_lowpass_sites : Vcf2List = new Vcf2List
|
|
g1k_lowpass_sites.in_vcf = g1k_lowpass_hg19_merged
|
|
g1k_lowpass_sites.out_list = new File(SCRATCH_DIR+"G1K_lowpass_EUR.sites.list")
|
|
|
|
add(g1k_lowpass_sites)
|
|
|
|
var g1k_exome_sites : Vcf2List = new Vcf2List
|
|
g1k_exome_sites.in_vcf = g1k_exome_calls
|
|
g1k_exome_sites.out_list = new File(SCRATCH_DIR+"G1K_exomes.sites.list")
|
|
|
|
add(g1k_exome_sites)
|
|
|
|
var remove_dbsnp : RemoveIntersect = new RemoveIntersect
|
|
remove_dbsnp.fileToUnique = eomi_single_sample.out_list
|
|
remove_dbsnp.comparisonFile = eomi_dbsnp.out_list
|
|
remove_dbsnp.outputFile = new File(LIST_DIR+"EOMI_single_sample.nodbsnp.list")
|
|
|
|
add(remove_dbsnp)
|
|
|
|
var remove_ceu : RemoveIntersect = new RemoveIntersect
|
|
remove_ceu.fileToUnique = remove_dbsnp.outputFile
|
|
remove_ceu.comparisonFile = ceu_sites.out_list
|
|
remove_ceu.outputFile = new File(LIST_DIR+"EOMI_single_sample.nodbsnp.noceu.list")
|
|
|
|
add(remove_ceu)
|
|
|
|
var remove_lowpass : RemoveIntersect = new RemoveIntersect
|
|
remove_lowpass.fileToUnique = remove_ceu.outputFile
|
|
remove_lowpass.comparisonFile = g1k_lowpass_sites.out_list
|
|
remove_lowpass.outputFile = new File(LIST_DIR+"EOMI_single_sample.nodbsnp.noceu.noeur.list")
|
|
|
|
add(remove_lowpass)
|
|
|
|
var remove_exome : RemoveIntersect = new RemoveIntersect
|
|
remove_exome.fileToUnique = remove_lowpass.outputFile
|
|
remove_exome.comparisonFile = g1k_exome_sites.out_list
|
|
remove_exome.outputFile = new File(LIST_DIR+"EOMI_single_sample.nodbsnp.noceu.noeur.noexome.list")
|
|
|
|
add(remove_exome)
|
|
|
|
var subset : SelectVariants = new SelectVariants
|
|
subset.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar")
|
|
subset.reference_sequence = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
|
|
subset.variantVCF = eomi_merged_hg19
|
|
subset.out = new File(VCF_DIR+"EOMI_merged.hg19.private.vcf")
|
|
subset.intervals :+= remove_exome.outputFile
|
|
|
|
add(subset)
|
|
|
|
var varDepths : GetVariantDepths = new GetVariantDepths
|
|
varDepths.inVCF = subset.out
|
|
varDepths.outFile = new File("/humgen/gsa-hphome1/chartl/projects/private_mutations/results/EOMI.private.variant.depths.txt")
|
|
|
|
add(varDepths)
|
|
|
|
var lowpass_single : Vcf2List = new Vcf2List
|
|
lowpass_single.in_vcf = g1k_lowpass_hg19_merged
|
|
lowpass_single.filter = SINGLE_SAMPLE_EXPRESSION
|
|
lowpass_single.out_list = new File(LIST_DIR+"g1k_lowpass_single_sample.variants.list")
|
|
|
|
add(lowpass_single)
|
|
|
|
var exome_single : Vcf2List = new Vcf2List
|
|
exome_single.in_vcf = g1k_exome_calls
|
|
exome_single.filter = SINGLE_SAMPLE_EXPRESSION
|
|
exome_single.out_list = new File(LIST_DIR+"g1k_exome_single_sample.variants.list")
|
|
|
|
add(exome_single)
|
|
|
|
var exome_extract : ExtractSites = new ExtractSites(g1k_exome_calls,new File(VCF_DIR+"g1k_exome.sites.vcf"))
|
|
add(exome_extract)
|
|
|
|
var getVennExS : CombineVariants = new CombineVariants
|
|
getVennExS.rodBind :+= new RodBind("lowpass","VCF",g1k_lowpass_hg19_merged);
|
|
getVennExS.rodBind :+= new RodBind("exome","VCF",exome_extract.outputVCF);
|
|
getVennExS.priority = "exome,lowpass"
|
|
getVennExS.intervals :+= exome_single.out_list
|
|
getVennExS.reference_sequence = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
|
|
getVennExS.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar")
|
|
getVennExS.genotypeMergeOptions = Some(VariantContextUtils.GenotypeMergeType.UNIQUIFY)
|
|
getVennExS.variantMergeOptions = Some(VariantContextUtils.VariantMergeType.UNION)
|
|
getVennExS.out = new File(VCF_DIR + "g1k_exome_plus_lowpass.singlesample.merged.exome.sites.vcf")
|
|
|
|
//add(getVennExS)
|
|
|
|
var getVennLPS : CombineVariants = new CombineVariants
|
|
getVennLPS.rodBind :+= new RodBind("lowpass","VCF",g1k_lowpass_hg19_merged);
|
|
getVennLPS.rodBind :+= new RodBind("exome","VCF",exome_extract.outputVCF);
|
|
getVennLPS.priority = "exome,lowpass"
|
|
getVennLPS.intervals :+= lowpass_single.out_list
|
|
getVennLPS.reference_sequence = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
|
|
getVennLPS.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar")
|
|
getVennLPS.genotypeMergeOptions = Some(VariantContextUtils.GenotypeMergeType.UNIQUIFY)
|
|
getVennLPS.variantMergeOptions = Some(VariantContextUtils.VariantMergeType.UNION)
|
|
getVennLPS.out = new File(VCF_DIR + "g1k_exome_plus_lowpass.singlesample.merged.lowpass.sites.vcf")
|
|
|
|
add(getVennLPS)
|
|
|
|
var getG1KOverlap : GetOverlapSamples = new GetOverlapSamples
|
|
getG1KOverlap.vcf1 = g1k_exome_calls
|
|
getG1KOverlap.vcf2 = g1k_lowpass_chr(1) // EUR only
|
|
getG1KOverlap.samples = new File(LIST_DIR+"g1k_EUR_exome_lowpass_overlap.txt")
|
|
|
|
add(getG1KOverlap)
|
|
|
|
//callOverlaps(getG1KOverlap.samples,exome_single.out_list, g1k_exome_calls)
|
|
|
|
}
|
|
|
|
/*
|
|
* defined modules go here
|
|
*/
|
|
|
|
class Vcf2List extends CommandLineFunction {
|
|
@Input(doc="The vcf file to convert to list",required=true)
|
|
var in_vcf: File = _
|
|
@Argument(doc="An egrep-based filter to apply during conversion",required=false)
|
|
var filter: String = "PASS"
|
|
@Output(doc="The list file to write to", required=true)
|
|
var out_list: File = _
|
|
|
|
def commandLine = {
|
|
"egrep %s %s | awk '{print $1\":\"$2}' > %s".format(filter,in_vcf.getAbsolutePath,out_list.getAbsolutePath)
|
|
}
|
|
}
|
|
|
|
class GetOverlapSamples extends CommandLineFunction {
|
|
@Input(doc="vcf1")
|
|
var vcf1: File = _
|
|
@Input(doc="vcf2")
|
|
var vcf2: File = _
|
|
@Output(doc="sample file")
|
|
var samples: File = _
|
|
|
|
def commandLine = {
|
|
"head -n 500 %s %s | grep \\\\#CHR | cut -f10- | tr '\\t' '\\n' | sort | uniq -c | awk '{if ($1 == 2) print $2}' > %s".format(
|
|
vcf1.getAbsolutePath,
|
|
vcf2.getAbsolutePath,
|
|
samples.getAbsolutePath
|
|
)
|
|
}
|
|
}
|
|
|
|
class RemoveIntersect extends CommandLineFunction {
|
|
@Input(doc="The vcf file whose unique entries should be retained",required=true)
|
|
var fileToUnique: File = _
|
|
@Input(doc="The vcf file whose entries overlapping the unique file you would like to remove",required=true)
|
|
var comparisonFile: File = _
|
|
@Output(doc="The file containing only the fileToUnique-unique entries", required=true)
|
|
var outputFile: File = _
|
|
|
|
def commandLine = {
|
|
var tmpFile: File = java.io.File.createTempFile("removeintersect","tmp")
|
|
|
|
"cat %s %s | sort | uniq -c | awk '{if ($1==2) print $2}' > %s ; cat %s %s | sort | uniq -c | awk '{if ($1 == 1) print $2}' > %s".format(
|
|
fileToUnique.getAbsolutePath,comparisonFile.getAbsolutePath, tmpFile.getAbsolutePath,
|
|
tmpFile.getAbsolutePath, fileToUnique.getAbsolutePath, outputFile.getAbsolutePath
|
|
)
|
|
}
|
|
}
|
|
|
|
class ExtractSites(inVCF: File, oVCF: File) extends CommandLineFunction {
|
|
@Input(doc="The vcf file to generate a sites only file from",required=true)
|
|
var inputVCF: File = inVCF
|
|
@Output(doc="The sites-only vcf to write to")
|
|
var outputVCF: File = oVCF
|
|
|
|
def commandLine = {
|
|
"cut -f1-9 %s > %s".format(inputVCF.getAbsolutePath,outputVCF.getAbsolutePath)
|
|
}
|
|
}
|
|
|
|
class ConcatVCF(in: List[File], out: File) extends CommandLineFunction {
|
|
@Input(doc="The files to concatenate",required=true)
|
|
var inputVCFs : List[File] = in
|
|
@Output(doc="The file to write to", required=true)
|
|
var outputVCF: File = out
|
|
|
|
def commandLine = {
|
|
var header : File = java.io.File.createTempFile("concatVCF","header.tmp")
|
|
var body_unsorted : File = java.io.File.createTempFile("concatVCF","body.unsorted.tmp")
|
|
var body : File = java.io.File.createTempFile("concatVCF","body.tmp")
|
|
|
|
"grep \\\\# %s > %s ; grep -v \\\\# %s | sed 's/.vcf:/\\t/g' | cut -f2- > %s ; perl %s -tmp . %s %s > %s ; cat %s %s > %s".format(
|
|
inputVCFs(0).getAbsolutePath,
|
|
header.getAbsolutePath,
|
|
inputVCFs.foldLeft[String]("")((x1: String, x2: File ) => x1 + " " + x2.getAbsolutePath ),
|
|
body_unsorted.getAbsolutePath,
|
|
"/humgen/gsa-scr1/chartl/sting/perl/sortByRef.pl",
|
|
body_unsorted.getAbsolutePath,
|
|
"/humgen/1kg/reference/human_g1k_v37.fasta.fai",
|
|
body.getAbsolutePath,
|
|
header.getAbsolutePath,
|
|
body.getAbsolutePath,
|
|
outputVCF.getAbsolutePath
|
|
)
|
|
}
|
|
}
|
|
|
|
class ExtractSample(vcfIn: File, sample: String, vcfOut: File) extends CommandLineFunction {
|
|
@Input(doc="File from which to extract sample")
|
|
var inVCF: File = vcfIn
|
|
@Argument(doc="The sample to extract")
|
|
var inSample: String = sample
|
|
@Output(doc="The VCF file to write to")
|
|
var outVCF: File = vcfOut
|
|
|
|
def commandLine = {
|
|
"head -n 500 %s | grep \\\\#CHR | tr '\\t' '\\n' | grep -n %s | tr ':' '\\t' | cut -f1 | xargs -i cut -f1-9,\\{\\} %s | egrep \"\\\\#|PASS.*0/1|PASS.*1/0|PASS.*1/1\" > %s".format(
|
|
inVCF.getAbsolutePath, inSample, inVCF.getAbsolutePath, outVCF.getAbsolutePath
|
|
)
|
|
}
|
|
}
|
|
|
|
class GetVariantDepths extends CommandLineFunction {
|
|
@Input(doc="The VCF to get the per-sample depth at variant calls")
|
|
var inVCF : File = _
|
|
@Output(doc="The depth file to write to")
|
|
var outFile : File = _
|
|
|
|
def commandLine = {
|
|
"grep PASS %s | cut -f10- | tr '\\t' '\\n' | egrep \"1/0|0/1|1/1|1\\|0|0\\|1|1\\|1\" | tr ':' '\\t' | awk '{print $3}' > %s".format(
|
|
inVCF.getAbsolutePath, outFile.getAbsolutePath
|
|
)
|
|
}
|
|
}
|
|
|
|
def combineAndMergeSites(chrInputs: List[File], output: File) = {
|
|
var sites_only: List[ExtractSites] = chrInputs.map( f => new ExtractSites(f,new File(VCF_DIR+swapExt(f,".vcf",".sites.vcf").getName)))
|
|
for ( s <- sites_only ) {
|
|
add(s)
|
|
}
|
|
var trivialMerge : ConcatVCF = new ConcatVCF(sites_only.map( s => s.outputVCF ), output)
|
|
add(trivialMerge)
|
|
}
|
|
|
|
def liftoverVCF(b36vcf: File, b37vcf: File) = {
|
|
|
|
class Liftover(in: File, out: File) extends CommandLineFunction {
|
|
@Input(doc="foo")
|
|
val input: File = in
|
|
@Output(doc="foo")
|
|
val output : File = out
|
|
|
|
val sting : String = "/humgen/gsa-scr1/chartl/sting/"
|
|
val chain : String = "/humgen/gsa-hpprojects/GATK/data/Liftover_Chain_Files/b36ToHg19.broad.over.chain"
|
|
|
|
def commandLine = {
|
|
"%s/perl/liftOverVCF.pl -vcf %s -chain %s -out %s -gatk %s -newRef %s -oldRef %s -tmp .".format(
|
|
sting, input, chain, output, sting, "/seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19",
|
|
"/humgen/1kg/reference/human_b36_both"
|
|
)
|
|
}
|
|
|
|
class RMDupes(in: File, out:File) extends CommandLineFunction {
|
|
@Input(doc="foo")
|
|
var input : File = in
|
|
@Output(doc="foo")
|
|
var output : File = out
|
|
|
|
def commandLine = "cat %s | python /humgen/gsa-scr1/chartl/projects/omni/scripts/filterDupes.py > %s".format(
|
|
input.getAbsolutePath,output.getAbsolutePath
|
|
)
|
|
}
|
|
|
|
var lift : Liftover = new Liftover(b36vcf,swapExt(b37vcf,".vcf",".undeduped.vcf"))
|
|
|
|
add(lift)
|
|
|
|
var rmDupes : RMDupes = new RMDupes(lift.output,b37vcf)
|
|
|
|
add(rmDupes)
|
|
}
|
|
}
|
|
|
|
def callOverlaps(samples: File, intervals: File, wexCalls: File) = {
|
|
class GetBamList(sample: String, oList: File) extends CommandLineFunction {
|
|
@Argument(doc="foo")
|
|
var inSample = sample
|
|
@Output(doc="foo")
|
|
var outList = oList
|
|
@Input(doc = "foo")
|
|
var waitForMe: File = _
|
|
|
|
def commandLine = {
|
|
"grep %s /humgen/1kg/processing/allPopulations_wholeGenome_august_release/bamLists/*.list | tr ':' '\\t' | awk '{print $2}' | sort | uniq > %s".format(
|
|
inSample,
|
|
outList.getAbsolutePath
|
|
)
|
|
}
|
|
}
|
|
for ( s <- scala.io.Source.fromFile(samples).getLines ) {
|
|
var extract : ExtractSample = new ExtractSample(wexCalls,s,new File(VCF_DIR+"g1k_exome.subset.%s.vcf".format(s)))
|
|
add(extract)
|
|
|
|
var sites : Vcf2List = new Vcf2List
|
|
sites.in_vcf = extract.outVCF
|
|
sites.out_list = new File(SCRATCH_DIR+"g1k_exome.subset.%s.variants.list".format(s))
|
|
add(sites)
|
|
|
|
var bamList : GetBamList = new GetBamList(s,new File(SCRATCH_DIR+"%s.bams.list".format(s)))
|
|
bamList.waitForMe = sites.out_list
|
|
|
|
add(bamList)
|
|
|
|
var genotype : UnifiedGenotyper = new UnifiedGenotyper
|
|
genotype.jarFile = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar")
|
|
genotype.reference_sequence = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
|
|
genotype.intervals :+= sites.out_list
|
|
genotype.out = new File(VCF_DIR+"g1k_lowpass.%s.exome_sites.vcf".format(s))
|
|
genotype.input_file :+= bamList.outList
|
|
genotype.memoryLimit = Some(3)
|
|
genotype.output_all_callable_bases = true
|
|
|
|
add(genotype)
|
|
|
|
}
|
|
}
|
|
|
|
} |