package core import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction import org.broadinstitute.sting.utils.exceptions.ReviewedStingException import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction import org.broadinstitute.sting.queue.function.JavaCommandLineFunction class GATKResourcesBundle extends QScript { // todo -- update to released version when things stabilize @Argument(doc="gatkJarFile", required=false) var gatkJarFile: File = new File("dist/GenomeAnalysisTK.jar") @Argument(doc="liftOverPerl", required=false) var liftOverPerl: File = new File("./perl/liftOverVCF.pl") @Argument(shortName = "svn", doc="The SVN version of this release", required=true) var SVN_VERSION: String = _ @Argument(shortName = "bundleDir", doc="Path to root where resource files will be placed", required=false) val BUNDLE_ROOT = new File("/humgen/gsa-hpprojects/GATK/bundle") @Argument(shortName = "downloadDir", doc="Path to root where resource files will be placed for users to download", required=false) val DOWNLOAD_ROOT = new File("/humgen/gsa-scr1/pub/bundle") @Argument(shortName = "test", doc="", required=false) val TEST = false @Argument(shortName = "phase2", doc="", required=false) val DO_DOWNLOAD = false val SITES_EXT: String = "sites" def BUNDLE_DIR: File = BUNDLE_ROOT + "/" + SVN_VERSION def DOWNLOAD_DIR: File = DOWNLOAD_ROOT + "/" + SVN_VERSION // REFERENCES class Reference( val name: String, val file: File ) { } var b37: Reference = _ var hg18: Reference = _ var b36: Reference = _ var exampleFASTA: Reference = _ var refs: List[Reference] = _ class Resource(val file: File, val name: String, val ref: Reference, val useName: Boolean = true, val makeSites: Boolean = true ) { def destname(target: Reference): String = { if ( useName ) return name + "." + target.name + "." + getExtension(file) else return file.getName } } def liftover(in: File, inRef: Reference, out: File, outRef: Reference): CommandLineFunction = { //Console.printf("liftover(%s => %s)%n", inRef.name, outRef.name) (inRef.name, outRef.name) match { case ("b37", "hg19") => return new LiftOverPerl(in, out, new File("chainFiles/b37Tohg19.chain"), inRef, outRef) case ("b37", "hg18") => return new LiftOverPerl(in, out, new File("chainFiles/b37tohg18.chain"), inRef, outRef) case ("b37", "b36") => return new LiftOverPerl(in, out, new File("chainFiles/b37tob36.chain"), inRef, outRef) case _ => return null } } def isVCF(file: File) = file.getName.endsWith(".vcf") def isBAM(file: File) = file.getName.endsWith(".bam") def isFASTA(file: File) = file.getName.endsWith(".fasta") var RESOURCES: List[Resource] = Nil def addResource(comp: Resource) { RESOURCES = comp :: RESOURCES } trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { this.logging_level = "INFO"; this.jarFile = gatkJarFile; this.memoryLimit = 2 } def initializeTestDataFiles() = { // // Standard evaluation files for indels // b37 = new Reference("b37", new File("/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta")) hg18 = new Reference("hg18", new File("/Users/depristo/Desktop/broadLocal/localData/Homo_sapiens_assembly18.fasta")) exampleFASTA = new Reference("exampleFASTA", new File("testdata/exampleFASTA.fasta")) refs = List(b37, hg18, exampleFASTA) val DATAROOT = "/Users/depristo/Desktop/broadLocal/localData/" //addResource(new Resource(DATAROOT + "human_g1k_v37.fasta", "human_g1k_v37.fasta", b37, false)) addResource(new Resource(DATAROOT + "1000G.snp.validation.b37.vcf", "1000G.snp.validation", b37)) addResource(new Resource(DATAROOT + "dbsnp_132_b37.vcf", "dbsnp_132", b37, true, false)) addResource(new Resource(exampleFASTA.file, "exampleFASTA", exampleFASTA, false)) addResource(new Resource("testdata/exampleBAM.bam", "exampleBAM", exampleFASTA, false)) } def initializeStandardDataFiles() = { // // references // b37 = new Reference("b37", new File("/humgen/1kg/reference/human_g1k_v37.fasta")) hg18 = new Reference("hg18", new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")) b36 = new Reference("b36", new File("/humgen/1kg/reference/human_b36_both.fasta")) exampleFASTA = new Reference("exampleFASTA", new File("testdata/exampleFASTA.fasta")) refs = List(b37, hg18, b36, exampleFASTA) addResource(new Resource(b37.file, "", b37, false)) addResource(new Resource(b36.file, "", b36, false)) addResource(new Resource(hg18.file, "", hg18, false)) // // standard VCF files. Will be lifted to each reference // addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf", "dbsnp_132", b37, true, false)) addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_genotypes_1525_samples.b37.vcf", "1000G_omni2.5", b37, true, true)) addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf", "hapmap_3.3", b37, true, true)) addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/AFR+EUR+ASN+1KG.dindel_august_release_merged_pilot1.20110126.sites.vcf", "1000G_indels_for_realignment", b37, true, false)) // // example call set for wiki tutorial // addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/exampleCalls/NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf", "NA12878.HiSeq.WGS.bwa.cleaned.raw.subset", b37, true, true)) // // Test BAM file, specific to each reference // addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam", "IGNORE", b37, false, false)) // // refGene files specific to each reference // addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/refGene_b37.sorted.txt", "refGene", b37, true, false)) addResource(new Resource("chainFiles/hg18tob37.chain", "", hg18, false, false)) addResource(new Resource("chainFiles/b36tob37.chain", "", b36, false, false)) // todo -- chain files? // todo 1000G SNP and indel call sets? // // exampleFASTA file // addResource(new Resource(exampleFASTA.file, "exampleFASTA", exampleFASTA, false)) addResource(new Resource("testdata/exampleBAM.bam", "exampleBAM", exampleFASTA, false)) } def createBundleDirectories(dir: File) = { if ( ! dir.exists ) dir.mkdirs() for ( ref <- refs ) { val refDir = new File(dir + "/" + ref.name) if ( ! refDir.exists ) refDir.mkdirs() } } def script = { if ( TEST ) initializeTestDataFiles(); else initializeStandardDataFiles(); if ( ! DO_DOWNLOAD ) { // create destination directory structure createBundleDirectories(BUNDLE_DIR) for ( resource: Resource <- RESOURCES ) { if ( isFASTA(resource.file) ) { val f = copyBundleFile(resource, resource.ref) add(new createDictandFAI(f)) } else if ( isBAM(resource.file) ) { val f = copyBundleFile(resource, resource.ref) add(new IndexBAM(f)) @Output val outvcf: File = swapExt(f.getParent, f, ".bam", ".vcf") add(new UG(resource.file, resource.ref.file, outvcf)) } else if ( isVCF(resource.file) ) { for ( destRef <- refs ) { val out = destFile(BUNDLE_DIR, destRef, resource.destname(destRef)) var continue = true // copy or lift over the original vcf if ( resource.ref == destRef ) { add(new cpFile(resource.file, out)) } else { val clf = liftover(resource.file, resource.ref, out, destRef) if ( clf != null ) { add(clf) } else { continue = false } } if ( continue ) { add(new IndexVCF(out, destRef.file)) if ( resource.makeSites ) { val sites: Resource = new Resource(swapExt(out.getParent, out, ".vcf", "." + SITES_EXT + ".vcf"), "", destRef, false) add(new JustSites(out, sites.file)) add(new IndexVCF(sites.file, destRef.file)) } if ( resource.name.contains("dbsnp") ) { val dbsnp129: Resource = new Resource(swapExt(out.getParent, out, ".vcf", ".excluding_sites_after_129.vcf"), "", destRef, false) add(new MakeDBSNP129(out, destRef.file, dbsnp129.file)) add(new IndexVCF(dbsnp129.file, destRef.file)) } } } } else { //throw new ReviewedStingException("Unknown file type: " + resource) } } } else { createBundleDirectories(DOWNLOAD_DIR) createDownloadsFromBundle(BUNDLE_DIR, DOWNLOAD_DIR) } } def createDownloadsFromBundle(in: File, out: File) { Console.printf("Visiting %s%n", in) if (! in.getName.startsWith(".")) { if ( in.isDirectory ) { out.mkdirs for ( child: File <- in.listFiles ) { createDownloadsFromBundle(child, out + "/" + child.getName) } } else { if ( isBAM(in) ) { add(new cpFile(in, out)) add(new md5sum(out)) } else { add(new GzipFile(in, out + ".gz")) add(new md5sum(out + ".gz")) } } } } def copyBundleFile(res: Resource, ref: Reference): File = { val out = destFile(BUNDLE_DIR, ref, res.destname(ref)) add(new cpFile(res.file, out)) return out } def destFile(dir: File, ref: Reference, f: File): File = { return destFile(dir, ref, f.getName) } def destFile(dir: File, ref: Reference, name: String): File = { return new File(dir + "/" + ref.name + "/" + name) } /** * A command line (cut) that removes all genotyping information from a file */ class JustSites(@Input(doc="foo") in: File, @Output(doc="foo") out: File) extends CommandLineFunction { def commandLine = "cut -f 1-8 %s > %s".format(in, out) } class GzipFile(@Input val in: File, @Output val out: File) extends CommandLineFunction { def commandLine = "gzip -c %s > %s".format(in.getAbsolutePath, out.getAbsolutePath) } class cpFile(@Input val in: File, @Output val out: File) extends CommandLineFunction { def commandLine = "cp %s %s".format(in.getAbsolutePath, out.getAbsolutePath) } class md5sum(@Input val in: File) extends CommandLineFunction { @Output val o: File = new File(in.getAbsolutePath + ".md5") def commandLine = "md5sum %s > %s".format(in.getAbsolutePath, o) } class IndexBAM(bamIn: File) extends SamtoolsIndexFunction { bamFile = bamIn } class IndexVCF(@Input vcf: File, @Input ref: File) extends CountRod with UNIVERSAL_GATK_ARGS { //@Output val vcfIndex: File = swapExt(vcf.getParent, vcf, ".vcf", ".vcf.idx") this.rodBind :+= RodBind(vcf.getName, "VCF", vcf) this.reference_sequence = ref } class UG(@Input bam: File, @Input ref: File, @Input outVCF: File) extends UnifiedGenotyper with UNIVERSAL_GATK_ARGS { this.input_file = List(bam) this.reference_sequence = ref this.out = outVCF } class MakeDBSNP129(@Input dbsnp: File, @Input ref: File, @Output dbsnp129: File) extends SelectVariants with UNIVERSAL_GATK_ARGS { this.rodBind :+= RodBind("variant", "VCF", dbsnp) this.select ++= List("\"dbSNPBuildID <= 129\"") this.reference_sequence = ref this.out = dbsnp129 } class LiftOverPerl(@Input val in: File, @Output val out: File, @Input val chain: File, oldRef: Reference, newRef: Reference) extends CommandLineFunction { this.memoryLimit = 8 def commandLine = ("%s -vcf %s -chain %s -out %s " + "-gatk ./ -newRef %s -oldRef %s -tmp %s").format(liftOverPerl, in.getAbsolutePath, chain, out.getAbsolutePath, newRef.file.replace(".fasta", ""), oldRef.file.replace(".fasta", ""), jobTempDir) } def getExtension(f: File): String = { val i = f.getName.lastIndexOf('.'); if (i > 0 && i < f.getName.length() - 1) return f.getName.substring(i+1).toLowerCase(); else return ""; } class createDictandFAI (@Input ref: File) extends FastaStats with UNIVERSAL_GATK_ARGS { @Output val outDict: File = swapExt(ref.getParent, ref, ".fasta", ".dict") @Output val outFai: File = swapExt(ref.getParent, ref, ".fasta", ".fasta.fai") @Output val outStats: File = swapExt(ref.getParent, ref, ".fasta", ".stats") this.reference_sequence = ref this.out = outStats } }