gatk-3.8/scala/qscript/core/GATKResourcesBundle.scala

335 lines
13 KiB
Scala
Executable File

package core
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException
import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction
import org.broadinstitute.sting.queue.function.JavaCommandLineFunction
class GATKResourcesBundle extends QScript {
// todo -- update to released version when things stabilize
@Argument(doc="gatkJarFile", required=false)
var gatkJarFile: File = new File("dist/GenomeAnalysisTK.jar")
@Argument(doc="liftOverPerl", required=false)
var liftOverPerl: File = new File("./perl/liftOverVCF.pl")
@Argument(shortName = "svn", doc="The SVN version of this release", required=true)
var SVN_VERSION: String = _
@Argument(shortName = "bundleDir", doc="Path to root where resource files will be placed", required=false)
val BUNDLE_ROOT = new File("/humgen/gsa-hpprojects/GATK/bundle")
@Argument(shortName = "downloadDir", doc="Path to root where resource files will be placed for users to download", required=false)
val DOWNLOAD_ROOT = new File("/humgen/gsa-scr1/pub/bundle")
@Argument(shortName = "test", doc="", required=false)
val TEST = false
@Argument(shortName = "phase2", doc="", required=false)
val DO_DOWNLOAD = false
val SITES_EXT: String = "sites"
def BUNDLE_DIR: File = BUNDLE_ROOT + "/" + SVN_VERSION
def DOWNLOAD_DIR: File = DOWNLOAD_ROOT + "/" + SVN_VERSION
// REFERENCES
class Reference( val name: String, val file: File ) { }
var b37: Reference = _
var hg18: Reference = _
var b36: Reference = _
var exampleFASTA: Reference = _
var refs: List[Reference] = _
class Resource(val file: File, val name: String, val ref: Reference, val useName: Boolean = true, val makeSites: Boolean = true ) {
def destname(target: Reference): String = {
if ( useName )
return name + "." + target.name + "." + getExtension(file)
else
return file.getName
}
}
def liftover(in: File, inRef: Reference, out: File, outRef: Reference): CommandLineFunction = {
//Console.printf("liftover(%s => %s)%n", inRef.name, outRef.name)
(inRef.name, outRef.name) match {
case ("b37", "hg19") =>
return new LiftOverPerl(in, out, new File("chainFiles/b37Tohg19.chain"), inRef, outRef)
case ("b37", "hg18") =>
return new LiftOverPerl(in, out, new File("chainFiles/b37tohg18.chain"), inRef, outRef)
case ("b37", "b36") =>
return new LiftOverPerl(in, out, new File("chainFiles/b37tob36.chain"), inRef, outRef)
case _ => return null
}
}
def isVCF(file: File) = file.getName.endsWith(".vcf")
def isBAM(file: File) = file.getName.endsWith(".bam")
def isFASTA(file: File) = file.getName.endsWith(".fasta")
var RESOURCES: List[Resource] = Nil
def addResource(comp: Resource) { RESOURCES = comp :: RESOURCES }
trait UNIVERSAL_GATK_ARGS extends CommandLineGATK {
this.logging_level = "INFO";
this.jarFile = gatkJarFile;
this.memoryLimit = 2
}
def initializeTestDataFiles() = {
//
// Standard evaluation files for indels
//
b37 = new Reference("b37", new File("/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta"))
hg18 = new Reference("hg18", new File("/Users/depristo/Desktop/broadLocal/localData/Homo_sapiens_assembly18.fasta"))
exampleFASTA = new Reference("exampleFASTA", new File("testdata/exampleFASTA.fasta"))
refs = List(b37, hg18, exampleFASTA)
val DATAROOT = "/Users/depristo/Desktop/broadLocal/localData/"
//addResource(new Resource(DATAROOT + "human_g1k_v37.fasta", "human_g1k_v37.fasta", b37, false))
addResource(new Resource(DATAROOT + "1000G.snp.validation.b37.vcf", "1000G.snp.validation", b37))
addResource(new Resource(DATAROOT + "dbsnp_132_b37.vcf", "dbsnp_132", b37, true, false))
addResource(new Resource(exampleFASTA.file, "exampleFASTA", exampleFASTA, false))
addResource(new Resource("testdata/exampleBAM.bam", "exampleBAM", exampleFASTA, false))
}
def initializeStandardDataFiles() = {
//
// references
//
b37 = new Reference("b37", new File("/humgen/1kg/reference/human_g1k_v37.fasta"))
hg18 = new Reference("hg18", new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"))
b36 = new Reference("b36", new File("/humgen/1kg/reference/human_b36_both.fasta"))
exampleFASTA = new Reference("exampleFASTA", new File("testdata/exampleFASTA.fasta"))
refs = List(b37, hg18, b36, exampleFASTA)
addResource(new Resource(b37.file, "", b37, false))
addResource(new Resource(b36.file, "", b36, false))
addResource(new Resource(hg18.file, "", hg18, false))
//
// standard VCF files. Will be lifted to each reference
//
addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf",
"dbsnp_132", b37, true, false))
addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_genotypes_1525_samples.b37.vcf",
"1000G_omni2.5", b37, true, true))
addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf",
"hapmap_3.3", b37, true, true))
addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/AFR+EUR+ASN+1KG.dindel_august_release_merged_pilot1.20110126.sites.vcf",
"1000G_indels_for_realignment", b37, true, false))
//
// example call set for wiki tutorial
//
addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/exampleCalls/NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf",
"NA12878.HiSeq.WGS.bwa.cleaned.raw.subset", b37, true, true))
//
// Test BAM file, specific to each reference
//
addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam",
"IGNORE", b37, false, false))
//
// refGene files specific to each reference
//
addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/refGene_b37.sorted.txt",
"refGene", b37, true, false))
addResource(new Resource("chainFiles/hg18tob37.chain", "", hg18, false, false))
addResource(new Resource("chainFiles/b36tob37.chain", "", b36, false, false))
// todo -- chain files?
// todo 1000G SNP and indel call sets?
//
// exampleFASTA file
//
addResource(new Resource(exampleFASTA.file, "exampleFASTA", exampleFASTA, false))
addResource(new Resource("testdata/exampleBAM.bam", "exampleBAM", exampleFASTA, false))
}
def createBundleDirectories(dir: File) = {
if ( ! dir.exists ) dir.mkdirs()
for ( ref <- refs ) {
val refDir = new File(dir + "/" + ref.name)
if ( ! refDir.exists ) refDir.mkdirs()
}
}
def script = {
if ( TEST )
initializeTestDataFiles();
else
initializeStandardDataFiles();
if ( ! DO_DOWNLOAD ) {
// create destination directory structure
createBundleDirectories(BUNDLE_DIR)
for ( resource: Resource <- RESOURCES ) {
if ( isFASTA(resource.file) ) {
val f = copyBundleFile(resource, resource.ref)
add(new createDictandFAI(f))
} else if ( isBAM(resource.file) ) {
val f = copyBundleFile(resource, resource.ref)
add(new IndexBAM(f))
@Output val outvcf: File = swapExt(f.getParent, f, ".bam", ".vcf")
add(new UG(resource.file, resource.ref.file, outvcf))
} else if ( isVCF(resource.file) ) {
for ( destRef <- refs ) {
val out = destFile(BUNDLE_DIR, destRef, resource.destname(destRef))
var continue = true
// copy or lift over the original vcf
if ( resource.ref == destRef ) {
add(new cpFile(resource.file, out))
} else {
val clf = liftover(resource.file, resource.ref, out, destRef)
if ( clf != null ) {
add(clf)
} else {
continue = false
}
}
if ( continue ) {
add(new IndexVCF(out, destRef.file))
if ( resource.makeSites ) {
val sites: Resource = new Resource(swapExt(out.getParent, out, ".vcf", "." + SITES_EXT + ".vcf"), "", destRef, false)
add(new JustSites(out, sites.file))
add(new IndexVCF(sites.file, destRef.file))
}
if ( resource.name.contains("dbsnp") ) {
val dbsnp129: Resource = new Resource(swapExt(out.getParent, out, ".vcf", ".excluding_sites_after_129.vcf"), "", destRef, false)
add(new MakeDBSNP129(out, destRef.file, dbsnp129.file))
add(new IndexVCF(dbsnp129.file, destRef.file))
}
}
}
} else {
//throw new ReviewedStingException("Unknown file type: " + resource)
}
}
} else {
createBundleDirectories(DOWNLOAD_DIR)
createDownloadsFromBundle(BUNDLE_DIR, DOWNLOAD_DIR)
}
}
def createDownloadsFromBundle(in: File, out: File) {
Console.printf("Visiting %s%n", in)
if (! in.getName.startsWith(".")) {
if ( in.isDirectory ) {
out.mkdirs
for ( child: File <- in.listFiles ) {
createDownloadsFromBundle(child, out + "/" + child.getName)
}
} else {
if ( isBAM(in) ) {
add(new cpFile(in, out))
add(new md5sum(out))
} else {
add(new GzipFile(in, out + ".gz"))
add(new md5sum(out + ".gz"))
}
}
}
}
def copyBundleFile(res: Resource, ref: Reference): File = {
val out = destFile(BUNDLE_DIR, ref, res.destname(ref))
add(new cpFile(res.file, out))
return out
}
def destFile(dir: File, ref: Reference, f: File): File = {
return destFile(dir, ref, f.getName)
}
def destFile(dir: File, ref: Reference, name: String): File = {
return new File(dir + "/" + ref.name + "/" + name)
}
/**
* A command line (cut) that removes all genotyping information from a file
*/
class JustSites(@Input(doc="foo") in: File, @Output(doc="foo") out: File) extends CommandLineFunction {
def commandLine = "cut -f 1-8 %s > %s".format(in, out)
}
class GzipFile(@Input val in: File, @Output val out: File) extends CommandLineFunction {
def commandLine = "gzip -c %s > %s".format(in.getAbsolutePath, out.getAbsolutePath)
}
class cpFile(@Input val in: File, @Output val out: File) extends CommandLineFunction {
def commandLine = "cp %s %s".format(in.getAbsolutePath, out.getAbsolutePath)
}
class md5sum(@Input val in: File) extends CommandLineFunction {
@Output val o: File = new File(in.getAbsolutePath + ".md5")
def commandLine = "md5sum %s > %s".format(in.getAbsolutePath, o)
}
class IndexBAM(bamIn: File) extends SamtoolsIndexFunction {
bamFile = bamIn
}
class IndexVCF(@Input vcf: File, @Input ref: File) extends CountRod with UNIVERSAL_GATK_ARGS {
//@Output val vcfIndex: File = swapExt(vcf.getParent, vcf, ".vcf", ".vcf.idx")
this.rodBind :+= RodBind(vcf.getName, "VCF", vcf)
this.reference_sequence = ref
}
class UG(@Input bam: File, @Input ref: File, @Input outVCF: File) extends UnifiedGenotyper with UNIVERSAL_GATK_ARGS {
this.input_file = List(bam)
this.reference_sequence = ref
this.out = outVCF
}
class MakeDBSNP129(@Input dbsnp: File, @Input ref: File, @Output dbsnp129: File) extends SelectVariants with UNIVERSAL_GATK_ARGS {
this.rodBind :+= RodBind("variant", "VCF", dbsnp)
this.select ++= List("\"dbSNPBuildID <= 129\"")
this.reference_sequence = ref
this.out = dbsnp129
}
class LiftOverPerl(@Input val in: File, @Output val out: File, @Input val chain: File, oldRef: Reference, newRef: Reference) extends CommandLineFunction {
this.memoryLimit = 8
def commandLine = ("%s -vcf %s -chain %s -out %s " +
"-gatk ./ -newRef %s -oldRef %s -tmp %s").format(liftOverPerl, in.getAbsolutePath, chain,
out.getAbsolutePath, newRef.file.replace(".fasta", ""),
oldRef.file.replace(".fasta", ""), jobTempDir)
}
def getExtension(f: File): String = {
val i = f.getName.lastIndexOf('.');
if (i > 0 && i < f.getName.length() - 1)
return f.getName.substring(i+1).toLowerCase();
else
return "";
}
class createDictandFAI (@Input ref: File) extends FastaStats with UNIVERSAL_GATK_ARGS {
@Output val outDict: File = swapExt(ref.getParent, ref, ".fasta", ".dict")
@Output val outFai: File = swapExt(ref.getParent, ref, ".fasta", ".fasta.fai")
@Output val outStats: File = swapExt(ref.getParent, ref, ".fasta", ".stats")
this.reference_sequence = ref
this.out = outStats
}
}