From fab31e1d539c6f5d8ba8075f29f9a25a39c036d6 Mon Sep 17 00:00:00 2001 From: chartl Date: Thu, 8 Apr 2010 16:18:40 +0000 Subject: [PATCH] Check in so I don't lose this code -- spawning of jobs by genes git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3137 348d0f76-0448-11de-a6fe-93d51630548a --- python/JobDispatcher.py | 31 +++++++++++++++- python/RefseqLibrary.py | 80 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 108 insertions(+), 3 deletions(-) diff --git a/python/JobDispatcher.py b/python/JobDispatcher.py index 902a2b436..8c3cdedc8 100644 --- a/python/JobDispatcher.py +++ b/python/JobDispatcher.py @@ -5,6 +5,7 @@ import time import re import unittest import tempfile +import RefseqLibrary MAX_UNNAMED_DEPENDENCIES = 0 class FarmJob: @@ -399,4 +400,32 @@ class GATKDispatcher(JobDispatcher): else: cmdToDispatch = command + " -o "+job_dir+"job"+str(num)+".txt" return cmdToDispatch - + +class GeneDispatcher(GATKDispatcher): + + def __init__(self,geneNames,jarfile,memory,walker,args,output_directory,reference = None, bams = None, + queues = ["long"], limits = dict([["long",500]]), print_only = False, delay = "0:1:0"): + JobDispatcher.GATKDispatcher.__init__(self,jarfile,memory,walker,args,output_directory,reference,bams,None,queues, + limits,print_only,"space",delay) + self.genes = RefseqLibrary.getRefseqGenes(geneNames) + + def dispatchByInterval(self,base_limit): + raise JobDispatchError("Dispatch by interval not permitted from GeneDispatcher") + + def dispatchByGene(self): + dispatchCommand = self.baseCommand + " -R "+self.reference + farmJobs = list() + jobNumber = "" + headerLines = RefseqLibrary.getIntervalHeaderLines() + + if ( not os.path.exists(self.outputDir+"GATKDispatcher/") ): + os.mkdir(self.outputDir+"GATKDispatcher/") + if ( self.bams != None ): + dispatchCommand += " -I "+self.bams + + for gene in self.genes: + jobNumber = "_"+gene.getGeneName() + intervals = gene.getExonIntervals() + farmJobs.append(self._buildIntervalJob(jobNumber,headerLines,intervals,dispatchCommand)) + + self.dispatchAll_Interval(farmJobs) diff --git a/python/RefseqLibrary.py b/python/RefseqLibrary.py index bd3ce7c2f..9e0c34191 100755 --- a/python/RefseqLibrary.py +++ b/python/RefseqLibrary.py @@ -73,6 +73,9 @@ class Interval: return True return False + def bedFormat(self): + return self.chromosome+"\t"+str(self.start)+"\t"+str(self.stop)+"\t+\ttarget_x" + def __str__(self): return self.chromosome + ":" + str(self.start) + "-" + str(self.stop) @@ -161,6 +164,12 @@ class Gene: size = size + exon.size() return size + def getExonIntervals(self): + intervals = list() + for exon in self.exons: + intervals.append(exon.getInterval()) + return intervals + def getBaseCoverage(self): coverage = 0 for exon in self.exons: @@ -169,9 +178,15 @@ class Gene: def __str__(self): exonString = list() - for exon in exons: + for exon in self.exons: exonString.append(str(exon)) - return name+"\t"+"\t".join(exonString) + return self.name+"\t"+"\t".join(exonString) + + def getGeneName(self): + return self.name + + def setGeneName(self,newName): + self.name = newName class ExonRecord(Exon): def __init__(self,geneName,exonid,chrom,start,stop,prop): @@ -208,3 +223,64 @@ class CoverageRecord: def getInterval(self): return self.interval + +def getRefseqGenes(names): + names = list(names) + refGene = open("/humgen/gsa-hpprojects/GATK/data/refGene.sorted.txt") + refSeq = open("/humgen/gsa-hpprojects/GATK/data/refseq/hg18.ref_gene.cds.bed") + refSeqGeneNames = list() + refNamesToAltNames = dict() + for name in names: + if ( name.startswith("NM_") ): + refSeqGeneNames.append(name) + refNamesToAltNames[name]=name + + if ( len(names) > 0 ): + for line in refGene.readlines(): + spline = line.strip().split("\t") + altName = spline[len(spline)-4] + if ( altName in names ): + if ( not ( altName in refNamesToAltNames.values() ) ): + refSeqGeneNames.append(spline[1]) + refNamesToAltNames[spline[1]]=altName + else: + print("WARNING: multiple transcripts found for gene "+altName+" using first available transcript from refseq export") + + if ( len(names) > len(refSeqGeneNames) ): + for g in refSeqGeneNames: + if ( refNamesToAltNames[g] in names ): + names.remove(refNamesToAltNames[g]) + + raise ValueError("No entry found for genes: "+str(names)) + + # build up the gene list + genes = dict() + for geneName in refSeqGeneNames: + genes[geneName] = Gene(geneName) + + for line in refSeq.readlines(): + spline = line.strip().split("\t") + geneName = spline[3].split("_cds")[0] + if ( geneName in refSeqGeneNames ): + chrom = spline[0] + start = int(spline[1]) + stop = int(spline[2]) + id = "cds_"+spline[3].split("_cds_")[1].split("_")[0] + genes[geneName].addExon(Exon(geneName,id,chrom,start,stop)) + + toReturn = list() + for gene in genes.values(): + gene.setGeneName(refNamesToAltNames[gene.getGeneName()]) + toReturn.append(gene) + return toReturn + + +def getIntervalHeaderLines(): + whole_exome_file = open("/humgen/gsa-hpprojects/GATK/data/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.hg18.interval_list") + header = list() + line = whole_exome_file.readline() + while ( line.startswith("@") ): + header.append(line) + line = whole_exome_file.readline() + whole_exome_file.close() + return header