From 2e4377b1cf84c31ca4cb08dd28888908a96c3fdf Mon Sep 17 00:00:00 2001 From: chartl Date: Wed, 14 Apr 2010 18:17:41 +0000 Subject: [PATCH] Awesome: JobDispatcher can now dispatch jobs by gene from a target .design file found in /seq/references. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3170 348d0f76-0448-11de-a6fe-93d51630548a --- python/JobDispatcher.py | 31 ++++++++++++++++++++----------- python/RefseqLibrary.py | 25 +++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/python/JobDispatcher.py b/python/JobDispatcher.py index 8c3cdedc8..336dea4be 100644 --- a/python/JobDispatcher.py +++ b/python/JobDispatcher.py @@ -401,18 +401,8 @@ class GATKDispatcher(JobDispatcher): cmdToDispatch = command + " -o "+job_dir+"job"+str(num)+".txt" return cmdToDispatch -class GeneDispatcher(GATKDispatcher): - - def __init__(self,geneNames,jarfile,memory,walker,args,output_directory,reference = None, bams = None, - queues = ["long"], limits = dict([["long",500]]), print_only = False, delay = "0:1:0"): - JobDispatcher.GATKDispatcher.__init__(self,jarfile,memory,walker,args,output_directory,reference,bams,None,queues, - limits,print_only,"space",delay) + def dispatchByGene(self, geneNames): self.genes = RefseqLibrary.getRefseqGenes(geneNames) - - def dispatchByInterval(self,base_limit): - raise JobDispatchError("Dispatch by interval not permitted from GeneDispatcher") - - def dispatchByGene(self): dispatchCommand = self.baseCommand + " -R "+self.reference farmJobs = list() jobNumber = "" @@ -429,3 +419,22 @@ class GeneDispatcher(GATKDispatcher): farmJobs.append(self._buildIntervalJob(jobNumber,headerLines,intervals,dispatchCommand)) self.dispatchAll_Interval(farmJobs) + + def dispatchByTargetDesign(self,designFile): + self.genes = RefseqLibrary.parseDesignFile(designFile) + dispatchCommand = self.baseCommand + " -R "+self.reference + farmJobs = list() + jobNumber = "" + headerLines = RefseqLibrary.getIntervalHeaderLines() + + if ( not os.path.exists(self.outputDir+"GATKDispatcher/") ): + os.mkdir(self.outputDir+"GATKDispatcher/") + if ( self.bams != None ): + dispatchCommand += " -I "+self.bams + + for gene in self.genes: + jobNumber = "_"+gene.getGeneName() + intervals = gene.getExonIntervals() + farmJobs.append(self._buildIntervalJob(jobNumber,headerLines,intervals,dispatchCommand)) + + self.dispatchAll_Interval(farmJobs) diff --git a/python/RefseqLibrary.py b/python/RefseqLibrary.py index 9e0c34191..f2e897714 100755 --- a/python/RefseqLibrary.py +++ b/python/RefseqLibrary.py @@ -284,3 +284,28 @@ def getIntervalHeaderLines(): line = whole_exome_file.readline() whole_exome_file.close() return header + +def parseDesignFile(file): + designFile = open(file) + genes = dict() + for line in designFile.readlines(): + if ( line.startswith("TARGET") ): + spline = line.strip().split() + if ( spline[1].startswith("chr") ): + chrom = spline[1] + else: + chrom = "chr"+spline[1] + start = 1 + int(spline[2]) + stop = 1 + int(spline[3]) + gene_name = spline[4].split("#")[1].split("_")[0] + try: + exon_id = spline[4].split(gene_name)[1] + except IndexError: + exon_id = "_".join(spline[5:len(spline)-1]) + exon = Exon(gene_name,exon_id,chrom,start,stop) + if ( gene_name in genes.keys() ): + genes[gene_name].addExon(exon) + else: + genes[gene_name] = Gene(gene_name) + genes[gene_name].addExon(exon) + return genes.values()