Awesome: JobDispatcher can now dispatch jobs by gene from a target .design file found in /seq/references.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3170 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-04-14 18:17:41 +00:00
parent 04909fa6ad
commit 2e4377b1cf
2 changed files with 45 additions and 11 deletions

View File

@ -401,18 +401,8 @@ class GATKDispatcher(JobDispatcher):
cmdToDispatch = command + " -o "+job_dir+"job"+str(num)+".txt"
return cmdToDispatch
class GeneDispatcher(GATKDispatcher):
def __init__(self,geneNames,jarfile,memory,walker,args,output_directory,reference = None, bams = None,
queues = ["long"], limits = dict([["long",500]]), print_only = False, delay = "0:1:0"):
JobDispatcher.GATKDispatcher.__init__(self,jarfile,memory,walker,args,output_directory,reference,bams,None,queues,
limits,print_only,"space",delay)
def dispatchByGene(self, geneNames):
self.genes = RefseqLibrary.getRefseqGenes(geneNames)
def dispatchByInterval(self,base_limit):
raise JobDispatchError("Dispatch by interval not permitted from GeneDispatcher")
def dispatchByGene(self):
dispatchCommand = self.baseCommand + " -R "+self.reference
farmJobs = list()
jobNumber = ""
@ -429,3 +419,22 @@ class GeneDispatcher(GATKDispatcher):
farmJobs.append(self._buildIntervalJob(jobNumber,headerLines,intervals,dispatchCommand))
self.dispatchAll_Interval(farmJobs)
def dispatchByTargetDesign(self,designFile):
self.genes = RefseqLibrary.parseDesignFile(designFile)
dispatchCommand = self.baseCommand + " -R "+self.reference
farmJobs = list()
jobNumber = ""
headerLines = RefseqLibrary.getIntervalHeaderLines()
if ( not os.path.exists(self.outputDir+"GATKDispatcher/") ):
os.mkdir(self.outputDir+"GATKDispatcher/")
if ( self.bams != None ):
dispatchCommand += " -I "+self.bams
for gene in self.genes:
jobNumber = "_"+gene.getGeneName()
intervals = gene.getExonIntervals()
farmJobs.append(self._buildIntervalJob(jobNumber,headerLines,intervals,dispatchCommand))
self.dispatchAll_Interval(farmJobs)

View File

@ -284,3 +284,28 @@ def getIntervalHeaderLines():
line = whole_exome_file.readline()
whole_exome_file.close()
return header
def parseDesignFile(file):
designFile = open(file)
genes = dict()
for line in designFile.readlines():
if ( line.startswith("TARGET") ):
spline = line.strip().split()
if ( spline[1].startswith("chr") ):
chrom = spline[1]
else:
chrom = "chr"+spline[1]
start = 1 + int(spline[2])
stop = 1 + int(spline[3])
gene_name = spline[4].split("#")[1].split("_")[0]
try:
exon_id = spline[4].split(gene_name)[1]
except IndexError:
exon_id = "_".join(spline[5:len(spline)-1])
exon = Exon(gene_name,exon_id,chrom,start,stop)
if ( gene_name in genes.keys() ):
genes[gene_name].addExon(exon)
else:
genes[gene_name] = Gene(gene_name)
genes[gene_name].addExon(exon)
return genes.values()