Check in so I don't lose this code -- spawning of jobs by genes
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3137 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
9f6377f7fb
commit
fab31e1d53
|
|
@ -5,6 +5,7 @@ import time
|
||||||
import re
|
import re
|
||||||
import unittest
|
import unittest
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import RefseqLibrary
|
||||||
|
|
||||||
MAX_UNNAMED_DEPENDENCIES = 0
|
MAX_UNNAMED_DEPENDENCIES = 0
|
||||||
class FarmJob:
|
class FarmJob:
|
||||||
|
|
@ -400,3 +401,31 @@ class GATKDispatcher(JobDispatcher):
|
||||||
cmdToDispatch = command + " -o "+job_dir+"job"+str(num)+".txt"
|
cmdToDispatch = command + " -o "+job_dir+"job"+str(num)+".txt"
|
||||||
return cmdToDispatch
|
return cmdToDispatch
|
||||||
|
|
||||||
|
class GeneDispatcher(GATKDispatcher):
|
||||||
|
|
||||||
|
def __init__(self,geneNames,jarfile,memory,walker,args,output_directory,reference = None, bams = None,
|
||||||
|
queues = ["long"], limits = dict([["long",500]]), print_only = False, delay = "0:1:0"):
|
||||||
|
JobDispatcher.GATKDispatcher.__init__(self,jarfile,memory,walker,args,output_directory,reference,bams,None,queues,
|
||||||
|
limits,print_only,"space",delay)
|
||||||
|
self.genes = RefseqLibrary.getRefseqGenes(geneNames)
|
||||||
|
|
||||||
|
def dispatchByInterval(self,base_limit):
|
||||||
|
raise JobDispatchError("Dispatch by interval not permitted from GeneDispatcher")
|
||||||
|
|
||||||
|
def dispatchByGene(self):
|
||||||
|
dispatchCommand = self.baseCommand + " -R "+self.reference
|
||||||
|
farmJobs = list()
|
||||||
|
jobNumber = ""
|
||||||
|
headerLines = RefseqLibrary.getIntervalHeaderLines()
|
||||||
|
|
||||||
|
if ( not os.path.exists(self.outputDir+"GATKDispatcher/") ):
|
||||||
|
os.mkdir(self.outputDir+"GATKDispatcher/")
|
||||||
|
if ( self.bams != None ):
|
||||||
|
dispatchCommand += " -I "+self.bams
|
||||||
|
|
||||||
|
for gene in self.genes:
|
||||||
|
jobNumber = "_"+gene.getGeneName()
|
||||||
|
intervals = gene.getExonIntervals()
|
||||||
|
farmJobs.append(self._buildIntervalJob(jobNumber,headerLines,intervals,dispatchCommand))
|
||||||
|
|
||||||
|
self.dispatchAll_Interval(farmJobs)
|
||||||
|
|
|
||||||
|
|
@ -73,6 +73,9 @@ class Interval:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def bedFormat(self):
|
||||||
|
return self.chromosome+"\t"+str(self.start)+"\t"+str(self.stop)+"\t+\ttarget_x"
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.chromosome + ":" + str(self.start) + "-" + str(self.stop)
|
return self.chromosome + ":" + str(self.start) + "-" + str(self.stop)
|
||||||
|
|
||||||
|
|
@ -161,6 +164,12 @@ class Gene:
|
||||||
size = size + exon.size()
|
size = size + exon.size()
|
||||||
return size
|
return size
|
||||||
|
|
||||||
|
def getExonIntervals(self):
|
||||||
|
intervals = list()
|
||||||
|
for exon in self.exons:
|
||||||
|
intervals.append(exon.getInterval())
|
||||||
|
return intervals
|
||||||
|
|
||||||
def getBaseCoverage(self):
|
def getBaseCoverage(self):
|
||||||
coverage = 0
|
coverage = 0
|
||||||
for exon in self.exons:
|
for exon in self.exons:
|
||||||
|
|
@ -169,9 +178,15 @@ class Gene:
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
exonString = list()
|
exonString = list()
|
||||||
for exon in exons:
|
for exon in self.exons:
|
||||||
exonString.append(str(exon))
|
exonString.append(str(exon))
|
||||||
return name+"\t"+"\t".join(exonString)
|
return self.name+"\t"+"\t".join(exonString)
|
||||||
|
|
||||||
|
def getGeneName(self):
|
||||||
|
return self.name
|
||||||
|
|
||||||
|
def setGeneName(self,newName):
|
||||||
|
self.name = newName
|
||||||
|
|
||||||
class ExonRecord(Exon):
|
class ExonRecord(Exon):
|
||||||
def __init__(self,geneName,exonid,chrom,start,stop,prop):
|
def __init__(self,geneName,exonid,chrom,start,stop,prop):
|
||||||
|
|
@ -208,3 +223,64 @@ class CoverageRecord:
|
||||||
|
|
||||||
def getInterval(self):
|
def getInterval(self):
|
||||||
return self.interval
|
return self.interval
|
||||||
|
|
||||||
|
def getRefseqGenes(names):
|
||||||
|
names = list(names)
|
||||||
|
refGene = open("/humgen/gsa-hpprojects/GATK/data/refGene.sorted.txt")
|
||||||
|
refSeq = open("/humgen/gsa-hpprojects/GATK/data/refseq/hg18.ref_gene.cds.bed")
|
||||||
|
refSeqGeneNames = list()
|
||||||
|
refNamesToAltNames = dict()
|
||||||
|
for name in names:
|
||||||
|
if ( name.startswith("NM_") ):
|
||||||
|
refSeqGeneNames.append(name)
|
||||||
|
refNamesToAltNames[name]=name
|
||||||
|
|
||||||
|
if ( len(names) > 0 ):
|
||||||
|
for line in refGene.readlines():
|
||||||
|
spline = line.strip().split("\t")
|
||||||
|
altName = spline[len(spline)-4]
|
||||||
|
if ( altName in names ):
|
||||||
|
if ( not ( altName in refNamesToAltNames.values() ) ):
|
||||||
|
refSeqGeneNames.append(spline[1])
|
||||||
|
refNamesToAltNames[spline[1]]=altName
|
||||||
|
else:
|
||||||
|
print("WARNING: multiple transcripts found for gene "+altName+" using first available transcript from refseq export")
|
||||||
|
|
||||||
|
if ( len(names) > len(refSeqGeneNames) ):
|
||||||
|
for g in refSeqGeneNames:
|
||||||
|
if ( refNamesToAltNames[g] in names ):
|
||||||
|
names.remove(refNamesToAltNames[g])
|
||||||
|
|
||||||
|
raise ValueError("No entry found for genes: "+str(names))
|
||||||
|
|
||||||
|
# build up the gene list
|
||||||
|
genes = dict()
|
||||||
|
for geneName in refSeqGeneNames:
|
||||||
|
genes[geneName] = Gene(geneName)
|
||||||
|
|
||||||
|
for line in refSeq.readlines():
|
||||||
|
spline = line.strip().split("\t")
|
||||||
|
geneName = spline[3].split("_cds")[0]
|
||||||
|
if ( geneName in refSeqGeneNames ):
|
||||||
|
chrom = spline[0]
|
||||||
|
start = int(spline[1])
|
||||||
|
stop = int(spline[2])
|
||||||
|
id = "cds_"+spline[3].split("_cds_")[1].split("_")[0]
|
||||||
|
genes[geneName].addExon(Exon(geneName,id,chrom,start,stop))
|
||||||
|
|
||||||
|
toReturn = list()
|
||||||
|
for gene in genes.values():
|
||||||
|
gene.setGeneName(refNamesToAltNames[gene.getGeneName()])
|
||||||
|
toReturn.append(gene)
|
||||||
|
return toReturn
|
||||||
|
|
||||||
|
|
||||||
|
def getIntervalHeaderLines():
|
||||||
|
whole_exome_file = open("/humgen/gsa-hpprojects/GATK/data/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.hg18.interval_list")
|
||||||
|
header = list()
|
||||||
|
line = whole_exome_file.readline()
|
||||||
|
while ( line.startswith("@") ):
|
||||||
|
header.append(line)
|
||||||
|
line = whole_exome_file.readline()
|
||||||
|
whole_exome_file.close()
|
||||||
|
return header
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue