Just some code I want to freeze. If you ever need to estimate the % of bases covered by exon, given an interval list, give it to getTargetedGenes. Not the best name for this function, but I don't expect anyone to use it but me.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3111 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
d7880ef7ad
commit
687fd477ff
|
|
@ -197,6 +197,15 @@ class JobDispatcher:
|
|||
else:
|
||||
self.maxJobs = -1
|
||||
self.print_only = print_only
|
||||
self.startDays = 0
|
||||
self.startHours = 0
|
||||
self.startMins = 0
|
||||
|
||||
def setInitialDelay(self,delay):
|
||||
dhm = delay.split(":")
|
||||
self.startDays = int(dhm[0])
|
||||
self.startHours = int(dhm[1])
|
||||
self.startMins = int(dhm[2])
|
||||
|
||||
# external accessor, sorts by dependency and ensures user hasn't exceeded the set limits
|
||||
def dispatchAll(self,farmJobs):
|
||||
|
|
@ -234,9 +243,9 @@ class JobDispatcher:
|
|||
# spaces jobs out by using the delay command to LSF
|
||||
def _dispatchWithSpacing(self,farmJobs):
|
||||
dayHrMin = self.action_string.split(":")
|
||||
days = 0
|
||||
hours = 0
|
||||
mins = 0
|
||||
days = self.startDays
|
||||
hours = self.startHours
|
||||
mins = self.startMins
|
||||
dispatchBuffer = list()
|
||||
while ( len(farmJobs) > 0 ):
|
||||
nextJob = farmJobs.pop(0)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,200 @@
|
|||
import math
|
||||
def chr2int(chr):
|
||||
try:
|
||||
chr = chr.split("chr")[1]
|
||||
if ( chr == "M" ):
|
||||
return 0
|
||||
if ( chr == "X" ):
|
||||
return 23
|
||||
if ( chr == "Y" ):
|
||||
return 24
|
||||
return int(chr)
|
||||
except IndexError:
|
||||
print("Index error: "+chr)
|
||||
return -1
|
||||
|
||||
def intervalCompare(int1,int2):
|
||||
chr1 = chr2int(int1.chromosome)
|
||||
chr2 = chr2int(int2.chromosome)
|
||||
if ( chr1 < chr2 ):
|
||||
return -1
|
||||
elif ( chr1 > chr2 ):
|
||||
return 1
|
||||
else:
|
||||
start1 = int1.start
|
||||
start2 = int2.start
|
||||
return start1 - start2
|
||||
|
||||
class Interval:
|
||||
def __init__(self,chrom,start,stop):
|
||||
self.chromosome = chrom
|
||||
self.start = start
|
||||
self.stop = stop
|
||||
if ( chrom == "NONE" ):
|
||||
self.isEmpty = True
|
||||
else:
|
||||
self.isEmpty = False
|
||||
self.basesCovered = None
|
||||
|
||||
def size(self):
|
||||
return self.stop - self.start
|
||||
|
||||
def overlaps(self,other):
|
||||
if ( self.chromosome == other.chromosome ):
|
||||
if ( other.stop < self.stop and not other.start < self.start ):
|
||||
return True
|
||||
if ( other.stop > self.start and not other.start > self.stop ):
|
||||
return True
|
||||
return False
|
||||
|
||||
def intersect(self,other):
|
||||
if ( self.overlaps(other) ):
|
||||
return Interval(self.chromosome, max(self.start,other.start), min(self.stop,other.stop))
|
||||
else:
|
||||
return Interval("NONE",-1,-1)
|
||||
|
||||
def isBefore(self,other):
|
||||
if ( chr2int(self.chromosome) < chr2int(other.chromosome) ):
|
||||
return True
|
||||
elif ( chr2int(self.chromosome) > chr2int(other.chromosome) ):
|
||||
return False
|
||||
else:
|
||||
if ( other.start > self.stop ):
|
||||
return True
|
||||
return False
|
||||
|
||||
def __str__(self):
|
||||
return self.chromosome + ":" + str(self.start) + "-" + str(self.stop)
|
||||
|
||||
def __cmp__(self,other):
|
||||
return intervalCompare(self,other)
|
||||
|
||||
class CoveredInterval(Interval):
|
||||
def __init__(self,chrom,start,stop):
|
||||
Interval.__init__(self,chrom,start,stop)
|
||||
self.overlappingSubIntervals = list()
|
||||
|
||||
def updateCoverage(self,other):
|
||||
if ( other.overlaps(self) ):
|
||||
self.overlappingSubIntervals.append(other)
|
||||
|
||||
def getBaseCoverage(self):
|
||||
if ( self.basesCovered is None ):
|
||||
basesCovered = 0
|
||||
intersects = list()
|
||||
self.overlappingSubIntervals.sort(intervalCompare)
|
||||
for overlap in self.overlappingSubIntervals:
|
||||
ival = self.intersect(overlap)
|
||||
intersects.append(ival)
|
||||
for i in range(len(intersects)):
|
||||
basesCovered = basesCovered + intersects[i].size()
|
||||
for j in range(i+1,len(intersects)):
|
||||
basesCovered = basesCovered - (intersects[i].intersect(intersects[j])).size()
|
||||
|
||||
self.basesCovered = basesCovered
|
||||
return self.basesCovered
|
||||
|
||||
def getOverlappingIntervals(self):
|
||||
return self.overlappingSubIntervals
|
||||
|
||||
class Exon:
|
||||
def __init__(self,geneName,exonid,chrom,start,stop):
|
||||
#print("Adding exon for "+geneName+" with "+chrom+":"+str(start)+"-"+str(stop))
|
||||
self.interval = CoveredInterval(chrom,start,stop)
|
||||
self.gene = geneName
|
||||
self.id = exonid
|
||||
|
||||
def getOverlappingIntervals(self):
|
||||
return self.interval.getOverlappingIntervals()
|
||||
|
||||
def updateCoverage(self,target):
|
||||
self.interval.updateCoverage(target)
|
||||
|
||||
def overlaps(self,target):
|
||||
return self.interval.overlaps(target)
|
||||
|
||||
def isBefore(self,target):
|
||||
return self.interval.isBefore(target)
|
||||
|
||||
def getBaseCoverage(self):
|
||||
return self.interval.getBaseCoverage()
|
||||
|
||||
def size(self):
|
||||
return self.interval.size()
|
||||
|
||||
def getBedEntry(self):
|
||||
return "\t".join([self.interval.chromosome,str(self.interval.start),str(self.interval.stop),self.gene,self.id,str(self.getCoverageProportion())])
|
||||
|
||||
def getCoverageProportion(self):
|
||||
if ( self.size() > 0 ):
|
||||
return float(self.getBaseCoverage())/float(self.size())
|
||||
else:
|
||||
return -1
|
||||
|
||||
def __str__(self):
|
||||
return self.gene+"("+self.id+") "+str(self.interval)
|
||||
|
||||
def getInterval(self):
|
||||
return self.interval
|
||||
|
||||
class Gene:
|
||||
def __init__(self,name):
|
||||
self.name = name
|
||||
self.exons = list()
|
||||
|
||||
def addExon(self,exon):
|
||||
self.exons.append(exon)
|
||||
|
||||
def size(self):
|
||||
size = 0
|
||||
for exon in self.exons:
|
||||
size = size + exon.size()
|
||||
return size
|
||||
|
||||
def getBaseCoverage(self):
|
||||
coverage = 0
|
||||
for exon in self.exons:
|
||||
coverage = coverage + exon.getBaseCoverage()
|
||||
return coverage
|
||||
|
||||
def __str__(self):
|
||||
exonString = list()
|
||||
for exon in exons:
|
||||
exonString.append(str(exon))
|
||||
return name+"\t"+"\t".join(exonString)
|
||||
|
||||
class ExonRecord(Exon):
|
||||
def __init__(self,geneName,exonid,chrom,start,stop,prop):
|
||||
Exon.__init__(self,geneName,exonid,chrom,start,stop)
|
||||
self.coverageProportion = prop
|
||||
self.baseCoverage = math.ceil(prop*self.size())
|
||||
self.records = list()
|
||||
|
||||
def getBaseCoverage(self):
|
||||
return self.baseCoverage
|
||||
|
||||
def getCoverageProportion(self):
|
||||
return self.coverageProportion
|
||||
|
||||
def addRecord(self,record):
|
||||
self.records.append(record)
|
||||
|
||||
def getData(self):
|
||||
toRet = ""
|
||||
for record in self.records:
|
||||
toRet += record.dataString()+"\t"
|
||||
return toRet
|
||||
class CoverageRecord:
|
||||
def __init__(self,chrom,start,stop,sampleName,mean,median,q1,q3):
|
||||
self.interval = Interval(chrom,start,stop) #indexing issues
|
||||
self.sampleName = sampleName
|
||||
self.mean = mean
|
||||
self.median = median
|
||||
self.q1 = q1
|
||||
self.q3 = q3
|
||||
|
||||
def dataString(self):
|
||||
return "\t".join([self.sampleName,str(self.mean),str(self.median),str(self.q1),str(self.q3)])
|
||||
|
||||
def getInterval(self):
|
||||
return self.interval
|
||||
|
|
@ -0,0 +1,216 @@
|
|||
import RefseqLibrary as exLib
|
||||
|
||||
def chr2int(chr):
|
||||
chr = chr.split("chr")[1]
|
||||
if ( chr == "M" ):
|
||||
return 0
|
||||
if ( chr == "X" ):
|
||||
return 23
|
||||
if ( chr == "Y" ):
|
||||
return 24
|
||||
return int(chr)
|
||||
|
||||
def intervalCompareExon(exon1,exon2):
|
||||
int1 = exon1.interval
|
||||
int2 = exon2.interval
|
||||
return intervalCompare(int1,int2)
|
||||
|
||||
def intervalCompareExonTarget(exon1,target):
|
||||
int1 = exon1.interval
|
||||
return intervalCompare(int1,target)
|
||||
|
||||
def intervalCompare(int1,int2):
|
||||
chr1 = chr2int(int1.chromosome)
|
||||
chr2 = chr2int(int2.chromosome)
|
||||
if ( chr1 < chr2 ):
|
||||
return -1
|
||||
elif ( chr1 > chr2 ):
|
||||
return 1
|
||||
else:
|
||||
start1 = int1.start
|
||||
start2 = int2.start
|
||||
return start1 - start2
|
||||
|
||||
test_first = exLib.Interval("chr2",100000,100001)
|
||||
test_second = exLib.Interval("chr2",100001,100004)
|
||||
shouldBeNegative = intervalCompare(test_first,test_second)
|
||||
if ( not shouldBeNegative < 0 ):
|
||||
raise ValueError("Interval compare is not performing properly.")
|
||||
|
||||
test_first = exLib.CoveredInterval("chr2",10400,10800)
|
||||
test_first.updateCoverage(exLib.Interval("chr2",10387,10420))
|
||||
test_first.updateCoverage(exLib.Interval("chr2",10410,10560))
|
||||
test_first.updateCoverage(exLib.Interval("chr2",10600,10820))
|
||||
test_first.updateCoverage(exLib.Interval("chr2",10890,10990))
|
||||
if ( test_first.getBaseCoverage() != 360 ):
|
||||
print("Base coverage of test was "+str(test_first.getBaseCoverage())+" (400 expected)")
|
||||
print("Testing intersection....")
|
||||
g = exLib.Interval("chr2",10387,10420).intersect(exLib.Interval("chr2",10410,10560))
|
||||
if ( g.chromosome != "chr2" or g.start != 10410 or g.stop != 10420 ):
|
||||
print("Bad intersection! "+str(g))
|
||||
if ( g.size() != 10 ):
|
||||
print("Size not performing correctly")
|
||||
raise ValueError("Covered interval is not performing properly")
|
||||
|
||||
refseq_exons = open("/humgen/gsa-hpprojects/exome/gene_interval_lists/refseq_exons/hg18.ref_gene.cds.bed")
|
||||
refseq_utr3 = open("/humgen/gsa-hpprojects/exome/gene_interval_lists/refseq_exons/hg18.ref_gene.utr3.bed")
|
||||
refseq_utr5 = open("/humgen/gsa-hpprojects/exome/gene_interval_lists/refseq_exons/hg18.ref_gene.utr5.bed")
|
||||
cancer_6000 = open("/seq/references/HybSelOligos/tcga_6k_genes.design")
|
||||
|
||||
counter = 0
|
||||
exons = list()
|
||||
def isNotValid(rline):
|
||||
try:
|
||||
n = chr2int(line.split()[0])
|
||||
except ValueError:
|
||||
return True
|
||||
return False
|
||||
|
||||
def parseLine(rline,token):
|
||||
spline = line.strip().split()
|
||||
chrom = spline[0]
|
||||
start = int(spline[1])
|
||||
stop = int(spline[2])
|
||||
longname = spline[3]
|
||||
geneName = longname.split("_"+token)[0]
|
||||
exonID = longname.split("_"+token+"_")[1].split("_chr")[0]+"_"+"_".join([token,longname[len(longname)-1]])
|
||||
return exLib.Exon(geneName,exonID,chrom,start,stop)
|
||||
|
||||
for line in refseq_exons.readlines():
|
||||
if ( isNotValid(line) ):
|
||||
continue
|
||||
counter = 1 + counter
|
||||
exons.append(parseLine(line,"cds"))
|
||||
if ( counter % 25000 == 0 ):
|
||||
print(str(counter)+" exons added")
|
||||
|
||||
for line in refseq_utr3:
|
||||
if ( isNotValid(line) ):
|
||||
continue
|
||||
counter = 1 + counter
|
||||
exons.append(parseLine(line,"utr3"))
|
||||
if ( counter % 25000 == 0 ):
|
||||
print(str(counter)+" exons added")
|
||||
|
||||
for line in refseq_utr5:
|
||||
if ( isNotValid(line) ):
|
||||
continue
|
||||
counter = 1 + counter
|
||||
exons.append(parseLine(line,"utr5"))
|
||||
if ( counter % 25000 == 0 ):
|
||||
print(str(counter)+" exons added")
|
||||
|
||||
for line in cancer_6000:
|
||||
if ( not line.startswith("TARGET") ):
|
||||
continue
|
||||
counter = 1 + counter
|
||||
spline = line.strip().split()
|
||||
chrom = "chr"+spline[1]
|
||||
start = int(spline[2])
|
||||
stop = int(spline[3])
|
||||
longname = spline[4]
|
||||
gene_name = longname.split("_")[0].split("#")[1]
|
||||
exon_id = "tcga_"+longname.split("_")[1]
|
||||
exons.append(exLib.Exon(gene_name,exon_id,chrom,start,stop))
|
||||
if ( counter % 25000 == 0 ):
|
||||
print(str(counter)+" exons added")
|
||||
|
||||
exons.sort(intervalCompareExon)
|
||||
print(str(len(exons))+" exons added")
|
||||
start_index = 0
|
||||
counter = 0
|
||||
import sys
|
||||
interval_list = sys.argv[1]
|
||||
headerlines = list()
|
||||
for line in open(interval_list).readlines():
|
||||
if ( not line.startswith("@") and not line.startswith("#") ):
|
||||
counter = 1 + counter
|
||||
s = line.split()
|
||||
target = exLib.Interval(s[0],int(s[1]),int(s[2]))
|
||||
while ( exons[start_index].isBefore(target) and start_index < len(exons)-1):
|
||||
start_index = 1 + start_index
|
||||
index = start_index
|
||||
while(exons[index].overlaps(target) and index < len(exons) - 1):
|
||||
exons[index].updateCoverage(target)
|
||||
index = 1 + index
|
||||
if ( counter % 25000 == 0 ):
|
||||
print("Read "+str(counter)+" lines from interval list")
|
||||
#if ( counter % 5000 == 0 ):
|
||||
# break
|
||||
else:
|
||||
if ( line.startswith("@") ):
|
||||
headerlines.append(line)
|
||||
|
||||
print("Done reading interval file. Creating genes.")
|
||||
exons.sort(key = lambda x: x.gene)
|
||||
genes = list()
|
||||
counter = 0
|
||||
prevName = "@T#h12_3"
|
||||
nGenes = 0
|
||||
for exon in exons:
|
||||
counter = 1 + counter
|
||||
if ( exon.gene != prevName ):
|
||||
genes.append(exLib.Gene(exon.gene))
|
||||
nGenes = 1 + nGenes
|
||||
genes[nGenes - 1].addExon(exon)
|
||||
prevName = exon.gene
|
||||
else:
|
||||
genes[nGenes - 1].addExon(exon)
|
||||
if ( counter % 20000 == 0 ):
|
||||
print("Processed "+str(counter)+" exons. "+str(len(genes))+" gene records created.")
|
||||
|
||||
def byProportion(gene1,gene2):
|
||||
gene1targets = gene1.size()
|
||||
gene1cvg = gene1.getBaseCoverage()
|
||||
gene1prop = float(gene1cvg)/float(gene1targets)
|
||||
gene2targets = gene2.size()
|
||||
gene2cvg = gene2.getBaseCoverage()
|
||||
gene2prop = float(gene2cvg)/float(gene2targets)
|
||||
if ( gene1prop > gene2prop ):
|
||||
return -1
|
||||
elif ( gene1prop < gene2prop ):
|
||||
return 1
|
||||
return 0
|
||||
|
||||
genes.sort(byProportion)
|
||||
|
||||
print("Writing coverage table...")
|
||||
coverage = open("geneCoverage.txt",'w')
|
||||
counter = 0
|
||||
exonsInWellCoveredGenes = list()
|
||||
for gene in genes:
|
||||
try:
|
||||
coverage.write(gene.name+"\t"+str(float(gene.getBaseCoverage())/float(gene.size()))+"\t"+str(gene.size())+"\t"+str(gene.getBaseCoverage()))
|
||||
if ( float(gene.getBaseCoverage())/float(gene.size()) > 0.8 ):
|
||||
for exon in gene.exons:
|
||||
exonsInWellCoveredGenes.append(exon)
|
||||
coverage.write("\n")
|
||||
except ZeroDivisionError:
|
||||
continue
|
||||
counter = 1 + counter
|
||||
if ( counter % 5000 == 0 ):
|
||||
print("Written: "+str(counter)+" genes.")
|
||||
coverage.close()
|
||||
print("Sorting exons by start location...")
|
||||
exons.sort(intervalCompareExon)
|
||||
exonsInWellCoveredGenes.sort(intervalCompareExon)
|
||||
print("Writing all exon targets...")
|
||||
exon_targets = open("exonTargets.interval_list",'w')
|
||||
exon_targets.write("".join(headerlines))
|
||||
for exon in exons:
|
||||
exon_targets.write(exon.getBedEntry()+"\n")
|
||||
exon_targets.close()
|
||||
print("Writing exons for well-covered genes...")
|
||||
good_exon_targets = open("exonTargets_well_covered_genes.interval_list",'w')
|
||||
good_exon_targets.write("".join(headerlines))
|
||||
for exon in exonsInWellCoveredGenes:
|
||||
good_exon_targets.write(exon.getBedEntry()+"\n")
|
||||
good_exon_targets.close()
|
||||
print("Writing all exons and their overlapping targets...")
|
||||
debug = open("overlapping_targets.txt",'w')
|
||||
for exon in exons:
|
||||
debug.write(exon.id+"\t"+str(exon.getCoverageProportion())+"\t"+str(exon.interval)+"\t")
|
||||
intervals = list()
|
||||
for inter in exon.getOverlappingIntervals():
|
||||
intervals.append(str(inter))
|
||||
debug.write("\t".join(intervals)+"\n")
|
||||
Loading…
Reference in New Issue