Just some code I want to freeze. If you ever need to estimate the % of bases covered by exon, given an interval list, give it to getTargetedGenes. Not the best name for this function, but I don't expect anyone to use it but me.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3111 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-04-01 20:21:50 +00:00
parent d7880ef7ad
commit 687fd477ff
3 changed files with 428 additions and 3 deletions

View File

@ -197,6 +197,15 @@ class JobDispatcher:
else:
self.maxJobs = -1
self.print_only = print_only
self.startDays = 0
self.startHours = 0
self.startMins = 0
def setInitialDelay(self,delay):
dhm = delay.split(":")
self.startDays = int(dhm[0])
self.startHours = int(dhm[1])
self.startMins = int(dhm[2])
# external accessor, sorts by dependency and ensures user hasn't exceeded the set limits
def dispatchAll(self,farmJobs):
@ -234,9 +243,9 @@ class JobDispatcher:
# spaces jobs out by using the delay command to LSF
def _dispatchWithSpacing(self,farmJobs):
dayHrMin = self.action_string.split(":")
days = 0
hours = 0
mins = 0
days = self.startDays
hours = self.startHours
mins = self.startMins
dispatchBuffer = list()
while ( len(farmJobs) > 0 ):
nextJob = farmJobs.pop(0)

View File

@ -0,0 +1,200 @@
import math
def chr2int(chr):
try:
chr = chr.split("chr")[1]
if ( chr == "M" ):
return 0
if ( chr == "X" ):
return 23
if ( chr == "Y" ):
return 24
return int(chr)
except IndexError:
print("Index error: "+chr)
return -1
def intervalCompare(int1,int2):
chr1 = chr2int(int1.chromosome)
chr2 = chr2int(int2.chromosome)
if ( chr1 < chr2 ):
return -1
elif ( chr1 > chr2 ):
return 1
else:
start1 = int1.start
start2 = int2.start
return start1 - start2
class Interval:
def __init__(self,chrom,start,stop):
self.chromosome = chrom
self.start = start
self.stop = stop
if ( chrom == "NONE" ):
self.isEmpty = True
else:
self.isEmpty = False
self.basesCovered = None
def size(self):
return self.stop - self.start
def overlaps(self,other):
if ( self.chromosome == other.chromosome ):
if ( other.stop < self.stop and not other.start < self.start ):
return True
if ( other.stop > self.start and not other.start > self.stop ):
return True
return False
def intersect(self,other):
if ( self.overlaps(other) ):
return Interval(self.chromosome, max(self.start,other.start), min(self.stop,other.stop))
else:
return Interval("NONE",-1,-1)
def isBefore(self,other):
if ( chr2int(self.chromosome) < chr2int(other.chromosome) ):
return True
elif ( chr2int(self.chromosome) > chr2int(other.chromosome) ):
return False
else:
if ( other.start > self.stop ):
return True
return False
def __str__(self):
return self.chromosome + ":" + str(self.start) + "-" + str(self.stop)
def __cmp__(self,other):
return intervalCompare(self,other)
class CoveredInterval(Interval):
def __init__(self,chrom,start,stop):
Interval.__init__(self,chrom,start,stop)
self.overlappingSubIntervals = list()
def updateCoverage(self,other):
if ( other.overlaps(self) ):
self.overlappingSubIntervals.append(other)
def getBaseCoverage(self):
if ( self.basesCovered is None ):
basesCovered = 0
intersects = list()
self.overlappingSubIntervals.sort(intervalCompare)
for overlap in self.overlappingSubIntervals:
ival = self.intersect(overlap)
intersects.append(ival)
for i in range(len(intersects)):
basesCovered = basesCovered + intersects[i].size()
for j in range(i+1,len(intersects)):
basesCovered = basesCovered - (intersects[i].intersect(intersects[j])).size()
self.basesCovered = basesCovered
return self.basesCovered
def getOverlappingIntervals(self):
return self.overlappingSubIntervals
class Exon:
def __init__(self,geneName,exonid,chrom,start,stop):
#print("Adding exon for "+geneName+" with "+chrom+":"+str(start)+"-"+str(stop))
self.interval = CoveredInterval(chrom,start,stop)
self.gene = geneName
self.id = exonid
def getOverlappingIntervals(self):
return self.interval.getOverlappingIntervals()
def updateCoverage(self,target):
self.interval.updateCoverage(target)
def overlaps(self,target):
return self.interval.overlaps(target)
def isBefore(self,target):
return self.interval.isBefore(target)
def getBaseCoverage(self):
return self.interval.getBaseCoverage()
def size(self):
return self.interval.size()
def getBedEntry(self):
return "\t".join([self.interval.chromosome,str(self.interval.start),str(self.interval.stop),self.gene,self.id,str(self.getCoverageProportion())])
def getCoverageProportion(self):
if ( self.size() > 0 ):
return float(self.getBaseCoverage())/float(self.size())
else:
return -1
def __str__(self):
return self.gene+"("+self.id+") "+str(self.interval)
def getInterval(self):
return self.interval
class Gene:
def __init__(self,name):
self.name = name
self.exons = list()
def addExon(self,exon):
self.exons.append(exon)
def size(self):
size = 0
for exon in self.exons:
size = size + exon.size()
return size
def getBaseCoverage(self):
coverage = 0
for exon in self.exons:
coverage = coverage + exon.getBaseCoverage()
return coverage
def __str__(self):
exonString = list()
for exon in exons:
exonString.append(str(exon))
return name+"\t"+"\t".join(exonString)
class ExonRecord(Exon):
def __init__(self,geneName,exonid,chrom,start,stop,prop):
Exon.__init__(self,geneName,exonid,chrom,start,stop)
self.coverageProportion = prop
self.baseCoverage = math.ceil(prop*self.size())
self.records = list()
def getBaseCoverage(self):
return self.baseCoverage
def getCoverageProportion(self):
return self.coverageProportion
def addRecord(self,record):
self.records.append(record)
def getData(self):
toRet = ""
for record in self.records:
toRet += record.dataString()+"\t"
return toRet
class CoverageRecord:
def __init__(self,chrom,start,stop,sampleName,mean,median,q1,q3):
self.interval = Interval(chrom,start,stop) #indexing issues
self.sampleName = sampleName
self.mean = mean
self.median = median
self.q1 = q1
self.q3 = q3
def dataString(self):
return "\t".join([self.sampleName,str(self.mean),str(self.median),str(self.q1),str(self.q3)])
def getInterval(self):
return self.interval

View File

@ -0,0 +1,216 @@
import RefseqLibrary as exLib
def chr2int(chr):
chr = chr.split("chr")[1]
if ( chr == "M" ):
return 0
if ( chr == "X" ):
return 23
if ( chr == "Y" ):
return 24
return int(chr)
def intervalCompareExon(exon1,exon2):
int1 = exon1.interval
int2 = exon2.interval
return intervalCompare(int1,int2)
def intervalCompareExonTarget(exon1,target):
int1 = exon1.interval
return intervalCompare(int1,target)
def intervalCompare(int1,int2):
chr1 = chr2int(int1.chromosome)
chr2 = chr2int(int2.chromosome)
if ( chr1 < chr2 ):
return -1
elif ( chr1 > chr2 ):
return 1
else:
start1 = int1.start
start2 = int2.start
return start1 - start2
test_first = exLib.Interval("chr2",100000,100001)
test_second = exLib.Interval("chr2",100001,100004)
shouldBeNegative = intervalCompare(test_first,test_second)
if ( not shouldBeNegative < 0 ):
raise ValueError("Interval compare is not performing properly.")
test_first = exLib.CoveredInterval("chr2",10400,10800)
test_first.updateCoverage(exLib.Interval("chr2",10387,10420))
test_first.updateCoverage(exLib.Interval("chr2",10410,10560))
test_first.updateCoverage(exLib.Interval("chr2",10600,10820))
test_first.updateCoverage(exLib.Interval("chr2",10890,10990))
if ( test_first.getBaseCoverage() != 360 ):
print("Base coverage of test was "+str(test_first.getBaseCoverage())+" (400 expected)")
print("Testing intersection....")
g = exLib.Interval("chr2",10387,10420).intersect(exLib.Interval("chr2",10410,10560))
if ( g.chromosome != "chr2" or g.start != 10410 or g.stop != 10420 ):
print("Bad intersection! "+str(g))
if ( g.size() != 10 ):
print("Size not performing correctly")
raise ValueError("Covered interval is not performing properly")
refseq_exons = open("/humgen/gsa-hpprojects/exome/gene_interval_lists/refseq_exons/hg18.ref_gene.cds.bed")
refseq_utr3 = open("/humgen/gsa-hpprojects/exome/gene_interval_lists/refseq_exons/hg18.ref_gene.utr3.bed")
refseq_utr5 = open("/humgen/gsa-hpprojects/exome/gene_interval_lists/refseq_exons/hg18.ref_gene.utr5.bed")
cancer_6000 = open("/seq/references/HybSelOligos/tcga_6k_genes.design")
counter = 0
exons = list()
def isNotValid(rline):
try:
n = chr2int(line.split()[0])
except ValueError:
return True
return False
def parseLine(rline,token):
spline = line.strip().split()
chrom = spline[0]
start = int(spline[1])
stop = int(spline[2])
longname = spline[3]
geneName = longname.split("_"+token)[0]
exonID = longname.split("_"+token+"_")[1].split("_chr")[0]+"_"+"_".join([token,longname[len(longname)-1]])
return exLib.Exon(geneName,exonID,chrom,start,stop)
for line in refseq_exons.readlines():
if ( isNotValid(line) ):
continue
counter = 1 + counter
exons.append(parseLine(line,"cds"))
if ( counter % 25000 == 0 ):
print(str(counter)+" exons added")
for line in refseq_utr3:
if ( isNotValid(line) ):
continue
counter = 1 + counter
exons.append(parseLine(line,"utr3"))
if ( counter % 25000 == 0 ):
print(str(counter)+" exons added")
for line in refseq_utr5:
if ( isNotValid(line) ):
continue
counter = 1 + counter
exons.append(parseLine(line,"utr5"))
if ( counter % 25000 == 0 ):
print(str(counter)+" exons added")
for line in cancer_6000:
if ( not line.startswith("TARGET") ):
continue
counter = 1 + counter
spline = line.strip().split()
chrom = "chr"+spline[1]
start = int(spline[2])
stop = int(spline[3])
longname = spline[4]
gene_name = longname.split("_")[0].split("#")[1]
exon_id = "tcga_"+longname.split("_")[1]
exons.append(exLib.Exon(gene_name,exon_id,chrom,start,stop))
if ( counter % 25000 == 0 ):
print(str(counter)+" exons added")
exons.sort(intervalCompareExon)
print(str(len(exons))+" exons added")
start_index = 0
counter = 0
import sys
interval_list = sys.argv[1]
headerlines = list()
for line in open(interval_list).readlines():
if ( not line.startswith("@") and not line.startswith("#") ):
counter = 1 + counter
s = line.split()
target = exLib.Interval(s[0],int(s[1]),int(s[2]))
while ( exons[start_index].isBefore(target) and start_index < len(exons)-1):
start_index = 1 + start_index
index = start_index
while(exons[index].overlaps(target) and index < len(exons) - 1):
exons[index].updateCoverage(target)
index = 1 + index
if ( counter % 25000 == 0 ):
print("Read "+str(counter)+" lines from interval list")
#if ( counter % 5000 == 0 ):
# break
else:
if ( line.startswith("@") ):
headerlines.append(line)
print("Done reading interval file. Creating genes.")
exons.sort(key = lambda x: x.gene)
genes = list()
counter = 0
prevName = "@T#h12_3"
nGenes = 0
for exon in exons:
counter = 1 + counter
if ( exon.gene != prevName ):
genes.append(exLib.Gene(exon.gene))
nGenes = 1 + nGenes
genes[nGenes - 1].addExon(exon)
prevName = exon.gene
else:
genes[nGenes - 1].addExon(exon)
if ( counter % 20000 == 0 ):
print("Processed "+str(counter)+" exons. "+str(len(genes))+" gene records created.")
def byProportion(gene1,gene2):
gene1targets = gene1.size()
gene1cvg = gene1.getBaseCoverage()
gene1prop = float(gene1cvg)/float(gene1targets)
gene2targets = gene2.size()
gene2cvg = gene2.getBaseCoverage()
gene2prop = float(gene2cvg)/float(gene2targets)
if ( gene1prop > gene2prop ):
return -1
elif ( gene1prop < gene2prop ):
return 1
return 0
genes.sort(byProportion)
print("Writing coverage table...")
coverage = open("geneCoverage.txt",'w')
counter = 0
exonsInWellCoveredGenes = list()
for gene in genes:
try:
coverage.write(gene.name+"\t"+str(float(gene.getBaseCoverage())/float(gene.size()))+"\t"+str(gene.size())+"\t"+str(gene.getBaseCoverage()))
if ( float(gene.getBaseCoverage())/float(gene.size()) > 0.8 ):
for exon in gene.exons:
exonsInWellCoveredGenes.append(exon)
coverage.write("\n")
except ZeroDivisionError:
continue
counter = 1 + counter
if ( counter % 5000 == 0 ):
print("Written: "+str(counter)+" genes.")
coverage.close()
print("Sorting exons by start location...")
exons.sort(intervalCompareExon)
exonsInWellCoveredGenes.sort(intervalCompareExon)
print("Writing all exon targets...")
exon_targets = open("exonTargets.interval_list",'w')
exon_targets.write("".join(headerlines))
for exon in exons:
exon_targets.write(exon.getBedEntry()+"\n")
exon_targets.close()
print("Writing exons for well-covered genes...")
good_exon_targets = open("exonTargets_well_covered_genes.interval_list",'w')
good_exon_targets.write("".join(headerlines))
for exon in exonsInWellCoveredGenes:
good_exon_targets.write(exon.getBedEntry()+"\n")
good_exon_targets.close()
print("Writing all exons and their overlapping targets...")
debug = open("overlapping_targets.txt",'w')
for exon in exons:
debug.write(exon.id+"\t"+str(exon.getCoverageProportion())+"\t"+str(exon.interval)+"\t")
intervals = list()
for inter in exon.getOverlappingIntervals():
intervals.append(str(inter))
debug.write("\t".join(intervals)+"\n")