diff --git a/python/JobDispatcher.py b/python/JobDispatcher.py index 16967e7d7..902a2b436 100644 --- a/python/JobDispatcher.py +++ b/python/JobDispatcher.py @@ -197,6 +197,15 @@ class JobDispatcher: else: self.maxJobs = -1 self.print_only = print_only + self.startDays = 0 + self.startHours = 0 + self.startMins = 0 + + def setInitialDelay(self,delay): + dhm = delay.split(":") + self.startDays = int(dhm[0]) + self.startHours = int(dhm[1]) + self.startMins = int(dhm[2]) # external accessor, sorts by dependency and ensures user hasn't exceeded the set limits def dispatchAll(self,farmJobs): @@ -234,9 +243,9 @@ class JobDispatcher: # spaces jobs out by using the delay command to LSF def _dispatchWithSpacing(self,farmJobs): dayHrMin = self.action_string.split(":") - days = 0 - hours = 0 - mins = 0 + days = self.startDays + hours = self.startHours + mins = self.startMins dispatchBuffer = list() while ( len(farmJobs) > 0 ): nextJob = farmJobs.pop(0) diff --git a/python/RefseqLibrary.py b/python/RefseqLibrary.py new file mode 100755 index 000000000..e88ef3a96 --- /dev/null +++ b/python/RefseqLibrary.py @@ -0,0 +1,200 @@ +import math +def chr2int(chr): + try: + chr = chr.split("chr")[1] + if ( chr == "M" ): + return 0 + if ( chr == "X" ): + return 23 + if ( chr == "Y" ): + return 24 + return int(chr) + except IndexError: + print("Index error: "+chr) + return -1 + +def intervalCompare(int1,int2): + chr1 = chr2int(int1.chromosome) + chr2 = chr2int(int2.chromosome) + if ( chr1 < chr2 ): + return -1 + elif ( chr1 > chr2 ): + return 1 + else: + start1 = int1.start + start2 = int2.start + return start1 - start2 + +class Interval: + def __init__(self,chrom,start,stop): + self.chromosome = chrom + self.start = start + self.stop = stop + if ( chrom == "NONE" ): + self.isEmpty = True + else: + self.isEmpty = False + self.basesCovered = None + + def size(self): + return self.stop - self.start + + def overlaps(self,other): + if ( self.chromosome == other.chromosome ): + if ( other.stop < self.stop and not other.start < self.start ): + return True + if ( other.stop > self.start and not other.start > self.stop ): + return True + return False + + def intersect(self,other): + if ( self.overlaps(other) ): + return Interval(self.chromosome, max(self.start,other.start), min(self.stop,other.stop)) + else: + return Interval("NONE",-1,-1) + + def isBefore(self,other): + if ( chr2int(self.chromosome) < chr2int(other.chromosome) ): + return True + elif ( chr2int(self.chromosome) > chr2int(other.chromosome) ): + return False + else: + if ( other.start > self.stop ): + return True + return False + + def __str__(self): + return self.chromosome + ":" + str(self.start) + "-" + str(self.stop) + + def __cmp__(self,other): + return intervalCompare(self,other) + +class CoveredInterval(Interval): + def __init__(self,chrom,start,stop): + Interval.__init__(self,chrom,start,stop) + self.overlappingSubIntervals = list() + + def updateCoverage(self,other): + if ( other.overlaps(self) ): + self.overlappingSubIntervals.append(other) + + def getBaseCoverage(self): + if ( self.basesCovered is None ): + basesCovered = 0 + intersects = list() + self.overlappingSubIntervals.sort(intervalCompare) + for overlap in self.overlappingSubIntervals: + ival = self.intersect(overlap) + intersects.append(ival) + for i in range(len(intersects)): + basesCovered = basesCovered + intersects[i].size() + for j in range(i+1,len(intersects)): + basesCovered = basesCovered - (intersects[i].intersect(intersects[j])).size() + + self.basesCovered = basesCovered + return self.basesCovered + + def getOverlappingIntervals(self): + return self.overlappingSubIntervals + +class Exon: + def __init__(self,geneName,exonid,chrom,start,stop): + #print("Adding exon for "+geneName+" with "+chrom+":"+str(start)+"-"+str(stop)) + self.interval = CoveredInterval(chrom,start,stop) + self.gene = geneName + self.id = exonid + + def getOverlappingIntervals(self): + return self.interval.getOverlappingIntervals() + + def updateCoverage(self,target): + self.interval.updateCoverage(target) + + def overlaps(self,target): + return self.interval.overlaps(target) + + def isBefore(self,target): + return self.interval.isBefore(target) + + def getBaseCoverage(self): + return self.interval.getBaseCoverage() + + def size(self): + return self.interval.size() + + def getBedEntry(self): + return "\t".join([self.interval.chromosome,str(self.interval.start),str(self.interval.stop),self.gene,self.id,str(self.getCoverageProportion())]) + + def getCoverageProportion(self): + if ( self.size() > 0 ): + return float(self.getBaseCoverage())/float(self.size()) + else: + return -1 + + def __str__(self): + return self.gene+"("+self.id+") "+str(self.interval) + + def getInterval(self): + return self.interval + +class Gene: + def __init__(self,name): + self.name = name + self.exons = list() + + def addExon(self,exon): + self.exons.append(exon) + + def size(self): + size = 0 + for exon in self.exons: + size = size + exon.size() + return size + + def getBaseCoverage(self): + coverage = 0 + for exon in self.exons: + coverage = coverage + exon.getBaseCoverage() + return coverage + + def __str__(self): + exonString = list() + for exon in exons: + exonString.append(str(exon)) + return name+"\t"+"\t".join(exonString) + +class ExonRecord(Exon): + def __init__(self,geneName,exonid,chrom,start,stop,prop): + Exon.__init__(self,geneName,exonid,chrom,start,stop) + self.coverageProportion = prop + self.baseCoverage = math.ceil(prop*self.size()) + self.records = list() + + def getBaseCoverage(self): + return self.baseCoverage + + def getCoverageProportion(self): + return self.coverageProportion + + def addRecord(self,record): + self.records.append(record) + + def getData(self): + toRet = "" + for record in self.records: + toRet += record.dataString()+"\t" + return toRet +class CoverageRecord: + def __init__(self,chrom,start,stop,sampleName,mean,median,q1,q3): + self.interval = Interval(chrom,start,stop) #indexing issues + self.sampleName = sampleName + self.mean = mean + self.median = median + self.q1 = q1 + self.q3 = q3 + + def dataString(self): + return "\t".join([self.sampleName,str(self.mean),str(self.median),str(self.q1),str(self.q3)]) + + def getInterval(self): + return self.interval diff --git a/python/getTargetedGenes.py b/python/getTargetedGenes.py new file mode 100755 index 000000000..e7725fba9 --- /dev/null +++ b/python/getTargetedGenes.py @@ -0,0 +1,216 @@ +import RefseqLibrary as exLib + +def chr2int(chr): + chr = chr.split("chr")[1] + if ( chr == "M" ): + return 0 + if ( chr == "X" ): + return 23 + if ( chr == "Y" ): + return 24 + return int(chr) + +def intervalCompareExon(exon1,exon2): + int1 = exon1.interval + int2 = exon2.interval + return intervalCompare(int1,int2) + +def intervalCompareExonTarget(exon1,target): + int1 = exon1.interval + return intervalCompare(int1,target) + +def intervalCompare(int1,int2): + chr1 = chr2int(int1.chromosome) + chr2 = chr2int(int2.chromosome) + if ( chr1 < chr2 ): + return -1 + elif ( chr1 > chr2 ): + return 1 + else: + start1 = int1.start + start2 = int2.start + return start1 - start2 + +test_first = exLib.Interval("chr2",100000,100001) +test_second = exLib.Interval("chr2",100001,100004) +shouldBeNegative = intervalCompare(test_first,test_second) +if ( not shouldBeNegative < 0 ): + raise ValueError("Interval compare is not performing properly.") + +test_first = exLib.CoveredInterval("chr2",10400,10800) +test_first.updateCoverage(exLib.Interval("chr2",10387,10420)) +test_first.updateCoverage(exLib.Interval("chr2",10410,10560)) +test_first.updateCoverage(exLib.Interval("chr2",10600,10820)) +test_first.updateCoverage(exLib.Interval("chr2",10890,10990)) +if ( test_first.getBaseCoverage() != 360 ): + print("Base coverage of test was "+str(test_first.getBaseCoverage())+" (400 expected)") + print("Testing intersection....") + g = exLib.Interval("chr2",10387,10420).intersect(exLib.Interval("chr2",10410,10560)) + if ( g.chromosome != "chr2" or g.start != 10410 or g.stop != 10420 ): + print("Bad intersection! "+str(g)) + if ( g.size() != 10 ): + print("Size not performing correctly") + raise ValueError("Covered interval is not performing properly") + +refseq_exons = open("/humgen/gsa-hpprojects/exome/gene_interval_lists/refseq_exons/hg18.ref_gene.cds.bed") +refseq_utr3 = open("/humgen/gsa-hpprojects/exome/gene_interval_lists/refseq_exons/hg18.ref_gene.utr3.bed") +refseq_utr5 = open("/humgen/gsa-hpprojects/exome/gene_interval_lists/refseq_exons/hg18.ref_gene.utr5.bed") +cancer_6000 = open("/seq/references/HybSelOligos/tcga_6k_genes.design") + +counter = 0 +exons = list() +def isNotValid(rline): + try: + n = chr2int(line.split()[0]) + except ValueError: + return True + return False + +def parseLine(rline,token): + spline = line.strip().split() + chrom = spline[0] + start = int(spline[1]) + stop = int(spline[2]) + longname = spline[3] + geneName = longname.split("_"+token)[0] + exonID = longname.split("_"+token+"_")[1].split("_chr")[0]+"_"+"_".join([token,longname[len(longname)-1]]) + return exLib.Exon(geneName,exonID,chrom,start,stop) + +for line in refseq_exons.readlines(): + if ( isNotValid(line) ): + continue + counter = 1 + counter + exons.append(parseLine(line,"cds")) + if ( counter % 25000 == 0 ): + print(str(counter)+" exons added") + +for line in refseq_utr3: + if ( isNotValid(line) ): + continue + counter = 1 + counter + exons.append(parseLine(line,"utr3")) + if ( counter % 25000 == 0 ): + print(str(counter)+" exons added") + +for line in refseq_utr5: + if ( isNotValid(line) ): + continue + counter = 1 + counter + exons.append(parseLine(line,"utr5")) + if ( counter % 25000 == 0 ): + print(str(counter)+" exons added") + +for line in cancer_6000: + if ( not line.startswith("TARGET") ): + continue + counter = 1 + counter + spline = line.strip().split() + chrom = "chr"+spline[1] + start = int(spline[2]) + stop = int(spline[3]) + longname = spline[4] + gene_name = longname.split("_")[0].split("#")[1] + exon_id = "tcga_"+longname.split("_")[1] + exons.append(exLib.Exon(gene_name,exon_id,chrom,start,stop)) + if ( counter % 25000 == 0 ): + print(str(counter)+" exons added") + +exons.sort(intervalCompareExon) +print(str(len(exons))+" exons added") +start_index = 0 +counter = 0 +import sys +interval_list = sys.argv[1] +headerlines = list() +for line in open(interval_list).readlines(): + if ( not line.startswith("@") and not line.startswith("#") ): + counter = 1 + counter + s = line.split() + target = exLib.Interval(s[0],int(s[1]),int(s[2])) + while ( exons[start_index].isBefore(target) and start_index < len(exons)-1): + start_index = 1 + start_index + index = start_index + while(exons[index].overlaps(target) and index < len(exons) - 1): + exons[index].updateCoverage(target) + index = 1 + index + if ( counter % 25000 == 0 ): + print("Read "+str(counter)+" lines from interval list") + #if ( counter % 5000 == 0 ): + # break + else: + if ( line.startswith("@") ): + headerlines.append(line) + +print("Done reading interval file. Creating genes.") +exons.sort(key = lambda x: x.gene) +genes = list() +counter = 0 +prevName = "@T#h12_3" +nGenes = 0 +for exon in exons: + counter = 1 + counter + if ( exon.gene != prevName ): + genes.append(exLib.Gene(exon.gene)) + nGenes = 1 + nGenes + genes[nGenes - 1].addExon(exon) + prevName = exon.gene + else: + genes[nGenes - 1].addExon(exon) + if ( counter % 20000 == 0 ): + print("Processed "+str(counter)+" exons. "+str(len(genes))+" gene records created.") + +def byProportion(gene1,gene2): + gene1targets = gene1.size() + gene1cvg = gene1.getBaseCoverage() + gene1prop = float(gene1cvg)/float(gene1targets) + gene2targets = gene2.size() + gene2cvg = gene2.getBaseCoverage() + gene2prop = float(gene2cvg)/float(gene2targets) + if ( gene1prop > gene2prop ): + return -1 + elif ( gene1prop < gene2prop ): + return 1 + return 0 + +genes.sort(byProportion) + +print("Writing coverage table...") +coverage = open("geneCoverage.txt",'w') +counter = 0 +exonsInWellCoveredGenes = list() +for gene in genes: + try: + coverage.write(gene.name+"\t"+str(float(gene.getBaseCoverage())/float(gene.size()))+"\t"+str(gene.size())+"\t"+str(gene.getBaseCoverage())) + if ( float(gene.getBaseCoverage())/float(gene.size()) > 0.8 ): + for exon in gene.exons: + exonsInWellCoveredGenes.append(exon) + coverage.write("\n") + except ZeroDivisionError: + continue + counter = 1 + counter + if ( counter % 5000 == 0 ): + print("Written: "+str(counter)+" genes.") +coverage.close() +print("Sorting exons by start location...") +exons.sort(intervalCompareExon) +exonsInWellCoveredGenes.sort(intervalCompareExon) +print("Writing all exon targets...") +exon_targets = open("exonTargets.interval_list",'w') +exon_targets.write("".join(headerlines)) +for exon in exons: + exon_targets.write(exon.getBedEntry()+"\n") +exon_targets.close() +print("Writing exons for well-covered genes...") +good_exon_targets = open("exonTargets_well_covered_genes.interval_list",'w') +good_exon_targets.write("".join(headerlines)) +for exon in exonsInWellCoveredGenes: + good_exon_targets.write(exon.getBedEntry()+"\n") +good_exon_targets.close() +print("Writing all exons and their overlapping targets...") +debug = open("overlapping_targets.txt",'w') +for exon in exons: + debug.write(exon.id+"\t"+str(exon.getCoverageProportion())+"\t"+str(exon.interval)+"\t") + intervals = list() + for inter in exon.getOverlappingIntervals(): + intervals.append(str(inter)) + debug.write("\t".join(intervals)+"\n")