155 lines
5.1 KiB
Python
Executable File
155 lines
5.1 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
# merges .tab files produced by EvalMapping.py into a tabular format
|
|
|
|
#from __future__ import print_function # python 2.6 / 3.0 print as a function
|
|
import os,sys,re
|
|
from rpy2 import *
|
|
|
|
def ordered_eval_files(aligner, mutation_type, file_ext):
|
|
tab_files = [f for f in os.listdir(".") if f.endswith("."+aligner+"."+file_ext) and (mutation_type in f or "None_den." in f)]
|
|
|
|
tabs = []
|
|
for f in tab_files:
|
|
num = int(re.search(r'den\.(\d\d?)_', f).groups()[0])
|
|
if "None_den" in f:
|
|
num = 0
|
|
tabs.append( (num,f) )
|
|
tabs.sort()
|
|
return tabs
|
|
|
|
def anchor_hist(filename):
|
|
fin = open(filename)
|
|
offsets = []
|
|
read_poses = []
|
|
|
|
for line in fin:
|
|
id = None
|
|
fields = line.split()
|
|
if len(fields) > 0 and ".read." in fields[0]:
|
|
id = line.split()[0]
|
|
read_poses.append(id.split(".")[-1])
|
|
offsets.append(id.split(":")[1].split(".")[0])
|
|
|
|
#if "read name :" in line:
|
|
# id = line.split()[3]
|
|
|
|
import rpy2.robjects as robj
|
|
if len(read_poses):
|
|
print len(read_poses)
|
|
return robj.r.hist(robj.FloatVector(read_poses), breaks=77, plot=False)
|
|
else:
|
|
print 0
|
|
return robj.IntVector([])
|
|
|
|
def estimateCPUTime(filename):
|
|
times = []
|
|
if os.path.exists(filename):
|
|
regex = re.compile('CPU time\s*:\s+([\d\.]+) sec')
|
|
#print ('Filename', filename)
|
|
def extractTime1(line):
|
|
sobj = regex.search(line)
|
|
if sobj <> None:
|
|
time = float(sobj.group(1))
|
|
#print (line, sobj, sobj.groups(), time)
|
|
return time
|
|
else:
|
|
return None
|
|
times = [time for time in map(extractTime1, open(filename)) if time <> None]
|
|
#print('times', times)
|
|
|
|
return max(times)
|
|
|
|
def print_eval_tables(aligner, mut_type, num_var, filename, min_qual_cutoff=0):
|
|
filebase = os.path.splitext(filename)[0]
|
|
cpuTime = estimateCPUTime(filebase + '.stdout')
|
|
|
|
lines = open(filename).readlines()
|
|
data = None
|
|
if len(lines) > 1:
|
|
da_line = lines[0]
|
|
for line in lines:
|
|
line = line.rstrip()
|
|
fields = line.split("\t")
|
|
try:
|
|
qual_cutoff = int(fields[0])
|
|
except:
|
|
qual_cutoff = 1000
|
|
if qual_cutoff <= min_qual_cutoff:
|
|
break
|
|
else:
|
|
da_line = line
|
|
|
|
#print ("%2d\t" % num_var, file=sys.stderr),
|
|
data = [aligner, mut_type, min_qual_cutoff, num_var] + map(int, da_line.split()) + [cpuTime]
|
|
print "%s\t%s\t%2d\t%s\t%.2f" % (aligner, mut_type, num_var, da_line, cpuTime)
|
|
return data
|
|
|
|
def plot_anchor_hists():
|
|
import rpy2.robjects as robj
|
|
#robj.r.par( mfrow = robj.RVector([2,2]) )
|
|
#robj.r('X11(width=24, height=15)')
|
|
robj.r('png("lotto_histo.png", width=1500, height=900)')
|
|
robj.r('par(mfrow=c(4,6 ), cin=c(3,3))')
|
|
|
|
for (aligner, cutoff) in (("maq", 30), ("ilt",-1), ("merlin",-1), ("swmerlin",-1)):
|
|
print aligner, ">"+str(cutoff)
|
|
|
|
num_file = ordered_eval_files(aligner, mut_type, file_ext="eval")
|
|
#print (*num_file, sep="\n")
|
|
for num, file in num_file:
|
|
|
|
h = anchor_hist( file )
|
|
title = "{aligner} {num} bp insert".format(aligner=aligner, num=num)
|
|
robj.r.plot(h, xlab="Anchor position", main=title, col="blue", ylim=robj.FloatVector([0,40]), xlim=robj.FloatVector([0,90]))
|
|
|
|
robj.r("dev.off()")
|
|
#raw_input("Press any key to exit...")
|
|
|
|
|
|
def analyzeData( data, mut_type ):
|
|
import rpy2.robjects as robj
|
|
# print name / data name
|
|
aligners = [('MAQ', 'maq>30'), ('Merlin', 'swmerlin>-1'), ('ILT', 'ilt>-1'), ('BWA', 'bwa32>30')]
|
|
|
|
def selectData( key ):
|
|
def keepDataP( data ):
|
|
return data[0] == key and data[1] == mut_type
|
|
return filter( keyDataP, data )
|
|
|
|
def placeRatePlot():
|
|
robj.r('png("x.png", width=1500, height=900)')
|
|
x = robj.FloatVector(mutDensity)
|
|
y = robj.FloatVector(placeRate)
|
|
robj.r.plot(x, y, xlab="Mutation density / size", col="blue")
|
|
robj.r("dev.off()")
|
|
|
|
return 0
|
|
|
|
if __name__ == "__main__":
|
|
|
|
if len(sys.argv) < 2:
|
|
print "merge_tabs.py MUTATION_TYPE (e.g. INSERTION, DELETION, SNP)"
|
|
sys.exit(1)
|
|
else:
|
|
mut_type = sys.argv[1]
|
|
|
|
if False:
|
|
plot_anchor_hists()
|
|
else:
|
|
data = []
|
|
for (aligner, cutoff) in (("maq", 30), ("maq", 0), ("maq",-1), ("ilt",-1), ("merlin",-1), ("swmerlin",-1), ("bwa", 30), ("bwa", 0), ("bwa", -1), ("bwa32", 30), ("bwa32", 0), ("bwa32", -1)):
|
|
name = aligner + ">" + str(cutoff)
|
|
#print (aligner, ">"+str(cutoff))
|
|
|
|
num_file = ordered_eval_files(aligner, mut_type, file_ext="tab")
|
|
#num_file = ordered_eval_files(aligner, mut_type, file_ext="eval")
|
|
for num, file in num_file:
|
|
datum = print_eval_tables( name, mut_type, num, file, cutoff )
|
|
if datum <> None:
|
|
data.append(datum)
|
|
#print
|
|
|
|
analyzeData( data, mut_type )
|
|
|