gatk-3.8/playground/python/EvalMapping.py

310 lines
11 KiB
Python
Executable File

#!/usr/bin/env python
# Evaluates mapping results produced my RunMapper.py by comparig
# the result to the original simulated reads in a sam file
import getopt, sys, os, re
from operator import attrgetter, itemgetter
from subprocess import Popen, STDOUT, PIPE
from SAM import SAMIO
from pushback_file import pushback_file
from qltout import qltout_file
from aln_file import aln_file
#!/usr/bin/env python
import string, sys
class SAMRecordWrapper:
def __init__(self, rec):
#print 'Rec', rec
self.rec = rec
def id(self): return self.rec.getQName()
def contig(self): return self.rec.getRname()
def pos(self): return self.rec.getPos()
def offset(self): return self.pos()-1
def map_qual(self): return self.rec.getMapq()
def __str__(self): return str(self.rec)
def SAMIOWrapper( file ):
print 'SAMIOWrapper', file
return SAMIO( file, debugging=True, func=SAMRecordWrapper)
class mapper_info:
def __init__(self, name, ext, file_class):
self.name = name.lower()
self.ext = ext
self.file_class = file_class
mapper_defs = [ mapper_info("ilt", "ilt.qltout", qltout_file),
mapper_info("merlin", "qltout", qltout_file),
mapper_info("swmerlin", "qltout", qltout_file),
mapper_info("bwa", "sam", SAMIOWrapper),
mapper_info("bwa32", "sam", SAMIOWrapper),
mapper_info("maq", "aln.txt", aln_file) ]
mapper_defs = dict( [(m.name, m) for m in mapper_defs] )
def Comp_sam_map(comp_head, mapper, file_output, sam_indel):
# sam_indel is a +/- number corresponding to the size of insertions / deletions
# in generated SAM file that are fixed (1,2,4,8,16...), this doesn't always
# translate to the SAM's cigar, but the full value should be used
file_output_head = comp_head+"."+mapper.name
if file_output:
# if non-false, output to filename in file_output_head
# if "" or False, leave output going to stdout
#saveout = sys.stdout
feval = open(file_output_head+".eval", "w")
#sys.stdout = feval
# create a file with a one-line tabular output of stats
# (for merging into Excel / R input)
fevaltab = open(file_output_head+".tab", "w")
# if maq file, convert aln.map to aln.txt
aln_txt_filename = file_output_head+"."+mapper.ext
if mapper.name == "maq" and not os.path.exists(aln_txt_filename):
maq_exe = "/seq/dirseq/maq-0.7.1/maq"
# SHOULD BE THIS
#cmd_str = maq_exe+" mapview "+file_output_head+".out.aln.map"
cmd_str = maq_exe+" mapview "+file_output_head+".out.aln.map"
print >> sys.stderr, "Executing "+cmd_str
fout = open(aln_txt_filename, "w")
p = Popen( cmd_str , shell=True, stdout=fout)
map_filename = comp_head+"."+mapper.name+"."+mapper.ext
print 'map_filename', map_filename
map_file = mapper.file_class(map_filename)
SAM_filename = comp_head+".sam"
sam_file = SAMIO(SAM_filename, debugging=True)
#sams = [s for s in sam_file]
print "Reading map file..."
maps = [q for q in map_file]
print "Reading SAM file..."
sams = {}
if mapper.name in ["maq", 'bwa', 'bwa32']:
for s in sam_file:
sams[s.getQName()] = s
else:
for i, s in enumerate(sam_file):
sams[i] = s
#maps = {}
#for m in map_file:
# maps[m.id()] = m
#sams = [s for s in sam_file]
class smq_t:
def __init__(self, sam, map, qual):
self.sam = sam # SAM record (exists for all reads)
self.map = map # Mapping record, may be None
self.qual = qual # Quality of mapping
print "Making SMQ records..."
SMQ = []
for maprec in maps:
samrec = sams.get(maprec.id()) # get mapping otherwise None
SMQ.append( smq_t(samrec, maprec, maprec.map_qual()) )
print "Sorting..."
SMQ.sort(key=attrgetter('qual'), reverse=True)
print "Evaluating placements..."
placed = 0
incorrectly_placed = 0
correctly_placed = 0
fevaltab.write( "\t".join( ["last_qual", "total_reads", "placed", "correctly_placed", "incorrectly_placed", "not_placed"] ) + "\n" )
if len(SMQ):
last_qual = SMQ[0].qual # grab 1st qual
for smq in SMQ:
if smq.sam == None:
continue
if smq.qual != last_qual:
total_reads = len(sams)#placed + not_placed
not_placed = len(sams) - placed
fevaltab.write( "\t".join( map(str, [last_qual, total_reads, placed, correctly_placed, incorrectly_placed, not_placed]) ) + "\n" )
#print "Wrote all reads with qual >= "+str(last_qual)
last_qual = smq.qual
placed += 1
# Convert the CIGAR string - e.g. 75M1I, 74M2D - into an int corresponding to bases
# inserted / deleted - e.g. +1, -2
#cigar = smq.sam.getCigar()
#indelled_bases = 0
#match = re.search(r'(\d+)I', cigar)
#if match:
# indelled_bases += int(match.groups()[0])
#match = re.search(r'(\d+)D', cigar)
#if match:
# indelled_bases -= int(match.groups()[0])
# We consider a read properly aligned if
# - both start positions are the same OR
# - the end position of the mapping = end position of the samq
#if smq.sam.getPos() != smq.map.pos() and smq.sam.getPos() - indelled_bases != smq.map.pos():
try:
map_indel_bases = smq.map.indelled_bases() # total number of indels in mapped seq
# are generally at the beginning of an alignment spanning the anchor base
except AttributeError:
map_indel_bases = 0 # Since MAQ doesn't do local alignment, we don't care
# if its sam.py interface doesn't have access to indel data since there are
# no indels in its alignmnets
if smq.sam.getPos() != smq.map.pos() and smq.map.pos() + sam_indel + map_indel_bases != smq.sam.getPos():
#print >> feval, "Indelled SAM bases: "+str(indelled_bases)
try:
print >> feval, "Indelled map bases: "+str(map_indel_bases)
except AttributeError:
pass
print >> feval, "SAM ref: %5s %s" % (smq.sam.getRname(), smq.sam.getPos())
print >> feval, "Mapping: %5s %s" % (smq.map.contig(), smq.map.pos())
print >> feval, "SAM record:\n"+str(smq.sam)
print >> feval, "Mapping record:\n"+str(smq.map)+"\n"
incorrectly_placed += 1
else:
correctly_placed +=1
#indexed_maps = [None] * len(sams)
#for q in maps:
# indexed_maps[q.id()] = q
#def f(s,q):
# if q == None:
# return s
# else:
# return False
#missing = filter( None, map(f, sams, indexed_maps))#, range(len(indexed_maps))) )
#for miss in missing:
# print miss
#not_placed = len(missing)
#not_placed = len(sams) - len(maps)
total_reads = len(sams) #placed + not_placed
not_placed = len(sams) - placed
print >> feval, "Total reads : "+str(total_reads)
print >> feval, "+-Placed : "+str(placed)
print >> feval, "| +-Correct : "+str(correctly_placed)
print >> feval, "| +-Incorrect: "+str(incorrectly_placed)
print >> feval, "+-Not placed : "+str(not_placed)
fevaltab.write( "\t".join( map(str, [0, total_reads, placed, correctly_placed, incorrectly_placed, not_placed]) ) + "\n" )
print "Wrote all reads with all quals"
#if file_output_head:
# sys.stdout = saveout
def remove_escape_codes(s):
import re
return re.sub(r'\x1b\[(\d\d)?\w', "", s)
def usage():
print "EvalMapping.py\n"
print "Required arguments:\n"
print " -h HEAD All input and output files start with HEAD"
print " --OR--"
print " <list of *.sam files on STDIN to use as filename templates>"
print " i.e. ls *.sam | EvalMapping.py -m MAPPER"
print
print " -m MAPPER Mapping program output to compare (e.g. merlin, ILT)"
print
print "Optional arguments:\n"
print
print " -o Output to screen instead of HEAD.MAPPER.eval"
print
print "Input files:"
print " HEAD.sam"
print
print "Result files expected for Merlin and ILT:"
print " HEAD.MAPPER.qltout"
print
print "Result files expected for MAQ:"
print
sys.exit(2)
if __name__ == "__main__":
opts = None
try:
opts, args = getopt.getopt(sys.argv[1:], "h:m:o", ["head","mapper","file-output"])
except getopt.GetoptError:
usage()
comp_heads = []
mapper_str = ""
file_output = True
for opt, arg in opts:
if opt in ("-h", "--head"):
comp_heads = [arg]
if opt in ("-m", "--mapper"):
mapper_str = arg
possible_mappers = mapper_defs.keys()
# Check it later now...
#if mapper_str not in possible_mappers:
# print "--mapper argument was \""+mapper_str+"\"; expected one of the following: "+", ".join(possible_mappers)
# sys.exit(2)
if opt in ("-o", "--file-output"):
file_output = False
# Check for required args
if (mapper_str == ""):
usage()
elif (mapper_str == "all"):
mappers = mapper_defs.values()
print mappers
else:
try:
mapper_str_items = mapper_str.split(",")
mappers = [mapper_defs.get(mapper.lower()) for mapper in mapper_str_items]
except KeyError:
print "Don't know this mapper given in -m option: "+mapper
print "Try: ilt, merlin, maq, bwa"
sys.exit(1)
# if we didn't already get a single file from -h
if len(comp_heads) == 0:
# and there's a list on STDIN,
if not sys.stdin.isatty(): # redirected from file or pipe
comp_heads = [ os.path.splitext( remove_escape_codes(file).rstrip() )[0] for file in sys.stdin.readlines() ]
comp_heads = [ f for f in comp_heads if len(f) > 0 ]
else:
# no single file AND no STDIN file list
usage()
print "\nFile heads to evaluate:"
print "\n".join(comp_heads)+"\n"
comp_heads.reverse()
for comp_head in comp_heads: #[:1]
for mapper in mappers:
# Guess the original size of simulated indel from filename
m = re.search(r'_den\.(\d+)_', comp_head)
if m:
if ".DELETION_" in comp_head:
indel_size = -1 * int(m.groups()[0])
elif ".INSERTION_" in comp_head:
indel_size = int(m.groups()[0])
else:
indel_size = 0
else:
sys.exit("Couldn't guess simulated read indel size from filename")
print "Evaluating "+mapper.name+" results with indel size: "+str(indel_size)+" and file header: "+comp_head
Comp_sam_map(comp_head, mapper, file_output, indel_size)