#!/usr/bin/env python # Evaluates mapping results produced my RunMapper.py by comparig # the result to the original simulated reads in a sam file import getopt, sys, os, re from operator import attrgetter, itemgetter from subprocess import Popen, STDOUT, PIPE from SAM import SAMIO from pushback_file import pushback_file from qltout import qltout_file from aln_file import aln_file #!/usr/bin/env python import string, sys class SAMRecordWrapper: def __init__(self, rec): #print 'Rec', rec self.rec = rec def id(self): return self.rec.getQName() def contig(self): return self.rec.getRname() def pos(self): return self.rec.getPos() def offset(self): return self.pos()-1 def map_qual(self): return self.rec.getMapq() def __str__(self): return str(self.rec) def SAMIOWrapper( file ): print 'SAMIOWrapper', file return SAMIO( file, debugging=True, func=SAMRecordWrapper) class mapper_info: def __init__(self, name, ext, file_class): self.name = name.lower() self.ext = ext self.file_class = file_class mapper_defs = [ mapper_info("ilt", "ilt.qltout", qltout_file), mapper_info("merlin", "qltout", qltout_file), mapper_info("swmerlin", "qltout", qltout_file), mapper_info("bwa", "sam", SAMIOWrapper), mapper_info("bwa32", "sam", SAMIOWrapper), mapper_info("maq", "aln.txt", aln_file) ] mapper_defs = dict( [(m.name, m) for m in mapper_defs] ) def Comp_sam_map(comp_head, mapper, file_output, sam_indel): # sam_indel is a +/- number corresponding to the size of insertions / deletions # in generated SAM file that are fixed (1,2,4,8,16...), this doesn't always # translate to the SAM's cigar, but the full value should be used file_output_head = comp_head+"."+mapper.name if file_output: # if non-false, output to filename in file_output_head # if "" or False, leave output going to stdout #saveout = sys.stdout feval = open(file_output_head+".eval", "w") #sys.stdout = feval # create a file with a one-line tabular output of stats # (for merging into Excel / R input) fevaltab = open(file_output_head+".tab", "w") # if maq file, convert aln.map to aln.txt aln_txt_filename = file_output_head+"."+mapper.ext if mapper.name == "maq" and not os.path.exists(aln_txt_filename): maq_exe = "/seq/dirseq/maq-0.7.1/maq" # SHOULD BE THIS #cmd_str = maq_exe+" mapview "+file_output_head+".out.aln.map" cmd_str = maq_exe+" mapview "+file_output_head+".out.aln.map" print >> sys.stderr, "Executing "+cmd_str fout = open(aln_txt_filename, "w") p = Popen( cmd_str , shell=True, stdout=fout) map_filename = comp_head+"."+mapper.name+"."+mapper.ext print 'map_filename', map_filename map_file = mapper.file_class(map_filename) SAM_filename = comp_head+".sam" sam_file = SAMIO(SAM_filename, debugging=True) #sams = [s for s in sam_file] print "Reading map file..." maps = [q for q in map_file] print "Reading SAM file..." sams = {} if mapper.name in ["maq", 'bwa', 'bwa32']: for s in sam_file: sams[s.getQName()] = s else: for i, s in enumerate(sam_file): sams[i] = s #maps = {} #for m in map_file: # maps[m.id()] = m #sams = [s for s in sam_file] class smq_t: def __init__(self, sam, map, qual): self.sam = sam # SAM record (exists for all reads) self.map = map # Mapping record, may be None self.qual = qual # Quality of mapping print "Making SMQ records..." SMQ = [] for maprec in maps: samrec = sams.get(maprec.id()) # get mapping otherwise None SMQ.append( smq_t(samrec, maprec, maprec.map_qual()) ) print "Sorting..." SMQ.sort(key=attrgetter('qual'), reverse=True) print "Evaluating placements..." placed = 0 incorrectly_placed = 0 correctly_placed = 0 fevaltab.write( "\t".join( ["last_qual", "total_reads", "placed", "correctly_placed", "incorrectly_placed", "not_placed"] ) + "\n" ) if len(SMQ): last_qual = SMQ[0].qual # grab 1st qual for smq in SMQ: if smq.sam == None: continue if smq.qual != last_qual: total_reads = len(sams)#placed + not_placed not_placed = len(sams) - placed fevaltab.write( "\t".join( map(str, [last_qual, total_reads, placed, correctly_placed, incorrectly_placed, not_placed]) ) + "\n" ) #print "Wrote all reads with qual >= "+str(last_qual) last_qual = smq.qual placed += 1 # Convert the CIGAR string - e.g. 75M1I, 74M2D - into an int corresponding to bases # inserted / deleted - e.g. +1, -2 #cigar = smq.sam.getCigar() #indelled_bases = 0 #match = re.search(r'(\d+)I', cigar) #if match: # indelled_bases += int(match.groups()[0]) #match = re.search(r'(\d+)D', cigar) #if match: # indelled_bases -= int(match.groups()[0]) # We consider a read properly aligned if # - both start positions are the same OR # - the end position of the mapping = end position of the samq #if smq.sam.getPos() != smq.map.pos() and smq.sam.getPos() - indelled_bases != smq.map.pos(): try: map_indel_bases = smq.map.indelled_bases() # total number of indels in mapped seq # are generally at the beginning of an alignment spanning the anchor base except AttributeError: map_indel_bases = 0 # Since MAQ doesn't do local alignment, we don't care # if its sam.py interface doesn't have access to indel data since there are # no indels in its alignmnets if smq.sam.getPos() != smq.map.pos() and smq.map.pos() + sam_indel + map_indel_bases != smq.sam.getPos(): #print >> feval, "Indelled SAM bases: "+str(indelled_bases) try: print >> feval, "Indelled map bases: "+str(map_indel_bases) except AttributeError: pass print >> feval, "SAM ref: %5s %s" % (smq.sam.getRname(), smq.sam.getPos()) print >> feval, "Mapping: %5s %s" % (smq.map.contig(), smq.map.pos()) print >> feval, "SAM record:\n"+str(smq.sam) print >> feval, "Mapping record:\n"+str(smq.map)+"\n" incorrectly_placed += 1 else: correctly_placed +=1 #indexed_maps = [None] * len(sams) #for q in maps: # indexed_maps[q.id()] = q #def f(s,q): # if q == None: # return s # else: # return False #missing = filter( None, map(f, sams, indexed_maps))#, range(len(indexed_maps))) ) #for miss in missing: # print miss #not_placed = len(missing) #not_placed = len(sams) - len(maps) total_reads = len(sams) #placed + not_placed not_placed = len(sams) - placed print >> feval, "Total reads : "+str(total_reads) print >> feval, "+-Placed : "+str(placed) print >> feval, "| +-Correct : "+str(correctly_placed) print >> feval, "| +-Incorrect: "+str(incorrectly_placed) print >> feval, "+-Not placed : "+str(not_placed) fevaltab.write( "\t".join( map(str, [0, total_reads, placed, correctly_placed, incorrectly_placed, not_placed]) ) + "\n" ) print "Wrote all reads with all quals" #if file_output_head: # sys.stdout = saveout def remove_escape_codes(s): import re return re.sub(r'\x1b\[(\d\d)?\w', "", s) def usage(): print "EvalMapping.py\n" print "Required arguments:\n" print " -h HEAD All input and output files start with HEAD" print " --OR--" print " " print " i.e. ls *.sam | EvalMapping.py -m MAPPER" print print " -m MAPPER Mapping program output to compare (e.g. merlin, ILT)" print print "Optional arguments:\n" print print " -o Output to screen instead of HEAD.MAPPER.eval" print print "Input files:" print " HEAD.sam" print print "Result files expected for Merlin and ILT:" print " HEAD.MAPPER.qltout" print print "Result files expected for MAQ:" print sys.exit(2) if __name__ == "__main__": opts = None try: opts, args = getopt.getopt(sys.argv[1:], "h:m:o", ["head","mapper","file-output"]) except getopt.GetoptError: usage() comp_heads = [] mapper_str = "" file_output = True for opt, arg in opts: if opt in ("-h", "--head"): comp_heads = [arg] if opt in ("-m", "--mapper"): mapper_str = arg possible_mappers = mapper_defs.keys() # Check it later now... #if mapper_str not in possible_mappers: # print "--mapper argument was \""+mapper_str+"\"; expected one of the following: "+", ".join(possible_mappers) # sys.exit(2) if opt in ("-o", "--file-output"): file_output = False # Check for required args if (mapper_str == ""): usage() elif (mapper_str == "all"): mappers = mapper_defs.values() print mappers else: try: mapper_str_items = mapper_str.split(",") mappers = [mapper_defs.get(mapper.lower()) for mapper in mapper_str_items] except KeyError: print "Don't know this mapper given in -m option: "+mapper print "Try: ilt, merlin, maq, bwa" sys.exit(1) # if we didn't already get a single file from -h if len(comp_heads) == 0: # and there's a list on STDIN, if not sys.stdin.isatty(): # redirected from file or pipe comp_heads = [ os.path.splitext( remove_escape_codes(file).rstrip() )[0] for file in sys.stdin.readlines() ] comp_heads = [ f for f in comp_heads if len(f) > 0 ] else: # no single file AND no STDIN file list usage() print "\nFile heads to evaluate:" print "\n".join(comp_heads)+"\n" comp_heads.reverse() for comp_head in comp_heads: #[:1] for mapper in mappers: # Guess the original size of simulated indel from filename m = re.search(r'_den\.(\d+)_', comp_head) if m: if ".DELETION_" in comp_head: indel_size = -1 * int(m.groups()[0]) elif ".INSERTION_" in comp_head: indel_size = int(m.groups()[0]) else: indel_size = 0 else: sys.exit("Couldn't guess simulated read indel size from filename") print "Evaluating "+mapper.name+" results with indel size: "+str(indel_size)+" and file header: "+comp_head Comp_sam_map(comp_head, mapper, file_output, indel_size)