import string import operator from pushback_file import pushback_file # Type TAG Req? Default Description SAMHeaderFields = \ [['HD', 'VN', True, '0.1.0', 'File format version'], \ ['HD', 'SO', False, None, 'Sort order. Valid values are: unsorted, queryname or coordinate'], \ ['SQ', 'SN', True, None, 'Sequence name. Unique among all sequence records in the file. The value of this field is used in alignment records.'], \ ['SQ', 'LN', True, None, 'Sequence length'], \ ['SQ', 'AS', False, None, 'Genome assembly identifier. Refers to the reference genome assembly in an unambiguous form. Example: HG18.'], \ ['SQ', 'M5', False, None, 'MD5 checksum of the sequence in the uppercase (gaps and space are removed)'], \ ['SQ', 'UR', False, None, 'URI of the sequence'], \ ['SQ', 'SP', False, None, 'Species'], \ ['RG', 'ID', True, 'ReadGroup1', 'Unique read group identifier. The value of the ID field is used in the RG tags of alignment records.'], \ ['RG', 'SM', True, 'SampleA', 'Sample (use pool name where a pool is being sequenced)'], \ ['RG', 'LB', False, None, 'Library'], \ ['RG', 'DS', False, None, 'Description'], \ ['RG', 'PU', False, None, 'Platform unit (e.g. lane for Illumina or slide for SOLiD)'], \ ['RG', 'PI', False, None, 'Predicted median insert size (maybe different from the actual median insert size)'], \ ['RG', 'CN', False, None, 'Name of sequencing center producing the read'], \ ['RG', 'DT', False, None, 'Date the run was produced (ISO 8601 date or date/time)'], \ ['RG', 'PL', False, None, 'Platform/technology used to produce the read'], \ ['PG', 'ID', True, 'ProgramA','Program name'], \ ['PG', 'VN', False, None, 'Program version'], \ ['PG', 'CL', False, None, 'Command line']] def SAMHeaderEncode( type, tag ): return type + '.' + tag SAMHeaderDict = dict() for record in SAMHeaderFields: type, tag, req, default, desc = record SAMHeaderDict[SAMHeaderEncode(type, tag)] = [req, default, desc] # ----------------------------------------------------------------------------------------------- # # A SAM header is a potentially complex object, so we just punt on it. The only really required # outputs that *must* be user specified are the SQ: SN and SQ: LN fields. # Everything else is just split up and stored in our hash table by reading the SAMHeaderFields # data structure (defined above). All required fields are initialized to their default values # # ----------------------------------------------------------------------------------------------- class SAMHeader(dict): def __init__(self, seqName, seqLen, **keys): # setup initial header self.fields = dict() for record in SAMHeaderFields: type, tag, req, default, desc = record self.setField( type, tag, default ) self.setField('SQ', 'SN', seqName ) self.setField('SQ', 'LN', seqLen ) # add keyword arguments for key, value in keys.iteritems(): type, tag = key.split('.') self.setField( type, tag, value ) def isReq( self, type, tag ): return SAMHeaderDict[SAMHeaderEncode(type,tag)][0] def setField( self, type, tag, value ): #print 'Setting', type, tag, value self.fields[SAMHeaderEncode(type, tag)] = value def getField( self, type, tag ): return self.fields[SAMHeaderEncode(type, tag)] def __str__( self ): types = ['HD', 'SQ', 'RG'] def formatType( type ): s = '@' + type + '\t' for record in SAMHeaderFields: qtype, tag, req, default, desc = record if type == qtype and self.isReq(type, tag): value = self.getField(type, tag) if value == None: raise Error('Unset required SAM header ' + type + ' ' + tag) s += tag + ':' + str(value) + '\t' return s return string.join(map( formatType, types ),'\n') SAM_SEQPAIRED = 0x0001 # the read is paired in sequencing, no matter whether it is mapped in a pair SAM_MAPPAIRED = 0x0002 # the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1 SAM_UNMAPPED = 0x0004 # the query sequence itself is unmapped SAM_MATEUNMAPPED = 0x0008 # the mate is unmapped 1 SAM_QUERYSTRAND = 0x0010 # strand of the query (0 for forward; 1 for reverse strand) SAM_MATESTRAND = 0x0020 # strand of the mate 1 SAM_ISFIRSTREAD = 0x0040 # the read is the first read in a pair 1,2 SAM_ISSECONDREAD = 0x0080 # the read is the second read in a pair 1,2 SAM_NOTPRIMARY = 0x0100 # the alignment is not primary (a read having split hits may have multiple primary alignment records) SAM_FLAGS = { SAM_SEQPAIRED : 'the read is paired in sequencing, no matter whether it is mapped in a pair', SAM_MAPPAIRED : 'the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1', SAM_UNMAPPED : 'the query sequence itself is unmapped', SAM_MATEUNMAPPED : 'the mate is unmapped 1', SAM_QUERYSTRAND : 'strand of the query (0 for forward; 1 for reverse strand)' , SAM_MATESTRAND : 'strand of the mate 1' , SAM_ISFIRSTREAD : 'the read is the first read in a pair 1,2' , SAM_ISSECONDREAD : 'the read is the second read in a pair 1,2', SAM_NOTPRIMARY : 'the alignment is not primary (a read having split hits may have multiple primary alignment records)' } def SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig = '*', pairPos = 0, insertSize = 0 ): r = SAMRecord() r.setValuesFromArgs( qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig, pairPos, insertSize ) return r def SAMRecordFromString( str ): r = SAMRecord() r.setValuesFromString( str ) return r def SAMFlagValue( flags, testFlag ): return testFlag & flags def SAMFlagIsSet( flags, testFlag ): return SAMFlagValue(flags, testFlag) <> 0 def SAMFlagsDescs( flags ): def keepMe(p): flagKey, flagDesc = p return [flagKey, SAMFlagIsSet(flags, flagKey), flagDesc] return sorted(map( keepMe, SAM_FLAGS.iteritems() )) # ----------------------------------------------------------------------------------------------- # # This is really the meat of the SAM I/O system. # # setValuesFromArgs takes the required arguments for a SAM record and stores them in a list # (in SAM ordering) so that writing the values to a string is trivial. Note the conversion # of int quals to ASCII encoded quals. # # ----------------------------------------------------------------------------------------------- class SAMRecord: def __init__( self, vals = []): self.vals = [] def setValuesFromArgs( self, qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig = '*', pairPos = 0, insertSize = 0 ): self.vals = [qname, self.formatFlags(flags), rname, pos, mapq, \ cigar, pairContig, pairPos, insertSize, seq.lower(), self.toASCII33(quals) ] def setValuesFromString( self, line ): #print 'line is', line formats = [str, int, str, int, int, str, str, int, int, str, str] self.vals = map( lambda f, v: f(v), formats, line.split()[0:len(formats)] ) def formatFlags( self, flags ): b = reduce( operator.__or__, flags, 0 ) #print 'FormatFlags', flags, b return b def getQName(self): return self.vals[0] def getFlags(self): return self.vals[1] def getRname(self): return self.vals[2] # Reference sequence name (can be chr1, chr2, etc...) def getPos(self): return self.vals[3] def getMapq(self): return self.vals[4] def getCigar(self): return self.vals[5] # Returns CIGAR which gives match, insertion, and other info as "75M2I" def getSeq(self): return self.vals[9] def getQuals(self): return self.fromASCII33(self.vals[10]) def toASCII33( self, quals ): return string.join( map( lambda q: chr(q+33), quals ), '' ) def fromASCII33( self, qualStr ): return map( lambda q: ord(q)-33, qualStr) def __str__( self ): return string.join( map( str, self.vals ), '\t') # ----------------------------------------------------------------------------------------------- # # Wrapper class for reading and writing records to a SAM file # # ----------------------------------------------------------------------------------------------- class SAMIO: def __init__( self, fileName, header = None, debugging = False, func = None ): self.header = header self.debugging = debugging if func == None: func = lambda x: x self.func = func if self.header == None: self.fileHandle = pushback_file(fileName, "r") self.readHeader() else: self.fileHandle = file(fileName, "w") self.writeHeader() def close(self): self.fileHandle.close() def readHeader(self): self.header = "" while True: line = self.fileHandle.readline() if line.startswith("@"): self.header += line else: self.fileHandle.pushback(line) if self.debugging: pass #print str(self.header) return True return self.header def RecordGenerator( self ): for line in self.fileHandle: #print 'Reading line', line line = line.rstrip("\n") yield self.func(SAMRecordFromString(line)) raise StopIteration return def __iter__(self): return self.RecordGenerator() def writeRecord( self, recordIterator ): for record in recordIterator: self.writeRecord(record) def writeRecord( self, record ): if self.debugging: print record print >> self.fileHandle, record return True def writeHeader( self ): if self.debugging: print str(self.header) print >> self.fileHandle, str(self.header) return True