2009-02-28 23:28:56 +08:00
|
|
|
import string
|
|
|
|
|
import operator
|
|
|
|
|
from pushback_file import pushback_file
|
|
|
|
|
|
|
|
|
|
# Type TAG Req? Default Description
|
|
|
|
|
SAMHeaderFields = \
|
|
|
|
|
[['HD', 'VN', True, '0.1.0', 'File format version'], \
|
|
|
|
|
['HD', 'SO', False, None, 'Sort order. Valid values are: unsorted, queryname or coordinate'], \
|
|
|
|
|
['SQ', 'SN', True, None, 'Sequence name. Unique among all sequence records in the file. The value of this field is used in alignment records.'], \
|
|
|
|
|
['SQ', 'LN', True, None, 'Sequence length'], \
|
|
|
|
|
['SQ', 'AS', False, None, 'Genome assembly identifier. Refers to the reference genome assembly in an unambiguous form. Example: HG18.'], \
|
|
|
|
|
['SQ', 'M5', False, None, 'MD5 checksum of the sequence in the uppercase (gaps and space are removed)'], \
|
|
|
|
|
['SQ', 'UR', False, None, 'URI of the sequence'], \
|
|
|
|
|
['SQ', 'SP', False, None, 'Species'], \
|
|
|
|
|
['RG', 'ID', True, 'ReadGroup1', 'Unique read group identifier. The value of the ID field is used in the RG tags of alignment records.'], \
|
|
|
|
|
['RG', 'SM', True, 'SampleA', 'Sample (use pool name where a pool is being sequenced)'], \
|
|
|
|
|
['RG', 'LB', False, None, 'Library'], \
|
|
|
|
|
['RG', 'DS', False, None, 'Description'], \
|
|
|
|
|
['RG', 'PU', False, None, 'Platform unit (e.g. lane for Illumina or slide for SOLiD)'], \
|
|
|
|
|
['RG', 'PI', False, None, 'Predicted median insert size (maybe different from the actual median insert size)'], \
|
|
|
|
|
['RG', 'CN', False, None, 'Name of sequencing center producing the read'], \
|
|
|
|
|
['RG', 'DT', False, None, 'Date the run was produced (ISO 8601 date or date/time)'], \
|
|
|
|
|
['RG', 'PL', False, None, 'Platform/technology used to produce the read'], \
|
|
|
|
|
['PG', 'ID', True, 'ProgramA','Program name'], \
|
|
|
|
|
['PG', 'VN', False, None, 'Program version'], \
|
|
|
|
|
['PG', 'CL', False, None, 'Command line']]
|
|
|
|
|
|
|
|
|
|
def SAMHeaderEncode( type, tag ):
|
|
|
|
|
return type + '.' + tag
|
|
|
|
|
|
|
|
|
|
SAMHeaderDict = dict()
|
|
|
|
|
for record in SAMHeaderFields:
|
|
|
|
|
type, tag, req, default, desc = record
|
|
|
|
|
SAMHeaderDict[SAMHeaderEncode(type, tag)] = [req, default, desc]
|
|
|
|
|
|
|
|
|
|
# -----------------------------------------------------------------------------------------------
|
|
|
|
|
#
|
|
|
|
|
# A SAM header is a potentially complex object, so we just punt on it. The only really required
|
|
|
|
|
# outputs that *must* be user specified are the SQ: SN and SQ: LN fields.
|
|
|
|
|
# Everything else is just split up and stored in our hash table by reading the SAMHeaderFields
|
|
|
|
|
# data structure (defined above). All required fields are initialized to their default values
|
|
|
|
|
#
|
|
|
|
|
# -----------------------------------------------------------------------------------------------
|
|
|
|
|
class SAMHeader(dict):
|
|
|
|
|
def __init__(self, seqName, seqLen, **keys):
|
|
|
|
|
# setup initial header
|
|
|
|
|
self.fields = dict()
|
|
|
|
|
for record in SAMHeaderFields:
|
|
|
|
|
type, tag, req, default, desc = record
|
|
|
|
|
self.setField( type, tag, default )
|
|
|
|
|
|
|
|
|
|
self.setField('SQ', 'SN', seqName )
|
|
|
|
|
self.setField('SQ', 'LN', seqLen )
|
|
|
|
|
|
|
|
|
|
# add keyword arguments
|
|
|
|
|
for key, value in keys.iteritems():
|
|
|
|
|
type, tag = key.split('.')
|
|
|
|
|
self.setField( type, tag, value )
|
|
|
|
|
|
|
|
|
|
def isReq( self, type, tag ):
|
|
|
|
|
return SAMHeaderDict[SAMHeaderEncode(type,tag)][0]
|
|
|
|
|
|
|
|
|
|
def setField( self, type, tag, value ):
|
|
|
|
|
#print 'Setting', type, tag, value
|
|
|
|
|
self.fields[SAMHeaderEncode(type, tag)] = value
|
|
|
|
|
|
|
|
|
|
def getField( self, type, tag ):
|
|
|
|
|
return self.fields[SAMHeaderEncode(type, tag)]
|
|
|
|
|
|
|
|
|
|
def __str__( self ):
|
|
|
|
|
types = ['HD', 'SQ', 'RG']
|
|
|
|
|
|
|
|
|
|
def formatType( type ):
|
|
|
|
|
s = '@' + type + '\t'
|
|
|
|
|
for record in SAMHeaderFields:
|
|
|
|
|
qtype, tag, req, default, desc = record
|
|
|
|
|
if type == qtype and self.isReq(type, tag):
|
|
|
|
|
value = self.getField(type, tag)
|
|
|
|
|
if value == None:
|
|
|
|
|
raise Error('Unset required SAM header ' + type + ' ' + tag)
|
|
|
|
|
s += tag + ':' + str(value) + '\t'
|
|
|
|
|
return s
|
|
|
|
|
return string.join(map( formatType, types ),'\n')
|
|
|
|
|
|
|
|
|
|
SAM_SEQPAIRED = 0x0001 # the read is paired in sequencing, no matter whether it is mapped in a pair
|
|
|
|
|
SAM_MAPPAIRED = 0x0002 # the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1
|
|
|
|
|
SAM_UNMAPPED = 0x0004 # the query sequence itself is unmapped
|
|
|
|
|
SAM_MATEUNMAPPED = 0x0008 # the mate is unmapped 1
|
|
|
|
|
SAM_QUERYSTRAND = 0x0010 # strand of the query (0 for forward; 1 for reverse strand)
|
|
|
|
|
SAM_MATESTRAND = 0x0020 # strand of the mate 1
|
|
|
|
|
SAM_ISFIRSTREAD = 0x0040 # the read is the first read in a pair 1,2
|
|
|
|
|
SAM_ISSECONDREAD = 0x0080 # the read is the second read in a pair 1,2
|
|
|
|
|
SAM_NOTPRIMARY = 0x0100 # the alignment is not primary (a read having split hits may have multiple primary alignment records)
|
|
|
|
|
|
2009-05-19 05:18:51 +08:00
|
|
|
SAM_FLAGS = {
|
|
|
|
|
SAM_SEQPAIRED : 'the read is paired in sequencing, no matter whether it is mapped in a pair',
|
|
|
|
|
SAM_MAPPAIRED : 'the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1',
|
|
|
|
|
SAM_UNMAPPED : 'the query sequence itself is unmapped',
|
|
|
|
|
SAM_MATEUNMAPPED : 'the mate is unmapped 1',
|
|
|
|
|
SAM_QUERYSTRAND : 'strand of the query (0 for forward; 1 for reverse strand)' ,
|
|
|
|
|
SAM_MATESTRAND : 'strand of the mate 1' ,
|
|
|
|
|
SAM_ISFIRSTREAD : 'the read is the first read in a pair 1,2' ,
|
|
|
|
|
SAM_ISSECONDREAD : 'the read is the second read in a pair 1,2',
|
|
|
|
|
SAM_NOTPRIMARY : 'the alignment is not primary (a read having split hits may have multiple primary alignment records)'
|
|
|
|
|
}
|
|
|
|
|
|
2009-02-28 23:28:56 +08:00
|
|
|
def SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig = '*', pairPos = 0, insertSize = 0 ):
|
|
|
|
|
r = SAMRecord()
|
|
|
|
|
r.setValuesFromArgs( qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig, pairPos, insertSize )
|
|
|
|
|
return r
|
|
|
|
|
|
|
|
|
|
def SAMRecordFromString( str ):
|
|
|
|
|
r = SAMRecord()
|
|
|
|
|
r.setValuesFromString( str )
|
|
|
|
|
return r
|
|
|
|
|
|
|
|
|
|
def SAMFlagValue( flags, testFlag ):
|
|
|
|
|
return testFlag & flags
|
2009-05-19 05:18:51 +08:00
|
|
|
|
2009-02-28 23:28:56 +08:00
|
|
|
def SAMFlagIsSet( flags, testFlag ):
|
|
|
|
|
return SAMFlagValue(flags, testFlag) <> 0
|
|
|
|
|
|
2009-05-19 05:18:51 +08:00
|
|
|
def SAMFlagsDescs( flags ):
|
|
|
|
|
def keepMe(p):
|
|
|
|
|
flagKey, flagDesc = p
|
|
|
|
|
return [flagKey, SAMFlagIsSet(flags, flagKey), flagDesc]
|
|
|
|
|
return sorted(map( keepMe, SAM_FLAGS.iteritems() ))
|
|
|
|
|
|
2009-02-28 23:28:56 +08:00
|
|
|
# -----------------------------------------------------------------------------------------------
|
|
|
|
|
#
|
|
|
|
|
# This is really the meat of the SAM I/O system.
|
|
|
|
|
#
|
|
|
|
|
# setValuesFromArgs takes the required arguments for a SAM record and stores them in a list
|
|
|
|
|
# (in SAM ordering) so that writing the values to a string is trivial. Note the conversion
|
|
|
|
|
# of int quals to ASCII encoded quals.
|
|
|
|
|
#
|
|
|
|
|
# -----------------------------------------------------------------------------------------------
|
|
|
|
|
class SAMRecord:
|
|
|
|
|
def __init__( self, vals = []):
|
|
|
|
|
self.vals = []
|
|
|
|
|
|
|
|
|
|
def setValuesFromArgs( self, qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig = '*', pairPos = 0, insertSize = 0 ):
|
|
|
|
|
self.vals = [qname, self.formatFlags(flags), rname, pos, mapq, \
|
|
|
|
|
cigar, pairContig, pairPos, insertSize, seq.lower(), self.toASCII33(quals) ]
|
|
|
|
|
|
|
|
|
|
def setValuesFromString( self, line ):
|
|
|
|
|
#print 'line is', line
|
|
|
|
|
formats = [str, int, str, int, int, str, str, int, int, str, str]
|
|
|
|
|
self.vals = map( lambda f, v: f(v), formats, line.split()[0:len(formats)] )
|
|
|
|
|
|
|
|
|
|
def formatFlags( self, flags ):
|
|
|
|
|
b = reduce( operator.__or__, flags, 0 )
|
|
|
|
|
#print 'FormatFlags', flags, b
|
|
|
|
|
return b
|
|
|
|
|
|
|
|
|
|
def getQName(self): return self.vals[0]
|
|
|
|
|
def getFlags(self): return self.vals[1]
|
|
|
|
|
def getRname(self): return self.vals[2] # Reference sequence name (can be chr1, chr2, etc...)
|
|
|
|
|
def getPos(self): return self.vals[3]
|
|
|
|
|
def getMapq(self): return self.vals[4]
|
|
|
|
|
def getCigar(self): return self.vals[5] # Returns CIGAR which gives match, insertion, and other info as "75M2I"
|
|
|
|
|
def getSeq(self): return self.vals[9]
|
|
|
|
|
def getQuals(self): return self.fromASCII33(self.vals[10])
|
|
|
|
|
|
|
|
|
|
def toASCII33( self, quals ):
|
|
|
|
|
return string.join( map( lambda q: chr(q+33), quals ), '' )
|
|
|
|
|
|
|
|
|
|
def fromASCII33( self, qualStr ):
|
|
|
|
|
return map( lambda q: ord(q)-33, qualStr)
|
|
|
|
|
|
|
|
|
|
def __str__( self ):
|
|
|
|
|
return string.join( map( str, self.vals ), '\t')
|
|
|
|
|
|
|
|
|
|
# -----------------------------------------------------------------------------------------------
|
|
|
|
|
#
|
|
|
|
|
# Wrapper class for reading and writing records to a SAM file
|
|
|
|
|
#
|
|
|
|
|
# -----------------------------------------------------------------------------------------------
|
|
|
|
|
class SAMIO:
|
|
|
|
|
def __init__( self, fileName, header = None, debugging = False, func = None ):
|
|
|
|
|
self.header = header
|
|
|
|
|
self.debugging = debugging
|
|
|
|
|
|
|
|
|
|
if func == None:
|
|
|
|
|
func = lambda x: x
|
|
|
|
|
self.func = func
|
|
|
|
|
|
|
|
|
|
if self.header == None:
|
|
|
|
|
self.fileHandle = pushback_file(fileName, "r")
|
|
|
|
|
self.readHeader()
|
|
|
|
|
else:
|
|
|
|
|
self.fileHandle = file(fileName, "w")
|
|
|
|
|
self.writeHeader()
|
|
|
|
|
|
|
|
|
|
def close(self):
|
|
|
|
|
self.fileHandle.close()
|
|
|
|
|
|
|
|
|
|
def readHeader(self):
|
|
|
|
|
self.header = ""
|
|
|
|
|
while True:
|
|
|
|
|
line = self.fileHandle.readline()
|
|
|
|
|
if line.startswith("@"):
|
|
|
|
|
self.header += line
|
|
|
|
|
else:
|
|
|
|
|
self.fileHandle.pushback(line)
|
|
|
|
|
if self.debugging:
|
|
|
|
|
pass #print str(self.header)
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
return self.header
|
|
|
|
|
|
|
|
|
|
def RecordGenerator( self ):
|
|
|
|
|
for line in self.fileHandle:
|
|
|
|
|
#print 'Reading line', line
|
|
|
|
|
line = line.rstrip("\n")
|
|
|
|
|
yield self.func(SAMRecordFromString(line))
|
|
|
|
|
raise StopIteration
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
|
return self.RecordGenerator()
|
|
|
|
|
|
|
|
|
|
def writeRecord( self, recordIterator ):
|
|
|
|
|
for record in recordIterator:
|
|
|
|
|
self.writeRecord(record)
|
|
|
|
|
|
|
|
|
|
def writeRecord( self, record ):
|
|
|
|
|
if self.debugging:
|
|
|
|
|
print record
|
|
|
|
|
print >> self.fileHandle, record
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
def writeHeader( self ):
|
|
|
|
|
if self.debugging:
|
|
|
|
|
print str(self.header)
|
|
|
|
|
print >> self.fileHandle, str(self.header)
|
|
|
|
|
return True
|