gatk-3.8/python/SAM.py

219 lines
9.3 KiB
Python
Raw Normal View History

import string
import operator
from pushback_file import pushback_file
# Type TAG Req? Default Description
SAMHeaderFields = \
[['HD', 'VN', True, '0.1.0', 'File format version'], \
['HD', 'SO', False, None, 'Sort order. Valid values are: unsorted, queryname or coordinate'], \
['SQ', 'SN', True, None, 'Sequence name. Unique among all sequence records in the file. The value of this field is used in alignment records.'], \
['SQ', 'LN', True, None, 'Sequence length'], \
['SQ', 'AS', False, None, 'Genome assembly identifier. Refers to the reference genome assembly in an unambiguous form. Example: HG18.'], \
['SQ', 'M5', False, None, 'MD5 checksum of the sequence in the uppercase (gaps and space are removed)'], \
['SQ', 'UR', False, None, 'URI of the sequence'], \
['SQ', 'SP', False, None, 'Species'], \
['RG', 'ID', True, 'ReadGroup1', 'Unique read group identifier. The value of the ID field is used in the RG tags of alignment records.'], \
['RG', 'SM', True, 'SampleA', 'Sample (use pool name where a pool is being sequenced)'], \
['RG', 'LB', False, None, 'Library'], \
['RG', 'DS', False, None, 'Description'], \
['RG', 'PU', False, None, 'Platform unit (e.g. lane for Illumina or slide for SOLiD)'], \
['RG', 'PI', False, None, 'Predicted median insert size (maybe different from the actual median insert size)'], \
['RG', 'CN', False, None, 'Name of sequencing center producing the read'], \
['RG', 'DT', False, None, 'Date the run was produced (ISO 8601 date or date/time)'], \
['RG', 'PL', False, None, 'Platform/technology used to produce the read'], \
['PG', 'ID', True, 'ProgramA','Program name'], \
['PG', 'VN', False, None, 'Program version'], \
['PG', 'CL', False, None, 'Command line']]
def SAMHeaderEncode( type, tag ):
return type + '.' + tag
SAMHeaderDict = dict()
for record in SAMHeaderFields:
type, tag, req, default, desc = record
SAMHeaderDict[SAMHeaderEncode(type, tag)] = [req, default, desc]
# -----------------------------------------------------------------------------------------------
#
# A SAM header is a potentially complex object, so we just punt on it. The only really required
# outputs that *must* be user specified are the SQ: SN and SQ: LN fields.
# Everything else is just split up and stored in our hash table by reading the SAMHeaderFields
# data structure (defined above). All required fields are initialized to their default values
#
# -----------------------------------------------------------------------------------------------
class SAMHeader(dict):
def __init__(self, seqName, seqLen, **keys):
# setup initial header
self.fields = dict()
for record in SAMHeaderFields:
type, tag, req, default, desc = record
self.setField( type, tag, default )
self.setField('SQ', 'SN', seqName )
self.setField('SQ', 'LN', seqLen )
# add keyword arguments
for key, value in keys.iteritems():
type, tag = key.split('.')
self.setField( type, tag, value )
def isReq( self, type, tag ):
return SAMHeaderDict[SAMHeaderEncode(type,tag)][0]
def setField( self, type, tag, value ):
#print 'Setting', type, tag, value
self.fields[SAMHeaderEncode(type, tag)] = value
def getField( self, type, tag ):
return self.fields[SAMHeaderEncode(type, tag)]
def __str__( self ):
types = ['HD', 'SQ', 'RG']
def formatType( type ):
s = '@' + type + '\t'
for record in SAMHeaderFields:
qtype, tag, req, default, desc = record
if type == qtype and self.isReq(type, tag):
value = self.getField(type, tag)
if value == None:
raise Error('Unset required SAM header ' + type + ' ' + tag)
s += tag + ':' + str(value) + '\t'
return s
return string.join(map( formatType, types ),'\n')
SAM_SEQPAIRED = 0x0001 # the read is paired in sequencing, no matter whether it is mapped in a pair
SAM_MAPPAIRED = 0x0002 # the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1
SAM_UNMAPPED = 0x0004 # the query sequence itself is unmapped
SAM_MATEUNMAPPED = 0x0008 # the mate is unmapped 1
SAM_QUERYSTRAND = 0x0010 # strand of the query (0 for forward; 1 for reverse strand)
SAM_MATESTRAND = 0x0020 # strand of the mate 1
SAM_ISFIRSTREAD = 0x0040 # the read is the first read in a pair 1,2
SAM_ISSECONDREAD = 0x0080 # the read is the second read in a pair 1,2
SAM_NOTPRIMARY = 0x0100 # the alignment is not primary (a read having split hits may have multiple primary alignment records)
def SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig = '*', pairPos = 0, insertSize = 0 ):
r = SAMRecord()
r.setValuesFromArgs( qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig, pairPos, insertSize )
return r
def SAMRecordFromString( str ):
r = SAMRecord()
r.setValuesFromString( str )
return r
def SAMFlagValue( flags, testFlag ):
return testFlag & flags
def SAMFlagIsSet( flags, testFlag ):
return SAMFlagValue(flags, testFlag) <> 0
# -----------------------------------------------------------------------------------------------
#
# This is really the meat of the SAM I/O system.
#
# setValuesFromArgs takes the required arguments for a SAM record and stores them in a list
# (in SAM ordering) so that writing the values to a string is trivial. Note the conversion
# of int quals to ASCII encoded quals.
#
# -----------------------------------------------------------------------------------------------
class SAMRecord:
def __init__( self, vals = []):
self.vals = []
def setValuesFromArgs( self, qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig = '*', pairPos = 0, insertSize = 0 ):
self.vals = [qname, self.formatFlags(flags), rname, pos, mapq, \
cigar, pairContig, pairPos, insertSize, seq.lower(), self.toASCII33(quals) ]
def setValuesFromString( self, line ):
#print 'line is', line
formats = [str, int, str, int, int, str, str, int, int, str, str]
self.vals = map( lambda f, v: f(v), formats, line.split()[0:len(formats)] )
def formatFlags( self, flags ):
b = reduce( operator.__or__, flags, 0 )
#print 'FormatFlags', flags, b
return b
def getQName(self): return self.vals[0]
def getFlags(self): return self.vals[1]
def getRname(self): return self.vals[2] # Reference sequence name (can be chr1, chr2, etc...)
def getPos(self): return self.vals[3]
def getMapq(self): return self.vals[4]
def getCigar(self): return self.vals[5] # Returns CIGAR which gives match, insertion, and other info as "75M2I"
def getSeq(self): return self.vals[9]
def getQuals(self): return self.fromASCII33(self.vals[10])
def toASCII33( self, quals ):
return string.join( map( lambda q: chr(q+33), quals ), '' )
def fromASCII33( self, qualStr ):
return map( lambda q: ord(q)-33, qualStr)
def __str__( self ):
return string.join( map( str, self.vals ), '\t')
# -----------------------------------------------------------------------------------------------
#
# Wrapper class for reading and writing records to a SAM file
#
# -----------------------------------------------------------------------------------------------
class SAMIO:
def __init__( self, fileName, header = None, debugging = False, func = None ):
self.header = header
self.debugging = debugging
if func == None:
func = lambda x: x
self.func = func
if self.header == None:
self.fileHandle = pushback_file(fileName, "r")
self.readHeader()
else:
self.fileHandle = file(fileName, "w")
self.writeHeader()
def close(self):
self.fileHandle.close()
def readHeader(self):
self.header = ""
while True:
line = self.fileHandle.readline()
if line.startswith("@"):
self.header += line
else:
self.fileHandle.pushback(line)
if self.debugging:
pass #print str(self.header)
return True
return self.header
def RecordGenerator( self ):
for line in self.fileHandle:
#print 'Reading line', line
line = line.rstrip("\n")
yield self.func(SAMRecordFromString(line))
raise StopIteration
return
def __iter__(self):
return self.RecordGenerator()
def writeRecord( self, recordIterator ):
for record in recordIterator:
self.writeRecord(record)
def writeRecord( self, record ):
if self.debugging:
print record
print >> self.fileHandle, record
return True
def writeHeader( self ):
if self.debugging:
print str(self.header)
print >> self.fileHandle, str(self.header)
return True