gatk-3.8/python/SAM.py

import string
import operator
from pushback_file import pushback_file

#   Type   TAG   Req?   Default     Description
SAMHeaderFields = \
    [['HD', 'VN', True,  '0.1.0',   'File format version'], \
     ['HD', 'SO', False, None,      'Sort order. Valid values are: unsorted, queryname or coordinate'], \
     ['SQ', 'SN', True,  None,      'Sequence name. Unique among all sequence records in the file. The value of this field is used in alignment records.'], \
     ['SQ', 'LN', True,  None,      'Sequence length'], \
     ['SQ', 'AS', False, None,      'Genome assembly identifier. Refers to the reference genome assembly in an unambiguous form. Example: HG18.'], \
     ['SQ', 'M5', False, None,      'MD5 checksum of the sequence in the uppercase (gaps and space are removed)'], \
     ['SQ', 'UR', False, None,      'URI of the sequence'],  \
     ['SQ', 'SP', False, None,      'Species'], \
     ['RG', 'ID', True, 'ReadGroup1', 'Unique read group identifier. The value of the ID field is used in the RG tags of alignment records.'], \
     ['RG', 'SM', True, 'SampleA',  'Sample (use pool name where a pool is being sequenced)'], \
     ['RG', 'LB', False, None,      'Library'], \
     ['RG', 'DS', False, None,      'Description'], \
     ['RG', 'PU', False, None,      'Platform unit (e.g. lane for Illumina or slide for SOLiD)'], \
     ['RG', 'PI', False, None,      'Predicted median insert size (maybe different from the actual median insert size)'], \
     ['RG', 'CN', False, None,      'Name of sequencing center producing the read'], \
     ['RG', 'DT', False, None,      'Date the run was produced (ISO 8601 date or date/time)'], \
     ['RG', 'PL', False, None,      'Platform/technology used to produce the read'], \
     ['PG', 'ID', True,  'ProgramA','Program name'], \
     ['PG', 'VN', False, None,      'Program version'], \
     ['PG', 'CL', False, None,      'Command line']] 

def SAMHeaderEncode( type, tag ):
    return type + '.' + tag

SAMHeaderDict = dict()
for record in SAMHeaderFields:
    type, tag, req, default, desc = record
    SAMHeaderDict[SAMHeaderEncode(type, tag)] = [req, default, desc]

# -----------------------------------------------------------------------------------------------
#
# A SAM header is a potentially complex object, so we just punt on it.  The only really required
# outputs that *must* be user specified are the SQ: SN and SQ: LN fields.  
# Everything else is just split up and stored in our hash table by reading the SAMHeaderFields
# data structure (defined above).  All required fields are initialized to their default values
#
# -----------------------------------------------------------------------------------------------
class SAMHeader(dict):
    def __init__(self, seqName, seqLen, **keys):
        # setup initial header
        self.fields = dict()
        for record in SAMHeaderFields:
            type, tag, req, default, desc = record
            self.setField( type, tag, default )
        
        self.setField('SQ', 'SN', seqName )
        self.setField('SQ', 'LN', seqLen )
                        
        # add keyword arguments
        for key, value in keys.iteritems():
            type, tag = key.split('.')
            self.setField( type, tag, value )
        
    def isReq( self, type, tag ):
        return SAMHeaderDict[SAMHeaderEncode(type,tag)][0]
            
    def setField( self, type, tag, value ):
        #print 'Setting', type, tag, value
        self.fields[SAMHeaderEncode(type, tag)] = value
        
    def getField( self, type, tag ):
        return self.fields[SAMHeaderEncode(type, tag)]
        
    def __str__( self ):
        types = ['HD', 'SQ', 'RG']
        
        def formatType( type ):
            s = '@' + type + '\t'
            for record in SAMHeaderFields:
                qtype, tag, req, default, desc = record
                if type == qtype and self.isReq(type, tag):
                    value = self.getField(type, tag)
                    if value == None:
                        raise Error('Unset required SAM header ' + type + ' ' + tag)
                    s += tag + ':' + str(value) + '\t'
            return s
        return string.join(map( formatType, types ),'\n')

SAM_SEQPAIRED    = 0x0001 # the read is paired in sequencing, no matter whether it is mapped in a pair
SAM_MAPPAIRED    = 0x0002 # the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1 
SAM_UNMAPPED     = 0x0004 # the query sequence itself is unmapped 
SAM_MATEUNMAPPED = 0x0008 # the mate is unmapped 1 
SAM_QUERYSTRAND  = 0x0010 # strand of the query (0 for forward; 1 for reverse strand) 
SAM_MATESTRAND   = 0x0020 # strand of the mate 1 
SAM_ISFIRSTREAD  = 0x0040 # the read is the first read in a pair 1,2 
SAM_ISSECONDREAD = 0x0080 # the read is the second read in a pair 1,2 
SAM_NOTPRIMARY   = 0x0100 # the alignment is not primary (a read having split hits may have multiple primary alignment records) 

SAM_FLAGS = {
    SAM_SEQPAIRED       : 'the read is paired in sequencing, no matter whether it is mapped in a pair',
    SAM_MAPPAIRED       : 'the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1', 
    SAM_UNMAPPED        : 'the query sequence itself is unmapped',
    SAM_MATEUNMAPPED    : 'the mate is unmapped 1', 
    SAM_QUERYSTRAND     : 'strand of the query (0 for forward; 1 for reverse strand)' ,
    SAM_MATESTRAND      : 'strand of the mate 1' ,
    SAM_ISFIRSTREAD     : 'the read is the first read in a pair 1,2' ,
    SAM_ISSECONDREAD    : 'the read is the second read in a pair 1,2',
    SAM_NOTPRIMARY      : 'the alignment is not primary (a read having split hits may have multiple primary alignment records)'
    }
   
def SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig = '*', pairPos = 0, insertSize = 0 ):
    r = SAMRecord()
    r.setValuesFromArgs( qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig, pairPos, insertSize )
    return r

def SAMRecordFromString( str ):
    r = SAMRecord()
    r.setValuesFromString( str )
    return r
    
def SAMFlagValue( flags, testFlag ):
    return testFlag & flags

def SAMFlagIsSet( flags, testFlag ):
    return SAMFlagValue(flags, testFlag) <> 0

def SAMFlagsDescs( flags ):
    def keepMe(p):
        flagKey, flagDesc = p
        return [flagKey, SAMFlagIsSet(flags, flagKey), flagDesc]
    return sorted(map( keepMe, SAM_FLAGS.iteritems() ))            

# -----------------------------------------------------------------------------------------------
#
# This is really the meat of the SAM I/O system.  
#
# setValuesFromArgs takes the required arguments for a SAM record and stores them in a list
# (in SAM ordering) so that writing the values to a string is trivial.  Note the conversion
# of int quals to ASCII encoded quals.  
#
# -----------------------------------------------------------------------------------------------
class SAMRecord:
    def __init__( self, vals = []):
        self.vals = []
    
    def setValuesFromArgs( self, qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig = '*', pairPos = 0, insertSize = 0 ):
        self.vals = [qname, self.formatFlags(flags), rname, pos, mapq, \
                     cigar, pairContig, pairPos, insertSize, seq.lower(), self.toASCII33(quals) ]

    def setValuesFromString( self, line ):
        #print 'line is', line
        formats = [str, int, str, int, int, str, str, int, int, str, str]
        self.vals = map( lambda f, v: f(v), formats, line.split()[0:len(formats)] )

    def formatFlags( self, flags ):
        b = reduce( operator.__or__, flags, 0 )
        #print 'FormatFlags', flags, b
        return b
        
    def getQName(self): return self.vals[0]
    def getFlags(self): return self.vals[1]
    def getRname(self): return self.vals[2] # Reference sequence name (can be chr1, chr2, etc...)
    def getPos(self): return self.vals[3]
    def getMapq(self): return self.vals[4]
    def getCigar(self): return self.vals[5] # Returns CIGAR which gives match, insertion, and other info as "75M2I"
    def getSeq(self): return self.vals[9]
    def getQuals(self): return self.fromASCII33(self.vals[10])

    def toASCII33( self, quals ):
        return string.join( map( lambda q: chr(q+33), quals ), '' )

    def fromASCII33( self, qualStr ):
        return map( lambda q: ord(q)-33, qualStr) 

    def __str__( self ):
        return string.join( map( str, self.vals ), '\t')

# -----------------------------------------------------------------------------------------------
#
# Wrapper class for reading and writing records to a SAM file
#
# -----------------------------------------------------------------------------------------------
class SAMIO:
    def __init__( self, fileName, header = None, debugging = False, func = None ):
        self.header = header
        self.debugging = debugging
        
        if func == None:
            func = lambda x: x
        self.func = func
        
        if self.header == None:
            self.fileHandle = pushback_file(fileName, "r")
            self.readHeader()
        else:
            self.fileHandle = file(fileName, "w")
            self.writeHeader()
            
    def close(self):
        self.fileHandle.close()
        
    def readHeader(self):
        self.header = ""
        while True:
            line = self.fileHandle.readline()
            if line.startswith("@"):
                self.header += line
            else:
                self.fileHandle.pushback(line)
                if self.debugging:
                    pass #print str(self.header)
                return True

        return self.header

    def RecordGenerator( self ):
        for line in self.fileHandle:
            #print 'Reading line', line
            line = line.rstrip("\n")
            yield self.func(SAMRecordFromString(line))
        raise StopIteration
        return
        
    def __iter__(self):
        return self.RecordGenerator()
        
    def writeRecord( self, recordIterator ):
        for record in recordIterator:
            self.writeRecord(record)
            
    def writeRecord( self, record ):
        if self.debugging:
            print record
        print >> self.fileHandle, record
        return True
        
    def writeHeader( self ):
        if self.debugging:
            print str(self.header)
        print >> self.fileHandle, str(self.header)
        return True
Basic reorganization of tree git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@8 348d0f76-0448-11de-a6fe-93d51630548a 2009-02-28 23:28:56 +08:00			`import string`
			`import operator`
			`from pushback_file import pushback_file`

			`# Type TAG Req? Default Description`
			`SAMHeaderFields = \`
			`[['HD', 'VN', True, '0.1.0', 'File format version'], \`
			`['HD', 'SO', False, None, 'Sort order. Valid values are: unsorted, queryname or coordinate'], \`
			`['SQ', 'SN', True, None, 'Sequence name. Unique among all sequence records in the file. The value of this field is used in alignment records.'], \`
			`['SQ', 'LN', True, None, 'Sequence length'], \`
			`['SQ', 'AS', False, None, 'Genome assembly identifier. Refers to the reference genome assembly in an unambiguous form. Example: HG18.'], \`
			`['SQ', 'M5', False, None, 'MD5 checksum of the sequence in the uppercase (gaps and space are removed)'], \`
			`['SQ', 'UR', False, None, 'URI of the sequence'], \`
			`['SQ', 'SP', False, None, 'Species'], \`
			`['RG', 'ID', True, 'ReadGroup1', 'Unique read group identifier. The value of the ID field is used in the RG tags of alignment records.'], \`
			`['RG', 'SM', True, 'SampleA', 'Sample (use pool name where a pool is being sequenced)'], \`
			`['RG', 'LB', False, None, 'Library'], \`
			`['RG', 'DS', False, None, 'Description'], \`
			`['RG', 'PU', False, None, 'Platform unit (e.g. lane for Illumina or slide for SOLiD)'], \`
			`['RG', 'PI', False, None, 'Predicted median insert size (maybe different from the actual median insert size)'], \`
			`['RG', 'CN', False, None, 'Name of sequencing center producing the read'], \`
			`['RG', 'DT', False, None, 'Date the run was produced (ISO 8601 date or date/time)'], \`
			`['RG', 'PL', False, None, 'Platform/technology used to produce the read'], \`
			`['PG', 'ID', True, 'ProgramA','Program name'], \`
			`['PG', 'VN', False, None, 'Program version'], \`
			`['PG', 'CL', False, None, 'Command line']]`

			`def SAMHeaderEncode( type, tag ):`
			`return type + '.' + tag`

			`SAMHeaderDict = dict()`
			`for record in SAMHeaderFields:`
			`type, tag, req, default, desc = record`
			`SAMHeaderDict[SAMHeaderEncode(type, tag)] = [req, default, desc]`

			`# -----------------------------------------------------------------------------------------------`
			`#`
			`# A SAM header is a potentially complex object, so we just punt on it. The only really required`
			`# outputs that must be user specified are the SQ: SN and SQ: LN fields.`
			`# Everything else is just split up and stored in our hash table by reading the SAMHeaderFields`
			`# data structure (defined above). All required fields are initialized to their default values`
			`#`
			`# -----------------------------------------------------------------------------------------------`
			`class SAMHeader(dict):`
			`def __init__(self, seqName, seqLen, **keys):`
			`# setup initial header`
			`self.fields = dict()`
			`for record in SAMHeaderFields:`
			`type, tag, req, default, desc = record`
			`self.setField( type, tag, default )`

			`self.setField('SQ', 'SN', seqName )`
			`self.setField('SQ', 'LN', seqLen )`

			`# add keyword arguments`
			`for key, value in keys.iteritems():`
			`type, tag = key.split('.')`
			`self.setField( type, tag, value )`

			`def isReq( self, type, tag ):`
			`return SAMHeaderDict[SAMHeaderEncode(type,tag)][0]`

			`def setField( self, type, tag, value ):`
			`#print 'Setting', type, tag, value`
			`self.fields[SAMHeaderEncode(type, tag)] = value`

			`def getField( self, type, tag ):`
			`return self.fields[SAMHeaderEncode(type, tag)]`

			`def __str__( self ):`
			`types = ['HD', 'SQ', 'RG']`

			`def formatType( type ):`
			`s = '@' + type + '\t'`
			`for record in SAMHeaderFields:`
			`qtype, tag, req, default, desc = record`
			`if type == qtype and self.isReq(type, tag):`
			`value = self.getField(type, tag)`
			`if value == None:`
			`raise Error('Unset required SAM header ' + type + ' ' + tag)`
			`s += tag + ':' + str(value) + '\t'`
			`return s`
			`return string.join(map( formatType, types ),'\n')`

			`SAM_SEQPAIRED = 0x0001 # the read is paired in sequencing, no matter whether it is mapped in a pair`
			`SAM_MAPPAIRED = 0x0002 # the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1`
			`SAM_UNMAPPED = 0x0004 # the query sequence itself is unmapped`
			`SAM_MATEUNMAPPED = 0x0008 # the mate is unmapped 1`
			`SAM_QUERYSTRAND = 0x0010 # strand of the query (0 for forward; 1 for reverse strand)`
			`SAM_MATESTRAND = 0x0020 # strand of the mate 1`
			`SAM_ISFIRSTREAD = 0x0040 # the read is the first read in a pair 1,2`
			`SAM_ISSECONDREAD = 0x0080 # the read is the second read in a pair 1,2`
			`SAM_NOTPRIMARY = 0x0100 # the alignment is not primary (a read having split hits may have multiple primary alignment records)`

Better merge support git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@748 348d0f76-0448-11de-a6fe-93d51630548a 2009-05-19 05:18:51 +08:00			`SAM_FLAGS = {`
			`SAM_SEQPAIRED : 'the read is paired in sequencing, no matter whether it is mapped in a pair',`
			`SAM_MAPPAIRED : 'the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1',`
			`SAM_UNMAPPED : 'the query sequence itself is unmapped',`
			`SAM_MATEUNMAPPED : 'the mate is unmapped 1',`
			`SAM_QUERYSTRAND : 'strand of the query (0 for forward; 1 for reverse strand)' ,`
			`SAM_MATESTRAND : 'strand of the mate 1' ,`
			`SAM_ISFIRSTREAD : 'the read is the first read in a pair 1,2' ,`
			`SAM_ISSECONDREAD : 'the read is the second read in a pair 1,2',`
			`SAM_NOTPRIMARY : 'the alignment is not primary (a read having split hits may have multiple primary alignment records)'`
			`}`

Basic reorganization of tree git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@8 348d0f76-0448-11de-a6fe-93d51630548a 2009-02-28 23:28:56 +08:00			`def SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig = '*', pairPos = 0, insertSize = 0 ):`
			`r = SAMRecord()`
			`r.setValuesFromArgs( qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig, pairPos, insertSize )`
			`return r`

			`def SAMRecordFromString( str ):`
			`r = SAMRecord()`
			`r.setValuesFromString( str )`
			`return r`

			`def SAMFlagValue( flags, testFlag ):`
			`return testFlag & flags`
Better merge support git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@748 348d0f76-0448-11de-a6fe-93d51630548a 2009-05-19 05:18:51 +08:00
Basic reorganization of tree git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@8 348d0f76-0448-11de-a6fe-93d51630548a 2009-02-28 23:28:56 +08:00			`def SAMFlagIsSet( flags, testFlag ):`
			`return SAMFlagValue(flags, testFlag) <> 0`

Better merge support git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@748 348d0f76-0448-11de-a6fe-93d51630548a 2009-05-19 05:18:51 +08:00			`def SAMFlagsDescs( flags ):`
			`def keepMe(p):`
			`flagKey, flagDesc = p`
			`return [flagKey, SAMFlagIsSet(flags, flagKey), flagDesc]`
			`return sorted(map( keepMe, SAM_FLAGS.iteritems() ))`

Basic reorganization of tree git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@8 348d0f76-0448-11de-a6fe-93d51630548a 2009-02-28 23:28:56 +08:00			`# -----------------------------------------------------------------------------------------------`
			`#`
			`# This is really the meat of the SAM I/O system.`
			`#`
			`# setValuesFromArgs takes the required arguments for a SAM record and stores them in a list`
			`# (in SAM ordering) so that writing the values to a string is trivial. Note the conversion`
			`# of int quals to ASCII encoded quals.`
			`#`
			`# -----------------------------------------------------------------------------------------------`
			`class SAMRecord:`
			`def __init__( self, vals = []):`
			`self.vals = []`

			`def setValuesFromArgs( self, qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig = '*', pairPos = 0, insertSize = 0 ):`
			`self.vals = [qname, self.formatFlags(flags), rname, pos, mapq, \`
			`cigar, pairContig, pairPos, insertSize, seq.lower(), self.toASCII33(quals) ]`

			`def setValuesFromString( self, line ):`
			`#print 'line is', line`
			`formats = [str, int, str, int, int, str, str, int, int, str, str]`
			`self.vals = map( lambda f, v: f(v), formats, line.split()[0:len(formats)] )`

			`def formatFlags( self, flags ):`
			`b = reduce( operator.__or__, flags, 0 )`
			`#print 'FormatFlags', flags, b`
			`return b`

			`def getQName(self): return self.vals[0]`
			`def getFlags(self): return self.vals[1]`
			`def getRname(self): return self.vals[2] # Reference sequence name (can be chr1, chr2, etc...)`
			`def getPos(self): return self.vals[3]`
			`def getMapq(self): return self.vals[4]`
			`def getCigar(self): return self.vals[5] # Returns CIGAR which gives match, insertion, and other info as "75M2I"`
			`def getSeq(self): return self.vals[9]`
			`def getQuals(self): return self.fromASCII33(self.vals[10])`

			`def toASCII33( self, quals ):`
			`return string.join( map( lambda q: chr(q+33), quals ), '' )`

			`def fromASCII33( self, qualStr ):`
			`return map( lambda q: ord(q)-33, qualStr)`

			`def __str__( self ):`
			`return string.join( map( str, self.vals ), '\t')`

			`# -----------------------------------------------------------------------------------------------`
			`#`
			`# Wrapper class for reading and writing records to a SAM file`
			`#`
			`# -----------------------------------------------------------------------------------------------`
			`class SAMIO:`
			`def __init__( self, fileName, header = None, debugging = False, func = None ):`
			`self.header = header`
			`self.debugging = debugging`

			`if func == None:`
			`func = lambda x: x`
			`self.func = func`

			`if self.header == None:`
			`self.fileHandle = pushback_file(fileName, "r")`
			`self.readHeader()`
			`else:`
			`self.fileHandle = file(fileName, "w")`
			`self.writeHeader()`

			`def close(self):`
			`self.fileHandle.close()`

			`def readHeader(self):`
			`self.header = ""`
			`while True:`
			`line = self.fileHandle.readline()`
			`if line.startswith("@"):`
			`self.header += line`
			`else:`
			`self.fileHandle.pushback(line)`
			`if self.debugging:`
			`pass #print str(self.header)`
			`return True`

			`return self.header`

			`def RecordGenerator( self ):`
			`for line in self.fileHandle:`
			`#print 'Reading line', line`
			`line = line.rstrip("\n")`
			`yield self.func(SAMRecordFromString(line))`
			`raise StopIteration`
			`return`

			`def __iter__(self):`
			`return self.RecordGenerator()`

			`def writeRecord( self, recordIterator ):`
			`for record in recordIterator:`
			`self.writeRecord(record)`

			`def writeRecord( self, record ):`
			`if self.debugging:`
			`print record`
			`print >> self.fileHandle, record`
			`return True`

			`def writeHeader( self ):`
			`if self.debugging:`
			`print str(self.header)`
			`print >> self.fileHandle, str(self.header)`
			`return True`