gatk-3.8/playground/python/SimulateReads.py

754 lines
29 KiB
Python
Executable File

#!/util/bin/python
from optparse import OptionParser
import os
import sys
#import set
import random
import string
from SAM import *
def all(iterable):
for element in iterable:
if not element:
return False
return True
def any(iterable):
for element in iterable:
if element:
return True
return False
OPTIONS = None
bases = set(['a', 't', 'g', 'c'])
class Mutation:
"""Representation of a change from one sequence to another"""
SNP = "SNP"
INSERTION = "Insertion"
DELETION = "Deletion"
TYPES = set([SNP, INSERTION, DELETION])
CIGARChars = { SNP : 'M', INSERTION : 'I', DELETION : 'D' }
def __init__( self, contig, pos, type, original, mutated ):
# example: chr20 123 SNP "a" "t" <- a to t at base 123 on chr20
# example: chr20 123 INSERTION "" "atg" <- insertion of 3 bases starting at pos 123
# example: chr20 123 DELETION "atg" "" <- deletion of 3 bases starting at pos 123
self.contig = contig # contig of the mutation
self._pos = pos # position of the change in contig coordinates
self._type = type # one of the TYPES
assert(self._type in Mutation.TYPES)
self.original = original # String of the original genotype
self.mutated = mutated # String of the mutated genotype
def pos(self, offset=0): return self._pos - offset
def type(self): return self._type
def __len__(self):
return max( len(self.mutated), len(self.original) )
def mutString( self ):
if self._type == Mutation.SNP:
return 'P:%s->%s' % (self.original, self.mutated)
elif self._type == Mutation.INSERTION:
return 'I:%s' % (self.mutated)
elif self._type == Mutation.DELETION:
return 'D:%s' % (self.original)
else:
raise Exception('Unexpected mutation type ' + self._type)
def CIGARChar( self ):
return Mutation.CIGARChars[self.type()]
def __str__( self ):
return '%s:%d %s' % (self.contig, self._pos, self.mutString())
def __repr__( self ):
return 'Mutation(%s, %d, %s)' % (self.contig, self._pos, self.mutString())
def execute( self, offset, refseq, mutseq ):
p = self.pos(offset)
if self.type() == Mutation.SNP:
mutseq[p] = self.mutated
elif self.type() == Mutation.INSERTION:
refseq[p:p] = string.join(['-'] * len(self),'')
mutseq[p:p] = self.mutated
elif self.type() == Mutation.DELETION:
mutseq[p:p+len(self)] = string.join(['-'] * len(self), '')
refseq.extend(self.mutated)
mutseq.extend(self.mutated)
#print refseq, mutseq
return refseq, mutseq
# ---------------------------------------------------------------------------
#
# Code for dealing with CIGAR strings
#
# ---------------------------------------------------------------------------
def flatten(l):
out = []
for item in l:
if isinstance(item, (list, tuple)):
out.extend(flatten(item))
else:
out.append(item)
return out
def mutationsCIGAR( readStart, alignStart, readLen, mutations ):
if len(mutations) == 1 and mutations[0].type() <> Mutation.SNP:
mut = mutations[0]
internalPos = mut.pos() - alignStart
leftLen = max(internalPos - readStart,0)
mutLen = min(internalPos + len(mut) - readStart, readLen - leftLen, len(mut))
rightLen = readLen - leftLen - mutLen
#print mut, readStart, alignStart, internalPos, leftLen, mutLen, rightLen
#print
l = [ (leftLen, 'M'), (mutLen, mut.CIGARChar()), (rightLen, 'M') ]
l = filter( lambda x: x[0] > 0, l )
return string.join(map(str, flatten(l)), '')
if not all( map( lambda mut: mut.type() == Mutation.SNP, mutations ) ):
# bail
raise Exception('Currently only supports multiple SNPs CIGARS')
return str(readLen) + 'M'
# def mutationsCIGARBAD( refStart, readStart, readStop, mutations ):
# sortedMutations = sorted(mutations, key=Mutation.pos)
# CIGAR = []
# pos = readStart
#
# for mutation in sortedMutations:
# internalMutationStart = mutation.pos() - refStart
# if internalMutationStart >= readStart and internalMutationStart < readStop
# delta = mutation.pos() - pos
# if not OPTIONS.quiet:
# print 'mutationsCIGAR', pos, CIGAR, delta, mutation
# if mutation.type() <> Mutation.SNP and delta > 0:
# CIGAR.append([delta, 'M'])
# pos = mutation.pos()
#
# if mutation.type == Mutation.INSERTION:
# CIGAR.append(['I', len(mutation)])
# if mutation.type == Mutation.DELETION:
# CIGAR.append(['D', len(mutation)])
#
# delta = refSeqPos + refLen - pos
# if not OPTIONS.quiet:
# print 'mutationsCIGAR', pos, CIGAR, delta, mutation
# if delta > 0:
# CIGAR.append([delta, 'M'])
#
# s = string.join(map( str, flatten(CIGAR) ), '')
# print 'CIGAR:', CIGAR, s
# return s
# ---------------------------------------------------------------------------
#
# mutating the reference
#
# ---------------------------------------------------------------------------
def executeMutations( mutations, seqIndex, refseq, mutseq ):
for mutation in mutations:
internalSite = mutation.pos() - seqIndex
refseq, mutseq = mutation.execute( seqIndex, refseq, mutseq )
return refseq, mutseq
def isBadSequenceForSim(seq):
if any( map( lambda b: b not in bases, seq.tostring().lower() ) ):
return True
else:
return False
def mutateReference(ref, contig, mutSite, mutType, mutParams, nBasesToPad):
#print 'mutateReference'
# start and stop
refStartIndex = mutSite - nBasesToPad + 1
internalStartIndex = nBasesToPad - 1
if OPTIONS.mutationType == 'SNP' or OPTIONS.mutationType == 'NONE':
refStopIndex = mutSite + nBasesToPad
else:
refStopIndex = mutSite + nBasesToPad - 1
refsub = ref[refStartIndex : refStopIndex]
mutseq = refsub.tomutable()
refsub = refsub.tomutable()
mutations = []
def success():
return True, mutations, refsub, mutseq
def fail():
return False, [], None, None
if refStartIndex < 0:
return fail()
if isBadSequenceForSim(refsub):
#print 'rejecting seq', refsub.tostring()
#print map( lambda b: b not in bases, refsub.tostring().lower() )
return fail()
if not OPTIONS.quiet:
print ref
print refsub
otherSites = set(range(refStartIndex, refStopIndex)) - set([mutSite])
# print 'otherSites', otherSites
for site in [mutSite] + random.sample(otherSites, max(OPTIONS.mutationDensity-1,0)):
internalSite = site - refStartIndex
if mutType == 'NONE' or OPTIONS.mutationDensity < 1:
pass
elif mutType == 'SNP':
mutBase = ref[site].lower()
otherBases = bases - set(mutBase)
otherBase = random.choice(list(otherBases))
assert mutBase <> otherBase
if not OPTIONS.quiet:
print mutBase, ' => ', otherBase
mutations.append( Mutation( contig, site, Mutation.SNP, mutBase, otherBase ) )
elif mutType == 'INSERTION':
inSeq = string.join([ random.choice(list(bases)) for i in range( OPTIONS.indelSize ) ], '')
mutation = Mutation( contig, site, Mutation.INSERTION, '', inSeq )
mutations.append( mutation )
elif mutType == 'DELETION':
inSeq = string.join([ random.choice(list(bases)) for i in range( OPTIONS.indelSize ) ], '')
mutation = Mutation( contig, site, Mutation.DELETION, '', ref[refStopIndex:refStopIndex+OPTIONS.indelSize])
mutations.append( mutation )
else:
raise Exception('Unexpected mutation type', mutType)
# process the mutations
refsub, mutseq = executeMutations( mutations, refStartIndex, refsub, mutseq )
return success()
# version 1.0 -- prototype
# def mutateReference(ref, mutSite, mutType, mutParams, nBasesToPad):
# #print 'mutateReference'
# refStartIndex = mutSite - nBasesToPad + 1
# refStopIndex = mutSite + nBasesToPad
# internalStartIndex = nBasesToPad - 1
#
# if refStartIndex < 0:
# return False, None, None
#
# refsub = ref[refStartIndex : refStopIndex]
# print ref
# print refsub
#
# if mutType == 'SNP' or mutType == 'None':
# #print ' In IF'
# mutBase = ref[mutSite]
# bases = set(['A', 'T', 'G', 'C'])
# if mutBase in bases:
# otherBases = bases - set(mutBase)
# otherBase = random.choice(list(otherBases))
# assert mutBase <> otherBase
# mutseq = refsub.tomutable()
#
# if mutType == 'SNP':
# print mutBase, ' => ', otherBase
# mutseq[internalStartIndex] = otherBase
#
# print refsub
# print mutseq
# align = Bio.Align.Generic.Alignment(Gapped(IUPAC.unambiguous_dna, '-'))
# align.add_sequence("ref", refsub.tostring())
# align.add_sequence("mut", mutseq.tostring())
# print str(align)
#
# return True, refsub, mutseq
#
# return False, None, None
# ---------------------------------------------------------------------------
#
# sample reads from alignments
#
# ---------------------------------------------------------------------------
def accumulateBases( mutSeq, start, readLen ):
count = 0
for stop in range(start, len(mutSeq)):
if notDash(mutSeq[stop]):
count += 1
#print stop, count
if count == readLen:
break
stop += 1
read = string.join(filter( notDash, mutSeq[start:stop] ), '')
return read, stop
# doesn't support paired reads
#
# def sampleReadsFromAlignment(refSeq, mutSeq, alignStart, readLen, nReads, mutations):
# #print 'REF:', refSeq.tostring()
# #print 'MUT:', mutSeq.tostring()
# #print ''
#
# # we are assuming that mutSeq is exactly 1 + 2 * readLen in size, which the mutation
# # right in the middle
# lastGoodStartSite = readLen - 1
# if not OPTIONS.quiet:
# print refSeq.tostring(), 'ref'
# print mutSeq.tostring(), 'mut'
#
# # we can potentially start at any site in mutSeq
# nPotentialStarts = len(mutSeq) - readLen + 1
# potentialStarts = range(nPotentialStarts)
#
# if nReads.lower() == 'tile' or int(nReads) >= nPotentialStarts:
# if OPTIONS.uniqueReadsOnly:
# starts = potentialStarts
# else:
# starts = map( lambda x: random.choice(potentialStarts), range(nPotentialStarts))
# else:
# starts = random.sample(potentialStarts, nPotentialStarts)
#
# #print 'potentialStarts', potentialStarts
# #print 'Starts', starts
#
# def sampleRead( start ):
# if mutSeq[start] <> '-':
# read, stop = accumulateBases( mutSeq, start, readLen )
# remaining = len(mutSeq) - stop
# refForRead = refSeq[start:stop]
# #print 'XXXX', start, stop, refForRead, read
# cigar = mutationsCIGAR( alignStart, readLen, mutations )
# if not OPTIONS.quiet:
# leftPads = string.join(start * ['-'], '')
# rightPads = string.join(remaining * ['-'], '')
# print leftPads + mutSeq.tostring()[start:stop] + rightPads, start, stop, cigar
# return filter(notDash, read), alignStart + start, cigar
# else:
# return False, False, False
#
# reads = map( sampleRead, starts )
# return [read for read in reads if read[0] <> False]
class refSite:
def __init__( self, refSeq, mutSeq, alignStart, mutations ):
self.refSeq = refSeq
self.mutSeq = mutSeq
self.alignStart = alignStart
self.mutations = mutations
def sample1Read( refSite, start, readLen, reverseComplementP = False ):
if refSite.mutSeq[start] <> '-':
read, stop = accumulateBases( refSite.mutSeq, start, readLen )
mutSubSeq = refSite.mutSeq[start:stop]
if reverseComplementP:
s = Seq( read, IUPAC.unambiguous_dna )
#print 'reverseComplementP', read, s.reverse_complement().tostring(), mutSubSeq
read = s.reverse_complement().tostring()
mutSubSeq.complement()
remaining = len(refSite.mutSeq) - stop
refForRead = refSite.refSeq[start:stop]
#print 'XXXX', start, stop, refForRead, read
cigar = mutationsCIGAR( start, refSite.alignStart, readLen, refSite.mutations )
leftPads = string.join(start * ['-'], '')
rightPads = string.join(remaining * ['-'], '')
ppReadStr = leftPads + mutSubSeq.tostring() + rightPads # + '(' + read + ')'
return True, filter(notDash, read), refSite.alignStart + start, cigar, ppReadStr
else:
return False, None, None, None, None
def sampleReadsFromAlignment(wholeRef, refSeq, mutSeq, alignStart, readLen, nReads, mutations, pairedOffset, mutations2, refSeq2, mutSeq2):
#print 'REF:', refSeq.tostring()
#print 'MUT:', mutSeq.tostring()
#print ''
# pairedSite, mutations2, refSeq2, mutSeq2 can all be None
refSite1 = refSite( refSeq, mutSeq, alignStart, mutations )
refSite2 = refSite( refSeq2, mutSeq2, alignStart + pairedOffset, mutations2 )
#print refSite1.alignStart, refSite2.alignStart, pairedOffset
# we are assuming that mutSeq is exactly 1 + 2 * readLen in size, which the mutation
# right in the middle
lastGoodStartSite = readLen - 1
if not OPTIONS.quiet:
if OPTIONS.generatePairedEnds:
r = wholeRef.seq[refSite1.alignStart:refSite2.alignStart + 2*readLen - 1]
print r.tostring(), 'ref'
print r.complement().tostring(), 'ref, complement'
else:
print refSeq.tostring(), 'ref'
print mutSeq.tostring(), 'mut'
# we can potentially start at any site in mutSeq
nPotentialStarts = len(mutSeq) - readLen + 1
potentialStarts = range(nPotentialStarts)
#print 'potentialStarts', potentialStarts
if nReads.lower() == 'tile' or int(nReads) >= nPotentialStarts:
if OPTIONS.uniqueReadsOnly:
starts = potentialStarts
else:
starts = map( lambda x: random.choice(potentialStarts), range(nPotentialStarts))
else:
starts = random.sample(potentialStarts, int(nReads))
#print 'Starts', starts
def sampleRead( start ):
good1, read1, pos1, cigar1, ppstr1 = sample1Read( refSite1, start, readLen )
if OPTIONS.generatePairedEnds:
good2, read2, pos2, cigar2, ppstr2 = sample1Read( refSite2, start, readLen, True )
if not OPTIONS.quiet:
offsetDashes = string.join(['-'] * (pairedOffset), '')
print
print ppstr1 + offsetDashes, pos1, cigar1
print offsetDashes + ppstr2, pos2, cigar2
return [(read1, pos1, cigar1), (read2, pos2, cigar2)]
else:
if not OPTIONS.quiet:
print ppstr1, pos1, cigar1
return [(read1, pos1, cigar1), (None, 0, None)]
reads = map( sampleRead, starts )
return [read for read in reads if read[0][0] <> None]
def notDash( c ):
return c <> '-'
def fakeQuals( seq ):
return len(seq) * [30]
#return range(len(seq))
def alignedRead2SAM( siteID, readID, fastaID, read, pos, cigar, read2, pos2, cigar2 ):
rname = fastaID
mapq = 40
quals = fakeQuals( read )
qname = '%s:%d.read.%d' % (fastaID, siteID, readID)
if not OPTIONS.generatePairedEnds:
# not in paired mode
flags = []
record1 = SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, read, quals )
record2 = None
else:
flags = [SAM_SEQPAIRED, SAM_MAPPAIRED]
insertSize = pos2 - pos - len(read)
record1 = SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, read, quals, pairContig = rname, pairPos = pos2, insertSize = insertSize )
record2 = SAMRecordFromArgs( qname + 'p', flags, rname, pos2, mapq, cigar2, read2, quals, pairContig = rname, pairPos = pos, insertSize = insertSize )
#print 'read1, read2', record1.getSeq(), record2.getSeq()
return [record1, record2]
# ---------------------------------------------------------------------------
#
# paired end reads
#
# ---------------------------------------------------------------------------
def pairedReadSite( ref, leftMutSite, readLen ):
pairedOffset = int(round(random.normalvariate(OPTIONS.insertSize, OPTIONS.insertDev)))
def fail(): return False, None
if pairedOffset < 0:
return fail()
pairedSite = pairedOffset + leftMutSite + readLen
#print 'pairedSite', pairedOffset, leftMutSite, pairedSite
if pairedSite + readLen >= len(ref):
return fail()
refsub = ref[pairedSite - readLen:pairedSite + readLen + 1]
if isBadSequenceForSim( refsub ):
return fail()
return True, pairedSite
# ---------------------------------------------------------------------------
#
# build allowed regions and sampling starts from region
#
# ---------------------------------------------------------------------------
def parseSamplingRange( arg, ref, readLen ):
# returns a list of the allowed regions to sample in the reference
def one(x):
elts = x.split()
#print 'elts', elts
if len(elts) > 0:
elts1 = elts[0].split(':')
#print 'elts1', elts1
return map( int, elts1 )
else:
return False
if arg <> None:
try:
return [ one(arg) ]
except:
# try to read it as a file
return filter( None, map( one, open(arg).readlines() ) )
else:
return [[0, len(ref.seq) - readLen]]
def sampleStartSites( sampleRanges, nsites ):
print 'sampleStartSites', sampleRanges, nsites
# build a set of potential sites
if len(sampleRanges) > 1:
potentialSites = set()
for start, end in sampleRanges:
#print 'potentialSites', start, end, potentialSites
potentialSites = potentialSites.union(set(xrange(start, end)))
else:
start, end = sampleRanges[0]
potentialSites = xrange(start, end)
#print 'potentialSites', potentialSites
# choose sites from potentials
if len(potentialSites) <= nsites:
# tile every site
sites = potentialSites
else:
# we need to sample from the potential sites
sites = random.sample( potentialSites, nsites )
# print 'Sites', sites
print 'Number of potential start sites:', len(potentialSites)
print 'Number of start sites that will be generated:', len(sites)
return sites
def readRef(referenceFasta):
handle = open(referenceFasta)
for seq_record in SeqIO.parse(handle, "fasta"):
print seq_record.id
print seq_record.name
#print repr(seq_record.seq)
print len(seq_record.seq)
yield seq_record
handle.close()
import os.path
def outputFilename():
root = os.path.splitext(os.path.split(OPTIONS.reference)[1])[0]
if OPTIONS.generatePairedEnds:
pairedP = 'Yes'
else:
pairedP = 'No'
params = [['mut',OPTIONS.mutationType], ['den', max(OPTIONS.mutationDensity, OPTIONS.indelSize)], [OPTIONS.readLen,'bp'], \
['nsites', OPTIONS.nSites], ['cov', OPTIONS.coverage], ['range', OPTIONS.range], ['paired', pairedP]]
filename = root + '__' + string.join(map( lambda p: string.join(map(str, p),'.'), params), '_')
root = os.path.join(OPTIONS.outputPrefix, filename)
return root, root + '.sam'
def main():
global OPTIONS, ROOT
usage = "usage: %prog [options]"
parser = OptionParser(usage=usage)
parser.add_option("-r", "--ref", dest="reference",
type="string", default=None,
help="Reference DNA seqeunce in FASTA format")
parser.add_option("-o", "--outputPrefix", dest="outputPrefix",
type="string", default='',
help="Output Prefix SAM file")
parser.add_option("-c", "--coverage", dest="coverage",
type="string", default='1',
help="Number of reads to produce that cover each mutated region for the reference, or tile for all possible reads.")
parser.add_option("-n", "--nsites", dest="nSites",
type=int, default=1,
help="Number of sites to generate mutations in the reference")
parser.add_option("-l", "--readlen", dest="readLen",
type=int, default=36,
help="Length of reads to generate")
parser.add_option("-s", "--seed", dest="seed",
type=int, default=123456789,
help="Random number seed. If 0, then system clock is used")
parser.add_option("-t", "--muttype", dest="mutationType",
type="string", default='None',
help="Type of mutation to introduce from reference. Can be None, SNP, insertion, or deletion")
parser.add_option("-e", "--mutdensity", dest="mutationDensity",
type=int, default=1,
help="Number of mutations to introduce from the reference")
parser.add_option("-x", "--range", dest="range",
type="string", default=None,
help="Sampling range restriction, in the form of x:y without a space")
parser.add_option("-u", "--unique", dest="uniqueReadsOnly",
action='store_true', default=True,
help="Assumes that the user wants unique reads generated. If the coverage is greater than the number of unique reads at the site, the region is just tiled")
parser.add_option("-i", "--indelsize", dest="indelSize",
type=int, default=0,
help="Size in bp of the insertion or deletion to introduce")
# Paired end options
parser.add_option("", "--paired", dest="generatePairedEnds",
action='store_true', default=False,
help="Should we generate paired end reads?")
parser.add_option("", "--insertSize", dest="insertSize",
type=int, default=280,
help="Size in bp of the paired library insertion")
parser.add_option("", "--insertDev", dest="insertDev",
type=int, default=5,
help="Standard deviation in the size in bp of the paired library insertion")
parser.add_option("-q", "--quiet", dest="quiet",
action='store_true', default=False,
help="Be quiet when generating output")
parser.add_option("-d", "--debug", dest="debug",
action='store_true', default=False,
help="Verbose debugging output?")
(OPTIONS, args) = parser.parse_args()
if len(args) != 0:
parser.error("incorrect number of arguments")
root, outputSAM = outputFilename()
if OPTIONS.seed == 0:
random.seed(None)
else:
random.seed(OPTIONS.seed)
OPTIONS.mutationType = OPTIONS.mutationType.upper()
print '------------------------------------------------------------'
print 'program:', sys.argv[0]
print ''
print ' ref: ', OPTIONS.reference
print ' output: ', outputSAM
print ''
print ' mutation type: ', OPTIONS.mutationType
print ' mutation density: ', OPTIONS.mutationDensity
print ' indel size: ', OPTIONS.indelSize
print ' readLen: ', OPTIONS.readLen
print ' nSites: ', OPTIONS.nSites
print ' coverage: ', OPTIONS.coverage
print ' range restriction: ', OPTIONS.range
print ''
print ' paired ends?: ', OPTIONS.generatePairedEnds
print ' insert size: ', OPTIONS.insertSize
print ' insert stddev: ', OPTIONS.insertDev
print ''
print ' Debugging?: ', OPTIONS.debug
print '------------------------------------------------------------'
if OPTIONS.mutationType <> 'SNP' and OPTIONS.mutationDensity > 1:
raise Exception('Does not support mutation density > 1 for mutations of class', OPTIONS.mutationType)
readLen = OPTIONS.readLen
header = SAMHeader( "MarkD", readLen )
SAMout = SAMIO( outputSAM, header, debugging=OPTIONS.debug )
mutationsout = open(root + '.mutations.txt', 'w')
qualsout = open(root + '.quals.txt', 'w')
refout = open(root + '.ref', 'w')
fastaout = open(root + '.fasta', 'w')
if OPTIONS.generatePairedEnds:
fastaout2 = open(root + '.pairs.fasta', 'w')
qualsout2 = open(root + '.pairs.quals.txt', 'w')
counter = 0
refLen = 0
for ref in readRef(OPTIONS.reference):
refLen = len(ref.seq)
# write the crazy ref file info needed by samtools
print >> refout, ref.id + '\t' + str(refLen)
sampleRanges = parseSamplingRange( OPTIONS.range, ref, readLen )
sites = sampleStartSites( sampleRanges, OPTIONS.nSites )
for mutSite, siteCounter in zip( sites, range(1, len(sites)+1) ):
#print 'Sampling site:', mutSite, refLen
#mutSite = readLen-1
nSitesSelected = max(int(round(len(sites)/100.0)), 1)
#print siteCounter, len(sites), nSitesSelected)
if siteCounter % nSitesSelected == 0:
print 'Sampled site %d of %d (%.2f%%)' % ( siteCounter, len(sites), (100.0*siteCounter) / len(sites))
good, mutations, refSeq, mutSeq = mutateReference(ref.seq, ref.id, mutSite, OPTIONS.mutationType, [], readLen)
pairedSite, mutations2, refSeq2, mutSeq2 = 0, None, None, None
if good and OPTIONS.generatePairedEnds:
if OPTIONS.mutationType == 'SNP':
pairedMutType = 'SNP'
else:
pairedMutType = 'NONE'
good, pairedSite = pairedReadSite( ref.seq, mutSite, readLen )
print 'pairedReadSite', good, pairedSite, good
if good:
good, mutations2, refSeq2, mutSeq2 = mutateReference(ref.seq, ref.id, pairedSite, pairedMutType, [], readLen)
#print 'Good', good, mutations, refSeq, mutSeq
if good:
print >> mutationsout, ref.id + ':' + str(mutSite) + '\t' + string.join(map(str, mutations), '\t')
#print 'Mutations', mutations
refSeqPos = mutSite - readLen + 1
readPairs = sampleReadsFromAlignment(ref, refSeq, mutSeq, refSeqPos, readLen, OPTIONS.coverage, mutations, pairedSite - mutSite, mutations2, refSeq2, mutSeq2)
for i in range(len(readPairs)):
counter += 1
read1, read2 = readPairs[i]
seq, pos, cigar = read1
seq2, pos2, cigar2 = read2
# write out the sam and fasta files
def write1( record, recordi ):
if record <> None:
SAMout.writeRecord( record )
sequences = [SeqRecord(Seq(record.getSeq()), id = record.getQName(), description='')]
#print sequences
if recordi % 2 == 0:
localFastaOut, localQualsOut = fastaout, qualsout
else:
localFastaOut, localQualsOut = fastaout2, qualsout2
SeqIO.write(sequences, localFastaOut, "fasta")
print >> localQualsOut, record.getQName(), string.join(map( str, record.getQuals() ), ' ')
#print i, read1, read2
records = alignedRead2SAM( mutSite, i+1, ref.id, seq, pos + 1, cigar, seq2, pos2 + 1, cigar2 )
map( write1, records, range( len(records) ) )
SAMout.close()
qualsout.close()
fastaout.close()
refout.close()
mutationsout.close()
if OPTIONS.generatePairedEnds:
qualsout2.close()
fastaout2.close()
# for record in SAMIO( outputSAM ):
# print record
if __name__ == "__main__":
import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC, Gapped
main()