gatk-3.8/playground/python/SimulateReads.py

#!/util/bin/python

from optparse import OptionParser
import os
import sys
#import set
import random
import string

from SAM import *

def all(iterable):
    for element in iterable:
        if not element:
            return False
    return True

def any(iterable):
    for element in iterable:
        if element:
            return True
    return False

OPTIONS = None

bases = set(['a', 't', 'g', 'c'])

class Mutation:
    """Representation of a change from one sequence to another"""

    SNP = "SNP"
    INSERTION = "Insertion"
    DELETION = "Deletion"
    TYPES = set([SNP, INSERTION, DELETION])

    CIGARChars = { SNP : 'M', INSERTION : 'I', DELETION : 'D' }

    def __init__( self, contig, pos, type, original, mutated ):
        # example: chr20 123 SNP "a" "t"        <- a to t at base 123 on chr20
        # example: chr20 123 INSERTION "" "atg" <- insertion of 3 bases starting at pos 123
        # example: chr20 123 DELETION  "atg" "" <- deletion of 3 bases starting at pos 123
        self.contig = contig    # contig of the mutation
        self._pos = pos          # position of the change in contig coordinates
        self._type = type        # one of the TYPES
        assert(self._type in Mutation.TYPES)
        self.original = original    # String of the original genotype
        self.mutated  = mutated     # String of the mutated genotype

    def pos(self, offset=0): return self._pos - offset
    def type(self): return self._type
    def __len__(self):
        return max( len(self.mutated), len(self.original) )

    def mutString( self ):
        if self._type == Mutation.SNP:
            return 'P:%s->%s' % (self.original, self.mutated)
        elif self._type == Mutation.INSERTION:
            return 'I:%s' % (self.mutated)
        elif self._type == Mutation.DELETION:
            return 'D:%s' % (self.original)
        else:
            raise Exception('Unexpected mutation type ' + self._type)

    def CIGARChar( self ):
        return Mutation.CIGARChars[self.type()]

    def __str__( self ):
        return '%s:%d %s' % (self.contig, self._pos, self.mutString())

    def __repr__( self ):
        return 'Mutation(%s, %d, %s)' % (self.contig, self._pos, self.mutString())

    def execute( self, offset, refseq, mutseq ):
        p = self.pos(offset)
        if self.type() == Mutation.SNP:
            mutseq[p] = self.mutated
        elif self.type() == Mutation.INSERTION:
            refseq[p:p] = string.join(['-'] * len(self),'')
            mutseq[p:p] = self.mutated
        elif self.type() == Mutation.DELETION:
            mutseq[p:p+len(self)] = string.join(['-'] * len(self), '')
            refseq.extend(self.mutated)
            mutseq.extend(self.mutated)

        #print refseq, mutseq
        return refseq, mutseq

# ---------------------------------------------------------------------------
#
# Code for dealing with CIGAR strings
#
# ---------------------------------------------------------------------------
def flatten(l):
    out = []
    for item in l:
        if isinstance(item, (list, tuple)):
            out.extend(flatten(item))
        else:
            out.append(item)
    return out

def mutationsCIGAR( readStart, alignStart, readLen, mutations ):
    if len(mutations) == 1 and mutations[0].type() <> Mutation.SNP:
        mut = mutations[0]
        internalPos = mut.pos() - alignStart

        leftLen = max(internalPos - readStart,0)
        mutLen = min(internalPos + len(mut) - readStart, readLen - leftLen, len(mut))
        rightLen = readLen - leftLen - mutLen
        #print mut, readStart, alignStart, internalPos, leftLen, mutLen, rightLen
        #print

        l = [ (leftLen, 'M'), (mutLen, mut.CIGARChar()), (rightLen, 'M') ]
        l = filter( lambda x: x[0] > 0, l )
        return string.join(map(str, flatten(l)), '')

    if not all( map( lambda mut: mut.type() == Mutation.SNP, mutations ) ):
        # bail
        raise Exception('Currently only supports multiple SNPs CIGARS')

    return str(readLen) + 'M'

# def mutationsCIGARBAD( refStart, readStart, readStop, mutations ):
#     sortedMutations = sorted(mutations, key=Mutation.pos)
#     CIGAR = []
#     pos = readStart
#
#     for mutation in sortedMutations:
#         internalMutationStart = mutation.pos() - refStart
#         if internalMutationStart >= readStart and internalMutationStart < readStop
#             delta = mutation.pos() - pos
#             if not OPTIONS.quiet:
#                 print 'mutationsCIGAR', pos, CIGAR, delta, mutation
#             if mutation.type() <> Mutation.SNP and delta > 0:
#                 CIGAR.append([delta, 'M'])
#                 pos = mutation.pos()
#
#             if mutation.type == Mutation.INSERTION:
#                 CIGAR.append(['I', len(mutation)])
#             if mutation.type == Mutation.DELETION:
#                 CIGAR.append(['D', len(mutation)])
#
#     delta = refSeqPos + refLen - pos
#     if not OPTIONS.quiet:
#         print 'mutationsCIGAR', pos, CIGAR, delta, mutation
#     if delta > 0:
#         CIGAR.append([delta, 'M'])
#
#     s = string.join(map( str, flatten(CIGAR) ), '')
#     print 'CIGAR:', CIGAR, s
#     return s

# ---------------------------------------------------------------------------
#
# mutating the reference
#
# ---------------------------------------------------------------------------
def executeMutations( mutations, seqIndex, refseq, mutseq ):
    for mutation in mutations:
        internalSite = mutation.pos() - seqIndex
        refseq, mutseq = mutation.execute( seqIndex, refseq, mutseq )
    return refseq, mutseq

def isBadSequenceForSim(seq):
    if any( map( lambda b: b not in bases, seq.tostring().lower() ) ):
        return True
    else:
        return False


def mutateReference(ref, contig, mutSite, mutType, mutParams, nBasesToPad):
    #print 'mutateReference'

    # start and stop
    refStartIndex = mutSite - nBasesToPad + 1
    internalStartIndex = nBasesToPad - 1

    if OPTIONS.mutationType == 'SNP' or OPTIONS.mutationType == 'NONE':
        refStopIndex = mutSite + nBasesToPad
    else:
        refStopIndex = mutSite + nBasesToPad - 1

    refsub = ref[refStartIndex : refStopIndex]
    mutseq = refsub.tomutable()
    refsub = refsub.tomutable()
    mutations = []

    def success():
        return True, mutations, refsub, mutseq
    def fail():
        return False, [], None, None

    if refStartIndex < 0:
        return fail()
    if isBadSequenceForSim(refsub):
        #print 'rejecting seq', refsub.tostring()
        #print map( lambda b: b not in bases, refsub.tostring().lower() )
        return fail()

    if not OPTIONS.quiet:
        print ref
        print refsub

    otherSites = set(range(refStartIndex, refStopIndex)) - set([mutSite])
    # print 'otherSites', otherSites
    for site in [mutSite] + random.sample(otherSites, max(OPTIONS.mutationDensity-1,0)):
        internalSite = site - refStartIndex
        if mutType == 'NONE' or OPTIONS.mutationDensity < 1:
            pass
        elif mutType == 'SNP':
            mutBase = ref[site].lower()
            otherBases = bases - set(mutBase)
            otherBase = random.choice(list(otherBases))
            assert mutBase <> otherBase
            if not OPTIONS.quiet:
                print mutBase, ' => ', otherBase
            mutations.append( Mutation( contig, site, Mutation.SNP, mutBase, otherBase ) )
        elif mutType == 'INSERTION':
            inSeq = string.join([ random.choice(list(bases)) for i in range( OPTIONS.indelSize ) ], '')
            mutation = Mutation( contig, site, Mutation.INSERTION, '', inSeq )
            mutations.append( mutation )
        elif mutType == 'DELETION':
            inSeq = string.join([ random.choice(list(bases)) for i in range( OPTIONS.indelSize ) ], '')
            mutation = Mutation( contig, site, Mutation.DELETION, '', ref[refStopIndex:refStopIndex+OPTIONS.indelSize])
            mutations.append( mutation )
        else:
            raise Exception('Unexpected mutation type', mutType)

    # process the mutations
    refsub, mutseq = executeMutations( mutations, refStartIndex, refsub, mutseq )

    return success()

# version 1.0 -- prototype
# def mutateReference(ref, mutSite, mutType, mutParams, nBasesToPad):
#     #print 'mutateReference'
#     refStartIndex = mutSite - nBasesToPad + 1
#     refStopIndex = mutSite + nBasesToPad
#     internalStartIndex = nBasesToPad - 1
#
#     if refStartIndex < 0:
#         return False, None, None
#
#     refsub = ref[refStartIndex : refStopIndex]
#     print ref
#     print refsub
#
#     if mutType == 'SNP' or mutType == 'None':
#         #print '  In IF'
#         mutBase = ref[mutSite]
#         bases = set(['A', 'T', 'G', 'C'])
#         if mutBase in bases:
#             otherBases = bases - set(mutBase)
#             otherBase = random.choice(list(otherBases))
#             assert mutBase <> otherBase
#             mutseq = refsub.tomutable()
#
#             if mutType == 'SNP':
#                 print mutBase, ' => ', otherBase
#                 mutseq[internalStartIndex] = otherBase
#
#             print refsub
#             print mutseq
#             align = Bio.Align.Generic.Alignment(Gapped(IUPAC.unambiguous_dna, '-'))
#             align.add_sequence("ref", refsub.tostring())
#             align.add_sequence("mut", mutseq.tostring())
#             print str(align)
#
#             return True, refsub, mutseq
#
#     return False, None, None

# ---------------------------------------------------------------------------
#
# sample reads from alignments
#
# ---------------------------------------------------------------------------
def accumulateBases( mutSeq, start, readLen ):
    count = 0
    for stop in range(start, len(mutSeq)):
        if notDash(mutSeq[stop]):
            count += 1
        #print stop, count
        if count == readLen:
            break

    stop += 1
    read = string.join(filter( notDash, mutSeq[start:stop] ), '')

    return read, stop

# doesn't support paired reads
#
# def sampleReadsFromAlignment(refSeq, mutSeq, alignStart, readLen, nReads, mutations):
#     #print 'REF:', refSeq.tostring()
#     #print 'MUT:', mutSeq.tostring()
#     #print ''
#
#     # we are assuming that mutSeq is exactly 1 + 2 * readLen in size, which the mutation
#     # right in the middle
#     lastGoodStartSite = readLen - 1
#     if not OPTIONS.quiet:
#         print refSeq.tostring(), 'ref'
#         print mutSeq.tostring(), 'mut'
#
#     # we can potentially start at any site in mutSeq
#     nPotentialStarts = len(mutSeq) - readLen + 1
#     potentialStarts = range(nPotentialStarts)
#
#     if nReads.lower() == 'tile' or int(nReads) >= nPotentialStarts:
#         if OPTIONS.uniqueReadsOnly:
#             starts = potentialStarts
#         else:
#             starts = map( lambda x: random.choice(potentialStarts), range(nPotentialStarts))
#     else:
#         starts = random.sample(potentialStarts, nPotentialStarts)
#
#     #print 'potentialStarts', potentialStarts
#     #print 'Starts', starts
#
#     def sampleRead( start ):
#         if mutSeq[start] <> '-':
#             read, stop = accumulateBases( mutSeq, start, readLen )
#             remaining = len(mutSeq) - stop
#             refForRead = refSeq[start:stop]
#             #print 'XXXX', start, stop, refForRead, read
#             cigar = mutationsCIGAR( alignStart, readLen, mutations )
#             if not OPTIONS.quiet:
#                 leftPads = string.join(start * ['-'], '')
#                 rightPads = string.join(remaining * ['-'], '')
#                 print leftPads + mutSeq.tostring()[start:stop] + rightPads, start, stop, cigar
#             return filter(notDash, read), alignStart + start, cigar
#         else:
#             return False, False, False
#
#     reads = map( sampleRead, starts )
#     return [read for read in reads if read[0] <> False]

class refSite:
    def __init__( self, refSeq, mutSeq, alignStart, mutations ):
        self.refSeq = refSeq
        self.mutSeq = mutSeq
        self.alignStart = alignStart
        self.mutations = mutations

def sample1Read( refSite, start, readLen, reverseComplementP = False ):
    if refSite.mutSeq[start] <> '-':
        read, stop = accumulateBases( refSite.mutSeq, start, readLen )

        mutSubSeq = refSite.mutSeq[start:stop]
        if reverseComplementP:
            s = Seq( read, IUPAC.unambiguous_dna )
            #print 'reverseComplementP', read, s.reverse_complement().tostring(), mutSubSeq
            read = s.reverse_complement().tostring()
            mutSubSeq.complement()

        remaining = len(refSite.mutSeq) - stop
        refForRead = refSite.refSeq[start:stop]
        #print 'XXXX', start, stop, refForRead, read
        cigar = mutationsCIGAR( start, refSite.alignStart, readLen, refSite.mutations )
        leftPads = string.join(start * ['-'], '')
        rightPads = string.join(remaining * ['-'], '')
        ppReadStr = leftPads + mutSubSeq.tostring() + rightPads # + '(' + read + ')'
        return True, filter(notDash, read), refSite.alignStart + start, cigar, ppReadStr
    else:
        return False, None, None, None, None

def sampleReadsFromAlignment(wholeRef, refSeq, mutSeq, alignStart, readLen, nReads, mutations, pairedOffset, mutations2, refSeq2, mutSeq2):
    #print 'REF:', refSeq.tostring()
    #print 'MUT:', mutSeq.tostring()
    #print ''

    # pairedSite, mutations2, refSeq2, mutSeq2 can all be None

    refSite1 = refSite( refSeq, mutSeq, alignStart, mutations )
    refSite2 = refSite( refSeq2, mutSeq2, alignStart + pairedOffset, mutations2 )

    #print refSite1.alignStart, refSite2.alignStart, pairedOffset

    # we are assuming that mutSeq is exactly 1 + 2 * readLen in size, which the mutation
    # right in the middle
    lastGoodStartSite = readLen - 1
    if not OPTIONS.quiet:
        if OPTIONS.generatePairedEnds:
            r = wholeRef.seq[refSite1.alignStart:refSite2.alignStart + 2*readLen - 1]
            print r.tostring(), 'ref'
            print r.complement().tostring(), 'ref, complement'
        else:
            print refSeq.tostring(), 'ref'
            print mutSeq.tostring(), 'mut'

    # we can potentially start at any site in mutSeq
    nPotentialStarts = len(mutSeq) - readLen + 1
    potentialStarts = range(nPotentialStarts)
    #print 'potentialStarts', potentialStarts

    if nReads.lower() == 'tile' or int(nReads) >= nPotentialStarts:
        if OPTIONS.uniqueReadsOnly:
            starts = potentialStarts
        else:
            starts = map( lambda x: random.choice(potentialStarts), range(nPotentialStarts))
    else:
        starts = random.sample(potentialStarts, int(nReads))

    #print 'Starts', starts

    def sampleRead( start ):
        good1, read1, pos1, cigar1, ppstr1 = sample1Read( refSite1, start, readLen )

        if OPTIONS.generatePairedEnds:
            good2, read2, pos2, cigar2, ppstr2 = sample1Read( refSite2, start, readLen, True )
            if not OPTIONS.quiet:
                offsetDashes = string.join(['-'] * (pairedOffset), '')
                print
                print ppstr1 + offsetDashes, pos1, cigar1
                print offsetDashes + ppstr2, pos2, cigar2
            return [(read1, pos1, cigar1), (read2, pos2, cigar2)]
        else:
            if not OPTIONS.quiet:
                print ppstr1, pos1, cigar1
            return [(read1, pos1, cigar1), (None, 0, None)]

    reads = map( sampleRead, starts )
    return [read for read in reads if read[0][0] <> None]

def notDash( c ):
    return c <> '-'

def fakeQuals( seq ):
    return len(seq) * [30]
    #return range(len(seq))

def alignedRead2SAM( siteID, readID, fastaID, read, pos, cigar, read2, pos2, cigar2 ):
    rname = fastaID
    mapq = 40
    quals = fakeQuals( read )

    qname = '%s:%d.read.%d' % (fastaID, siteID, readID)

    if not OPTIONS.generatePairedEnds:
        # not in paired mode
        flags = []
        record1 = SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, read, quals )
        record2 = None
    else:
        flags = [SAM_SEQPAIRED, SAM_MAPPAIRED]
        insertSize = pos2 - pos - len(read)
        record1 = SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, read, quals, pairContig = rname, pairPos = pos2, insertSize = insertSize )
        record2 = SAMRecordFromArgs( qname + 'p', flags, rname, pos2, mapq, cigar2, read2, quals, pairContig = rname, pairPos = pos, insertSize = insertSize )

        #print 'read1, read2', record1.getSeq(), record2.getSeq()

    return [record1, record2]

# ---------------------------------------------------------------------------
#
# paired end reads
#
# ---------------------------------------------------------------------------
def pairedReadSite( ref, leftMutSite, readLen ):
    pairedOffset = int(round(random.normalvariate(OPTIONS.insertSize, OPTIONS.insertDev)))

    def fail(): return False, None

    if pairedOffset < 0:
        return fail()

    pairedSite = pairedOffset + leftMutSite + readLen
    #print 'pairedSite', pairedOffset, leftMutSite, pairedSite

    if pairedSite + readLen >= len(ref):
        return fail()
    refsub = ref[pairedSite - readLen:pairedSite + readLen + 1]
    if isBadSequenceForSim( refsub ):
        return fail()

    return True, pairedSite


# ---------------------------------------------------------------------------
#
# build allowed regions and sampling starts from region
#
# ---------------------------------------------------------------------------
def parseSamplingRange( arg, ref, readLen ):
    # returns a list of the allowed regions to sample in the reference

    def one(x):
        elts = x.split()
        #print 'elts', elts
        if len(elts) > 0:
            elts1 = elts[0].split(':')
            #print 'elts1', elts1
            return map( int, elts1 )
        else:
            return False

    if arg <> None:
        try:
            return [ one(arg) ]
        except:
            # try to read it as a file
            return filter( None, map( one, open(arg).readlines() ) )
    else:
        return [[0, len(ref.seq) - readLen]]

def sampleStartSites( sampleRanges, nsites ):
    print 'sampleStartSites', sampleRanges, nsites

    # build a set of potential sites
    if len(sampleRanges) > 1:
        potentialSites = set()
        for start, end in sampleRanges:
            #print 'potentialSites', start, end, potentialSites
            potentialSites = potentialSites.union(set(xrange(start, end)))
    else:
        start, end = sampleRanges[0]
        potentialSites = xrange(start, end)

    #print 'potentialSites', potentialSites

    # choose sites from potentials
    if len(potentialSites) <= nsites:
        # tile every site
        sites = potentialSites
    else:
        # we need to sample from the potential sites
        sites = random.sample( potentialSites, nsites )

    # print 'Sites', sites
    print 'Number of potential start sites:', len(potentialSites)
    print 'Number of start sites that will be generated:', len(sites)
    return sites

def readRef(referenceFasta):
	handle = open(referenceFasta)
	for seq_record in SeqIO.parse(handle, "fasta"):
		print seq_record.id
		print seq_record.name
		#print repr(seq_record.seq)
		print len(seq_record.seq)
		yield seq_record
	handle.close()

import os.path
def outputFilename():
    root = os.path.splitext(os.path.split(OPTIONS.reference)[1])[0]
    if OPTIONS.generatePairedEnds:
        pairedP = 'Yes'
    else:
        pairedP = 'No'

    params = [['mut',OPTIONS.mutationType], ['den', max(OPTIONS.mutationDensity, OPTIONS.indelSize)], [OPTIONS.readLen,'bp'], \
              ['nsites', OPTIONS.nSites], ['cov', OPTIONS.coverage], ['range', OPTIONS.range], ['paired', pairedP]]
    filename = root + '__' + string.join(map( lambda p: string.join(map(str, p),'.'), params), '_')
    root = os.path.join(OPTIONS.outputPrefix, filename)
    return root, root + '.sam'

def main():
    global OPTIONS, ROOT

    usage = "usage: %prog [options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-r", "--ref", dest="reference",
                        type="string", default=None,
                        help="Reference DNA seqeunce in FASTA format")
    parser.add_option("-o", "--outputPrefix", dest="outputPrefix",
                        type="string", default='',
                        help="Output Prefix SAM file")
    parser.add_option("-c", "--coverage", dest="coverage",
                        type="string", default='1',
                        help="Number of reads to produce that cover each mutated region for the reference, or tile for all possible reads.")
    parser.add_option("-n", "--nsites", dest="nSites",
                        type=int, default=1,
                        help="Number of sites to generate mutations in the reference")
    parser.add_option("-l", "--readlen", dest="readLen",
                        type=int, default=36,
                        help="Length of reads to generate")
    parser.add_option("-s", "--seed", dest="seed",
                        type=int, default=123456789,
                        help="Random number seed.  If 0, then system clock is used")
    parser.add_option("-t", "--muttype", dest="mutationType",
                        type="string", default='None',
                        help="Type of mutation to introduce from reference.  Can be None, SNP, insertion, or deletion")
    parser.add_option("-e", "--mutdensity", dest="mutationDensity",
                        type=int, default=1,
                        help="Number of mutations to introduce from the reference")
    parser.add_option("-x", "--range", dest="range",
                        type="string", default=None,
                        help="Sampling range restriction, in the form of x:y without a space")
    parser.add_option("-u", "--unique", dest="uniqueReadsOnly",
                        action='store_true', default=True,
                        help="Assumes that the user wants unique reads generated.  If the coverage is greater than the number of unique reads at the site, the region is just tiled")
    parser.add_option("-i", "--indelsize", dest="indelSize",
                        type=int, default=0,
                        help="Size in bp of the insertion or deletion to introduce")

    # Paired end options
    parser.add_option("", "--paired", dest="generatePairedEnds",
                        action='store_true', default=False,
                        help="Should we generate paired end reads?")
    parser.add_option("", "--insertSize", dest="insertSize",
                        type=int, default=280,
                        help="Size in bp of the paired library insertion")
    parser.add_option("", "--insertDev", dest="insertDev",
                        type=int, default=5,
                        help="Standard deviation in the size in bp of the paired library insertion")

    parser.add_option("-q", "--quiet", dest="quiet",
                        action='store_true', default=False,
                        help="Be quiet when generating output")
    parser.add_option("-d", "--debug", dest="debug",
                        action='store_true', default=False,
                        help="Verbose debugging output?")
    (OPTIONS, args) = parser.parse_args()
    if len(args) != 0:
        parser.error("incorrect number of arguments")

    root, outputSAM = outputFilename()

    if OPTIONS.seed == 0:
        random.seed(None)
    else:
        random.seed(OPTIONS.seed)

    OPTIONS.mutationType = OPTIONS.mutationType.upper()

    print '------------------------------------------------------------'
    print 'program:', sys.argv[0]
    print ''
    print '  ref:               ', OPTIONS.reference
    print '  output:            ', outputSAM
    print ''
    print '  mutation type:     ', OPTIONS.mutationType
    print '  mutation density:  ', OPTIONS.mutationDensity
    print '  indel size:        ', OPTIONS.indelSize
    print '  readLen:           ', OPTIONS.readLen
    print '  nSites:            ', OPTIONS.nSites
    print '  coverage:          ', OPTIONS.coverage
    print '  range restriction: ', OPTIONS.range
    print ''
    print '  paired ends?:      ', OPTIONS.generatePairedEnds
    print '  insert size:       ', OPTIONS.insertSize
    print '  insert stddev:     ', OPTIONS.insertDev
    print ''
    print '  Debugging?:        ', OPTIONS.debug
    print '------------------------------------------------------------'

    if OPTIONS.mutationType <> 'SNP' and OPTIONS.mutationDensity > 1:
        raise Exception('Does not support mutation density > 1 for mutations of class', OPTIONS.mutationType)

    readLen = OPTIONS.readLen
    header = SAMHeader( "MarkD", readLen )
    SAMout = SAMIO( outputSAM, header, debugging=OPTIONS.debug )

    mutationsout = open(root + '.mutations.txt', 'w')
    qualsout = open(root + '.quals.txt', 'w')
    refout = open(root + '.ref', 'w')

    fastaout = open(root + '.fasta', 'w')
    if OPTIONS.generatePairedEnds:
        fastaout2 = open(root + '.pairs.fasta', 'w')
        qualsout2 = open(root + '.pairs.quals.txt', 'w')

    counter = 0
    refLen = 0
    for ref in readRef(OPTIONS.reference):
        refLen = len(ref.seq)

        # write the crazy ref file info needed by samtools
        print >> refout, ref.id + '\t' + str(refLen)

        sampleRanges = parseSamplingRange( OPTIONS.range, ref, readLen )
        sites = sampleStartSites( sampleRanges, OPTIONS.nSites )

        for mutSite, siteCounter in zip( sites, range(1, len(sites)+1) ):
            #print 'Sampling site:', mutSite, refLen
            #mutSite = readLen-1
            nSitesSelected = max(int(round(len(sites)/100.0)), 1)
            #print siteCounter, len(sites), nSitesSelected)
            if siteCounter % nSitesSelected == 0:
                print 'Sampled site %d of %d (%.2f%%)' % ( siteCounter, len(sites), (100.0*siteCounter) / len(sites))

            good, mutations, refSeq, mutSeq = mutateReference(ref.seq, ref.id, mutSite, OPTIONS.mutationType, [], readLen)

            pairedSite, mutations2, refSeq2, mutSeq2 = 0, None, None, None
            if good and OPTIONS.generatePairedEnds:
                if OPTIONS.mutationType == 'SNP':
                    pairedMutType = 'SNP'
                else:
                    pairedMutType = 'NONE'

                good, pairedSite = pairedReadSite( ref.seq, mutSite, readLen )
                print 'pairedReadSite', good, pairedSite, good
                if good:
                    good, mutations2, refSeq2, mutSeq2 = mutateReference(ref.seq, ref.id, pairedSite, pairedMutType, [], readLen)

            #print 'Good', good, mutations, refSeq, mutSeq
            if good:
                print >> mutationsout, ref.id + ':' + str(mutSite) + '\t' + string.join(map(str, mutations), '\t')
                #print 'Mutations', mutations

                refSeqPos = mutSite - readLen + 1
                readPairs = sampleReadsFromAlignment(ref, refSeq, mutSeq, refSeqPos, readLen, OPTIONS.coverage, mutations, pairedSite - mutSite, mutations2, refSeq2, mutSeq2)
                for i in range(len(readPairs)):
                    counter += 1

                    read1, read2 = readPairs[i]
                    seq, pos, cigar = read1
                    seq2, pos2, cigar2 = read2

                    # write out the sam and fasta files
                    def write1( record, recordi ):
                        if record <> None:
                            SAMout.writeRecord( record )
                            sequences = [SeqRecord(Seq(record.getSeq()), id = record.getQName(), description='')]
                            #print sequences

                            if recordi % 2 == 0:
                                localFastaOut, localQualsOut = fastaout, qualsout
                            else:
                                localFastaOut, localQualsOut = fastaout2, qualsout2

                            SeqIO.write(sequences, localFastaOut, "fasta")
                            print >> localQualsOut, record.getQName(), string.join(map( str, record.getQuals() ), ' ')

                    #print i, read1, read2
                    records = alignedRead2SAM( mutSite, i+1, ref.id, seq, pos + 1, cigar, seq2, pos2 + 1, cigar2 )
                    map( write1, records, range( len(records) ) )

    SAMout.close()
    qualsout.close()
    fastaout.close()
    refout.close()
    mutationsout.close()

    if OPTIONS.generatePairedEnds:
        qualsout2.close()
        fastaout2.close()

#    for record in SAMIO( outputSAM ):
#        print record

if __name__ == "__main__":
    import Bio
    from Bio import SeqIO
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.Alphabet import IUPAC, Gapped
    main()