gatk-3.8/python/compSNPCalls.py

#!/usr/bin/env python

import sys, string
import os
import re
from itertools import *
from optparse import OptionParser
from memo import DiskMemoize, time_func

class ref_genome:
    """Reads reference genome in FASTA format into a dict"""

    def __init__(self, ref_genome_file):
        ref_genome.chr_offset = [[] for i in range(45)]
        chr_id = 0
        seq = ""
        for line in open(ref_genome_file):
            if line.startswith(">"):
                print line[1:],
                if line.startswith(">chrM"): # skip first > line
                    continue
                ref_genome.chr_offset[chr_id] = seq
                chr_id += 1
                seq = " " # make it 1 indexed instead of 0 indexed
                #if chr_id > 2:
                #    break
            else:
                seq += line.rstrip().upper()
        ref_genome.chr_offset[chr_id] = seq

    def __getitem__(self, key):
        return ref_genome.chr_offset[key]

AffyChr2Index = dict()
for i in range(1,23):
    AffyChr2Index[str(i)] = i
AffyChr2Index['MT'] = 0
AffyChr2Index['X'] = 23
AffyChr2Index['Y'] = 24

class GenotypeCall:
    #ref = time_func(ref_genome)("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
    def __init__( self, chr, pos, genotype, snpP, lod ):
        self.chr = chr
        self.pos = int(pos)
        self._site = chr + ':' + str(self.pos)
        self._isSNP = snpP
        self.genotype = string.join(map(string.upper, sorted(genotype)), '/')    # sorted list of bases at position
        self.lod = lod

    def refbase(self):
        return GenotypeCall.ref[AffyChr2Index[self.chr]][self.pos]

    def __hash__(self):
        return hash(self._site)

    def __eq__(self, other):
        return self._site == other._site

    def site(self): return self._site
    def isSNP(self): return self._isSNP
    
    def ref_het_hom(self):
        if self.genotype[0] <> self.genotype[2]:
            return 1 # het(erozygous non-ref)
        else:
            # homozygous something
            if self.genotype[0] == self.refbase:
                return 0 # ref
            else:
                return 2 # hom(ozygous non-ref)

    def isHET(self): return self.genotype[0] <> self.genotype[2]
    def isHOM(self): return self.genotype[0] == self.genotype[2]
            
    def __str__(self):
        return "%s:%s %s %s" % ( self.chr, self.pos, self.genotype, self.lod)
        
MAQGenotypeEncoding = {
    'A' : ['A', 'A'], 
    'C' : ['C', 'C'], 
    'T' : ['T', 'T'], 
    'G' : ['G', 'G'], 
    "M" : ['A', 'C'], 
    'K' : ['G', 'T'], 
    'Y' : ['C', 'T'], 
    'R' : ['A', 'G'], 
    'W' : ['A', 'T'], 
    'S' : ['C', 'G'], 
    'D' : ['A', 'G', 'T'], 
    'B' : ['C', 'G', 'T'], 
    'H' : ['A', 'C', 'T'], 
    'V' : ['A', 'C', 'G'], 
    'N' : ['A', 'C', 'G', 'T'] }

MAQ2STDChr = dict()
for i in range(1,23):
    MAQ2STDChr['chr'+str(i)] = str(i)
MAQ2STDChr['chrM'] = 'MT'
MAQ2STDChr['chrX'] = 'X'
MAQ2STDChr['chrY'] = 'Y'

def convertMAQChr(maqChr):
    #print 'convertMAQChr:', maqChr, MAQ2STDChr[maqChr]
    if maqChr in MAQ2STDChr:
        return MAQ2STDChr[maqChr]
    else:
        return '?'

def convertMAQGenotype( oneBaseCode ):
    return MAQGenotypeEncoding[oneBaseCode]

def internalReadSNPFile( parse1, filename ):
    result = []
    snps_extracted = 0
    for snp in imap( parse1, open(filename) ):
        if snp:
            result.append(snp)
            snps_extracted += 1
        if snps_extracted > OPTIONS.debug_lines:
            break

    print len(result),"genotypes extracted"
    return result

def snpMAP( snps ):
    #d = dict( map( lambda x: [x.site(), x], snps ) )
    d = dict()
    for snp in snps:
        d[snp.site()] = snp#d
    
    #print 'snps', snps, d
    return d

def overlappingSites( snps1, snps2 ):
    map1 = snpMAP(snps1)
    map2 = snpMAP(snps2)
    shared = set(map1.keys()) & set(map2.keys())
    print 'Number of snp1 records', len(map1)
    print 'Number of snp2 records', len(map2)
    print 'Number of shared sites', len(shared)
    print "\n".join(map(str,snps1))
    return shared

def readMAQSNPs(filename):
    # Each line consists of:
    #  chromosome
    #  position
    #  reference base
    #  consensus base
    #  Phred-like consensus quality
    #  read depth
    #  the average number of hits of reads covering this position
    #  the highest mapping quality of the reads covering the position
    #  the minimum consensus quality in the 3bp flanking regions at each side of the site (6bp in total)
    #  the second best call
    #  log likelihood ratio of the second best and the third best call
    #  and the third best call.
    # 
    # Also, note that:
    # 
    #      What do those "S", "M" and so on mean in the cns2snp output?
    # 		They are IUB codes for heterozygotes. Briefly:
    # 
    # 		M=A/C, K=G/T, Y=C/T, R=A/G, W=A/T, S=G/C, D=A/G/T, B=C/G/T, H=A/C/T, V=A/C/G, N=A/C/G/T
    def read1(line):
        formats = [str, int, str, str, int, int]
        vals = map( lambda f, x: f(x), formats, line.split()[0:6] )
        alignQual = vals[4]
        if alignQual >= (10*OPTIONS.lod):
            return GenotypeCall( convertMAQChr(vals[0]), vals[1], convertMAQGenotype(vals[3]), vals[2] <> vals[3], alignQual/10.0 )
        else:
            #print 'Filtering', alignQual, vals
            return False

    return internalReadSNPFile( read1, filename )

OPTIONS = None

def MerlinChr( index ):
    if index == 0: 
        return 'MT'
    elif index == 23: 
        return 'X'
    elif index == 24: 
        return 'Y'
    else:
        return str(index)
    
def readMerlinSNPs(filename):
    # 0:72 G GG 155.337967 0.000000 homozygous A:0 C:2 G:510 T:2 514 0 1 1 GG:-5.59 CG:-160.92 GT:-161.51 AG:-162.11 CT:-1293.61 CC:-1293.61 TT:-1294.19 AC:-1294.21 AT:-1294.80 AA:-1295.40 
    # 0:149 T CC 118.595886 1131.024696 homozygous-SNP A:2 C:442 G:1 T:7 452 0 1 1 CC:-24.21 CT:-142.81 AC:-156.33 CG:-156.96 TT:-1155.23 AT:-1159.41 GT:-1160.04 AA:-1173.26 AG:-1173.56 GG:-1174.20 
    # chr:pos ref genotype bestVsRef bestVsNextBest class ...
    def read1(line):
        formats = [lambda x: x.split(':'), str, sorted, float, float, str]
        vals = map( lambda f, x: f(x), formats, line.split()[0:6] )
        bestVsRef, bestVsNext = vals[3:5]
        isSNP = vals[5].find('-SNP') <> -1
        if bestVsRef >= OPTIONS.lod and isSNP:
            return GenotypeCall( MerlinChr(int(vals[0][0])), int(vals[0][1]) + 1, vals[2], isSNP, bestVsRef )
        else:
            return False

    return internalReadSNPFile( read1, filename )
    
def readSNPfile( filename, format ):
    formats = { 'merlin' : readMerlinSNPs, 'maq' : readMAQSNPs }
    if format.lower() in formats:
        return list(formats[format.lower()](filename))
    else:
        raise Exception('Unknown SNP file format ' + format)
    
def readAffyFile(filename):
    # chrom	position	genotype	probe_set_id	dbsnp_id
    # 1	84647761		TC	SNP_A-1780419	rs6576700
    # 5	156323558		GG	SNP_A-1780418	rs17054099
    def read1(line):
        formats = [str, int, sorted, str, str]
        vals = map( lambda f, x: f(x), formats, line.split() )
        
        try:
            chr = str(int(vals[0]))
        except:
            chr = convertMAQChr(vals[0])
        #print 'CHR', chr, vals[0]
        return GenotypeCall( chr, vals[1], vals[2], False, 100 )

    file = open(filename)
    file.readline()                                     # skip header
    #affyData = map( read1, file )
    affyData = []
    for index, line in enumerate(file):
        affyData.append(read1(line))
        if index > OPTIONS.debug_lines:
            break
        if index % 10000 == 0:
            print index
    # Give a chance to use list before creating dictionary
    return affyData

    #print "1111111"
    #return dict( zip( map( GenotypeCall.site, affyData ), affyData ) )

def equalSNPs( snp1, snp2 ):
    return snp1.genotype == snp2.genotype

# def concordance( truthSet, testSet, includeVector = None ):
#     # calculates a bunch of useful stats about the two 
#     # data genotype call sets above
#     
#     states = [[x,0] for x in ['tested', 'shared', 'shared-equal', 'test-snp', 'hom-ref', 'hom-snp', 'het-snp', 'het-ref']]
#     counts = dict(states)
#     
#     def incShared(state, equalP ):
#         counts[state] += 1
#         if equalP:
#             counts[state + '-equal'] += 1
#     
#     nTestSites = 0
#     for i, testSNP in izip( count(), testSet ):
#         if includeVector <> None and not includeVector[i]:
#             # we are skiping this site
#             continue
# 
#         nTestSites += 1
# 
#         if testSNP.isSNP():
#             counts['test-snp'] += 1
# 
#         #print testSNP.site()
#         if testSNP.site() in truthSet:
#             truth = truthSet[testSNP.site()]
#             eql = equalSNPs( testSNP, truth )
#             
#             incShared( 'shared', eql )
#             if testSNP.isSNP():
#                 if truth.isHOM(): incShared( 'hom-snp', eql )
#                 else: incShared( 'het-snp', eql )
#             else:
#                 if truth.isHOM(): incShared( 'hom-ref', eql )
#                 else: incShared( 'het-ref', eql )
#             
#         if OPTIONS.verbose and nTestSites % 100 == 0 and nSharedSites > 0:
#             #print nTestSites, nSharedSites, nEqualSites
#             print nTestSites, counts 
# 
#     counts['tested'] = nTestedSites
#             
#     return counts

class ConcordanceData:
    def __init__(self, name, file1count, file2count):
        self.name = name
        self.nFile1Sites = file1count    # num sites in file 1
        self.nFile2Sites = file2count    # num sites in file 1
        self.nSharedSites = 0   # num SNP pairs that map to same position on the genome
        self.nEqualSites = 0    # num SNPs pars with the same genotype

    def inc( self, truthSNP, testSNP ):
        self.nSharedSites += 1
        if equalSNPs( testSNP, truthSNP ): # if the genotypes are equal
            self.nEqualSites += 1
            
    def rate(self):
        return (100.0 * self.nEqualSites) / max(self.nSharedSites,1)        
        
    def __str__(self):
        return '%d %d %.2f' % ( self.nSharedSites, self.nEqualSites, self.rate() )

def concordance( truthSet, testSet, sharedSites = None ):
    # calculates a bunch of useful stats about the two 
    # data genotype call sets above
    #
    # The 2 calls in main work like this:
    # affy, snp1, snp1_snp2_shared
    # affy, snp2, snp1_snp2_shared
    
    nTestSites = 0

    # Now for each of the calls to concordance, we generate 3 sets:
    # - allData: all SNP1 sites that are also in Affy 
    allData = ConcordanceData('all', len(truthSet), len(testSet))
    # - sharedData: SNP1 sites that are also SNP2 sites that are alse in Affy
    sharedData = ConcordanceData('shared', len(truthSet), len(testSet))
    # - uniqueData: SNP1 sites that are not SNP2 sites but that are in Affy
    uniqueData = ConcordanceData('unique', len(truthSet), len(testSet))
    for i, testSNP in izip( count(), testSet ):
        nTestSites += 1
        if testSNP.site() in truthSet:
            truthSNP = truthSet[testSNP.site()]

            allData.inc( truthSNP, testSNP )
            if sharedSites <> None:
                if testSNP.site() in sharedSites:
                    sharedData.inc( truthSNP, testSNP )
                else:
                    uniqueData.inc( truthSNP, testSNP )

        if OPTIONS.verbose and nTestSites % 100000 == 0:
            #print nTestSites, nSharedSites, nEqualSites
            print nTestSites, allData, sharedData, uniqueData

    return nTestSites, allData, sharedData, uniqueData

# def concordance( truthSet, testSet, includeVector = None ):
#     # calculates a bunch of useful stats about the two 
#     # data genotype call sets above
#     
#     states = [[x,0] for x in ['tested', 'shared', 'test-snp', 'shared-hom-ref', 'shared-het-snp', 'shared-hom-snp']]
#     counts = dict(states)
#     
#     nTestSites = 0
#     nSharedSites = 0
#     nEqualSites = 0
#     for i, testSNP in izip( count(), testSet ):
#         nTestSites += 1
#         #print testSNP.site()
#         if testSNP.site() in truthSet:
#             nSharedSites += 1
#             if equalSNPs( testSNP, truthSet[testSNP.site()] ):
#                 nEqualSites += 1
#             #else:
#             #    print '~', testSNP, truthSet[testSNP.site()] 
#         if OPTIONS.verbose and nTestSites % 100000 == 0 and nSharedSites > 0:
#             #print nTestSites, nSharedSites, nEqualSites
#             print nTestSites, nSharedSites, nEqualSites, (100.0 * nEqualSites) / nSharedSites
#             
#     return [nTestSites, nSharedSites, nEqualSites, (100.0 * nEqualSites) / nSharedSites]


def printConcordanceResults( filename1, filename2, results, hasSharedSites = False ):
    nTestSites, allData, sharedData, uniqueData = results

    def print1(data):
        print '------------------------------------------------------------'
        print 'Concordance results', data.name, 'sites'
        print '  -> Genotype file1', filename1
        print '  -> Genotype file2', filename2
        print '  -> Number of tested sites (%s %s): %d' % (filename2, data.name, nTestSites)
        print '  -> Number of sites shared between files (%s %s): %d' % (filename2, data.name, data.nSharedSites)
        print '  ->   Percent sites in file1 shared (%s %s): %.2f' % (filename1, data.name, data.nSharedSites / (0.01 * data.nFile1Sites))
        print '  ->   Percent sites in file2 shared (%s %s): %.2f' % (filename1, data.name, data.nSharedSites / (0.01 * data.nFile2Sites))
        print '  -> Number of genotypically equivalent sites (%s %s): %d' % (filename2, data.name, data.nEqualSites)
        print '  -> Concordance rate (%s %s): %.2f' % (filename2, data.name, data.rate())
    
    print1(allData)
    if hasSharedSites:
        print1( sharedData ) # shared between SNP1 and SNP2
        print1( uniqueData ) # unique to SNP1 or to SNP2 only

def dump_shared_snps( affys, snp_list1, snp_list2 ):
    print len(affys), len(snp_list1), len(snp_list2)
    snps1 = snpMAP(snp_list1)
    snps2 = snpMAP(snp_list2)
    snp1_sites = set(snps1.keys()); print "SNP1s:",len(snp1_sites)
    snp2_sites = set(snps2.keys()); print "SNP2s:",len(snp2_sites)
    affy_sites = set(affys.keys()); print "Affys:",len(affy_sites)
    snp1or2_affy_sites = (snp1_sites | snp2_sites) & affy_sites
    snp1and2_affy_sites = (snp1_sites & snp2_sites) & affy_sites
    print "SNP 1 or 2 and Affy: ",len(snp1or2_affy_sites)
    print "SNP 1 and 2 and Affy:",len(snp1and2_affy_sites)

    fsnp = open ("snp.tab","w")
    print >>fsnp, "site lod1 lod2 lod1v2 gen1 gen2 genaff inc1 inc2 inc12 lodm genm incm ref_het_hom refbase"
    for site in snp1and2_affy_sites:
        snp1 = snps1[site]
        snp2 = snps2[site]
        affy = affys[site]

        print >>fsnp, "%-11s %5.2f %5.2f" % (site, snp1.lod, snp2.lod),
        try:
            snp1div2 = snp1.lod / snp2.lod
        except ZeroDivisionError:
            snp1div2 = 1000
        print >>fsnp, "%5.2f" % snp1div2,
        print >>fsnp, snp1.genotype, snp2.genotype, affy.genotype,
        print >>fsnp, "%1d %1d %1d" % (not equalSNPs(snp1, affy), not equalSNPs(snp2,affy), not equalSNPs(snp1,snp2)),

        # Calculte meta_lod from the two lods
        if snp1.genotype == snp2.genotype:
            meta_lod = snp1.lod + snp2.lod
            meta_genotype = snp1.genotype
        else:
            if snp1.lod > snp2.lod:
                meta_lod = snp1.lod
                meta_genotype = snp1.genotype
            else:
                meta_lod = snp2.lod
                meta_genotype = snp2.genotype
        meta_inc = meta_genotype != affy.genotype
        print >>fsnp, "%5.2f %3s %1d" % (meta_lod, meta_genotype, meta_inc),
        print >>fsnp, affy.ref_het_hom(),
        print >>fsnp, affy.refbase()

def intersection_union_snps( affy, snps1, snps2 ):
    map1 = snpMAP(snps1)
    map2 = snpMAP(snps2)
    shared_nonaffy_sites = (set(map1.keys()) & set(map2.keys())).difference(affy.keys())
    nonaffy_shared = [(map1[site].lod, map2[site].lod) for site in shared_nonaffy_sites]
    shared_affy_sites = set(map1.keys()) & set(map2.keys()) & set(affy.keys())

    #shared = []
    #for site in shared_sites:
    #    shared.append((map1[site], map2[site]))
    #print "Shared:",len( shared )
    #shared_x = [s[0].lod for s in shared]
    #shared_y = [s[1].lod for s in shared]

    both_corr = []
    snp1_corr = []
    snp2_corr = []
    neither_corr = []
    # given two bools telling whether snp1 and snp2 are correct, 
    # return the correct object
    #
    #             snp2 incorrect, snp2 correct
    truth_list = [[neither_corr, snp2_corr], # snp1 incorrect
                  [snp1_corr, both_corr]]    # snp1 correct

    for site in shared_affy_sites:
        snp1_true = equalSNPs(map1[site], affy[site])
        snp2_true = equalSNPs(map2[site], affy[site])
        truth_list[snp1_true][snp2_true].append( (map1[site].lod, map2[site].lod) )

    print "Beginning plot..."
    import rpy2.robjects as robj
    robj.r('X11(width=15, height=15)')
    XY_MAX = 25
    plots = ((nonaffy_shared, "gray45"),(both_corr, "black"), (snp1_corr, "red"), (snp2_corr, "green"), (neither_corr, "blue"))
    plots = ((both_corr, "black"), (snp1_corr, "red"), (snp2_corr, "green"), (neither_corr, "blue"))
    robj.r.plot([0, XY_MAX], [0, XY_MAX], \
                xlab = os.path.splitext(OPTIONS.snp1)[0].capitalize()+" LOD", \
                ylab = os.path.splitext(OPTIONS.snp2)[0].capitalize()+" LOD", \
                main = "Shared SNP site LODs", \
                xlim = robj.FloatVector([0,XY_MAX]), \
                ylim = robj.FloatVector([0,min(XY_MAX,25)]), \
                pch = 19, \
                cex = 0.0)
    for xy, color in plots:
        print "X"
        robj.r.points([pt[0] for pt in xy], [pt[1] for pt in xy], col=color, pch=19, cex=.3)
        print "Color:",color
        print "Len:",len(xy)
        #print "\n".join(["%25s %25s" % pt for pt in xy])

    #print "\n".join(["%25s %25s" % shared_snp for shared_snp in shared])
    #robj.r.plot(shared_x, shared_y, xlab=OPTIONS.format1.capitalize()+" LOD", ylab=OPTIONS.format2.capitalize()+" LOD", main="Shared SNP site LODs", col="black", xlim=robj.FloatVector([0,XY_MAX]), ylim=robj.FloatVector([0,min(XY_MAX,25)]), pch=19, cex=0.3)
    raw_input("Press enter to continue")
    
    return

    ss1 = set(snps1)
    ss2 = set(snps2)
    print "Shared:",len( ss1.intersection(ss2) )
    print "Snp1 only:",len( ss1.difference(ss2) )
    print "Snp2 only:",len( ss2.difference(ss1) )
    print "Snp1 total:",len( ss1 )
    print "Snp2 total:",len( ss2 )

def count_het_sites(snp_list):
    hets = 0
    for snp in snp_list:
        if snp.isHET():
            hets += 1
    print hets,"hets,",len(snp_list),"total,",
    print "%.1f" % (float(hets)/len(snp_list)*100)

def main(argv):
    global OPTIONS, ROOT

    usage = "usage: %prog --truth affy-truth-file --snp1 snpfile1 --snp2 snpfile2"
    parser = OptionParser(usage=usage)
    parser.add_option("-t", "--truth", dest="affy",
                        type="string", default=None,
                        help="Affy truth file")
    parser.add_option("-1", "--snp1", dest="snp1",
                        type="string", default=None,
                        help="Affy truth file")
    parser.add_option("-2", "--snp2", dest="snp2",
                        type="string", default=None,
                        help="Affy truth file")
    parser.add_option("", "--f1", dest="format1",
                        type="string", default=None,
                        help="File type of snpfile1")
    parser.add_option("", "--f2", dest="format2",
                        type="string", default=None,
                        help="File type of snpfile2")
    parser.add_option("-l", "--lod", dest="lod",
                        type="float", default=5.0,
                        help="Minimum LOD of confident SNP call in Merlin or Q (10x) in MAQ")
    parser.add_option("-v", "--verbose", dest="verbose",
                        action='store_true', default=False,
                        help="Verbose output")
    parser.add_option("-d", "--debug_lines", dest="debug_lines",
                        type='float', default=sys.maxint,
                        help="Number of input data lines to process for debugging")
                        
    (OPTIONS, args) = parser.parse_args()
    if len(args) != 0:
        parser.error("incorrect number of arguments")

    if OPTIONS.affy == None:
        parser.error("No affy data specified")

    if OPTIONS.format1 == None:
        parser.error("First format cannot be none")

    if OPTIONS.snp2 <> None and OPTIONS.format2 == None:
        parser.error("snp2 file given but format was not specified")
        
    # Load reference genome
    #ref = ref_genome("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
    #sys.exit()

    if 1:
        print "Reading Affy truth data..."
        #readAffyFile2 = DiskMemoize( readAffyFile, "readAffyFile", global_deps = ["OPTIONS.lod"] )
        readAffyFile2 = time_func(readAffyFile)
        affy_list = readAffyFile2( filename=OPTIONS.affy )
        #count_het_sites(affy_list)
        #sys.exit()
        affy = dict( zip( map( GenotypeCall.site, affy_list ), affy_list ) )
        print 'Read affy truth data:'
        print '  -> number of genotyped loci', len(affy)

    #readSNPfile2 = DiskMemoize( readSNPfile, "readSNPfile", global_deps = ["OPTIONS.lod"] )
    readSNPfile2 = time_func(readSNPfile)
    print "Reading SNPs 1 file..."
    snps1 = readSNPfile2( filename=OPTIONS.snp1, format=OPTIONS.format1 )
    if OPTIONS.snp2 <> None:
        print "Reading SNPs 2 file..."
        snps2 = readSNPfile2( filename=OPTIONS.snp2, format=OPTIONS.format2 )
       
        dump_shared_snps( affy, snps1, snps2 )
    #intersection_union_snps( affy, snps1, snps2 )
    #sys.exit()

    sharedSites = None
    if OPTIONS.snp2 <> None:
        sharedSites = overlappingSites( snps1, snps2 )
        results1 = concordance( affy, snps1, sharedSites )
        printConcordanceResults( OPTIONS.affy, OPTIONS.snp1, results1, True )
        results2 = concordance( affy, snps2, sharedSites )
        printConcordanceResults( OPTIONS.affy, OPTIONS.snp2, results2, True )

    else:
        results = concordance( affy, snps1 )
        printConcordanceResults( OPTIONS.affy, OPTIONS.snp1, results, sharedSites )    

if __name__ == "__main__":       
    main(sys.argv)
Basic reorganization of tree git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@8 348d0f76-0448-11de-a6fe-93d51630548a 2009-02-28 23:28:56 +08:00			`#!/usr/bin/env python`

			`import sys, string`
			`import os`
			`import re`
			`from itertools import *`
			`from optparse import OptionParser`
			`from memo import DiskMemoize, time_func`

			`class ref_genome:`
			`"""Reads reference genome in FASTA format into a dict"""`

			`def __init__(self, ref_genome_file):`
			`ref_genome.chr_offset = [[] for i in range(45)]`
			`chr_id = 0`
			`seq = ""`
			`for line in open(ref_genome_file):`
			`if line.startswith(">"):`
			`print line[1:],`
			`if line.startswith(">chrM"): # skip first > line`
			`continue`
			`ref_genome.chr_offset[chr_id] = seq`
			`chr_id += 1`
			`seq = " " # make it 1 indexed instead of 0 indexed`
			`#if chr_id > 2:`
			`# break`
			`else:`
			`seq += line.rstrip().upper()`
			`ref_genome.chr_offset[chr_id] = seq`

			`def __getitem__(self, key):`
			`return ref_genome.chr_offset[key]`

			`AffyChr2Index = dict()`
			`for i in range(1,23):`
			`AffyChr2Index[str(i)] = i`
			`AffyChr2Index['MT'] = 0`
			`AffyChr2Index['X'] = 23`
			`AffyChr2Index['Y'] = 24`

			`class GenotypeCall:`
			`#ref = time_func(ref_genome)("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")`
			`def __init__( self, chr, pos, genotype, snpP, lod ):`
			`self.chr = chr`
			`self.pos = int(pos)`
			`self._site = chr + ':' + str(self.pos)`
			`self._isSNP = snpP`
			`self.genotype = string.join(map(string.upper, sorted(genotype)), '/') # sorted list of bases at position`
			`self.lod = lod`

			`def refbase(self):`
			`return GenotypeCall.ref[AffyChr2Index[self.chr]][self.pos]`

			`def __hash__(self):`
			`return hash(self._site)`

			`def __eq__(self, other):`
			`return self._site == other._site`

			`def site(self): return self._site`
			`def isSNP(self): return self._isSNP`

			`def ref_het_hom(self):`
			`if self.genotype[0] <> self.genotype[2]:`
			`return 1 # het(erozygous non-ref)`
			`else:`
			`# homozygous something`
			`if self.genotype[0] == self.refbase:`
			`return 0 # ref`
			`else:`
			`return 2 # hom(ozygous non-ref)`

			`def isHET(self): return self.genotype[0] <> self.genotype[2]`
			`def isHOM(self): return self.genotype[0] == self.genotype[2]`

			`def __str__(self):`
			`return "%s:%s %s %s" % ( self.chr, self.pos, self.genotype, self.lod)`

			`MAQGenotypeEncoding = {`
			`'A' : ['A', 'A'],`
			`'C' : ['C', 'C'],`
			`'T' : ['T', 'T'],`
			`'G' : ['G', 'G'],`
			`"M" : ['A', 'C'],`
			`'K' : ['G', 'T'],`
			`'Y' : ['C', 'T'],`
			`'R' : ['A', 'G'],`
			`'W' : ['A', 'T'],`
			`'S' : ['C', 'G'],`
			`'D' : ['A', 'G', 'T'],`
			`'B' : ['C', 'G', 'T'],`
			`'H' : ['A', 'C', 'T'],`
			`'V' : ['A', 'C', 'G'],`
			`'N' : ['A', 'C', 'G', 'T'] }`

			`MAQ2STDChr = dict()`
			`for i in range(1,23):`
			`MAQ2STDChr['chr'+str(i)] = str(i)`
			`MAQ2STDChr['chrM'] = 'MT'`
			`MAQ2STDChr['chrX'] = 'X'`
			`MAQ2STDChr['chrY'] = 'Y'`

			`def convertMAQChr(maqChr):`
			`#print 'convertMAQChr:', maqChr, MAQ2STDChr[maqChr]`
			`if maqChr in MAQ2STDChr:`
			`return MAQ2STDChr[maqChr]`
			`else:`
			`return '?'`

			`def convertMAQGenotype( oneBaseCode ):`
			`return MAQGenotypeEncoding[oneBaseCode]`

			`def internalReadSNPFile( parse1, filename ):`
			`result = []`
			`snps_extracted = 0`
			`for snp in imap( parse1, open(filename) ):`
			`if snp:`
			`result.append(snp)`
			`snps_extracted += 1`
			`if snps_extracted > OPTIONS.debug_lines:`
			`break`

			`print len(result),"genotypes extracted"`
			`return result`

			`def snpMAP( snps ):`
			`#d = dict( map( lambda x: [x.site(), x], snps ) )`
			`d = dict()`
			`for snp in snps:`
			`d[snp.site()] = snp#d`

			`#print 'snps', snps, d`
			`return d`

			`def overlappingSites( snps1, snps2 ):`
			`map1 = snpMAP(snps1)`
			`map2 = snpMAP(snps2)`
			`shared = set(map1.keys()) & set(map2.keys())`
			`print 'Number of snp1 records', len(map1)`
			`print 'Number of snp2 records', len(map2)`
			`print 'Number of shared sites', len(shared)`
			`print "\n".join(map(str,snps1))`
			`return shared`

			`def readMAQSNPs(filename):`
			`# Each line consists of:`
			`# chromosome`
			`# position`
			`# reference base`
			`# consensus base`
			`# Phred-like consensus quality`
			`# read depth`
			`# the average number of hits of reads covering this position`
			`# the highest mapping quality of the reads covering the position`
			`# the minimum consensus quality in the 3bp flanking regions at each side of the site (6bp in total)`
			`# the second best call`
			`# log likelihood ratio of the second best and the third best call`
			`# and the third best call.`
			`#`
			`# Also, note that:`
			`#`
			`# What do those "S", "M" and so on mean in the cns2snp output?`
			`# They are IUB codes for heterozygotes. Briefly:`
			`#`
			`# M=A/C, K=G/T, Y=C/T, R=A/G, W=A/T, S=G/C, D=A/G/T, B=C/G/T, H=A/C/T, V=A/C/G, N=A/C/G/T`
			`def read1(line):`
			`formats = [str, int, str, str, int, int]`
			`vals = map( lambda f, x: f(x), formats, line.split()[0:6] )`
			`alignQual = vals[4]`
			`if alignQual >= (10*OPTIONS.lod):`
			`return GenotypeCall( convertMAQChr(vals[0]), vals[1], convertMAQGenotype(vals[3]), vals[2] <> vals[3], alignQual/10.0 )`
			`else:`
			`#print 'Filtering', alignQual, vals`
			`return False`

			`return internalReadSNPFile( read1, filename )`

			`OPTIONS = None`

			`def MerlinChr( index ):`
			`if index == 0:`
			`return 'MT'`
			`elif index == 23:`
			`return 'X'`
			`elif index == 24:`
			`return 'Y'`
			`else:`
			`return str(index)`

			`def readMerlinSNPs(filename):`
			`# 0:72 G GG 155.337967 0.000000 homozygous A:0 C:2 G:510 T:2 514 0 1 1 GG:-5.59 CG:-160.92 GT:-161.51 AG:-162.11 CT:-1293.61 CC:-1293.61 TT:-1294.19 AC:-1294.21 AT:-1294.80 AA:-1295.40`
			`# 0:149 T CC 118.595886 1131.024696 homozygous-SNP A:2 C:442 G:1 T:7 452 0 1 1 CC:-24.21 CT:-142.81 AC:-156.33 CG:-156.96 TT:-1155.23 AT:-1159.41 GT:-1160.04 AA:-1173.26 AG:-1173.56 GG:-1174.20`
			`# chr:pos ref genotype bestVsRef bestVsNextBest class ...`
			`def read1(line):`
			`formats = [lambda x: x.split(':'), str, sorted, float, float, str]`
			`vals = map( lambda f, x: f(x), formats, line.split()[0:6] )`
			`bestVsRef, bestVsNext = vals[3:5]`
			`isSNP = vals[5].find('-SNP') <> -1`
			`if bestVsRef >= OPTIONS.lod and isSNP:`
			`return GenotypeCall( MerlinChr(int(vals[0][0])), int(vals[0][1]) + 1, vals[2], isSNP, bestVsRef )`
			`else:`
			`return False`

			`return internalReadSNPFile( read1, filename )`

			`def readSNPfile( filename, format ):`
			`formats = { 'merlin' : readMerlinSNPs, 'maq' : readMAQSNPs }`
			`if format.lower() in formats:`
			`return list(formats[format.lower()](filename))`
			`else:`
			`raise Exception('Unknown SNP file format ' + format)`

			`def readAffyFile(filename):`
			`# chrom position genotype probe_set_id dbsnp_id`
			`# 1 84647761 TC SNP_A-1780419 rs6576700`
			`# 5 156323558 GG SNP_A-1780418 rs17054099`
			`def read1(line):`
			`formats = [str, int, sorted, str, str]`
			`vals = map( lambda f, x: f(x), formats, line.split() )`

			`try:`
			`chr = str(int(vals[0]))`
			`except:`
			`chr = convertMAQChr(vals[0])`
			`#print 'CHR', chr, vals[0]`
			`return GenotypeCall( chr, vals[1], vals[2], False, 100 )`

			`file = open(filename)`
			`file.readline() # skip header`
			`#affyData = map( read1, file )`
			`affyData = []`
			`for index, line in enumerate(file):`
			`affyData.append(read1(line))`
			`if index > OPTIONS.debug_lines:`
			`break`
			`if index % 10000 == 0:`
			`print index`
			`# Give a chance to use list before creating dictionary`
			`return affyData`

			`#print "1111111"`
			`#return dict( zip( map( GenotypeCall.site, affyData ), affyData ) )`

			`def equalSNPs( snp1, snp2 ):`
			`return snp1.genotype == snp2.genotype`

			`# def concordance( truthSet, testSet, includeVector = None ):`
			`# # calculates a bunch of useful stats about the two`
			`# # data genotype call sets above`
			`#`
			`# states = [[x,0] for x in ['tested', 'shared', 'shared-equal', 'test-snp', 'hom-ref', 'hom-snp', 'het-snp', 'het-ref']]`
			`# counts = dict(states)`
			`#`
			`# def incShared(state, equalP ):`
			`# counts[state] += 1`
			`# if equalP:`
			`# counts[state + '-equal'] += 1`
			`#`
			`# nTestSites = 0`
			`# for i, testSNP in izip( count(), testSet ):`
			`# if includeVector <> None and not includeVector[i]:`
			`# # we are skiping this site`
			`# continue`
			`#`
			`# nTestSites += 1`
			`#`
			`# if testSNP.isSNP():`
			`# counts['test-snp'] += 1`
			`#`
			`# #print testSNP.site()`
			`# if testSNP.site() in truthSet:`
			`# truth = truthSet[testSNP.site()]`
			`# eql = equalSNPs( testSNP, truth )`
			`#`
			`# incShared( 'shared', eql )`
			`# if testSNP.isSNP():`
			`# if truth.isHOM(): incShared( 'hom-snp', eql )`
			`# else: incShared( 'het-snp', eql )`
			`# else:`
			`# if truth.isHOM(): incShared( 'hom-ref', eql )`
			`# else: incShared( 'het-ref', eql )`
			`#`
			`# if OPTIONS.verbose and nTestSites % 100 == 0 and nSharedSites > 0:`
			`# #print nTestSites, nSharedSites, nEqualSites`
			`# print nTestSites, counts`
			`#`
			`# counts['tested'] = nTestedSites`
			`#`
			`# return counts`

			`class ConcordanceData:`
			`def __init__(self, name, file1count, file2count):`
			`self.name = name`
			`self.nFile1Sites = file1count # num sites in file 1`
			`self.nFile2Sites = file2count # num sites in file 1`
			`self.nSharedSites = 0 # num SNP pairs that map to same position on the genome`
			`self.nEqualSites = 0 # num SNPs pars with the same genotype`

			`def inc( self, truthSNP, testSNP ):`
			`self.nSharedSites += 1`
			`if equalSNPs( testSNP, truthSNP ): # if the genotypes are equal`
			`self.nEqualSites += 1`

			`def rate(self):`
			`return (100.0 * self.nEqualSites) / max(self.nSharedSites,1)`

			`def __str__(self):`
			`return '%d %d %.2f' % ( self.nSharedSites, self.nEqualSites, self.rate() )`

			`def concordance( truthSet, testSet, sharedSites = None ):`
			`# calculates a bunch of useful stats about the two`
			`# data genotype call sets above`
			`#`
			`# The 2 calls in main work like this:`
			`# affy, snp1, snp1_snp2_shared`
			`# affy, snp2, snp1_snp2_shared`

			`nTestSites = 0`

			`# Now for each of the calls to concordance, we generate 3 sets:`
			`# - allData: all SNP1 sites that are also in Affy`
			`allData = ConcordanceData('all', len(truthSet), len(testSet))`
			`# - sharedData: SNP1 sites that are also SNP2 sites that are alse in Affy`
			`sharedData = ConcordanceData('shared', len(truthSet), len(testSet))`
			`# - uniqueData: SNP1 sites that are not SNP2 sites but that are in Affy`
			`uniqueData = ConcordanceData('unique', len(truthSet), len(testSet))`
			`for i, testSNP in izip( count(), testSet ):`
			`nTestSites += 1`
			`if testSNP.site() in truthSet:`
			`truthSNP = truthSet[testSNP.site()]`

			`allData.inc( truthSNP, testSNP )`
			`if sharedSites <> None:`
			`if testSNP.site() in sharedSites:`
			`sharedData.inc( truthSNP, testSNP )`
			`else:`
			`uniqueData.inc( truthSNP, testSNP )`

			`if OPTIONS.verbose and nTestSites % 100000 == 0:`
			`#print nTestSites, nSharedSites, nEqualSites`
			`print nTestSites, allData, sharedData, uniqueData`

			`return nTestSites, allData, sharedData, uniqueData`

			`# def concordance( truthSet, testSet, includeVector = None ):`
			`# # calculates a bunch of useful stats about the two`
			`# # data genotype call sets above`
			`#`
			`# states = [[x,0] for x in ['tested', 'shared', 'test-snp', 'shared-hom-ref', 'shared-het-snp', 'shared-hom-snp']]`
			`# counts = dict(states)`
			`#`
			`# nTestSites = 0`
			`# nSharedSites = 0`
			`# nEqualSites = 0`
			`# for i, testSNP in izip( count(), testSet ):`
			`# nTestSites += 1`
			`# #print testSNP.site()`
			`# if testSNP.site() in truthSet:`
			`# nSharedSites += 1`
			`# if equalSNPs( testSNP, truthSet[testSNP.site()] ):`
			`# nEqualSites += 1`
			`# #else:`
			`# # print '~', testSNP, truthSet[testSNP.site()]`
			`# if OPTIONS.verbose and nTestSites % 100000 == 0 and nSharedSites > 0:`
			`# #print nTestSites, nSharedSites, nEqualSites`
			`# print nTestSites, nSharedSites, nEqualSites, (100.0 * nEqualSites) / nSharedSites`
			`#`
			`# return [nTestSites, nSharedSites, nEqualSites, (100.0 * nEqualSites) / nSharedSites]`


			`def printConcordanceResults( filename1, filename2, results, hasSharedSites = False ):`
			`nTestSites, allData, sharedData, uniqueData = results`

			`def print1(data):`
			`print '------------------------------------------------------------'`
			`print 'Concordance results', data.name, 'sites'`
			`print ' -> Genotype file1', filename1`
			`print ' -> Genotype file2', filename2`
			`print ' -> Number of tested sites (%s %s): %d' % (filename2, data.name, nTestSites)`
			`print ' -> Number of sites shared between files (%s %s): %d' % (filename2, data.name, data.nSharedSites)`
			`print ' -> Percent sites in file1 shared (%s %s): %.2f' % (filename1, data.name, data.nSharedSites / (0.01 * data.nFile1Sites))`
			`print ' -> Percent sites in file2 shared (%s %s): %.2f' % (filename1, data.name, data.nSharedSites / (0.01 * data.nFile2Sites))`
			`print ' -> Number of genotypically equivalent sites (%s %s): %d' % (filename2, data.name, data.nEqualSites)`
			`print ' -> Concordance rate (%s %s): %.2f' % (filename2, data.name, data.rate())`

			`print1(allData)`
			`if hasSharedSites:`
			`print1( sharedData ) # shared between SNP1 and SNP2`
			`print1( uniqueData ) # unique to SNP1 or to SNP2 only`

			`def dump_shared_snps( affys, snp_list1, snp_list2 ):`
			`print len(affys), len(snp_list1), len(snp_list2)`
			`snps1 = snpMAP(snp_list1)`
			`snps2 = snpMAP(snp_list2)`
			`snp1_sites = set(snps1.keys()); print "SNP1s:",len(snp1_sites)`
			`snp2_sites = set(snps2.keys()); print "SNP2s:",len(snp2_sites)`
			`affy_sites = set(affys.keys()); print "Affys:",len(affy_sites)`
			`snp1or2_affy_sites = (snp1_sites \| snp2_sites) & affy_sites`
			`snp1and2_affy_sites = (snp1_sites & snp2_sites) & affy_sites`
			`print "SNP 1 or 2 and Affy: ",len(snp1or2_affy_sites)`
			`print "SNP 1 and 2 and Affy:",len(snp1and2_affy_sites)`

			`fsnp = open ("snp.tab","w")`
			`print >>fsnp, "site lod1 lod2 lod1v2 gen1 gen2 genaff inc1 inc2 inc12 lodm genm incm ref_het_hom refbase"`
			`for site in snp1and2_affy_sites:`
			`snp1 = snps1[site]`
			`snp2 = snps2[site]`
			`affy = affys[site]`

			`print >>fsnp, "%-11s %5.2f %5.2f" % (site, snp1.lod, snp2.lod),`
			`try:`
			`snp1div2 = snp1.lod / snp2.lod`
			`except ZeroDivisionError:`
			`snp1div2 = 1000`
			`print >>fsnp, "%5.2f" % snp1div2,`
			`print >>fsnp, snp1.genotype, snp2.genotype, affy.genotype,`
			`print >>fsnp, "%1d %1d %1d" % (not equalSNPs(snp1, affy), not equalSNPs(snp2,affy), not equalSNPs(snp1,snp2)),`

			`# Calculte meta_lod from the two lods`
			`if snp1.genotype == snp2.genotype:`
			`meta_lod = snp1.lod + snp2.lod`
			`meta_genotype = snp1.genotype`
			`else:`
			`if snp1.lod > snp2.lod:`
			`meta_lod = snp1.lod`
			`meta_genotype = snp1.genotype`
			`else:`
			`meta_lod = snp2.lod`
			`meta_genotype = snp2.genotype`
			`meta_inc = meta_genotype != affy.genotype`
			`print >>fsnp, "%5.2f %3s %1d" % (meta_lod, meta_genotype, meta_inc),`
			`print >>fsnp, affy.ref_het_hom(),`
			`print >>fsnp, affy.refbase()`

			`def intersection_union_snps( affy, snps1, snps2 ):`
			`map1 = snpMAP(snps1)`
			`map2 = snpMAP(snps2)`
			`shared_nonaffy_sites = (set(map1.keys()) & set(map2.keys())).difference(affy.keys())`
			`nonaffy_shared = [(map1[site].lod, map2[site].lod) for site in shared_nonaffy_sites]`
			`shared_affy_sites = set(map1.keys()) & set(map2.keys()) & set(affy.keys())`

			`#shared = []`
			`#for site in shared_sites:`
			`# shared.append((map1[site], map2[site]))`
			`#print "Shared:",len( shared )`
			`#shared_x = [s[0].lod for s in shared]`
			`#shared_y = [s[1].lod for s in shared]`

			`both_corr = []`
			`snp1_corr = []`
			`snp2_corr = []`
			`neither_corr = []`
			`# given two bools telling whether snp1 and snp2 are correct,`
			`# return the correct object`
			`#`
			`# snp2 incorrect, snp2 correct`
			`truth_list = [[neither_corr, snp2_corr], # snp1 incorrect`
			`[snp1_corr, both_corr]] # snp1 correct`

			`for site in shared_affy_sites:`
			`snp1_true = equalSNPs(map1[site], affy[site])`
			`snp2_true = equalSNPs(map2[site], affy[site])`
			`truth_list[snp1_true][snp2_true].append( (map1[site].lod, map2[site].lod) )`

			`print "Beginning plot..."`
			`import rpy2.robjects as robj`
			`robj.r('X11(width=15, height=15)')`
			`XY_MAX = 25`
			`plots = ((nonaffy_shared, "gray45"),(both_corr, "black"), (snp1_corr, "red"), (snp2_corr, "green"), (neither_corr, "blue"))`
			`plots = ((both_corr, "black"), (snp1_corr, "red"), (snp2_corr, "green"), (neither_corr, "blue"))`
			`robj.r.plot([0, XY_MAX], [0, XY_MAX], \`
			`xlab = os.path.splitext(OPTIONS.snp1)[0].capitalize()+" LOD", \`
			`ylab = os.path.splitext(OPTIONS.snp2)[0].capitalize()+" LOD", \`
			`main = "Shared SNP site LODs", \`
			`xlim = robj.FloatVector([0,XY_MAX]), \`
			`ylim = robj.FloatVector([0,min(XY_MAX,25)]), \`
			`pch = 19, \`
			`cex = 0.0)`
			`for xy, color in plots:`
			`print "X"`
			`robj.r.points([pt[0] for pt in xy], [pt[1] for pt in xy], col=color, pch=19, cex=.3)`
			`print "Color:",color`
			`print "Len:",len(xy)`
			`#print "\n".join(["%25s %25s" % pt for pt in xy])`

			`#print "\n".join(["%25s %25s" % shared_snp for shared_snp in shared])`
			`#robj.r.plot(shared_x, shared_y, xlab=OPTIONS.format1.capitalize()+" LOD", ylab=OPTIONS.format2.capitalize()+" LOD", main="Shared SNP site LODs", col="black", xlim=robj.FloatVector([0,XY_MAX]), ylim=robj.FloatVector([0,min(XY_MAX,25)]), pch=19, cex=0.3)`
			`raw_input("Press enter to continue")`

			`return`

			`ss1 = set(snps1)`
			`ss2 = set(snps2)`
			`print "Shared:",len( ss1.intersection(ss2) )`
			`print "Snp1 only:",len( ss1.difference(ss2) )`
			`print "Snp2 only:",len( ss2.difference(ss1) )`
			`print "Snp1 total:",len( ss1 )`
			`print "Snp2 total:",len( ss2 )`

			`def count_het_sites(snp_list):`
			`hets = 0`
			`for snp in snp_list:`
			`if snp.isHET():`
			`hets += 1`
			`print hets,"hets,",len(snp_list),"total,",`
			`print "%.1f" % (float(hets)/len(snp_list)*100)`

			`def main(argv):`
			`global OPTIONS, ROOT`

			`usage = "usage: %prog --truth affy-truth-file --snp1 snpfile1 --snp2 snpfile2"`
			`parser = OptionParser(usage=usage)`
			`parser.add_option("-t", "--truth", dest="affy",`
			`type="string", default=None,`
			`help="Affy truth file")`
			`parser.add_option("-1", "--snp1", dest="snp1",`
			`type="string", default=None,`
			`help="Affy truth file")`
			`parser.add_option("-2", "--snp2", dest="snp2",`
			`type="string", default=None,`
			`help="Affy truth file")`
			`parser.add_option("", "--f1", dest="format1",`
			`type="string", default=None,`
			`help="File type of snpfile1")`
			`parser.add_option("", "--f2", dest="format2",`
			`type="string", default=None,`
			`help="File type of snpfile2")`
			`parser.add_option("-l", "--lod", dest="lod",`
			`type="float", default=5.0,`
			`help="Minimum LOD of confident SNP call in Merlin or Q (10x) in MAQ")`
			`parser.add_option("-v", "--verbose", dest="verbose",`
			`action='store_true', default=False,`
			`help="Verbose output")`
			`parser.add_option("-d", "--debug_lines", dest="debug_lines",`
			`type='float', default=sys.maxint,`
			`help="Number of input data lines to process for debugging")`

			`(OPTIONS, args) = parser.parse_args()`
			`if len(args) != 0:`
			`parser.error("incorrect number of arguments")`

			`if OPTIONS.affy == None:`
			`parser.error("No affy data specified")`

			`if OPTIONS.format1 == None:`
			`parser.error("First format cannot be none")`

			`if OPTIONS.snp2 <> None and OPTIONS.format2 == None:`
			`parser.error("snp2 file given but format was not specified")`

			`# Load reference genome`
			`#ref = ref_genome("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")`
			`#sys.exit()`

			`if 1:`
			`print "Reading Affy truth data..."`
			`#readAffyFile2 = DiskMemoize( readAffyFile, "readAffyFile", global_deps = ["OPTIONS.lod"] )`
			`readAffyFile2 = time_func(readAffyFile)`
			`affy_list = readAffyFile2( filename=OPTIONS.affy )`
			`#count_het_sites(affy_list)`
			`#sys.exit()`
			`affy = dict( zip( map( GenotypeCall.site, affy_list ), affy_list ) )`
			`print 'Read affy truth data:'`
			`print ' -> number of genotyped loci', len(affy)`

			`#readSNPfile2 = DiskMemoize( readSNPfile, "readSNPfile", global_deps = ["OPTIONS.lod"] )`
			`readSNPfile2 = time_func(readSNPfile)`
			`print "Reading SNPs 1 file..."`
			`snps1 = readSNPfile2( filename=OPTIONS.snp1, format=OPTIONS.format1 )`
			`if OPTIONS.snp2 <> None:`
			`print "Reading SNPs 2 file..."`
			`snps2 = readSNPfile2( filename=OPTIONS.snp2, format=OPTIONS.format2 )`

			`dump_shared_snps( affy, snps1, snps2 )`
			`#intersection_union_snps( affy, snps1, snps2 )`
			`#sys.exit()`

			`sharedSites = None`
			`if OPTIONS.snp2 <> None:`
			`sharedSites = overlappingSites( snps1, snps2 )`
			`results1 = concordance( affy, snps1, sharedSites )`
			`printConcordanceResults( OPTIONS.affy, OPTIONS.snp1, results1, True )`
			`results2 = concordance( affy, snps2, sharedSites )`
			`printConcordanceResults( OPTIONS.affy, OPTIONS.snp2, results2, True )`

			`else:`
			`results = concordance( affy, snps1 )`
			`printConcordanceResults( OPTIONS.affy, OPTIONS.snp1, results, sharedSites )`

			`if __name__ == "__main__":`
			`main(sys.argv)`