#!/usr/bin/env python import sys, string import os import re from itertools import * from optparse import OptionParser from memo import DiskMemoize, time_func class ref_genome: """Reads reference genome in FASTA format into a dict""" def __init__(self, ref_genome_file): ref_genome.chr_offset = [[] for i in range(45)] chr_id = 0 seq = "" for line in open(ref_genome_file): if line.startswith(">"): print line[1:], if line.startswith(">chrM"): # skip first > line continue ref_genome.chr_offset[chr_id] = seq chr_id += 1 seq = " " # make it 1 indexed instead of 0 indexed #if chr_id > 2: # break else: seq += line.rstrip().upper() ref_genome.chr_offset[chr_id] = seq def __getitem__(self, key): return ref_genome.chr_offset[key] AffyChr2Index = dict() for i in range(1,23): AffyChr2Index[str(i)] = i AffyChr2Index['MT'] = 0 AffyChr2Index['X'] = 23 AffyChr2Index['Y'] = 24 class GenotypeCall: #ref = time_func(ref_genome)("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") def __init__( self, chr, pos, genotype, snpP, lod ): self.chr = chr self.pos = int(pos) self._site = chr + ':' + str(self.pos) self._isSNP = snpP self.genotype = string.join(map(string.upper, sorted(genotype)), '/') # sorted list of bases at position self.lod = lod def refbase(self): return GenotypeCall.ref[AffyChr2Index[self.chr]][self.pos] def __hash__(self): return hash(self._site) def __eq__(self, other): return self._site == other._site def site(self): return self._site def isSNP(self): return self._isSNP def ref_het_hom(self): if self.genotype[0] <> self.genotype[2]: return 1 # het(erozygous non-ref) else: # homozygous something if self.genotype[0] == self.refbase: return 0 # ref else: return 2 # hom(ozygous non-ref) def isHET(self): return self.genotype[0] <> self.genotype[2] def isHOM(self): return self.genotype[0] == self.genotype[2] def __str__(self): return "%s:%s %s %s" % ( self.chr, self.pos, self.genotype, self.lod) MAQGenotypeEncoding = { 'A' : ['A', 'A'], 'C' : ['C', 'C'], 'T' : ['T', 'T'], 'G' : ['G', 'G'], "M" : ['A', 'C'], 'K' : ['G', 'T'], 'Y' : ['C', 'T'], 'R' : ['A', 'G'], 'W' : ['A', 'T'], 'S' : ['C', 'G'], 'D' : ['A', 'G', 'T'], 'B' : ['C', 'G', 'T'], 'H' : ['A', 'C', 'T'], 'V' : ['A', 'C', 'G'], 'N' : ['A', 'C', 'G', 'T'] } MAQ2STDChr = dict() for i in range(1,23): MAQ2STDChr['chr'+str(i)] = str(i) MAQ2STDChr['chrM'] = 'MT' MAQ2STDChr['chrX'] = 'X' MAQ2STDChr['chrY'] = 'Y' def convertMAQChr(maqChr): #print 'convertMAQChr:', maqChr, MAQ2STDChr[maqChr] if maqChr in MAQ2STDChr: return MAQ2STDChr[maqChr] else: return '?' def convertMAQGenotype( oneBaseCode ): return MAQGenotypeEncoding[oneBaseCode] def internalReadSNPFile( parse1, filename ): result = [] snps_extracted = 0 for snp in imap( parse1, open(filename) ): if snp: result.append(snp) snps_extracted += 1 if snps_extracted > OPTIONS.debug_lines: break print len(result),"genotypes extracted" return result def snpMAP( snps ): #d = dict( map( lambda x: [x.site(), x], snps ) ) d = dict() for snp in snps: d[snp.site()] = snp#d #print 'snps', snps, d return d def overlappingSites( snps1, snps2 ): map1 = snpMAP(snps1) map2 = snpMAP(snps2) shared = set(map1.keys()) & set(map2.keys()) print 'Number of snp1 records', len(map1) print 'Number of snp2 records', len(map2) print 'Number of shared sites', len(shared) print "\n".join(map(str,snps1)) return shared def readMAQSNPs(filename): # Each line consists of: # chromosome # position # reference base # consensus base # Phred-like consensus quality # read depth # the average number of hits of reads covering this position # the highest mapping quality of the reads covering the position # the minimum consensus quality in the 3bp flanking regions at each side of the site (6bp in total) # the second best call # log likelihood ratio of the second best and the third best call # and the third best call. # # Also, note that: # # What do those "S", "M" and so on mean in the cns2snp output? # They are IUB codes for heterozygotes. Briefly: # # M=A/C, K=G/T, Y=C/T, R=A/G, W=A/T, S=G/C, D=A/G/T, B=C/G/T, H=A/C/T, V=A/C/G, N=A/C/G/T def read1(line): formats = [str, int, str, str, int, int] vals = map( lambda f, x: f(x), formats, line.split()[0:6] ) alignQual = vals[4] if alignQual >= (10*OPTIONS.lod): return GenotypeCall( convertMAQChr(vals[0]), vals[1], convertMAQGenotype(vals[3]), vals[2] <> vals[3], alignQual/10.0 ) else: #print 'Filtering', alignQual, vals return False return internalReadSNPFile( read1, filename ) OPTIONS = None def MerlinChr( index ): if index == 0: return 'MT' elif index == 23: return 'X' elif index == 24: return 'Y' else: return str(index) def readMerlinSNPs(filename): # 0:72 G GG 155.337967 0.000000 homozygous A:0 C:2 G:510 T:2 514 0 1 1 GG:-5.59 CG:-160.92 GT:-161.51 AG:-162.11 CT:-1293.61 CC:-1293.61 TT:-1294.19 AC:-1294.21 AT:-1294.80 AA:-1295.40 # 0:149 T CC 118.595886 1131.024696 homozygous-SNP A:2 C:442 G:1 T:7 452 0 1 1 CC:-24.21 CT:-142.81 AC:-156.33 CG:-156.96 TT:-1155.23 AT:-1159.41 GT:-1160.04 AA:-1173.26 AG:-1173.56 GG:-1174.20 # chr:pos ref genotype bestVsRef bestVsNextBest class ... def read1(line): formats = [lambda x: x.split(':'), str, sorted, float, float, str] vals = map( lambda f, x: f(x), formats, line.split()[0:6] ) bestVsRef, bestVsNext = vals[3:5] isSNP = vals[5].find('-SNP') <> -1 if bestVsRef >= OPTIONS.lod and isSNP: return GenotypeCall( MerlinChr(int(vals[0][0])), int(vals[0][1]) + 1, vals[2], isSNP, bestVsRef ) else: return False return internalReadSNPFile( read1, filename ) def readSNPfile( filename, format ): formats = { 'merlin' : readMerlinSNPs, 'maq' : readMAQSNPs } if format.lower() in formats: return list(formats[format.lower()](filename)) else: raise Exception('Unknown SNP file format ' + format) def readAffyFile(filename): # chrom position genotype probe_set_id dbsnp_id # 1 84647761 TC SNP_A-1780419 rs6576700 # 5 156323558 GG SNP_A-1780418 rs17054099 def read1(line): formats = [str, int, sorted, str, str] vals = map( lambda f, x: f(x), formats, line.split() ) try: chr = str(int(vals[0])) except: chr = convertMAQChr(vals[0]) #print 'CHR', chr, vals[0] return GenotypeCall( chr, vals[1], vals[2], False, 100 ) file = open(filename) file.readline() # skip header #affyData = map( read1, file ) affyData = [] for index, line in enumerate(file): affyData.append(read1(line)) if index > OPTIONS.debug_lines: break if index % 10000 == 0: print index # Give a chance to use list before creating dictionary return affyData #print "1111111" #return dict( zip( map( GenotypeCall.site, affyData ), affyData ) ) def equalSNPs( snp1, snp2 ): return snp1.genotype == snp2.genotype # def concordance( truthSet, testSet, includeVector = None ): # # calculates a bunch of useful stats about the two # # data genotype call sets above # # states = [[x,0] for x in ['tested', 'shared', 'shared-equal', 'test-snp', 'hom-ref', 'hom-snp', 'het-snp', 'het-ref']] # counts = dict(states) # # def incShared(state, equalP ): # counts[state] += 1 # if equalP: # counts[state + '-equal'] += 1 # # nTestSites = 0 # for i, testSNP in izip( count(), testSet ): # if includeVector <> None and not includeVector[i]: # # we are skiping this site # continue # # nTestSites += 1 # # if testSNP.isSNP(): # counts['test-snp'] += 1 # # #print testSNP.site() # if testSNP.site() in truthSet: # truth = truthSet[testSNP.site()] # eql = equalSNPs( testSNP, truth ) # # incShared( 'shared', eql ) # if testSNP.isSNP(): # if truth.isHOM(): incShared( 'hom-snp', eql ) # else: incShared( 'het-snp', eql ) # else: # if truth.isHOM(): incShared( 'hom-ref', eql ) # else: incShared( 'het-ref', eql ) # # if OPTIONS.verbose and nTestSites % 100 == 0 and nSharedSites > 0: # #print nTestSites, nSharedSites, nEqualSites # print nTestSites, counts # # counts['tested'] = nTestedSites # # return counts class ConcordanceData: def __init__(self, name, file1count, file2count): self.name = name self.nFile1Sites = file1count # num sites in file 1 self.nFile2Sites = file2count # num sites in file 1 self.nSharedSites = 0 # num SNP pairs that map to same position on the genome self.nEqualSites = 0 # num SNPs pars with the same genotype def inc( self, truthSNP, testSNP ): self.nSharedSites += 1 if equalSNPs( testSNP, truthSNP ): # if the genotypes are equal self.nEqualSites += 1 def rate(self): return (100.0 * self.nEqualSites) / max(self.nSharedSites,1) def __str__(self): return '%d %d %.2f' % ( self.nSharedSites, self.nEqualSites, self.rate() ) def concordance( truthSet, testSet, sharedSites = None ): # calculates a bunch of useful stats about the two # data genotype call sets above # # The 2 calls in main work like this: # affy, snp1, snp1_snp2_shared # affy, snp2, snp1_snp2_shared nTestSites = 0 # Now for each of the calls to concordance, we generate 3 sets: # - allData: all SNP1 sites that are also in Affy allData = ConcordanceData('all', len(truthSet), len(testSet)) # - sharedData: SNP1 sites that are also SNP2 sites that are alse in Affy sharedData = ConcordanceData('shared', len(truthSet), len(testSet)) # - uniqueData: SNP1 sites that are not SNP2 sites but that are in Affy uniqueData = ConcordanceData('unique', len(truthSet), len(testSet)) for i, testSNP in izip( count(), testSet ): nTestSites += 1 if testSNP.site() in truthSet: truthSNP = truthSet[testSNP.site()] allData.inc( truthSNP, testSNP ) if sharedSites <> None: if testSNP.site() in sharedSites: sharedData.inc( truthSNP, testSNP ) else: uniqueData.inc( truthSNP, testSNP ) if OPTIONS.verbose and nTestSites % 100000 == 0: #print nTestSites, nSharedSites, nEqualSites print nTestSites, allData, sharedData, uniqueData return nTestSites, allData, sharedData, uniqueData # def concordance( truthSet, testSet, includeVector = None ): # # calculates a bunch of useful stats about the two # # data genotype call sets above # # states = [[x,0] for x in ['tested', 'shared', 'test-snp', 'shared-hom-ref', 'shared-het-snp', 'shared-hom-snp']] # counts = dict(states) # # nTestSites = 0 # nSharedSites = 0 # nEqualSites = 0 # for i, testSNP in izip( count(), testSet ): # nTestSites += 1 # #print testSNP.site() # if testSNP.site() in truthSet: # nSharedSites += 1 # if equalSNPs( testSNP, truthSet[testSNP.site()] ): # nEqualSites += 1 # #else: # # print '~', testSNP, truthSet[testSNP.site()] # if OPTIONS.verbose and nTestSites % 100000 == 0 and nSharedSites > 0: # #print nTestSites, nSharedSites, nEqualSites # print nTestSites, nSharedSites, nEqualSites, (100.0 * nEqualSites) / nSharedSites # # return [nTestSites, nSharedSites, nEqualSites, (100.0 * nEqualSites) / nSharedSites] def printConcordanceResults( filename1, filename2, results, hasSharedSites = False ): nTestSites, allData, sharedData, uniqueData = results def print1(data): print '------------------------------------------------------------' print 'Concordance results', data.name, 'sites' print ' -> Genotype file1', filename1 print ' -> Genotype file2', filename2 print ' -> Number of tested sites (%s %s): %d' % (filename2, data.name, nTestSites) print ' -> Number of sites shared between files (%s %s): %d' % (filename2, data.name, data.nSharedSites) print ' -> Percent sites in file1 shared (%s %s): %.2f' % (filename1, data.name, data.nSharedSites / (0.01 * data.nFile1Sites)) print ' -> Percent sites in file2 shared (%s %s): %.2f' % (filename1, data.name, data.nSharedSites / (0.01 * data.nFile2Sites)) print ' -> Number of genotypically equivalent sites (%s %s): %d' % (filename2, data.name, data.nEqualSites) print ' -> Concordance rate (%s %s): %.2f' % (filename2, data.name, data.rate()) print1(allData) if hasSharedSites: print1( sharedData ) # shared between SNP1 and SNP2 print1( uniqueData ) # unique to SNP1 or to SNP2 only def dump_shared_snps( affys, snp_list1, snp_list2 ): print len(affys), len(snp_list1), len(snp_list2) snps1 = snpMAP(snp_list1) snps2 = snpMAP(snp_list2) snp1_sites = set(snps1.keys()); print "SNP1s:",len(snp1_sites) snp2_sites = set(snps2.keys()); print "SNP2s:",len(snp2_sites) affy_sites = set(affys.keys()); print "Affys:",len(affy_sites) snp1or2_affy_sites = (snp1_sites | snp2_sites) & affy_sites snp1and2_affy_sites = (snp1_sites & snp2_sites) & affy_sites print "SNP 1 or 2 and Affy: ",len(snp1or2_affy_sites) print "SNP 1 and 2 and Affy:",len(snp1and2_affy_sites) fsnp = open ("snp.tab","w") print >>fsnp, "site lod1 lod2 lod1v2 gen1 gen2 genaff inc1 inc2 inc12 lodm genm incm ref_het_hom refbase" for site in snp1and2_affy_sites: snp1 = snps1[site] snp2 = snps2[site] affy = affys[site] print >>fsnp, "%-11s %5.2f %5.2f" % (site, snp1.lod, snp2.lod), try: snp1div2 = snp1.lod / snp2.lod except ZeroDivisionError: snp1div2 = 1000 print >>fsnp, "%5.2f" % snp1div2, print >>fsnp, snp1.genotype, snp2.genotype, affy.genotype, print >>fsnp, "%1d %1d %1d" % (not equalSNPs(snp1, affy), not equalSNPs(snp2,affy), not equalSNPs(snp1,snp2)), # Calculte meta_lod from the two lods if snp1.genotype == snp2.genotype: meta_lod = snp1.lod + snp2.lod meta_genotype = snp1.genotype else: if snp1.lod > snp2.lod: meta_lod = snp1.lod meta_genotype = snp1.genotype else: meta_lod = snp2.lod meta_genotype = snp2.genotype meta_inc = meta_genotype != affy.genotype print >>fsnp, "%5.2f %3s %1d" % (meta_lod, meta_genotype, meta_inc), print >>fsnp, affy.ref_het_hom(), print >>fsnp, affy.refbase() def intersection_union_snps( affy, snps1, snps2 ): map1 = snpMAP(snps1) map2 = snpMAP(snps2) shared_nonaffy_sites = (set(map1.keys()) & set(map2.keys())).difference(affy.keys()) nonaffy_shared = [(map1[site].lod, map2[site].lod) for site in shared_nonaffy_sites] shared_affy_sites = set(map1.keys()) & set(map2.keys()) & set(affy.keys()) #shared = [] #for site in shared_sites: # shared.append((map1[site], map2[site])) #print "Shared:",len( shared ) #shared_x = [s[0].lod for s in shared] #shared_y = [s[1].lod for s in shared] both_corr = [] snp1_corr = [] snp2_corr = [] neither_corr = [] # given two bools telling whether snp1 and snp2 are correct, # return the correct object # # snp2 incorrect, snp2 correct truth_list = [[neither_corr, snp2_corr], # snp1 incorrect [snp1_corr, both_corr]] # snp1 correct for site in shared_affy_sites: snp1_true = equalSNPs(map1[site], affy[site]) snp2_true = equalSNPs(map2[site], affy[site]) truth_list[snp1_true][snp2_true].append( (map1[site].lod, map2[site].lod) ) print "Beginning plot..." import rpy2.robjects as robj robj.r('X11(width=15, height=15)') XY_MAX = 25 plots = ((nonaffy_shared, "gray45"),(both_corr, "black"), (snp1_corr, "red"), (snp2_corr, "green"), (neither_corr, "blue")) plots = ((both_corr, "black"), (snp1_corr, "red"), (snp2_corr, "green"), (neither_corr, "blue")) robj.r.plot([0, XY_MAX], [0, XY_MAX], \ xlab = os.path.splitext(OPTIONS.snp1)[0].capitalize()+" LOD", \ ylab = os.path.splitext(OPTIONS.snp2)[0].capitalize()+" LOD", \ main = "Shared SNP site LODs", \ xlim = robj.FloatVector([0,XY_MAX]), \ ylim = robj.FloatVector([0,min(XY_MAX,25)]), \ pch = 19, \ cex = 0.0) for xy, color in plots: print "X" robj.r.points([pt[0] for pt in xy], [pt[1] for pt in xy], col=color, pch=19, cex=.3) print "Color:",color print "Len:",len(xy) #print "\n".join(["%25s %25s" % pt for pt in xy]) #print "\n".join(["%25s %25s" % shared_snp for shared_snp in shared]) #robj.r.plot(shared_x, shared_y, xlab=OPTIONS.format1.capitalize()+" LOD", ylab=OPTIONS.format2.capitalize()+" LOD", main="Shared SNP site LODs", col="black", xlim=robj.FloatVector([0,XY_MAX]), ylim=robj.FloatVector([0,min(XY_MAX,25)]), pch=19, cex=0.3) raw_input("Press enter to continue") return ss1 = set(snps1) ss2 = set(snps2) print "Shared:",len( ss1.intersection(ss2) ) print "Snp1 only:",len( ss1.difference(ss2) ) print "Snp2 only:",len( ss2.difference(ss1) ) print "Snp1 total:",len( ss1 ) print "Snp2 total:",len( ss2 ) def count_het_sites(snp_list): hets = 0 for snp in snp_list: if snp.isHET(): hets += 1 print hets,"hets,",len(snp_list),"total,", print "%.1f" % (float(hets)/len(snp_list)*100) def main(argv): global OPTIONS, ROOT usage = "usage: %prog --truth affy-truth-file --snp1 snpfile1 --snp2 snpfile2" parser = OptionParser(usage=usage) parser.add_option("-t", "--truth", dest="affy", type="string", default=None, help="Affy truth file") parser.add_option("-1", "--snp1", dest="snp1", type="string", default=None, help="Affy truth file") parser.add_option("-2", "--snp2", dest="snp2", type="string", default=None, help="Affy truth file") parser.add_option("", "--f1", dest="format1", type="string", default=None, help="File type of snpfile1") parser.add_option("", "--f2", dest="format2", type="string", default=None, help="File type of snpfile2") parser.add_option("-l", "--lod", dest="lod", type="float", default=5.0, help="Minimum LOD of confident SNP call in Merlin or Q (10x) in MAQ") parser.add_option("-v", "--verbose", dest="verbose", action='store_true', default=False, help="Verbose output") parser.add_option("-d", "--debug_lines", dest="debug_lines", type='float', default=sys.maxint, help="Number of input data lines to process for debugging") (OPTIONS, args) = parser.parse_args() if len(args) != 0: parser.error("incorrect number of arguments") if OPTIONS.affy == None: parser.error("No affy data specified") if OPTIONS.format1 == None: parser.error("First format cannot be none") if OPTIONS.snp2 <> None and OPTIONS.format2 == None: parser.error("snp2 file given but format was not specified") # Load reference genome #ref = ref_genome("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") #sys.exit() if 1: print "Reading Affy truth data..." #readAffyFile2 = DiskMemoize( readAffyFile, "readAffyFile", global_deps = ["OPTIONS.lod"] ) readAffyFile2 = time_func(readAffyFile) affy_list = readAffyFile2( filename=OPTIONS.affy ) #count_het_sites(affy_list) #sys.exit() affy = dict( zip( map( GenotypeCall.site, affy_list ), affy_list ) ) print 'Read affy truth data:' print ' -> number of genotyped loci', len(affy) #readSNPfile2 = DiskMemoize( readSNPfile, "readSNPfile", global_deps = ["OPTIONS.lod"] ) readSNPfile2 = time_func(readSNPfile) print "Reading SNPs 1 file..." snps1 = readSNPfile2( filename=OPTIONS.snp1, format=OPTIONS.format1 ) if OPTIONS.snp2 <> None: print "Reading SNPs 2 file..." snps2 = readSNPfile2( filename=OPTIONS.snp2, format=OPTIONS.format2 ) dump_shared_snps( affy, snps1, snps2 ) #intersection_union_snps( affy, snps1, snps2 ) #sys.exit() sharedSites = None if OPTIONS.snp2 <> None: sharedSites = overlappingSites( snps1, snps2 ) results1 = concordance( affy, snps1, sharedSites ) printConcordanceResults( OPTIONS.affy, OPTIONS.snp1, results1, True ) results2 = concordance( affy, snps2, sharedSites ) printConcordanceResults( OPTIONS.affy, OPTIONS.snp2, results2, True ) else: results = concordance( affy, snps1 ) printConcordanceResults( OPTIONS.affy, OPTIONS.snp1, results, sharedSites ) if __name__ == "__main__": main(sys.argv)