592 lines
22 KiB
Python
592 lines
22 KiB
Python
|
|
#!/usr/bin/env python
|
||
|
|
|
||
|
|
import sys, string
|
||
|
|
import os
|
||
|
|
import re
|
||
|
|
from itertools import *
|
||
|
|
from optparse import OptionParser
|
||
|
|
from memo import DiskMemoize, time_func
|
||
|
|
|
||
|
|
class ref_genome:
|
||
|
|
"""Reads reference genome in FASTA format into a dict"""
|
||
|
|
|
||
|
|
def __init__(self, ref_genome_file):
|
||
|
|
ref_genome.chr_offset = [[] for i in range(45)]
|
||
|
|
chr_id = 0
|
||
|
|
seq = ""
|
||
|
|
for line in open(ref_genome_file):
|
||
|
|
if line.startswith(">"):
|
||
|
|
print line[1:],
|
||
|
|
if line.startswith(">chrM"): # skip first > line
|
||
|
|
continue
|
||
|
|
ref_genome.chr_offset[chr_id] = seq
|
||
|
|
chr_id += 1
|
||
|
|
seq = " " # make it 1 indexed instead of 0 indexed
|
||
|
|
#if chr_id > 2:
|
||
|
|
# break
|
||
|
|
else:
|
||
|
|
seq += line.rstrip().upper()
|
||
|
|
ref_genome.chr_offset[chr_id] = seq
|
||
|
|
|
||
|
|
def __getitem__(self, key):
|
||
|
|
return ref_genome.chr_offset[key]
|
||
|
|
|
||
|
|
AffyChr2Index = dict()
|
||
|
|
for i in range(1,23):
|
||
|
|
AffyChr2Index[str(i)] = i
|
||
|
|
AffyChr2Index['MT'] = 0
|
||
|
|
AffyChr2Index['X'] = 23
|
||
|
|
AffyChr2Index['Y'] = 24
|
||
|
|
|
||
|
|
class GenotypeCall:
|
||
|
|
#ref = time_func(ref_genome)("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
|
||
|
|
def __init__( self, chr, pos, genotype, snpP, lod ):
|
||
|
|
self.chr = chr
|
||
|
|
self.pos = int(pos)
|
||
|
|
self._site = chr + ':' + str(self.pos)
|
||
|
|
self._isSNP = snpP
|
||
|
|
self.genotype = string.join(map(string.upper, sorted(genotype)), '/') # sorted list of bases at position
|
||
|
|
self.lod = lod
|
||
|
|
|
||
|
|
def refbase(self):
|
||
|
|
return GenotypeCall.ref[AffyChr2Index[self.chr]][self.pos]
|
||
|
|
|
||
|
|
def __hash__(self):
|
||
|
|
return hash(self._site)
|
||
|
|
|
||
|
|
def __eq__(self, other):
|
||
|
|
return self._site == other._site
|
||
|
|
|
||
|
|
def site(self): return self._site
|
||
|
|
def isSNP(self): return self._isSNP
|
||
|
|
|
||
|
|
def ref_het_hom(self):
|
||
|
|
if self.genotype[0] <> self.genotype[2]:
|
||
|
|
return 1 # het(erozygous non-ref)
|
||
|
|
else:
|
||
|
|
# homozygous something
|
||
|
|
if self.genotype[0] == self.refbase:
|
||
|
|
return 0 # ref
|
||
|
|
else:
|
||
|
|
return 2 # hom(ozygous non-ref)
|
||
|
|
|
||
|
|
def isHET(self): return self.genotype[0] <> self.genotype[2]
|
||
|
|
def isHOM(self): return self.genotype[0] == self.genotype[2]
|
||
|
|
|
||
|
|
def __str__(self):
|
||
|
|
return "%s:%s %s %s" % ( self.chr, self.pos, self.genotype, self.lod)
|
||
|
|
|
||
|
|
MAQGenotypeEncoding = {
|
||
|
|
'A' : ['A', 'A'],
|
||
|
|
'C' : ['C', 'C'],
|
||
|
|
'T' : ['T', 'T'],
|
||
|
|
'G' : ['G', 'G'],
|
||
|
|
"M" : ['A', 'C'],
|
||
|
|
'K' : ['G', 'T'],
|
||
|
|
'Y' : ['C', 'T'],
|
||
|
|
'R' : ['A', 'G'],
|
||
|
|
'W' : ['A', 'T'],
|
||
|
|
'S' : ['C', 'G'],
|
||
|
|
'D' : ['A', 'G', 'T'],
|
||
|
|
'B' : ['C', 'G', 'T'],
|
||
|
|
'H' : ['A', 'C', 'T'],
|
||
|
|
'V' : ['A', 'C', 'G'],
|
||
|
|
'N' : ['A', 'C', 'G', 'T'] }
|
||
|
|
|
||
|
|
MAQ2STDChr = dict()
|
||
|
|
for i in range(1,23):
|
||
|
|
MAQ2STDChr['chr'+str(i)] = str(i)
|
||
|
|
MAQ2STDChr['chrM'] = 'MT'
|
||
|
|
MAQ2STDChr['chrX'] = 'X'
|
||
|
|
MAQ2STDChr['chrY'] = 'Y'
|
||
|
|
|
||
|
|
def convertMAQChr(maqChr):
|
||
|
|
#print 'convertMAQChr:', maqChr, MAQ2STDChr[maqChr]
|
||
|
|
if maqChr in MAQ2STDChr:
|
||
|
|
return MAQ2STDChr[maqChr]
|
||
|
|
else:
|
||
|
|
return '?'
|
||
|
|
|
||
|
|
def convertMAQGenotype( oneBaseCode ):
|
||
|
|
return MAQGenotypeEncoding[oneBaseCode]
|
||
|
|
|
||
|
|
def internalReadSNPFile( parse1, filename ):
|
||
|
|
result = []
|
||
|
|
snps_extracted = 0
|
||
|
|
for snp in imap( parse1, open(filename) ):
|
||
|
|
if snp:
|
||
|
|
result.append(snp)
|
||
|
|
snps_extracted += 1
|
||
|
|
if snps_extracted > OPTIONS.debug_lines:
|
||
|
|
break
|
||
|
|
|
||
|
|
print len(result),"genotypes extracted"
|
||
|
|
return result
|
||
|
|
|
||
|
|
def snpMAP( snps ):
|
||
|
|
#d = dict( map( lambda x: [x.site(), x], snps ) )
|
||
|
|
d = dict()
|
||
|
|
for snp in snps:
|
||
|
|
d[snp.site()] = snp#d
|
||
|
|
|
||
|
|
#print 'snps', snps, d
|
||
|
|
return d
|
||
|
|
|
||
|
|
def overlappingSites( snps1, snps2 ):
|
||
|
|
map1 = snpMAP(snps1)
|
||
|
|
map2 = snpMAP(snps2)
|
||
|
|
shared = set(map1.keys()) & set(map2.keys())
|
||
|
|
print 'Number of snp1 records', len(map1)
|
||
|
|
print 'Number of snp2 records', len(map2)
|
||
|
|
print 'Number of shared sites', len(shared)
|
||
|
|
print "\n".join(map(str,snps1))
|
||
|
|
return shared
|
||
|
|
|
||
|
|
def readMAQSNPs(filename):
|
||
|
|
# Each line consists of:
|
||
|
|
# chromosome
|
||
|
|
# position
|
||
|
|
# reference base
|
||
|
|
# consensus base
|
||
|
|
# Phred-like consensus quality
|
||
|
|
# read depth
|
||
|
|
# the average number of hits of reads covering this position
|
||
|
|
# the highest mapping quality of the reads covering the position
|
||
|
|
# the minimum consensus quality in the 3bp flanking regions at each side of the site (6bp in total)
|
||
|
|
# the second best call
|
||
|
|
# log likelihood ratio of the second best and the third best call
|
||
|
|
# and the third best call.
|
||
|
|
#
|
||
|
|
# Also, note that:
|
||
|
|
#
|
||
|
|
# What do those "S", "M" and so on mean in the cns2snp output?
|
||
|
|
# They are IUB codes for heterozygotes. Briefly:
|
||
|
|
#
|
||
|
|
# M=A/C, K=G/T, Y=C/T, R=A/G, W=A/T, S=G/C, D=A/G/T, B=C/G/T, H=A/C/T, V=A/C/G, N=A/C/G/T
|
||
|
|
def read1(line):
|
||
|
|
formats = [str, int, str, str, int, int]
|
||
|
|
vals = map( lambda f, x: f(x), formats, line.split()[0:6] )
|
||
|
|
alignQual = vals[4]
|
||
|
|
if alignQual >= (10*OPTIONS.lod):
|
||
|
|
return GenotypeCall( convertMAQChr(vals[0]), vals[1], convertMAQGenotype(vals[3]), vals[2] <> vals[3], alignQual/10.0 )
|
||
|
|
else:
|
||
|
|
#print 'Filtering', alignQual, vals
|
||
|
|
return False
|
||
|
|
|
||
|
|
return internalReadSNPFile( read1, filename )
|
||
|
|
|
||
|
|
OPTIONS = None
|
||
|
|
|
||
|
|
def MerlinChr( index ):
|
||
|
|
if index == 0:
|
||
|
|
return 'MT'
|
||
|
|
elif index == 23:
|
||
|
|
return 'X'
|
||
|
|
elif index == 24:
|
||
|
|
return 'Y'
|
||
|
|
else:
|
||
|
|
return str(index)
|
||
|
|
|
||
|
|
def readMerlinSNPs(filename):
|
||
|
|
# 0:72 G GG 155.337967 0.000000 homozygous A:0 C:2 G:510 T:2 514 0 1 1 GG:-5.59 CG:-160.92 GT:-161.51 AG:-162.11 CT:-1293.61 CC:-1293.61 TT:-1294.19 AC:-1294.21 AT:-1294.80 AA:-1295.40
|
||
|
|
# 0:149 T CC 118.595886 1131.024696 homozygous-SNP A:2 C:442 G:1 T:7 452 0 1 1 CC:-24.21 CT:-142.81 AC:-156.33 CG:-156.96 TT:-1155.23 AT:-1159.41 GT:-1160.04 AA:-1173.26 AG:-1173.56 GG:-1174.20
|
||
|
|
# chr:pos ref genotype bestVsRef bestVsNextBest class ...
|
||
|
|
def read1(line):
|
||
|
|
formats = [lambda x: x.split(':'), str, sorted, float, float, str]
|
||
|
|
vals = map( lambda f, x: f(x), formats, line.split()[0:6] )
|
||
|
|
bestVsRef, bestVsNext = vals[3:5]
|
||
|
|
isSNP = vals[5].find('-SNP') <> -1
|
||
|
|
if bestVsRef >= OPTIONS.lod and isSNP:
|
||
|
|
return GenotypeCall( MerlinChr(int(vals[0][0])), int(vals[0][1]) + 1, vals[2], isSNP, bestVsRef )
|
||
|
|
else:
|
||
|
|
return False
|
||
|
|
|
||
|
|
return internalReadSNPFile( read1, filename )
|
||
|
|
|
||
|
|
def readSNPfile( filename, format ):
|
||
|
|
formats = { 'merlin' : readMerlinSNPs, 'maq' : readMAQSNPs }
|
||
|
|
if format.lower() in formats:
|
||
|
|
return list(formats[format.lower()](filename))
|
||
|
|
else:
|
||
|
|
raise Exception('Unknown SNP file format ' + format)
|
||
|
|
|
||
|
|
def readAffyFile(filename):
|
||
|
|
# chrom position genotype probe_set_id dbsnp_id
|
||
|
|
# 1 84647761 TC SNP_A-1780419 rs6576700
|
||
|
|
# 5 156323558 GG SNP_A-1780418 rs17054099
|
||
|
|
def read1(line):
|
||
|
|
formats = [str, int, sorted, str, str]
|
||
|
|
vals = map( lambda f, x: f(x), formats, line.split() )
|
||
|
|
|
||
|
|
try:
|
||
|
|
chr = str(int(vals[0]))
|
||
|
|
except:
|
||
|
|
chr = convertMAQChr(vals[0])
|
||
|
|
#print 'CHR', chr, vals[0]
|
||
|
|
return GenotypeCall( chr, vals[1], vals[2], False, 100 )
|
||
|
|
|
||
|
|
file = open(filename)
|
||
|
|
file.readline() # skip header
|
||
|
|
#affyData = map( read1, file )
|
||
|
|
affyData = []
|
||
|
|
for index, line in enumerate(file):
|
||
|
|
affyData.append(read1(line))
|
||
|
|
if index > OPTIONS.debug_lines:
|
||
|
|
break
|
||
|
|
if index % 10000 == 0:
|
||
|
|
print index
|
||
|
|
# Give a chance to use list before creating dictionary
|
||
|
|
return affyData
|
||
|
|
|
||
|
|
#print "1111111"
|
||
|
|
#return dict( zip( map( GenotypeCall.site, affyData ), affyData ) )
|
||
|
|
|
||
|
|
def equalSNPs( snp1, snp2 ):
|
||
|
|
return snp1.genotype == snp2.genotype
|
||
|
|
|
||
|
|
# def concordance( truthSet, testSet, includeVector = None ):
|
||
|
|
# # calculates a bunch of useful stats about the two
|
||
|
|
# # data genotype call sets above
|
||
|
|
#
|
||
|
|
# states = [[x,0] for x in ['tested', 'shared', 'shared-equal', 'test-snp', 'hom-ref', 'hom-snp', 'het-snp', 'het-ref']]
|
||
|
|
# counts = dict(states)
|
||
|
|
#
|
||
|
|
# def incShared(state, equalP ):
|
||
|
|
# counts[state] += 1
|
||
|
|
# if equalP:
|
||
|
|
# counts[state + '-equal'] += 1
|
||
|
|
#
|
||
|
|
# nTestSites = 0
|
||
|
|
# for i, testSNP in izip( count(), testSet ):
|
||
|
|
# if includeVector <> None and not includeVector[i]:
|
||
|
|
# # we are skiping this site
|
||
|
|
# continue
|
||
|
|
#
|
||
|
|
# nTestSites += 1
|
||
|
|
#
|
||
|
|
# if testSNP.isSNP():
|
||
|
|
# counts['test-snp'] += 1
|
||
|
|
#
|
||
|
|
# #print testSNP.site()
|
||
|
|
# if testSNP.site() in truthSet:
|
||
|
|
# truth = truthSet[testSNP.site()]
|
||
|
|
# eql = equalSNPs( testSNP, truth )
|
||
|
|
#
|
||
|
|
# incShared( 'shared', eql )
|
||
|
|
# if testSNP.isSNP():
|
||
|
|
# if truth.isHOM(): incShared( 'hom-snp', eql )
|
||
|
|
# else: incShared( 'het-snp', eql )
|
||
|
|
# else:
|
||
|
|
# if truth.isHOM(): incShared( 'hom-ref', eql )
|
||
|
|
# else: incShared( 'het-ref', eql )
|
||
|
|
#
|
||
|
|
# if OPTIONS.verbose and nTestSites % 100 == 0 and nSharedSites > 0:
|
||
|
|
# #print nTestSites, nSharedSites, nEqualSites
|
||
|
|
# print nTestSites, counts
|
||
|
|
#
|
||
|
|
# counts['tested'] = nTestedSites
|
||
|
|
#
|
||
|
|
# return counts
|
||
|
|
|
||
|
|
class ConcordanceData:
|
||
|
|
def __init__(self, name, file1count, file2count):
|
||
|
|
self.name = name
|
||
|
|
self.nFile1Sites = file1count # num sites in file 1
|
||
|
|
self.nFile2Sites = file2count # num sites in file 1
|
||
|
|
self.nSharedSites = 0 # num SNP pairs that map to same position on the genome
|
||
|
|
self.nEqualSites = 0 # num SNPs pars with the same genotype
|
||
|
|
|
||
|
|
def inc( self, truthSNP, testSNP ):
|
||
|
|
self.nSharedSites += 1
|
||
|
|
if equalSNPs( testSNP, truthSNP ): # if the genotypes are equal
|
||
|
|
self.nEqualSites += 1
|
||
|
|
|
||
|
|
def rate(self):
|
||
|
|
return (100.0 * self.nEqualSites) / max(self.nSharedSites,1)
|
||
|
|
|
||
|
|
def __str__(self):
|
||
|
|
return '%d %d %.2f' % ( self.nSharedSites, self.nEqualSites, self.rate() )
|
||
|
|
|
||
|
|
def concordance( truthSet, testSet, sharedSites = None ):
|
||
|
|
# calculates a bunch of useful stats about the two
|
||
|
|
# data genotype call sets above
|
||
|
|
#
|
||
|
|
# The 2 calls in main work like this:
|
||
|
|
# affy, snp1, snp1_snp2_shared
|
||
|
|
# affy, snp2, snp1_snp2_shared
|
||
|
|
|
||
|
|
nTestSites = 0
|
||
|
|
|
||
|
|
# Now for each of the calls to concordance, we generate 3 sets:
|
||
|
|
# - allData: all SNP1 sites that are also in Affy
|
||
|
|
allData = ConcordanceData('all', len(truthSet), len(testSet))
|
||
|
|
# - sharedData: SNP1 sites that are also SNP2 sites that are alse in Affy
|
||
|
|
sharedData = ConcordanceData('shared', len(truthSet), len(testSet))
|
||
|
|
# - uniqueData: SNP1 sites that are not SNP2 sites but that are in Affy
|
||
|
|
uniqueData = ConcordanceData('unique', len(truthSet), len(testSet))
|
||
|
|
for i, testSNP in izip( count(), testSet ):
|
||
|
|
nTestSites += 1
|
||
|
|
if testSNP.site() in truthSet:
|
||
|
|
truthSNP = truthSet[testSNP.site()]
|
||
|
|
|
||
|
|
allData.inc( truthSNP, testSNP )
|
||
|
|
if sharedSites <> None:
|
||
|
|
if testSNP.site() in sharedSites:
|
||
|
|
sharedData.inc( truthSNP, testSNP )
|
||
|
|
else:
|
||
|
|
uniqueData.inc( truthSNP, testSNP )
|
||
|
|
|
||
|
|
if OPTIONS.verbose and nTestSites % 100000 == 0:
|
||
|
|
#print nTestSites, nSharedSites, nEqualSites
|
||
|
|
print nTestSites, allData, sharedData, uniqueData
|
||
|
|
|
||
|
|
return nTestSites, allData, sharedData, uniqueData
|
||
|
|
|
||
|
|
# def concordance( truthSet, testSet, includeVector = None ):
|
||
|
|
# # calculates a bunch of useful stats about the two
|
||
|
|
# # data genotype call sets above
|
||
|
|
#
|
||
|
|
# states = [[x,0] for x in ['tested', 'shared', 'test-snp', 'shared-hom-ref', 'shared-het-snp', 'shared-hom-snp']]
|
||
|
|
# counts = dict(states)
|
||
|
|
#
|
||
|
|
# nTestSites = 0
|
||
|
|
# nSharedSites = 0
|
||
|
|
# nEqualSites = 0
|
||
|
|
# for i, testSNP in izip( count(), testSet ):
|
||
|
|
# nTestSites += 1
|
||
|
|
# #print testSNP.site()
|
||
|
|
# if testSNP.site() in truthSet:
|
||
|
|
# nSharedSites += 1
|
||
|
|
# if equalSNPs( testSNP, truthSet[testSNP.site()] ):
|
||
|
|
# nEqualSites += 1
|
||
|
|
# #else:
|
||
|
|
# # print '~', testSNP, truthSet[testSNP.site()]
|
||
|
|
# if OPTIONS.verbose and nTestSites % 100000 == 0 and nSharedSites > 0:
|
||
|
|
# #print nTestSites, nSharedSites, nEqualSites
|
||
|
|
# print nTestSites, nSharedSites, nEqualSites, (100.0 * nEqualSites) / nSharedSites
|
||
|
|
#
|
||
|
|
# return [nTestSites, nSharedSites, nEqualSites, (100.0 * nEqualSites) / nSharedSites]
|
||
|
|
|
||
|
|
|
||
|
|
def printConcordanceResults( filename1, filename2, results, hasSharedSites = False ):
|
||
|
|
nTestSites, allData, sharedData, uniqueData = results
|
||
|
|
|
||
|
|
def print1(data):
|
||
|
|
print '------------------------------------------------------------'
|
||
|
|
print 'Concordance results', data.name, 'sites'
|
||
|
|
print ' -> Genotype file1', filename1
|
||
|
|
print ' -> Genotype file2', filename2
|
||
|
|
print ' -> Number of tested sites (%s %s): %d' % (filename2, data.name, nTestSites)
|
||
|
|
print ' -> Number of sites shared between files (%s %s): %d' % (filename2, data.name, data.nSharedSites)
|
||
|
|
print ' -> Percent sites in file1 shared (%s %s): %.2f' % (filename1, data.name, data.nSharedSites / (0.01 * data.nFile1Sites))
|
||
|
|
print ' -> Percent sites in file2 shared (%s %s): %.2f' % (filename1, data.name, data.nSharedSites / (0.01 * data.nFile2Sites))
|
||
|
|
print ' -> Number of genotypically equivalent sites (%s %s): %d' % (filename2, data.name, data.nEqualSites)
|
||
|
|
print ' -> Concordance rate (%s %s): %.2f' % (filename2, data.name, data.rate())
|
||
|
|
|
||
|
|
print1(allData)
|
||
|
|
if hasSharedSites:
|
||
|
|
print1( sharedData ) # shared between SNP1 and SNP2
|
||
|
|
print1( uniqueData ) # unique to SNP1 or to SNP2 only
|
||
|
|
|
||
|
|
def dump_shared_snps( affys, snp_list1, snp_list2 ):
|
||
|
|
print len(affys), len(snp_list1), len(snp_list2)
|
||
|
|
snps1 = snpMAP(snp_list1)
|
||
|
|
snps2 = snpMAP(snp_list2)
|
||
|
|
snp1_sites = set(snps1.keys()); print "SNP1s:",len(snp1_sites)
|
||
|
|
snp2_sites = set(snps2.keys()); print "SNP2s:",len(snp2_sites)
|
||
|
|
affy_sites = set(affys.keys()); print "Affys:",len(affy_sites)
|
||
|
|
snp1or2_affy_sites = (snp1_sites | snp2_sites) & affy_sites
|
||
|
|
snp1and2_affy_sites = (snp1_sites & snp2_sites) & affy_sites
|
||
|
|
print "SNP 1 or 2 and Affy: ",len(snp1or2_affy_sites)
|
||
|
|
print "SNP 1 and 2 and Affy:",len(snp1and2_affy_sites)
|
||
|
|
|
||
|
|
fsnp = open ("snp.tab","w")
|
||
|
|
print >>fsnp, "site lod1 lod2 lod1v2 gen1 gen2 genaff inc1 inc2 inc12 lodm genm incm ref_het_hom refbase"
|
||
|
|
for site in snp1and2_affy_sites:
|
||
|
|
snp1 = snps1[site]
|
||
|
|
snp2 = snps2[site]
|
||
|
|
affy = affys[site]
|
||
|
|
|
||
|
|
print >>fsnp, "%-11s %5.2f %5.2f" % (site, snp1.lod, snp2.lod),
|
||
|
|
try:
|
||
|
|
snp1div2 = snp1.lod / snp2.lod
|
||
|
|
except ZeroDivisionError:
|
||
|
|
snp1div2 = 1000
|
||
|
|
print >>fsnp, "%5.2f" % snp1div2,
|
||
|
|
print >>fsnp, snp1.genotype, snp2.genotype, affy.genotype,
|
||
|
|
print >>fsnp, "%1d %1d %1d" % (not equalSNPs(snp1, affy), not equalSNPs(snp2,affy), not equalSNPs(snp1,snp2)),
|
||
|
|
|
||
|
|
# Calculte meta_lod from the two lods
|
||
|
|
if snp1.genotype == snp2.genotype:
|
||
|
|
meta_lod = snp1.lod + snp2.lod
|
||
|
|
meta_genotype = snp1.genotype
|
||
|
|
else:
|
||
|
|
if snp1.lod > snp2.lod:
|
||
|
|
meta_lod = snp1.lod
|
||
|
|
meta_genotype = snp1.genotype
|
||
|
|
else:
|
||
|
|
meta_lod = snp2.lod
|
||
|
|
meta_genotype = snp2.genotype
|
||
|
|
meta_inc = meta_genotype != affy.genotype
|
||
|
|
print >>fsnp, "%5.2f %3s %1d" % (meta_lod, meta_genotype, meta_inc),
|
||
|
|
print >>fsnp, affy.ref_het_hom(),
|
||
|
|
print >>fsnp, affy.refbase()
|
||
|
|
|
||
|
|
def intersection_union_snps( affy, snps1, snps2 ):
|
||
|
|
map1 = snpMAP(snps1)
|
||
|
|
map2 = snpMAP(snps2)
|
||
|
|
shared_nonaffy_sites = (set(map1.keys()) & set(map2.keys())).difference(affy.keys())
|
||
|
|
nonaffy_shared = [(map1[site].lod, map2[site].lod) for site in shared_nonaffy_sites]
|
||
|
|
shared_affy_sites = set(map1.keys()) & set(map2.keys()) & set(affy.keys())
|
||
|
|
|
||
|
|
#shared = []
|
||
|
|
#for site in shared_sites:
|
||
|
|
# shared.append((map1[site], map2[site]))
|
||
|
|
#print "Shared:",len( shared )
|
||
|
|
#shared_x = [s[0].lod for s in shared]
|
||
|
|
#shared_y = [s[1].lod for s in shared]
|
||
|
|
|
||
|
|
both_corr = []
|
||
|
|
snp1_corr = []
|
||
|
|
snp2_corr = []
|
||
|
|
neither_corr = []
|
||
|
|
# given two bools telling whether snp1 and snp2 are correct,
|
||
|
|
# return the correct object
|
||
|
|
#
|
||
|
|
# snp2 incorrect, snp2 correct
|
||
|
|
truth_list = [[neither_corr, snp2_corr], # snp1 incorrect
|
||
|
|
[snp1_corr, both_corr]] # snp1 correct
|
||
|
|
|
||
|
|
for site in shared_affy_sites:
|
||
|
|
snp1_true = equalSNPs(map1[site], affy[site])
|
||
|
|
snp2_true = equalSNPs(map2[site], affy[site])
|
||
|
|
truth_list[snp1_true][snp2_true].append( (map1[site].lod, map2[site].lod) )
|
||
|
|
|
||
|
|
print "Beginning plot..."
|
||
|
|
import rpy2.robjects as robj
|
||
|
|
robj.r('X11(width=15, height=15)')
|
||
|
|
XY_MAX = 25
|
||
|
|
plots = ((nonaffy_shared, "gray45"),(both_corr, "black"), (snp1_corr, "red"), (snp2_corr, "green"), (neither_corr, "blue"))
|
||
|
|
plots = ((both_corr, "black"), (snp1_corr, "red"), (snp2_corr, "green"), (neither_corr, "blue"))
|
||
|
|
robj.r.plot([0, XY_MAX], [0, XY_MAX], \
|
||
|
|
xlab = os.path.splitext(OPTIONS.snp1)[0].capitalize()+" LOD", \
|
||
|
|
ylab = os.path.splitext(OPTIONS.snp2)[0].capitalize()+" LOD", \
|
||
|
|
main = "Shared SNP site LODs", \
|
||
|
|
xlim = robj.FloatVector([0,XY_MAX]), \
|
||
|
|
ylim = robj.FloatVector([0,min(XY_MAX,25)]), \
|
||
|
|
pch = 19, \
|
||
|
|
cex = 0.0)
|
||
|
|
for xy, color in plots:
|
||
|
|
print "X"
|
||
|
|
robj.r.points([pt[0] for pt in xy], [pt[1] for pt in xy], col=color, pch=19, cex=.3)
|
||
|
|
print "Color:",color
|
||
|
|
print "Len:",len(xy)
|
||
|
|
#print "\n".join(["%25s %25s" % pt for pt in xy])
|
||
|
|
|
||
|
|
#print "\n".join(["%25s %25s" % shared_snp for shared_snp in shared])
|
||
|
|
#robj.r.plot(shared_x, shared_y, xlab=OPTIONS.format1.capitalize()+" LOD", ylab=OPTIONS.format2.capitalize()+" LOD", main="Shared SNP site LODs", col="black", xlim=robj.FloatVector([0,XY_MAX]), ylim=robj.FloatVector([0,min(XY_MAX,25)]), pch=19, cex=0.3)
|
||
|
|
raw_input("Press enter to continue")
|
||
|
|
|
||
|
|
return
|
||
|
|
|
||
|
|
ss1 = set(snps1)
|
||
|
|
ss2 = set(snps2)
|
||
|
|
print "Shared:",len( ss1.intersection(ss2) )
|
||
|
|
print "Snp1 only:",len( ss1.difference(ss2) )
|
||
|
|
print "Snp2 only:",len( ss2.difference(ss1) )
|
||
|
|
print "Snp1 total:",len( ss1 )
|
||
|
|
print "Snp2 total:",len( ss2 )
|
||
|
|
|
||
|
|
def count_het_sites(snp_list):
|
||
|
|
hets = 0
|
||
|
|
for snp in snp_list:
|
||
|
|
if snp.isHET():
|
||
|
|
hets += 1
|
||
|
|
print hets,"hets,",len(snp_list),"total,",
|
||
|
|
print "%.1f" % (float(hets)/len(snp_list)*100)
|
||
|
|
|
||
|
|
def main(argv):
|
||
|
|
global OPTIONS, ROOT
|
||
|
|
|
||
|
|
usage = "usage: %prog --truth affy-truth-file --snp1 snpfile1 --snp2 snpfile2"
|
||
|
|
parser = OptionParser(usage=usage)
|
||
|
|
parser.add_option("-t", "--truth", dest="affy",
|
||
|
|
type="string", default=None,
|
||
|
|
help="Affy truth file")
|
||
|
|
parser.add_option("-1", "--snp1", dest="snp1",
|
||
|
|
type="string", default=None,
|
||
|
|
help="Affy truth file")
|
||
|
|
parser.add_option("-2", "--snp2", dest="snp2",
|
||
|
|
type="string", default=None,
|
||
|
|
help="Affy truth file")
|
||
|
|
parser.add_option("", "--f1", dest="format1",
|
||
|
|
type="string", default=None,
|
||
|
|
help="File type of snpfile1")
|
||
|
|
parser.add_option("", "--f2", dest="format2",
|
||
|
|
type="string", default=None,
|
||
|
|
help="File type of snpfile2")
|
||
|
|
parser.add_option("-l", "--lod", dest="lod",
|
||
|
|
type="float", default=5.0,
|
||
|
|
help="Minimum LOD of confident SNP call in Merlin or Q (10x) in MAQ")
|
||
|
|
parser.add_option("-v", "--verbose", dest="verbose",
|
||
|
|
action='store_true', default=False,
|
||
|
|
help="Verbose output")
|
||
|
|
parser.add_option("-d", "--debug_lines", dest="debug_lines",
|
||
|
|
type='float', default=sys.maxint,
|
||
|
|
help="Number of input data lines to process for debugging")
|
||
|
|
|
||
|
|
(OPTIONS, args) = parser.parse_args()
|
||
|
|
if len(args) != 0:
|
||
|
|
parser.error("incorrect number of arguments")
|
||
|
|
|
||
|
|
if OPTIONS.affy == None:
|
||
|
|
parser.error("No affy data specified")
|
||
|
|
|
||
|
|
if OPTIONS.format1 == None:
|
||
|
|
parser.error("First format cannot be none")
|
||
|
|
|
||
|
|
if OPTIONS.snp2 <> None and OPTIONS.format2 == None:
|
||
|
|
parser.error("snp2 file given but format was not specified")
|
||
|
|
|
||
|
|
# Load reference genome
|
||
|
|
#ref = ref_genome("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
|
||
|
|
#sys.exit()
|
||
|
|
|
||
|
|
if 1:
|
||
|
|
print "Reading Affy truth data..."
|
||
|
|
#readAffyFile2 = DiskMemoize( readAffyFile, "readAffyFile", global_deps = ["OPTIONS.lod"] )
|
||
|
|
readAffyFile2 = time_func(readAffyFile)
|
||
|
|
affy_list = readAffyFile2( filename=OPTIONS.affy )
|
||
|
|
#count_het_sites(affy_list)
|
||
|
|
#sys.exit()
|
||
|
|
affy = dict( zip( map( GenotypeCall.site, affy_list ), affy_list ) )
|
||
|
|
print 'Read affy truth data:'
|
||
|
|
print ' -> number of genotyped loci', len(affy)
|
||
|
|
|
||
|
|
#readSNPfile2 = DiskMemoize( readSNPfile, "readSNPfile", global_deps = ["OPTIONS.lod"] )
|
||
|
|
readSNPfile2 = time_func(readSNPfile)
|
||
|
|
print "Reading SNPs 1 file..."
|
||
|
|
snps1 = readSNPfile2( filename=OPTIONS.snp1, format=OPTIONS.format1 )
|
||
|
|
if OPTIONS.snp2 <> None:
|
||
|
|
print "Reading SNPs 2 file..."
|
||
|
|
snps2 = readSNPfile2( filename=OPTIONS.snp2, format=OPTIONS.format2 )
|
||
|
|
|
||
|
|
dump_shared_snps( affy, snps1, snps2 )
|
||
|
|
#intersection_union_snps( affy, snps1, snps2 )
|
||
|
|
#sys.exit()
|
||
|
|
|
||
|
|
sharedSites = None
|
||
|
|
if OPTIONS.snp2 <> None:
|
||
|
|
sharedSites = overlappingSites( snps1, snps2 )
|
||
|
|
results1 = concordance( affy, snps1, sharedSites )
|
||
|
|
printConcordanceResults( OPTIONS.affy, OPTIONS.snp1, results1, True )
|
||
|
|
results2 = concordance( affy, snps2, sharedSites )
|
||
|
|
printConcordanceResults( OPTIONS.affy, OPTIONS.snp2, results2, True )
|
||
|
|
|
||
|
|
else:
|
||
|
|
results = concordance( affy, snps1 )
|
||
|
|
printConcordanceResults( OPTIONS.affy, OPTIONS.snp1, results, sharedSites )
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main(sys.argv)
|