gatk-3.8/playground/python/compSNPCalls.py

592 lines
22 KiB
Python

#!/usr/bin/env python
import sys, string
import os
import re
from itertools import *
from optparse import OptionParser
from memo import DiskMemoize, time_func
class ref_genome:
"""Reads reference genome in FASTA format into a dict"""
def __init__(self, ref_genome_file):
ref_genome.chr_offset = [[] for i in range(45)]
chr_id = 0
seq = ""
for line in open(ref_genome_file):
if line.startswith(">"):
print line[1:],
if line.startswith(">chrM"): # skip first > line
continue
ref_genome.chr_offset[chr_id] = seq
chr_id += 1
seq = " " # make it 1 indexed instead of 0 indexed
#if chr_id > 2:
# break
else:
seq += line.rstrip().upper()
ref_genome.chr_offset[chr_id] = seq
def __getitem__(self, key):
return ref_genome.chr_offset[key]
AffyChr2Index = dict()
for i in range(1,23):
AffyChr2Index[str(i)] = i
AffyChr2Index['MT'] = 0
AffyChr2Index['X'] = 23
AffyChr2Index['Y'] = 24
class GenotypeCall:
#ref = time_func(ref_genome)("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
def __init__( self, chr, pos, genotype, snpP, lod ):
self.chr = chr
self.pos = int(pos)
self._site = chr + ':' + str(self.pos)
self._isSNP = snpP
self.genotype = string.join(map(string.upper, sorted(genotype)), '/') # sorted list of bases at position
self.lod = lod
def refbase(self):
return GenotypeCall.ref[AffyChr2Index[self.chr]][self.pos]
def __hash__(self):
return hash(self._site)
def __eq__(self, other):
return self._site == other._site
def site(self): return self._site
def isSNP(self): return self._isSNP
def ref_het_hom(self):
if self.genotype[0] <> self.genotype[2]:
return 1 # het(erozygous non-ref)
else:
# homozygous something
if self.genotype[0] == self.refbase:
return 0 # ref
else:
return 2 # hom(ozygous non-ref)
def isHET(self): return self.genotype[0] <> self.genotype[2]
def isHOM(self): return self.genotype[0] == self.genotype[2]
def __str__(self):
return "%s:%s %s %s" % ( self.chr, self.pos, self.genotype, self.lod)
MAQGenotypeEncoding = {
'A' : ['A', 'A'],
'C' : ['C', 'C'],
'T' : ['T', 'T'],
'G' : ['G', 'G'],
"M" : ['A', 'C'],
'K' : ['G', 'T'],
'Y' : ['C', 'T'],
'R' : ['A', 'G'],
'W' : ['A', 'T'],
'S' : ['C', 'G'],
'D' : ['A', 'G', 'T'],
'B' : ['C', 'G', 'T'],
'H' : ['A', 'C', 'T'],
'V' : ['A', 'C', 'G'],
'N' : ['A', 'C', 'G', 'T'] }
MAQ2STDChr = dict()
for i in range(1,23):
MAQ2STDChr['chr'+str(i)] = str(i)
MAQ2STDChr['chrM'] = 'MT'
MAQ2STDChr['chrX'] = 'X'
MAQ2STDChr['chrY'] = 'Y'
def convertMAQChr(maqChr):
#print 'convertMAQChr:', maqChr, MAQ2STDChr[maqChr]
if maqChr in MAQ2STDChr:
return MAQ2STDChr[maqChr]
else:
return '?'
def convertMAQGenotype( oneBaseCode ):
return MAQGenotypeEncoding[oneBaseCode]
def internalReadSNPFile( parse1, filename ):
result = []
snps_extracted = 0
for snp in imap( parse1, open(filename) ):
if snp:
result.append(snp)
snps_extracted += 1
if snps_extracted > OPTIONS.debug_lines:
break
print len(result),"genotypes extracted"
return result
def snpMAP( snps ):
#d = dict( map( lambda x: [x.site(), x], snps ) )
d = dict()
for snp in snps:
d[snp.site()] = snp#d
#print 'snps', snps, d
return d
def overlappingSites( snps1, snps2 ):
map1 = snpMAP(snps1)
map2 = snpMAP(snps2)
shared = set(map1.keys()) & set(map2.keys())
print 'Number of snp1 records', len(map1)
print 'Number of snp2 records', len(map2)
print 'Number of shared sites', len(shared)
print "\n".join(map(str,snps1))
return shared
def readMAQSNPs(filename):
# Each line consists of:
# chromosome
# position
# reference base
# consensus base
# Phred-like consensus quality
# read depth
# the average number of hits of reads covering this position
# the highest mapping quality of the reads covering the position
# the minimum consensus quality in the 3bp flanking regions at each side of the site (6bp in total)
# the second best call
# log likelihood ratio of the second best and the third best call
# and the third best call.
#
# Also, note that:
#
# What do those "S", "M" and so on mean in the cns2snp output?
# They are IUB codes for heterozygotes. Briefly:
#
# M=A/C, K=G/T, Y=C/T, R=A/G, W=A/T, S=G/C, D=A/G/T, B=C/G/T, H=A/C/T, V=A/C/G, N=A/C/G/T
def read1(line):
formats = [str, int, str, str, int, int]
vals = map( lambda f, x: f(x), formats, line.split()[0:6] )
alignQual = vals[4]
if alignQual >= (10*OPTIONS.lod):
return GenotypeCall( convertMAQChr(vals[0]), vals[1], convertMAQGenotype(vals[3]), vals[2] <> vals[3], alignQual/10.0 )
else:
#print 'Filtering', alignQual, vals
return False
return internalReadSNPFile( read1, filename )
OPTIONS = None
def MerlinChr( index ):
if index == 0:
return 'MT'
elif index == 23:
return 'X'
elif index == 24:
return 'Y'
else:
return str(index)
def readMerlinSNPs(filename):
# 0:72 G GG 155.337967 0.000000 homozygous A:0 C:2 G:510 T:2 514 0 1 1 GG:-5.59 CG:-160.92 GT:-161.51 AG:-162.11 CT:-1293.61 CC:-1293.61 TT:-1294.19 AC:-1294.21 AT:-1294.80 AA:-1295.40
# 0:149 T CC 118.595886 1131.024696 homozygous-SNP A:2 C:442 G:1 T:7 452 0 1 1 CC:-24.21 CT:-142.81 AC:-156.33 CG:-156.96 TT:-1155.23 AT:-1159.41 GT:-1160.04 AA:-1173.26 AG:-1173.56 GG:-1174.20
# chr:pos ref genotype bestVsRef bestVsNextBest class ...
def read1(line):
formats = [lambda x: x.split(':'), str, sorted, float, float, str]
vals = map( lambda f, x: f(x), formats, line.split()[0:6] )
bestVsRef, bestVsNext = vals[3:5]
isSNP = vals[5].find('-SNP') <> -1
if bestVsRef >= OPTIONS.lod and isSNP:
return GenotypeCall( MerlinChr(int(vals[0][0])), int(vals[0][1]) + 1, vals[2], isSNP, bestVsRef )
else:
return False
return internalReadSNPFile( read1, filename )
def readSNPfile( filename, format ):
formats = { 'merlin' : readMerlinSNPs, 'maq' : readMAQSNPs }
if format.lower() in formats:
return list(formats[format.lower()](filename))
else:
raise Exception('Unknown SNP file format ' + format)
def readAffyFile(filename):
# chrom position genotype probe_set_id dbsnp_id
# 1 84647761 TC SNP_A-1780419 rs6576700
# 5 156323558 GG SNP_A-1780418 rs17054099
def read1(line):
formats = [str, int, sorted, str, str]
vals = map( lambda f, x: f(x), formats, line.split() )
try:
chr = str(int(vals[0]))
except:
chr = convertMAQChr(vals[0])
#print 'CHR', chr, vals[0]
return GenotypeCall( chr, vals[1], vals[2], False, 100 )
file = open(filename)
file.readline() # skip header
#affyData = map( read1, file )
affyData = []
for index, line in enumerate(file):
affyData.append(read1(line))
if index > OPTIONS.debug_lines:
break
if index % 10000 == 0:
print index
# Give a chance to use list before creating dictionary
return affyData
#print "1111111"
#return dict( zip( map( GenotypeCall.site, affyData ), affyData ) )
def equalSNPs( snp1, snp2 ):
return snp1.genotype == snp2.genotype
# def concordance( truthSet, testSet, includeVector = None ):
# # calculates a bunch of useful stats about the two
# # data genotype call sets above
#
# states = [[x,0] for x in ['tested', 'shared', 'shared-equal', 'test-snp', 'hom-ref', 'hom-snp', 'het-snp', 'het-ref']]
# counts = dict(states)
#
# def incShared(state, equalP ):
# counts[state] += 1
# if equalP:
# counts[state + '-equal'] += 1
#
# nTestSites = 0
# for i, testSNP in izip( count(), testSet ):
# if includeVector <> None and not includeVector[i]:
# # we are skiping this site
# continue
#
# nTestSites += 1
#
# if testSNP.isSNP():
# counts['test-snp'] += 1
#
# #print testSNP.site()
# if testSNP.site() in truthSet:
# truth = truthSet[testSNP.site()]
# eql = equalSNPs( testSNP, truth )
#
# incShared( 'shared', eql )
# if testSNP.isSNP():
# if truth.isHOM(): incShared( 'hom-snp', eql )
# else: incShared( 'het-snp', eql )
# else:
# if truth.isHOM(): incShared( 'hom-ref', eql )
# else: incShared( 'het-ref', eql )
#
# if OPTIONS.verbose and nTestSites % 100 == 0 and nSharedSites > 0:
# #print nTestSites, nSharedSites, nEqualSites
# print nTestSites, counts
#
# counts['tested'] = nTestedSites
#
# return counts
class ConcordanceData:
def __init__(self, name, file1count, file2count):
self.name = name
self.nFile1Sites = file1count # num sites in file 1
self.nFile2Sites = file2count # num sites in file 1
self.nSharedSites = 0 # num SNP pairs that map to same position on the genome
self.nEqualSites = 0 # num SNPs pars with the same genotype
def inc( self, truthSNP, testSNP ):
self.nSharedSites += 1
if equalSNPs( testSNP, truthSNP ): # if the genotypes are equal
self.nEqualSites += 1
def rate(self):
return (100.0 * self.nEqualSites) / max(self.nSharedSites,1)
def __str__(self):
return '%d %d %.2f' % ( self.nSharedSites, self.nEqualSites, self.rate() )
def concordance( truthSet, testSet, sharedSites = None ):
# calculates a bunch of useful stats about the two
# data genotype call sets above
#
# The 2 calls in main work like this:
# affy, snp1, snp1_snp2_shared
# affy, snp2, snp1_snp2_shared
nTestSites = 0
# Now for each of the calls to concordance, we generate 3 sets:
# - allData: all SNP1 sites that are also in Affy
allData = ConcordanceData('all', len(truthSet), len(testSet))
# - sharedData: SNP1 sites that are also SNP2 sites that are alse in Affy
sharedData = ConcordanceData('shared', len(truthSet), len(testSet))
# - uniqueData: SNP1 sites that are not SNP2 sites but that are in Affy
uniqueData = ConcordanceData('unique', len(truthSet), len(testSet))
for i, testSNP in izip( count(), testSet ):
nTestSites += 1
if testSNP.site() in truthSet:
truthSNP = truthSet[testSNP.site()]
allData.inc( truthSNP, testSNP )
if sharedSites <> None:
if testSNP.site() in sharedSites:
sharedData.inc( truthSNP, testSNP )
else:
uniqueData.inc( truthSNP, testSNP )
if OPTIONS.verbose and nTestSites % 100000 == 0:
#print nTestSites, nSharedSites, nEqualSites
print nTestSites, allData, sharedData, uniqueData
return nTestSites, allData, sharedData, uniqueData
# def concordance( truthSet, testSet, includeVector = None ):
# # calculates a bunch of useful stats about the two
# # data genotype call sets above
#
# states = [[x,0] for x in ['tested', 'shared', 'test-snp', 'shared-hom-ref', 'shared-het-snp', 'shared-hom-snp']]
# counts = dict(states)
#
# nTestSites = 0
# nSharedSites = 0
# nEqualSites = 0
# for i, testSNP in izip( count(), testSet ):
# nTestSites += 1
# #print testSNP.site()
# if testSNP.site() in truthSet:
# nSharedSites += 1
# if equalSNPs( testSNP, truthSet[testSNP.site()] ):
# nEqualSites += 1
# #else:
# # print '~', testSNP, truthSet[testSNP.site()]
# if OPTIONS.verbose and nTestSites % 100000 == 0 and nSharedSites > 0:
# #print nTestSites, nSharedSites, nEqualSites
# print nTestSites, nSharedSites, nEqualSites, (100.0 * nEqualSites) / nSharedSites
#
# return [nTestSites, nSharedSites, nEqualSites, (100.0 * nEqualSites) / nSharedSites]
def printConcordanceResults( filename1, filename2, results, hasSharedSites = False ):
nTestSites, allData, sharedData, uniqueData = results
def print1(data):
print '------------------------------------------------------------'
print 'Concordance results', data.name, 'sites'
print ' -> Genotype file1', filename1
print ' -> Genotype file2', filename2
print ' -> Number of tested sites (%s %s): %d' % (filename2, data.name, nTestSites)
print ' -> Number of sites shared between files (%s %s): %d' % (filename2, data.name, data.nSharedSites)
print ' -> Percent sites in file1 shared (%s %s): %.2f' % (filename1, data.name, data.nSharedSites / (0.01 * data.nFile1Sites))
print ' -> Percent sites in file2 shared (%s %s): %.2f' % (filename1, data.name, data.nSharedSites / (0.01 * data.nFile2Sites))
print ' -> Number of genotypically equivalent sites (%s %s): %d' % (filename2, data.name, data.nEqualSites)
print ' -> Concordance rate (%s %s): %.2f' % (filename2, data.name, data.rate())
print1(allData)
if hasSharedSites:
print1( sharedData ) # shared between SNP1 and SNP2
print1( uniqueData ) # unique to SNP1 or to SNP2 only
def dump_shared_snps( affys, snp_list1, snp_list2 ):
print len(affys), len(snp_list1), len(snp_list2)
snps1 = snpMAP(snp_list1)
snps2 = snpMAP(snp_list2)
snp1_sites = set(snps1.keys()); print "SNP1s:",len(snp1_sites)
snp2_sites = set(snps2.keys()); print "SNP2s:",len(snp2_sites)
affy_sites = set(affys.keys()); print "Affys:",len(affy_sites)
snp1or2_affy_sites = (snp1_sites | snp2_sites) & affy_sites
snp1and2_affy_sites = (snp1_sites & snp2_sites) & affy_sites
print "SNP 1 or 2 and Affy: ",len(snp1or2_affy_sites)
print "SNP 1 and 2 and Affy:",len(snp1and2_affy_sites)
fsnp = open ("snp.tab","w")
print >>fsnp, "site lod1 lod2 lod1v2 gen1 gen2 genaff inc1 inc2 inc12 lodm genm incm ref_het_hom refbase"
for site in snp1and2_affy_sites:
snp1 = snps1[site]
snp2 = snps2[site]
affy = affys[site]
print >>fsnp, "%-11s %5.2f %5.2f" % (site, snp1.lod, snp2.lod),
try:
snp1div2 = snp1.lod / snp2.lod
except ZeroDivisionError:
snp1div2 = 1000
print >>fsnp, "%5.2f" % snp1div2,
print >>fsnp, snp1.genotype, snp2.genotype, affy.genotype,
print >>fsnp, "%1d %1d %1d" % (not equalSNPs(snp1, affy), not equalSNPs(snp2,affy), not equalSNPs(snp1,snp2)),
# Calculte meta_lod from the two lods
if snp1.genotype == snp2.genotype:
meta_lod = snp1.lod + snp2.lod
meta_genotype = snp1.genotype
else:
if snp1.lod > snp2.lod:
meta_lod = snp1.lod
meta_genotype = snp1.genotype
else:
meta_lod = snp2.lod
meta_genotype = snp2.genotype
meta_inc = meta_genotype != affy.genotype
print >>fsnp, "%5.2f %3s %1d" % (meta_lod, meta_genotype, meta_inc),
print >>fsnp, affy.ref_het_hom(),
print >>fsnp, affy.refbase()
def intersection_union_snps( affy, snps1, snps2 ):
map1 = snpMAP(snps1)
map2 = snpMAP(snps2)
shared_nonaffy_sites = (set(map1.keys()) & set(map2.keys())).difference(affy.keys())
nonaffy_shared = [(map1[site].lod, map2[site].lod) for site in shared_nonaffy_sites]
shared_affy_sites = set(map1.keys()) & set(map2.keys()) & set(affy.keys())
#shared = []
#for site in shared_sites:
# shared.append((map1[site], map2[site]))
#print "Shared:",len( shared )
#shared_x = [s[0].lod for s in shared]
#shared_y = [s[1].lod for s in shared]
both_corr = []
snp1_corr = []
snp2_corr = []
neither_corr = []
# given two bools telling whether snp1 and snp2 are correct,
# return the correct object
#
# snp2 incorrect, snp2 correct
truth_list = [[neither_corr, snp2_corr], # snp1 incorrect
[snp1_corr, both_corr]] # snp1 correct
for site in shared_affy_sites:
snp1_true = equalSNPs(map1[site], affy[site])
snp2_true = equalSNPs(map2[site], affy[site])
truth_list[snp1_true][snp2_true].append( (map1[site].lod, map2[site].lod) )
print "Beginning plot..."
import rpy2.robjects as robj
robj.r('X11(width=15, height=15)')
XY_MAX = 25
plots = ((nonaffy_shared, "gray45"),(both_corr, "black"), (snp1_corr, "red"), (snp2_corr, "green"), (neither_corr, "blue"))
plots = ((both_corr, "black"), (snp1_corr, "red"), (snp2_corr, "green"), (neither_corr, "blue"))
robj.r.plot([0, XY_MAX], [0, XY_MAX], \
xlab = os.path.splitext(OPTIONS.snp1)[0].capitalize()+" LOD", \
ylab = os.path.splitext(OPTIONS.snp2)[0].capitalize()+" LOD", \
main = "Shared SNP site LODs", \
xlim = robj.FloatVector([0,XY_MAX]), \
ylim = robj.FloatVector([0,min(XY_MAX,25)]), \
pch = 19, \
cex = 0.0)
for xy, color in plots:
print "X"
robj.r.points([pt[0] for pt in xy], [pt[1] for pt in xy], col=color, pch=19, cex=.3)
print "Color:",color
print "Len:",len(xy)
#print "\n".join(["%25s %25s" % pt for pt in xy])
#print "\n".join(["%25s %25s" % shared_snp for shared_snp in shared])
#robj.r.plot(shared_x, shared_y, xlab=OPTIONS.format1.capitalize()+" LOD", ylab=OPTIONS.format2.capitalize()+" LOD", main="Shared SNP site LODs", col="black", xlim=robj.FloatVector([0,XY_MAX]), ylim=robj.FloatVector([0,min(XY_MAX,25)]), pch=19, cex=0.3)
raw_input("Press enter to continue")
return
ss1 = set(snps1)
ss2 = set(snps2)
print "Shared:",len( ss1.intersection(ss2) )
print "Snp1 only:",len( ss1.difference(ss2) )
print "Snp2 only:",len( ss2.difference(ss1) )
print "Snp1 total:",len( ss1 )
print "Snp2 total:",len( ss2 )
def count_het_sites(snp_list):
hets = 0
for snp in snp_list:
if snp.isHET():
hets += 1
print hets,"hets,",len(snp_list),"total,",
print "%.1f" % (float(hets)/len(snp_list)*100)
def main(argv):
global OPTIONS, ROOT
usage = "usage: %prog --truth affy-truth-file --snp1 snpfile1 --snp2 snpfile2"
parser = OptionParser(usage=usage)
parser.add_option("-t", "--truth", dest="affy",
type="string", default=None,
help="Affy truth file")
parser.add_option("-1", "--snp1", dest="snp1",
type="string", default=None,
help="Affy truth file")
parser.add_option("-2", "--snp2", dest="snp2",
type="string", default=None,
help="Affy truth file")
parser.add_option("", "--f1", dest="format1",
type="string", default=None,
help="File type of snpfile1")
parser.add_option("", "--f2", dest="format2",
type="string", default=None,
help="File type of snpfile2")
parser.add_option("-l", "--lod", dest="lod",
type="float", default=5.0,
help="Minimum LOD of confident SNP call in Merlin or Q (10x) in MAQ")
parser.add_option("-v", "--verbose", dest="verbose",
action='store_true', default=False,
help="Verbose output")
parser.add_option("-d", "--debug_lines", dest="debug_lines",
type='float', default=sys.maxint,
help="Number of input data lines to process for debugging")
(OPTIONS, args) = parser.parse_args()
if len(args) != 0:
parser.error("incorrect number of arguments")
if OPTIONS.affy == None:
parser.error("No affy data specified")
if OPTIONS.format1 == None:
parser.error("First format cannot be none")
if OPTIONS.snp2 <> None and OPTIONS.format2 == None:
parser.error("snp2 file given but format was not specified")
# Load reference genome
#ref = ref_genome("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
#sys.exit()
if 1:
print "Reading Affy truth data..."
#readAffyFile2 = DiskMemoize( readAffyFile, "readAffyFile", global_deps = ["OPTIONS.lod"] )
readAffyFile2 = time_func(readAffyFile)
affy_list = readAffyFile2( filename=OPTIONS.affy )
#count_het_sites(affy_list)
#sys.exit()
affy = dict( zip( map( GenotypeCall.site, affy_list ), affy_list ) )
print 'Read affy truth data:'
print ' -> number of genotyped loci', len(affy)
#readSNPfile2 = DiskMemoize( readSNPfile, "readSNPfile", global_deps = ["OPTIONS.lod"] )
readSNPfile2 = time_func(readSNPfile)
print "Reading SNPs 1 file..."
snps1 = readSNPfile2( filename=OPTIONS.snp1, format=OPTIONS.format1 )
if OPTIONS.snp2 <> None:
print "Reading SNPs 2 file..."
snps2 = readSNPfile2( filename=OPTIONS.snp2, format=OPTIONS.format2 )
dump_shared_snps( affy, snps1, snps2 )
#intersection_union_snps( affy, snps1, snps2 )
#sys.exit()
sharedSites = None
if OPTIONS.snp2 <> None:
sharedSites = overlappingSites( snps1, snps2 )
results1 = concordance( affy, snps1, sharedSites )
printConcordanceResults( OPTIONS.affy, OPTIONS.snp1, results1, True )
results2 = concordance( affy, snps2, sharedSites )
printConcordanceResults( OPTIONS.affy, OPTIONS.snp2, results2, True )
else:
results = concordance( affy, snps1 )
printConcordanceResults( OPTIONS.affy, OPTIONS.snp1, results, sharedSites )
if __name__ == "__main__":
main(sys.argv)