189 lines
9.5 KiB
Python
Executable File
189 lines
9.5 KiB
Python
Executable File
import farm_commands
|
|
import os.path
|
|
import sys
|
|
from optparse import OptionParser
|
|
import string
|
|
import re
|
|
import glob
|
|
import unittest
|
|
import itertools
|
|
|
|
#lanes = ["30JW3AAXX.6", "30KRNAAXX.1", "30KRNAAXX.6", "30PYMAAXX.5"]
|
|
#idsList = ['NA12843', 'NA19065', 'NA19064', 'NA18637']
|
|
|
|
lanes = ["30JW3AAXX.6", "30PYMAAXX.5", "30PNUAAXX.8", "30PPJAAXX.5"]
|
|
idsList = ['NA12843', 'NA18637', "NA19058", "NA12842"]
|
|
ids = dict(zip(lanes, idsList))
|
|
gatkPath = "~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar"
|
|
ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
|
|
analysis = "CombineDuplicates"
|
|
|
|
MERGE_BIN = '/seq/software/picard/current/bin/MergeSamFiles.jar'
|
|
CALL_GENOTYPES_BIN = '/seq/software/picard/current/bin/CallGenotypes.jar'
|
|
|
|
def unique(l):
|
|
# NotÊorderÊpreservingÊÊÊÊ
|
|
return list(set(l))
|
|
|
|
def genotypes2heterozyosity(genotypes, nIndividuals = -1):
|
|
def isHET(genotype):
|
|
return genotype[0] <> genotype[1]
|
|
|
|
if nIndividuals == -1:
|
|
n = len(genotypes)
|
|
else:
|
|
n = nIndividuals
|
|
|
|
hets = filter( isHET, genotypes )
|
|
nhets = len(hets)
|
|
# print genotypes, ' => hets', hets
|
|
return [nhets / (1.0*n), nhets, n]
|
|
|
|
def genotypes2allele(genotypes, nIndividuals = -1):
|
|
def isHET(genotype):
|
|
return genotype[0] <> genotype[1]
|
|
|
|
if nIndividuals == -1:
|
|
n = len(genotypes)
|
|
else:
|
|
n = nIndividuals
|
|
|
|
hets = filter( isHET, genotypes )
|
|
nhets = len(hets)
|
|
print genotypes, ' => hets', hets
|
|
return [nhets / (1.0*n), nhets, n]
|
|
|
|
def aggregatedGeliCalls2SNP( geliCallsAtSite, nIndividuals ):
|
|
#print 'geliCallsAtSite', geliCallsAtSite
|
|
loc = geliCallsAtSite[0]
|
|
refBases = map( lambda call: call[2], geliCallsAtSite[1] )
|
|
#print 'refBases', refBases
|
|
genotypes = map( lambda call: call[5], geliCallsAtSite[1] )
|
|
#print 'genotypes', genotypes
|
|
polymorphism = unique(list(refBases[0] + genotypes[0]))
|
|
if polymorphism[0] <> refBases[0]: polymorphism.reverse()
|
|
assert polymorphism[0] == refBases[0]
|
|
#print 'polymorphism', polymorphism
|
|
genotype = list(geliCallsAtSite[1][0][5])
|
|
|
|
return [loc, polymorphism, genotypes2heterozyosity(genotypes, nIndividuals), genotypes]
|
|
|
|
#return '%s %s %s 0.002747 -411.622578 -420.661738 0.000000 9.039160 364.000000 %d 1 0' % (loc, genotype[0], genotype[1], len(geliCallsAtSite))
|
|
|
|
def call2loc(call):
|
|
return call[0] + ':' + call[1]
|
|
|
|
def aggregateGeliCalls( sortedGeliCalls ):
|
|
return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
|
|
|
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN ):
|
|
if type(inputFiles) <> list:
|
|
inputFiles = list(inputFiles)
|
|
return 'java -Xmx4096m -jar ' + mergeBin + ' MSD=true AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
|
|
|
def getPicardPath(lane, picardRoot = '/seq/picard/'):
|
|
flowcell, laneNo = lane.split('.')
|
|
filePat = os.path.join(picardRoot, flowcell, '*', laneNo, '*')
|
|
dirs = glob.glob(filePat)
|
|
print dirs
|
|
if len(dirs) > 1:
|
|
system.exit("Bad lane -- too many directories matching pattern " + filePat)
|
|
return dirs[0]
|
|
|
|
def getReferenceGenotypeFileFromConcordanceFile(concordFile):
|
|
# REFERENCE_GENOTYPES=/seq/references/reference_genotypes/hapmap/Homo_sapiens_assembly18/NA19058.geli
|
|
p = re.compile('REFERENCE_GENOTYPES=([/.\w]+)')
|
|
for line in open(concordFile):
|
|
match = p.search(line)
|
|
print 'Match is', line, match
|
|
if match <> None:
|
|
return match.group(1)
|
|
return None
|
|
|
|
def callGenotypesCmd( inputBam, outputFilename, callGenotypesBin = CALL_GENOTYPES_BIN, options = ''):
|
|
return "java -jar %s INPUT=%s OUTPUT=%s CALLER_ALGORITHM=QUALITY_SCORE PRIOR_MODEL=SNP_FREQUENCY %s" % ( callGenotypesBin, inputBam, outputFilename, options)
|
|
|
|
def concord(options, geli, output, genotypeFile):
|
|
return ("java -jar /seq/software/picard/current/bin/CollectGenotypeConcordanceStatistics.jar OPTIONS_FILE=%s INPUT=%s OUTPUT=%s REFERENCE_GENOTYPES=%s MINIMUM_LOD=5.0" % ( options, geli, output, genotypeFile ) )
|
|
|
|
def readPicardConcordance(file):
|
|
p = re.compile('HOMOZYGOUS_REFERENCE|HETEROZYGOUS|HOMOZYGOUS_NON_REFERENCE')
|
|
# CATEGORY OBSERVATIONS AGREE DISAGREE PCT_CONCORDANCE
|
|
# HOMOZYGOUS_REFERENCE 853 853 0 1
|
|
# HETEROZYGOUS 416 413 3 0.992788
|
|
# HOMOZYGOUS_NON_REFERENCE 235 231 4 0.982979
|
|
types = [str, int, int, int, float]
|
|
def parse1(line):
|
|
return [f(x) for f, x in zip(types, line.split())]
|
|
data = [parse1(line) for line in open(file) if p.match(line) <> None]
|
|
return data
|
|
|
|
# ------------------------------------------------------------------------------------------
|
|
#
|
|
# Unit testing!
|
|
#
|
|
# ------------------------------------------------------------------------------------------
|
|
class TestPicardUnils(unittest.TestCase):
|
|
def setUp(self):
|
|
import cStringIO
|
|
dataString = """chr1 1105366 T 52 99 CT 10.559975 10.559975 -117.68 -93.107178 -116.616493 -45.536842 -88.591728 -92.043671 -20.964022 -116.014435 -44.473339 -31.523996
|
|
chr1 1105411 G 22 99 AG 12.484722 12.484722 -23.995817 -27.909206 -10.875731 -27.909206 -46.579994 -29.546518 -46.579994 -23.360453 -29.546518 -46.579994
|
|
chr1 1105411 G 29 99 AG 12.033216 12.033216 -30.641142 -34.376297 -14.483982 -35.457623 -53.197636 -32.525024 -53.498665 -26.517199 -33.606354 -54.579994
|
|
chr1 1105857 G 6 99 AG 7.442399 2.096584 -7.55462 -9.3608 -5.458036 -9.3608 -20.279993 -16.37723 -20.279993 -12.900434 -16.37723 -20.279993
|
|
chr1 1105857 G 7 99 AG 10.889011 1.795554 -7.406977 -9.514187 -5.611423 -9.514187 -23.879993 -19.97723 -23.879993 -16.500435 -19.97723 -23.879993
|
|
chr1 1106094 T 20 99 CT 6.747106 6.747106 -56.979992 -43.143734 -56.806652 -23.699236 -41.036522 -42.97039 -9.862975 -56.505623 -23.525892 -16.610081
|
|
chr1 1110294 G 42 99 AG 21.076984 21.076984 -44.285015 -49.702267 -17.869579 -49.831242 -80.276649 -48.442669 -80.404335 -38.946564 -48.571644 -80.405624
|
|
chr1 1111204 C 26 99 CT 11.364679 11.364679 -55.479992 -31.040928 -55.479992 -36.424099 -23.349712 -31.040928 -11.985033 -55.479992 -36.424099 -32.811741
|
|
chr1 1111204 C 29 99 TT 34.740601 3.890135 -52.282055 -48.646442 -52.704597 -17.954794 -44.565525 -48.464859 -13.715057 -52.100475 -17.773212 -9.824923
|
|
chr1 1111204 C 31 99 CT 18.784479 18.784479 -71.079994 -39.870823 -71.079994 -44.303268 -31.878578 -39.870823 -13.094099 -71.079994 -44.303268 -39.48679
|
|
"""
|
|
dataFile = cStringIO.StringIO(dataString)
|
|
self.nIndividuals = 10
|
|
|
|
self.genotypesSets = aggregateGeliCalls(map( string.split, dataFile.readlines() ) )
|
|
self.genotypes = map(lambda x: aggregatedGeliCalls2SNP(x, self.nIndividuals), self.genotypesSets )
|
|
self.locs = ["chr1:1105366", "chr1:1105411", "chr1:1105857", "chr1:1106094", "chr1:1110294", "chr1:1111204"]
|
|
self.nhets = [1, 2, 2, 1, 1, 2]
|
|
self.altAlleles = [1, 2, 2, 1, 1, 4]
|
|
|
|
self.aaf = map( lambda x: (1.0*x) / self.nIndividuals, self.altAlleles )
|
|
self.hets = map( lambda x: (1.0*x) / self.nIndividuals, self.nhets )
|
|
|
|
def testGenotypesSize(self):
|
|
self.assertEqual(len(self.genotypesSets), 6)
|
|
|
|
def testGenotypes2Het(self):
|
|
print 'testGenotypes2Het...'
|
|
self.assertEqual(genotypes2heterozyosity(['AT']), [1, 1, 1])
|
|
self.assertEqual(genotypes2heterozyosity(['AA']), [0, 0, 1])
|
|
self.assertEqual(genotypes2heterozyosity(['TT']), [0, 0, 1])
|
|
self.assertEqual(genotypes2heterozyosity(['AT', 'AT']), [1, 2, 2])
|
|
self.assertEqual(genotypes2heterozyosity(['AA', 'AA']), [0, 0, 2])
|
|
self.assertEqual(genotypes2heterozyosity(['AT', 'AA']), [0.5, 1, 2])
|
|
self.assertEqual(genotypes2heterozyosity(['AT', 'TT']), [0.5, 1, 2])
|
|
self.assertEqual(genotypes2heterozyosity(['AT', 'TT', 'AA']), [1.0/3, 1, 3])
|
|
self.assertEqual(genotypes2heterozyosity(['AT', 'AT', 'AA']), [2.0/3, 2, 3])
|
|
|
|
self.assertEqual(genotypes2heterozyosity(['AT', 'AT'], 10), [2.0/10, 2, 10])
|
|
self.assertEqual(genotypes2heterozyosity(['AT', 'AA'], 10), [1.0/10, 1, 10])
|
|
|
|
|
|
def testGenotypeSetLocs(self):
|
|
for set, loc in zip(self.genotypesSets, self.locs):
|
|
#print loc, set
|
|
self.assertEqual(set[0], loc)
|
|
|
|
def testGenotypeLocs(self):
|
|
for genotype, loc in zip(self.genotypes, self.locs):
|
|
self.assertEqual(genotype[0], loc)
|
|
|
|
def testGenotypeHets(self):
|
|
print 'testGenotypeHets:'
|
|
for genotype, het in zip(self.genotypes, self.hets):
|
|
print ' => ', genotype, het
|
|
self.assertEqual(genotype[2][0], het)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|