gatk-3.8/python/picard_utils.py

52 lines
2.1 KiB
Python
Executable File

import farm_commands
import os.path
import sys
from optparse import OptionParser
import string
import re
import glob
#lanes = ["30JW3AAXX.6", "30KRNAAXX.1", "30KRNAAXX.6", "30PYMAAXX.5"]
#idsList = ['NA12843', 'NA19065', 'NA19064', 'NA18637']
lanes = ["30JW3AAXX.6", "30PYMAAXX.5", "30PNUAAXX.8", "30PPJAAXX.5"]
idsList = ['NA12843', 'NA18637', "NA19058", "NA12842"]
ids = dict(zip(lanes, idsList))
gatkPath = "~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar"
ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
analysis = "CombineDuplicates"
def getPicardPath(lane, picardRoot = '/seq/picard/'):
flowcell, laneNo = lane.split('.')
filePat = os.path.join(picardRoot, flowcell, '*', laneNo, '*')
dirs = glob.glob(filePat)
print dirs
if len(dirs) > 1:
system.exit("Bad lane -- too many directories matching pattern " + filePat)
return dirs[0]
def getReferenceGenotypeFileFromConcordanceFile(concordFile):
# REFERENCE_GENOTYPES=/seq/references/reference_genotypes/hapmap/Homo_sapiens_assembly18/NA19058.geli
p = re.compile('REFERENCE_GENOTYPES=([/.\w]+)')
for line in open(concordFile):
match = p.search(line)
print 'Match is', line, match
if match <> None:
return match.group(1)
return None
def concord(options, geli, output, genotypeFile):
return ("java -jar /seq/software/picard/current/bin/CollectGenotypeConcordanceStatistics.jar OPTIONS_FILE=%s INPUT=%s OUTPUT=%s REFERENCE_GENOTYPES=%s MINIMUM_LOD=5.0" % ( options, geli, output, genotypeFile ) )
def readPicardConcordance(file):
p = re.compile('HOMOZYGOUS_REFERENCE|HETEROZYGOUS|HOMOZYGOUS_NON_REFERENCE')
# CATEGORY OBSERVATIONS AGREE DISAGREE PCT_CONCORDANCE
# HOMOZYGOUS_REFERENCE 853 853 0 1
# HETEROZYGOUS 416 413 3 0.992788
# HOMOZYGOUS_NON_REFERENCE 235 231 4 0.982979
types = [str, int, int, int, float]
def parse1(line):
return [f(x) for f, x in zip(types, line.split())]
data = [parse1(line) for line in open(file) if p.match(line) <> None]
return data