From d6b036cdabceca2e7a3137182f553d1e6bb75a00 Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 7 May 2010 21:34:46 +0000 Subject: [PATCH] Minor improvements to simple python code git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3330 348d0f76-0448-11de-a6fe-93d51630548a --- python/indelVerboseStats.py | 45 +++++++++++++++++++++++++++++++++++++ python/madPipelineUtils.py | 28 ++++++++++++++--------- python/realignBamByChr.py | 4 ++-- python/vcf2table.py | 14 ++++++++---- 4 files changed, 74 insertions(+), 17 deletions(-) create mode 100755 python/indelVerboseStats.py diff --git a/python/indelVerboseStats.py b/python/indelVerboseStats.py new file mode 100755 index 000000000..55b9736c7 --- /dev/null +++ b/python/indelVerboseStats.py @@ -0,0 +1,45 @@ +from farm_commands2 import * +import os.path +import sys +from optparse import OptionParser +from datetime import date +import glob +import operator +import faiReader +import math +import shutil +import string +from madPipelineUtils import * + +def main(): + global OPTIONS + usage = "usage: %prog [options] indels.verbose.txt" + parser = OptionParser(usage=usage) + parser.add_option("-v", "--verbose", dest="verbose", + action='store_true', default=False, + help="") + + (OPTIONS, args) = parser.parse_args() + if len(args) != 1: + parser.error("incorrect number of arguments") + + indelsFile = args[0] + + print 'event size nReadsWithIndel nReadsWithoutIndel nReadsTotal' + for line in open(indelsFile): + try: + parts = line.split() + # chr1 30502 30505 -TTT OBS_COUNTS[C/A/T]:5/5/13 AV_MM[C/R]:0.00/0.75 AV_MAPQ[C/R]:37.00/27.13 NQS_MM_RATE[C/R]:0.00/0.01 NQS_AV_QUAL[C/R]:30.90/28.30 STRAND_COUNTS[C/C/R/R]:2/3/6/2 + chr, start, stop = parts[0:3] + event = parts[3] + counts = parts[4] + isInsertion = event[0] == '+' + size = len(event[1:]) + nConsensusReads, nReadsWithIndel, nReads = map(int, counts.split(':')[1].split('/')) + nReadsWithoutIndel = nReads - nReadsWithIndel + print event[0], size, nReadsWithIndel, nReadsWithoutIndel, nReads + except: + continue + +if __name__ == "__main__": + main() diff --git a/python/madPipelineUtils.py b/python/madPipelineUtils.py index 2b24c6cd2..83f275f30 100755 --- a/python/madPipelineUtils.py +++ b/python/madPipelineUtils.py @@ -10,9 +10,9 @@ import math import shutil import string -GATK_STABLE = '/home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar' -GATK_DEV = '/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar' -GATK = 'java -Xmx%s -Djava.io.tmpdir=/broad/shptmp/depristo/tmp/ -jar ' + GATK_STABLE + ' -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -l INFO ' +GATK_STABLE_JAR = '/home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar' +GATK_DEV_JAR = '/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar' +GATK_JAR = GATK_STABLE_JAR # add to GATK to enable dbSNP aware cleaning # -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod @@ -23,7 +23,6 @@ GATK = 'java -Xmx%s -Djava.io.tmpdir=/broad/shptmp/depristo/tmp/ -jar ' + GATK_S hg18 = ['chr' + str(i) for i in range(1,23)] + ['chrX', 'chrY'] b36 = [str(i) for i in range(1,23)] + ['X', 'Y'] - HG18_TO_B36 = { 'hg18' : 'b36', '/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta' : '/broad/1KG/reference/human_b36_both.fasta', @@ -46,17 +45,17 @@ def appendExtension(path, newExt, addExtension = True): # return os.path.join(OPTIONS.dir, s) class PipelineArgs: - def __init__( self, GATK = GATK, ref = 'hg18', name = None, memory = '4g' ): - self.GATK = GATK + def __init__( self, GATK_JAR = GATK_JAR, ref = 'hg18', name = None, memory = '4g' ): + self.GATK = 'java -Xmx%s -Djava.io.tmpdir=/broad/shptmp/depristo/tmp/ -jar ' + GATK_JAR + ' -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -l INFO ' self.ref = ref self.name = name self.memory = memory def convertToB36(self): - return this.ref == 'b36' + return self.ref == 'b36' def addGATKArg(self, arg): - if arg[0] != ' ': + if len(arg) > 0 and arg[0] != ' ': arg = ' ' + arg self.GATK += arg @@ -67,8 +66,8 @@ class PipelineArgs: return suffix def finalizedGATKCommand(self, args): - cmd = (self.GATK % self.memory) + args - if self.convertToB36: + cmd = (self.GATK % self.memory) + ' ' + args + if self.convertToB36(): cmd = hg18args_to_b36(cmd) return cmd @@ -83,11 +82,18 @@ def simpleGATKCommand( pargs, name, args, lastJobs ): # Takes a simpleGATKCommand and splits it by chromosome, merging output # def splitGATKCommandByChr( myPipelineArgs, cmd, outputsToParallelize, mergeCommands ): + if cmd.cmd_str_from_user.find(" -L") != -1: + sys.exit("Found -L argument in command -- cannot be provided for parallelization by chromosome: " + cmd.cmd_str_from_user) def makeChrCmd(chr): chrOutputMap = map(lambda x: appendExtension(x, chr), outputsToParallelize) - chr_cmd_str = cmd.cmd_str_from_user + chr_cmd_str = cmd.cmd_str_from_user + chr_cmd_str += ' -L ' + chr for x, y in zip(outputsToParallelize, chrOutputMap): chr_cmd_str = chr_cmd_str.replace(x, y) + + if myPipelineArgs.convertToB36(): + chr_cmd_str = hg18args_to_b36(chr_cmd_str) + chrCmd = FarmJob(chr_cmd_str, jobName = cmd.jobName + '.byChr' + chr, dependencies = cmd.dependencies) return chrCmd, chrOutputMap diff --git a/python/realignBamByChr.py b/python/realignBamByChr.py index 79aa999dd..9b70c0fc3 100755 --- a/python/realignBamByChr.py +++ b/python/realignBamByChr.py @@ -88,12 +88,12 @@ def main(): def createTargets( myPipelineArgs, chr, inputBam, outputRoot, args, lastJobs ): outputIntervals = outputRoot + ".intervals" - GATKArgs = '-T RealignerTargetCreator -I %s -o %s -mrl 10000 -L %s' % (inputBam, outputIntervals, chr) + GATKArgs = '-T RealignerTargetCreator -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -o %s -mrl 10000 -L %s' % (inputBam, outputIntervals, chr) return simpleGATKCommand( myPipelineArgs, 'CreateInterval' + chr, GATKArgs, lastJobs ), outputIntervals def realign( myPipelineArgs, chr, inputBam, outputRoot, intervals, lastJobs ): outputBAM = outputRoot + ".bam" - GATKArgs = '-T IndelRealigner -I %s -targetIntervals %s --output %s -sort ON_DISK -mrl 100000 -L %s' % (inputBam, intervals, outputBAM, chr) + GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -sort ON_DISK -mrl 100000 -L %s' % (inputBam, intervals, outputBAM, chr) return simpleGATKCommand( myPipelineArgs, 'Realign' + chr, GATKArgs, lastJobs ), outputBAM def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ): diff --git a/python/vcf2table.py b/python/vcf2table.py index a2e26603e..f770c73ce 100755 --- a/python/vcf2table.py +++ b/python/vcf2table.py @@ -4,10 +4,10 @@ from optparse import OptionParser from vcfReader import * if __name__ == "__main__": - usage = "usage: %prog files.list [options]" + usage = "usage: %prog [file.vcf | if absent stdin] [options]" parser = OptionParser(usage=usage) parser.add_option("-f", "--f", dest="fields", - type='string', default="", + type='string', default=None, help="Comma separated list of fields to exact") parser.add_option("-e", "--filter", dest="filter", action='store_true', default=False, @@ -17,13 +17,19 @@ if __name__ == "__main__": help="Only print out every 1 / skip records") (OPTIONS, args) = parser.parse_args() - if len(args) != 0: + if len(args) > 1: parser.error("incorrect number of arguments") counter = OPTIONS.skip + src = sys.stdin + if len(args) > 0: + src = open(args[0]) + + if OPTIONS.fields == None: + sys.exit("Fields argument must be provided") fields = OPTIONS.fields.split(',') - for header, vcf, count in lines2VCF(sys.stdin, extendedOutput = True): + for header, vcf, count in lines2VCF(src, extendedOutput = True): #print vcf, count if count == 1 and vcf.hasHeader(): print '\t'.join(fields)