Minor improvements to simple python code

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3330 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-05-07 21:34:46 +00:00
parent 504103bd15
commit d6b036cdab
4 changed files with 74 additions and 17 deletions

View File

@ -0,0 +1,45 @@
from farm_commands2 import *
import os.path
import sys
from optparse import OptionParser
from datetime import date
import glob
import operator
import faiReader
import math
import shutil
import string
from madPipelineUtils import *
def main():
global OPTIONS
usage = "usage: %prog [options] indels.verbose.txt"
parser = OptionParser(usage=usage)
parser.add_option("-v", "--verbose", dest="verbose",
action='store_true', default=False,
help="")
(OPTIONS, args) = parser.parse_args()
if len(args) != 1:
parser.error("incorrect number of arguments")
indelsFile = args[0]
print 'event size nReadsWithIndel nReadsWithoutIndel nReadsTotal'
for line in open(indelsFile):
try:
parts = line.split()
# chr1 30502 30505 -TTT OBS_COUNTS[C/A/T]:5/5/13 AV_MM[C/R]:0.00/0.75 AV_MAPQ[C/R]:37.00/27.13 NQS_MM_RATE[C/R]:0.00/0.01 NQS_AV_QUAL[C/R]:30.90/28.30 STRAND_COUNTS[C/C/R/R]:2/3/6/2
chr, start, stop = parts[0:3]
event = parts[3]
counts = parts[4]
isInsertion = event[0] == '+'
size = len(event[1:])
nConsensusReads, nReadsWithIndel, nReads = map(int, counts.split(':')[1].split('/'))
nReadsWithoutIndel = nReads - nReadsWithIndel
print event[0], size, nReadsWithIndel, nReadsWithoutIndel, nReads
except:
continue
if __name__ == "__main__":
main()

View File

@ -10,9 +10,9 @@ import math
import shutil
import string
GATK_STABLE = '/home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar'
GATK_DEV = '/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar'
GATK = 'java -Xmx%s -Djava.io.tmpdir=/broad/shptmp/depristo/tmp/ -jar ' + GATK_STABLE + ' -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -l INFO '
GATK_STABLE_JAR = '/home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar'
GATK_DEV_JAR = '/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar'
GATK_JAR = GATK_STABLE_JAR
# add to GATK to enable dbSNP aware cleaning
# -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod
@ -23,7 +23,6 @@ GATK = 'java -Xmx%s -Djava.io.tmpdir=/broad/shptmp/depristo/tmp/ -jar ' + GATK_S
hg18 = ['chr' + str(i) for i in range(1,23)] + ['chrX', 'chrY']
b36 = [str(i) for i in range(1,23)] + ['X', 'Y']
HG18_TO_B36 = {
'hg18' : 'b36',
'/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta' : '/broad/1KG/reference/human_b36_both.fasta',
@ -46,17 +45,17 @@ def appendExtension(path, newExt, addExtension = True):
# return os.path.join(OPTIONS.dir, s)
class PipelineArgs:
def __init__( self, GATK = GATK, ref = 'hg18', name = None, memory = '4g' ):
self.GATK = GATK
def __init__( self, GATK_JAR = GATK_JAR, ref = 'hg18', name = None, memory = '4g' ):
self.GATK = 'java -Xmx%s -Djava.io.tmpdir=/broad/shptmp/depristo/tmp/ -jar ' + GATK_JAR + ' -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -l INFO '
self.ref = ref
self.name = name
self.memory = memory
def convertToB36(self):
return this.ref == 'b36'
return self.ref == 'b36'
def addGATKArg(self, arg):
if arg[0] != ' ':
if len(arg) > 0 and arg[0] != ' ':
arg = ' ' + arg
self.GATK += arg
@ -67,8 +66,8 @@ class PipelineArgs:
return suffix
def finalizedGATKCommand(self, args):
cmd = (self.GATK % self.memory) + args
if self.convertToB36:
cmd = (self.GATK % self.memory) + ' ' + args
if self.convertToB36():
cmd = hg18args_to_b36(cmd)
return cmd
@ -83,11 +82,18 @@ def simpleGATKCommand( pargs, name, args, lastJobs ):
# Takes a simpleGATKCommand and splits it by chromosome, merging output
#
def splitGATKCommandByChr( myPipelineArgs, cmd, outputsToParallelize, mergeCommands ):
if cmd.cmd_str_from_user.find(" -L") != -1:
sys.exit("Found -L argument in command -- cannot be provided for parallelization by chromosome: " + cmd.cmd_str_from_user)
def makeChrCmd(chr):
chrOutputMap = map(lambda x: appendExtension(x, chr), outputsToParallelize)
chr_cmd_str = cmd.cmd_str_from_user
chr_cmd_str = cmd.cmd_str_from_user
chr_cmd_str += ' -L ' + chr
for x, y in zip(outputsToParallelize, chrOutputMap):
chr_cmd_str = chr_cmd_str.replace(x, y)
if myPipelineArgs.convertToB36():
chr_cmd_str = hg18args_to_b36(chr_cmd_str)
chrCmd = FarmJob(chr_cmd_str, jobName = cmd.jobName + '.byChr' + chr, dependencies = cmd.dependencies)
return chrCmd, chrOutputMap

View File

@ -88,12 +88,12 @@ def main():
def createTargets( myPipelineArgs, chr, inputBam, outputRoot, args, lastJobs ):
outputIntervals = outputRoot + ".intervals"
GATKArgs = '-T RealignerTargetCreator -I %s -o %s -mrl 10000 -L %s' % (inputBam, outputIntervals, chr)
GATKArgs = '-T RealignerTargetCreator -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -o %s -mrl 10000 -L %s' % (inputBam, outputIntervals, chr)
return simpleGATKCommand( myPipelineArgs, 'CreateInterval' + chr, GATKArgs, lastJobs ), outputIntervals
def realign( myPipelineArgs, chr, inputBam, outputRoot, intervals, lastJobs ):
outputBAM = outputRoot + ".bam"
GATKArgs = '-T IndelRealigner -I %s -targetIntervals %s --output %s -sort ON_DISK -mrl 100000 -L %s' % (inputBam, intervals, outputBAM, chr)
GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -sort ON_DISK -mrl 100000 -L %s' % (inputBam, intervals, outputBAM, chr)
return simpleGATKCommand( myPipelineArgs, 'Realign' + chr, GATKArgs, lastJobs ), outputBAM
def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ):

View File

@ -4,10 +4,10 @@ from optparse import OptionParser
from vcfReader import *
if __name__ == "__main__":
usage = "usage: %prog files.list [options]"
usage = "usage: %prog [file.vcf | if absent stdin] [options]"
parser = OptionParser(usage=usage)
parser.add_option("-f", "--f", dest="fields",
type='string', default="",
type='string', default=None,
help="Comma separated list of fields to exact")
parser.add_option("-e", "--filter", dest="filter",
action='store_true', default=False,
@ -17,13 +17,19 @@ if __name__ == "__main__":
help="Only print out every 1 / skip records")
(OPTIONS, args) = parser.parse_args()
if len(args) != 0:
if len(args) > 1:
parser.error("incorrect number of arguments")
counter = OPTIONS.skip
src = sys.stdin
if len(args) > 0:
src = open(args[0])
if OPTIONS.fields == None:
sys.exit("Fields argument must be provided")
fields = OPTIONS.fields.split(',')
for header, vcf, count in lines2VCF(sys.stdin, extendedOutput = True):
for header, vcf, count in lines2VCF(src, extendedOutput = True):
#print vcf, count
if count == 1 and vcf.hasHeader():
print '\t'.join(fields)