Minor improvements to simple python code
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3330 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
504103bd15
commit
d6b036cdab
|
|
@ -0,0 +1,45 @@
|
||||||
|
from farm_commands2 import *
|
||||||
|
import os.path
|
||||||
|
import sys
|
||||||
|
from optparse import OptionParser
|
||||||
|
from datetime import date
|
||||||
|
import glob
|
||||||
|
import operator
|
||||||
|
import faiReader
|
||||||
|
import math
|
||||||
|
import shutil
|
||||||
|
import string
|
||||||
|
from madPipelineUtils import *
|
||||||
|
|
||||||
|
def main():
|
||||||
|
global OPTIONS
|
||||||
|
usage = "usage: %prog [options] indels.verbose.txt"
|
||||||
|
parser = OptionParser(usage=usage)
|
||||||
|
parser.add_option("-v", "--verbose", dest="verbose",
|
||||||
|
action='store_true', default=False,
|
||||||
|
help="")
|
||||||
|
|
||||||
|
(OPTIONS, args) = parser.parse_args()
|
||||||
|
if len(args) != 1:
|
||||||
|
parser.error("incorrect number of arguments")
|
||||||
|
|
||||||
|
indelsFile = args[0]
|
||||||
|
|
||||||
|
print 'event size nReadsWithIndel nReadsWithoutIndel nReadsTotal'
|
||||||
|
for line in open(indelsFile):
|
||||||
|
try:
|
||||||
|
parts = line.split()
|
||||||
|
# chr1 30502 30505 -TTT OBS_COUNTS[C/A/T]:5/5/13 AV_MM[C/R]:0.00/0.75 AV_MAPQ[C/R]:37.00/27.13 NQS_MM_RATE[C/R]:0.00/0.01 NQS_AV_QUAL[C/R]:30.90/28.30 STRAND_COUNTS[C/C/R/R]:2/3/6/2
|
||||||
|
chr, start, stop = parts[0:3]
|
||||||
|
event = parts[3]
|
||||||
|
counts = parts[4]
|
||||||
|
isInsertion = event[0] == '+'
|
||||||
|
size = len(event[1:])
|
||||||
|
nConsensusReads, nReadsWithIndel, nReads = map(int, counts.split(':')[1].split('/'))
|
||||||
|
nReadsWithoutIndel = nReads - nReadsWithIndel
|
||||||
|
print event[0], size, nReadsWithIndel, nReadsWithoutIndel, nReads
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -10,9 +10,9 @@ import math
|
||||||
import shutil
|
import shutil
|
||||||
import string
|
import string
|
||||||
|
|
||||||
GATK_STABLE = '/home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar'
|
GATK_STABLE_JAR = '/home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar'
|
||||||
GATK_DEV = '/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar'
|
GATK_DEV_JAR = '/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar'
|
||||||
GATK = 'java -Xmx%s -Djava.io.tmpdir=/broad/shptmp/depristo/tmp/ -jar ' + GATK_STABLE + ' -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -l INFO '
|
GATK_JAR = GATK_STABLE_JAR
|
||||||
|
|
||||||
# add to GATK to enable dbSNP aware cleaning
|
# add to GATK to enable dbSNP aware cleaning
|
||||||
# -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod
|
# -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod
|
||||||
|
|
@ -23,7 +23,6 @@ GATK = 'java -Xmx%s -Djava.io.tmpdir=/broad/shptmp/depristo/tmp/ -jar ' + GATK_S
|
||||||
hg18 = ['chr' + str(i) for i in range(1,23)] + ['chrX', 'chrY']
|
hg18 = ['chr' + str(i) for i in range(1,23)] + ['chrX', 'chrY']
|
||||||
b36 = [str(i) for i in range(1,23)] + ['X', 'Y']
|
b36 = [str(i) for i in range(1,23)] + ['X', 'Y']
|
||||||
|
|
||||||
|
|
||||||
HG18_TO_B36 = {
|
HG18_TO_B36 = {
|
||||||
'hg18' : 'b36',
|
'hg18' : 'b36',
|
||||||
'/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta' : '/broad/1KG/reference/human_b36_both.fasta',
|
'/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta' : '/broad/1KG/reference/human_b36_both.fasta',
|
||||||
|
|
@ -46,17 +45,17 @@ def appendExtension(path, newExt, addExtension = True):
|
||||||
# return os.path.join(OPTIONS.dir, s)
|
# return os.path.join(OPTIONS.dir, s)
|
||||||
|
|
||||||
class PipelineArgs:
|
class PipelineArgs:
|
||||||
def __init__( self, GATK = GATK, ref = 'hg18', name = None, memory = '4g' ):
|
def __init__( self, GATK_JAR = GATK_JAR, ref = 'hg18', name = None, memory = '4g' ):
|
||||||
self.GATK = GATK
|
self.GATK = 'java -Xmx%s -Djava.io.tmpdir=/broad/shptmp/depristo/tmp/ -jar ' + GATK_JAR + ' -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -l INFO '
|
||||||
self.ref = ref
|
self.ref = ref
|
||||||
self.name = name
|
self.name = name
|
||||||
self.memory = memory
|
self.memory = memory
|
||||||
|
|
||||||
def convertToB36(self):
|
def convertToB36(self):
|
||||||
return this.ref == 'b36'
|
return self.ref == 'b36'
|
||||||
|
|
||||||
def addGATKArg(self, arg):
|
def addGATKArg(self, arg):
|
||||||
if arg[0] != ' ':
|
if len(arg) > 0 and arg[0] != ' ':
|
||||||
arg = ' ' + arg
|
arg = ' ' + arg
|
||||||
self.GATK += arg
|
self.GATK += arg
|
||||||
|
|
||||||
|
|
@ -67,8 +66,8 @@ class PipelineArgs:
|
||||||
return suffix
|
return suffix
|
||||||
|
|
||||||
def finalizedGATKCommand(self, args):
|
def finalizedGATKCommand(self, args):
|
||||||
cmd = (self.GATK % self.memory) + args
|
cmd = (self.GATK % self.memory) + ' ' + args
|
||||||
if self.convertToB36:
|
if self.convertToB36():
|
||||||
cmd = hg18args_to_b36(cmd)
|
cmd = hg18args_to_b36(cmd)
|
||||||
return cmd
|
return cmd
|
||||||
|
|
||||||
|
|
@ -83,11 +82,18 @@ def simpleGATKCommand( pargs, name, args, lastJobs ):
|
||||||
# Takes a simpleGATKCommand and splits it by chromosome, merging output
|
# Takes a simpleGATKCommand and splits it by chromosome, merging output
|
||||||
#
|
#
|
||||||
def splitGATKCommandByChr( myPipelineArgs, cmd, outputsToParallelize, mergeCommands ):
|
def splitGATKCommandByChr( myPipelineArgs, cmd, outputsToParallelize, mergeCommands ):
|
||||||
|
if cmd.cmd_str_from_user.find(" -L") != -1:
|
||||||
|
sys.exit("Found -L argument in command -- cannot be provided for parallelization by chromosome: " + cmd.cmd_str_from_user)
|
||||||
def makeChrCmd(chr):
|
def makeChrCmd(chr):
|
||||||
chrOutputMap = map(lambda x: appendExtension(x, chr), outputsToParallelize)
|
chrOutputMap = map(lambda x: appendExtension(x, chr), outputsToParallelize)
|
||||||
chr_cmd_str = cmd.cmd_str_from_user
|
chr_cmd_str = cmd.cmd_str_from_user
|
||||||
|
chr_cmd_str += ' -L ' + chr
|
||||||
for x, y in zip(outputsToParallelize, chrOutputMap):
|
for x, y in zip(outputsToParallelize, chrOutputMap):
|
||||||
chr_cmd_str = chr_cmd_str.replace(x, y)
|
chr_cmd_str = chr_cmd_str.replace(x, y)
|
||||||
|
|
||||||
|
if myPipelineArgs.convertToB36():
|
||||||
|
chr_cmd_str = hg18args_to_b36(chr_cmd_str)
|
||||||
|
|
||||||
chrCmd = FarmJob(chr_cmd_str, jobName = cmd.jobName + '.byChr' + chr, dependencies = cmd.dependencies)
|
chrCmd = FarmJob(chr_cmd_str, jobName = cmd.jobName + '.byChr' + chr, dependencies = cmd.dependencies)
|
||||||
return chrCmd, chrOutputMap
|
return chrCmd, chrOutputMap
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -88,12 +88,12 @@ def main():
|
||||||
|
|
||||||
def createTargets( myPipelineArgs, chr, inputBam, outputRoot, args, lastJobs ):
|
def createTargets( myPipelineArgs, chr, inputBam, outputRoot, args, lastJobs ):
|
||||||
outputIntervals = outputRoot + ".intervals"
|
outputIntervals = outputRoot + ".intervals"
|
||||||
GATKArgs = '-T RealignerTargetCreator -I %s -o %s -mrl 10000 -L %s' % (inputBam, outputIntervals, chr)
|
GATKArgs = '-T RealignerTargetCreator -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -o %s -mrl 10000 -L %s' % (inputBam, outputIntervals, chr)
|
||||||
return simpleGATKCommand( myPipelineArgs, 'CreateInterval' + chr, GATKArgs, lastJobs ), outputIntervals
|
return simpleGATKCommand( myPipelineArgs, 'CreateInterval' + chr, GATKArgs, lastJobs ), outputIntervals
|
||||||
|
|
||||||
def realign( myPipelineArgs, chr, inputBam, outputRoot, intervals, lastJobs ):
|
def realign( myPipelineArgs, chr, inputBam, outputRoot, intervals, lastJobs ):
|
||||||
outputBAM = outputRoot + ".bam"
|
outputBAM = outputRoot + ".bam"
|
||||||
GATKArgs = '-T IndelRealigner -I %s -targetIntervals %s --output %s -sort ON_DISK -mrl 100000 -L %s' % (inputBam, intervals, outputBAM, chr)
|
GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -sort ON_DISK -mrl 100000 -L %s' % (inputBam, intervals, outputBAM, chr)
|
||||||
return simpleGATKCommand( myPipelineArgs, 'Realign' + chr, GATKArgs, lastJobs ), outputBAM
|
return simpleGATKCommand( myPipelineArgs, 'Realign' + chr, GATKArgs, lastJobs ), outputBAM
|
||||||
|
|
||||||
def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ):
|
def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ):
|
||||||
|
|
|
||||||
|
|
@ -4,10 +4,10 @@ from optparse import OptionParser
|
||||||
from vcfReader import *
|
from vcfReader import *
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
usage = "usage: %prog files.list [options]"
|
usage = "usage: %prog [file.vcf | if absent stdin] [options]"
|
||||||
parser = OptionParser(usage=usage)
|
parser = OptionParser(usage=usage)
|
||||||
parser.add_option("-f", "--f", dest="fields",
|
parser.add_option("-f", "--f", dest="fields",
|
||||||
type='string', default="",
|
type='string', default=None,
|
||||||
help="Comma separated list of fields to exact")
|
help="Comma separated list of fields to exact")
|
||||||
parser.add_option("-e", "--filter", dest="filter",
|
parser.add_option("-e", "--filter", dest="filter",
|
||||||
action='store_true', default=False,
|
action='store_true', default=False,
|
||||||
|
|
@ -17,13 +17,19 @@ if __name__ == "__main__":
|
||||||
help="Only print out every 1 / skip records")
|
help="Only print out every 1 / skip records")
|
||||||
|
|
||||||
(OPTIONS, args) = parser.parse_args()
|
(OPTIONS, args) = parser.parse_args()
|
||||||
if len(args) != 0:
|
if len(args) > 1:
|
||||||
parser.error("incorrect number of arguments")
|
parser.error("incorrect number of arguments")
|
||||||
|
|
||||||
counter = OPTIONS.skip
|
counter = OPTIONS.skip
|
||||||
|
src = sys.stdin
|
||||||
|
if len(args) > 0:
|
||||||
|
src = open(args[0])
|
||||||
|
|
||||||
|
if OPTIONS.fields == None:
|
||||||
|
sys.exit("Fields argument must be provided")
|
||||||
|
|
||||||
fields = OPTIONS.fields.split(',')
|
fields = OPTIONS.fields.split(',')
|
||||||
for header, vcf, count in lines2VCF(sys.stdin, extendedOutput = True):
|
for header, vcf, count in lines2VCF(src, extendedOutput = True):
|
||||||
#print vcf, count
|
#print vcf, count
|
||||||
if count == 1 and vcf.hasHeader():
|
if count == 1 and vcf.hasHeader():
|
||||||
print '\t'.join(fields)
|
print '\t'.join(fields)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue