Minor improvements to simple python code
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3330 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
504103bd15
commit
d6b036cdab
|
|
@ -0,0 +1,45 @@
|
|||
from farm_commands2 import *
|
||||
import os.path
|
||||
import sys
|
||||
from optparse import OptionParser
|
||||
from datetime import date
|
||||
import glob
|
||||
import operator
|
||||
import faiReader
|
||||
import math
|
||||
import shutil
|
||||
import string
|
||||
from madPipelineUtils import *
|
||||
|
||||
def main():
|
||||
global OPTIONS
|
||||
usage = "usage: %prog [options] indels.verbose.txt"
|
||||
parser = OptionParser(usage=usage)
|
||||
parser.add_option("-v", "--verbose", dest="verbose",
|
||||
action='store_true', default=False,
|
||||
help="")
|
||||
|
||||
(OPTIONS, args) = parser.parse_args()
|
||||
if len(args) != 1:
|
||||
parser.error("incorrect number of arguments")
|
||||
|
||||
indelsFile = args[0]
|
||||
|
||||
print 'event size nReadsWithIndel nReadsWithoutIndel nReadsTotal'
|
||||
for line in open(indelsFile):
|
||||
try:
|
||||
parts = line.split()
|
||||
# chr1 30502 30505 -TTT OBS_COUNTS[C/A/T]:5/5/13 AV_MM[C/R]:0.00/0.75 AV_MAPQ[C/R]:37.00/27.13 NQS_MM_RATE[C/R]:0.00/0.01 NQS_AV_QUAL[C/R]:30.90/28.30 STRAND_COUNTS[C/C/R/R]:2/3/6/2
|
||||
chr, start, stop = parts[0:3]
|
||||
event = parts[3]
|
||||
counts = parts[4]
|
||||
isInsertion = event[0] == '+'
|
||||
size = len(event[1:])
|
||||
nConsensusReads, nReadsWithIndel, nReads = map(int, counts.split(':')[1].split('/'))
|
||||
nReadsWithoutIndel = nReads - nReadsWithIndel
|
||||
print event[0], size, nReadsWithIndel, nReadsWithoutIndel, nReads
|
||||
except:
|
||||
continue
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -10,9 +10,9 @@ import math
|
|||
import shutil
|
||||
import string
|
||||
|
||||
GATK_STABLE = '/home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar'
|
||||
GATK_DEV = '/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar'
|
||||
GATK = 'java -Xmx%s -Djava.io.tmpdir=/broad/shptmp/depristo/tmp/ -jar ' + GATK_STABLE + ' -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -l INFO '
|
||||
GATK_STABLE_JAR = '/home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar'
|
||||
GATK_DEV_JAR = '/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar'
|
||||
GATK_JAR = GATK_STABLE_JAR
|
||||
|
||||
# add to GATK to enable dbSNP aware cleaning
|
||||
# -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod
|
||||
|
|
@ -23,7 +23,6 @@ GATK = 'java -Xmx%s -Djava.io.tmpdir=/broad/shptmp/depristo/tmp/ -jar ' + GATK_S
|
|||
hg18 = ['chr' + str(i) for i in range(1,23)] + ['chrX', 'chrY']
|
||||
b36 = [str(i) for i in range(1,23)] + ['X', 'Y']
|
||||
|
||||
|
||||
HG18_TO_B36 = {
|
||||
'hg18' : 'b36',
|
||||
'/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta' : '/broad/1KG/reference/human_b36_both.fasta',
|
||||
|
|
@ -46,17 +45,17 @@ def appendExtension(path, newExt, addExtension = True):
|
|||
# return os.path.join(OPTIONS.dir, s)
|
||||
|
||||
class PipelineArgs:
|
||||
def __init__( self, GATK = GATK, ref = 'hg18', name = None, memory = '4g' ):
|
||||
self.GATK = GATK
|
||||
def __init__( self, GATK_JAR = GATK_JAR, ref = 'hg18', name = None, memory = '4g' ):
|
||||
self.GATK = 'java -Xmx%s -Djava.io.tmpdir=/broad/shptmp/depristo/tmp/ -jar ' + GATK_JAR + ' -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -l INFO '
|
||||
self.ref = ref
|
||||
self.name = name
|
||||
self.memory = memory
|
||||
|
||||
def convertToB36(self):
|
||||
return this.ref == 'b36'
|
||||
return self.ref == 'b36'
|
||||
|
||||
def addGATKArg(self, arg):
|
||||
if arg[0] != ' ':
|
||||
if len(arg) > 0 and arg[0] != ' ':
|
||||
arg = ' ' + arg
|
||||
self.GATK += arg
|
||||
|
||||
|
|
@ -67,8 +66,8 @@ class PipelineArgs:
|
|||
return suffix
|
||||
|
||||
def finalizedGATKCommand(self, args):
|
||||
cmd = (self.GATK % self.memory) + args
|
||||
if self.convertToB36:
|
||||
cmd = (self.GATK % self.memory) + ' ' + args
|
||||
if self.convertToB36():
|
||||
cmd = hg18args_to_b36(cmd)
|
||||
return cmd
|
||||
|
||||
|
|
@ -83,11 +82,18 @@ def simpleGATKCommand( pargs, name, args, lastJobs ):
|
|||
# Takes a simpleGATKCommand and splits it by chromosome, merging output
|
||||
#
|
||||
def splitGATKCommandByChr( myPipelineArgs, cmd, outputsToParallelize, mergeCommands ):
|
||||
if cmd.cmd_str_from_user.find(" -L") != -1:
|
||||
sys.exit("Found -L argument in command -- cannot be provided for parallelization by chromosome: " + cmd.cmd_str_from_user)
|
||||
def makeChrCmd(chr):
|
||||
chrOutputMap = map(lambda x: appendExtension(x, chr), outputsToParallelize)
|
||||
chr_cmd_str = cmd.cmd_str_from_user
|
||||
chr_cmd_str = cmd.cmd_str_from_user
|
||||
chr_cmd_str += ' -L ' + chr
|
||||
for x, y in zip(outputsToParallelize, chrOutputMap):
|
||||
chr_cmd_str = chr_cmd_str.replace(x, y)
|
||||
|
||||
if myPipelineArgs.convertToB36():
|
||||
chr_cmd_str = hg18args_to_b36(chr_cmd_str)
|
||||
|
||||
chrCmd = FarmJob(chr_cmd_str, jobName = cmd.jobName + '.byChr' + chr, dependencies = cmd.dependencies)
|
||||
return chrCmd, chrOutputMap
|
||||
|
||||
|
|
|
|||
|
|
@ -88,12 +88,12 @@ def main():
|
|||
|
||||
def createTargets( myPipelineArgs, chr, inputBam, outputRoot, args, lastJobs ):
|
||||
outputIntervals = outputRoot + ".intervals"
|
||||
GATKArgs = '-T RealignerTargetCreator -I %s -o %s -mrl 10000 -L %s' % (inputBam, outputIntervals, chr)
|
||||
GATKArgs = '-T RealignerTargetCreator -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -o %s -mrl 10000 -L %s' % (inputBam, outputIntervals, chr)
|
||||
return simpleGATKCommand( myPipelineArgs, 'CreateInterval' + chr, GATKArgs, lastJobs ), outputIntervals
|
||||
|
||||
def realign( myPipelineArgs, chr, inputBam, outputRoot, intervals, lastJobs ):
|
||||
outputBAM = outputRoot + ".bam"
|
||||
GATKArgs = '-T IndelRealigner -I %s -targetIntervals %s --output %s -sort ON_DISK -mrl 100000 -L %s' % (inputBam, intervals, outputBAM, chr)
|
||||
GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -sort ON_DISK -mrl 100000 -L %s' % (inputBam, intervals, outputBAM, chr)
|
||||
return simpleGATKCommand( myPipelineArgs, 'Realign' + chr, GATKArgs, lastJobs ), outputBAM
|
||||
|
||||
def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ):
|
||||
|
|
|
|||
|
|
@ -4,10 +4,10 @@ from optparse import OptionParser
|
|||
from vcfReader import *
|
||||
|
||||
if __name__ == "__main__":
|
||||
usage = "usage: %prog files.list [options]"
|
||||
usage = "usage: %prog [file.vcf | if absent stdin] [options]"
|
||||
parser = OptionParser(usage=usage)
|
||||
parser.add_option("-f", "--f", dest="fields",
|
||||
type='string', default="",
|
||||
type='string', default=None,
|
||||
help="Comma separated list of fields to exact")
|
||||
parser.add_option("-e", "--filter", dest="filter",
|
||||
action='store_true', default=False,
|
||||
|
|
@ -17,13 +17,19 @@ if __name__ == "__main__":
|
|||
help="Only print out every 1 / skip records")
|
||||
|
||||
(OPTIONS, args) = parser.parse_args()
|
||||
if len(args) != 0:
|
||||
if len(args) > 1:
|
||||
parser.error("incorrect number of arguments")
|
||||
|
||||
counter = OPTIONS.skip
|
||||
src = sys.stdin
|
||||
if len(args) > 0:
|
||||
src = open(args[0])
|
||||
|
||||
if OPTIONS.fields == None:
|
||||
sys.exit("Fields argument must be provided")
|
||||
|
||||
fields = OPTIONS.fields.split(',')
|
||||
for header, vcf, count in lines2VCF(sys.stdin, extendedOutput = True):
|
||||
for header, vcf, count in lines2VCF(src, extendedOutput = True):
|
||||
#print vcf, count
|
||||
if count == 1 and vcf.hasHeader():
|
||||
print '\t'.join(fields)
|
||||
|
|
|
|||
Loading…
Reference in New Issue