From dd978dd525a8abba6787058a029d98d2c1e5c1bc Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 9 Jul 2010 00:13:35 +0000 Subject: [PATCH] misc. changes to python scripts git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3751 348d0f76-0448-11de-a6fe-93d51630548a --- python/1kgStatsForCalls.py | 32 +++++++++++++++++++++----------- python/farm_commands2.py | 2 +- python/madPipelineUtils.py | 2 +- python/picard_utils.py | 7 +++---- python/realignBamByChr.py | 2 +- python/vcfReader.py | 3 ++- 6 files changed, 29 insertions(+), 19 deletions(-) diff --git a/python/1kgStatsForCalls.py b/python/1kgStatsForCalls.py index ff0c9e9cd..38c368a48 100755 --- a/python/1kgStatsForCalls.py +++ b/python/1kgStatsForCalls.py @@ -11,7 +11,7 @@ import string def average(l): sum = reduce(operator.add, l, 0) - return sum / len(l) + return sum / (1.0 * len(l)) def printHeaderSep(): print @@ -104,12 +104,17 @@ def findVariantEvalResults(key, file, type=str): def getDBSNPRate(file): if file != None: key = "[evaluation_name=eval].[comparison_name=dbsnp].[jexl_expression=none].[filter_name=called].[novelty_name=all].[analysis=Comp Overlap].[data_point=% evals at comp]" - return findVariantEvalResults(key, file, float)[0] + r = findVariantEvalResults(key, file, float) + if len(r) > 0: + return r[0] + else: + return -1 else: return -1 def useProject(project): - return OPTIONS.project == None or project == OPTIONS.project + #print 'match', project, OPTIONS.project, re.match(OPTIONS.project, project) + return OPTIONS.project == None or re.match(OPTIONS.project, project) != None def countMappedBases(samples, alignmentIndex): if ( OPTIONS.coverageFile != None ): @@ -148,16 +153,21 @@ def readMappedBasesFromBAS(samples, alignmentIndex): def countSNPs(samples, snpsVCF, useIndels = False): total = 0 + novel = 0 header, columnNames, remainingLines = vcfReader.readVCFHeader(open(snpsVCF)) sampleIDs = columnNames[9:] + print 'Counting SNPs...' for header, vcf, counter in vcfReader.lines2VCF(remainingLines, extendedOutput = True, decodeAll = False): if ( counter > OPTIONS.maxRecords and OPTIONS.maxRecords != -1 ): break - if vcf.passesFilters(): + + if vcf.passesFilters() and vcf.getChrom not in ['MT', 'Y']: if ( vcf.isVariant() ): total += 1 + + if ( vcf.isNovel() ): novel += 1 if ( OPTIONS.verbose and total % 10000 == 0 ): print ' progress', vcf.getChrom(), vcf.getPos() @@ -176,7 +186,7 @@ def countSNPs(samples, snpsVCF, useIndels = False): samples[sampleID].nSNPs += 1 printStatus(samples) - return total + return total, novel def countIndels(samples, indelsVCF): total = 0 @@ -184,7 +194,7 @@ def countIndels(samples, indelsVCF): if ( indelsVCF != None ): return countSNPs(samples, indelsVCF, True) - return total + return total, 0 def readSamples(vcf): print 'Reading samples for', OPTIONS.population @@ -229,11 +239,11 @@ if __name__ == "__main__": ignore, totalMappedBases = countMappedBases(samples, OPTIONS.alignmentIndex) totalBases = countBases(samples, OPTIONS.sequenceIndex) meanMappedDepth = totalMappedBases / OPTIONS.totalGenome / nSamples - totalSNPs = countSNPs(samples, OPTIONS.snps) - totalIndels = countIndels(samples, OPTIONS.indels) + totalSNPs, novelSNPs = countSNPs(samples, OPTIONS.snps) + totalIndels, novelIndels = countIndels(samples, OPTIONS.indels) snpNoveltyRate = 100 - getDBSNPRate(OPTIONS.snpsVE) - indelNoveltyRate = 100 - getDBSNPRate(OPTIONS.indelsVE) + indelNoveltyRate = novelIndels / (1.0*totalIndels) # 100 - getDBSNPRate(OPTIONS.indelsVE) out = sys.stdout if ( OPTIONS.output != None ): out = open(OPTIONS.output, 'w') @@ -247,8 +257,8 @@ if __name__ == "__main__": print >> out, 'bases called (fraction ref genome) %f (%.2f%%)' % (OPTIONS.calledGenome, 100.0 * OPTIONS.calledGenome / OPTIONS.totalGenome) print >> out, 'number of SNP sites (%% novel) %d (%.2f%%)' % (totalSNPs, snpNoveltyRate) - print >> out, 'average # SNP sites per individual', average(map(Sample.getNSNPs, samples.itervalues())) + print >> out, 'average # SNP sites per individual %.0f' % average(map(Sample.getNSNPs, samples.itervalues())) print >> out, 'number of indel sites (%% novel) %d (%.2f%%)' % (totalIndels, indelNoveltyRate) - print >> out, 'average # indel sites per individual', average(map(Sample.getNIndels, samples.itervalues())) + print >> out, 'average # indel sites per individual %.0f' % average(map(Sample.getNIndels, samples.itervalues())) out.close() diff --git a/python/farm_commands2.py b/python/farm_commands2.py index 1e8488b54..239828f5b 100755 --- a/python/farm_commands2.py +++ b/python/farm_commands2.py @@ -133,7 +133,7 @@ def buildExecutionString(job, farm_queue = None, debug = True): #farm_stdout = None farm_stdout = "%J.lsf.output" - cmd_str = "bsub -q " + farm_queue + cmd_str = "bsub -r -q " + farm_queue if farm_stdout != None: cmd_str += " -o " + farm_stdout diff --git a/python/madPipelineUtils.py b/python/madPipelineUtils.py index 560a78d03..23c333b3d 100755 --- a/python/madPipelineUtils.py +++ b/python/madPipelineUtils.py @@ -45,7 +45,7 @@ def appendExtension(path, newExt, addExtension = True): # return os.path.join(OPTIONS.dir, s) class PipelineArgs: - def __init__( self, GATK_JAR = GATK_JAR, ref = 'hg18', name = None, memory = '4g', excludeChrs = [] ): + def __init__( self, GATK_JAR = GATK_JAR, ref = 'hg18', name = None, memory = '3g', excludeChrs = [] ): self.GATK = 'java -Xmx%s -Djava.io.tmpdir=/broad/shptmp/depristo/tmp/ -jar ' + GATK_JAR + ' -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -l INFO ' self.ref = ref self.name = name diff --git a/python/picard_utils.py b/python/picard_utils.py index 2a44b31f5..441c06936 100755 --- a/python/picard_utils.py +++ b/python/picard_utils.py @@ -154,7 +154,7 @@ def aggregateGeliCalls( sortedGeliCalls ): #return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)] return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)] -def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx4096m', compression_level = 1 ): +def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx2g', compression_level = 1 ): if useSamtools: return SAMTOOLS_MERGE_BIN + ' ' + output_filename + ' ' + ' '.join(inputFiles) else: @@ -166,13 +166,12 @@ def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, if MSD: MSDStr = 'MSD=true' return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true COMPRESSION_LEVEL=' + str(compression_level) + ' SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles)) - #return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles)) -def mergeFixingMatesBAMCmd( output_filename, inputFiles, memLimit = '-Xmx4096m', compression_level = 1, tmpdir = '/broad/shptmp' ): +def mergeFixingMatesBAMCmd( output_filename, inputFiles, memLimit = '-Xmx2g', compression_level = 1, tmpdir = '/broad/shptmp' ): if type(inputFiles) <> list: inputFiles = list(inputFiles) - return 'java %s -Djava.io.tmpdir=%s -jar %s COMPRESSION_LEVEL=%d SO=coordinate O=%s VALIDATION_STRINGENCY=SILENT I=%s' % ( memLimit, tmpdir, FIXMATES_BIN, compression_level, output_filename, 'I='.join(inputFiles) ) + return 'java %s -Djava.io.tmpdir=%s -jar %s COMPRESSION_LEVEL=%d SO=coordinate O=%s VALIDATION_STRINGENCY=SILENT I=%s' % ( memLimit, tmpdir, FIXMATES_BIN, compression_level, output_filename, ' I='.join(inputFiles) ) def getPicardPath(lane, picardRoot = '/seq/picard/'): flowcell, laneNo = lane.split('.') diff --git a/python/realignBamByChr.py b/python/realignBamByChr.py index bf443e069..31cdfa67e 100755 --- a/python/realignBamByChr.py +++ b/python/realignBamByChr.py @@ -113,7 +113,7 @@ def createTargets( myPipelineArgs, chr, inputBam, outputRoot, args, lastJobs ): def realign( myPipelineArgs, chr, inputBam, outputRoot, intervals, lastJobs ): outputBAM = outputRoot + ".bam" - GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -L %s' % (inputBam, intervals, outputBAM, chr) + GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -stats %s -L %s' % (inputBam, intervals, outputBAM, outputBAM + ".stats", chr) return simpleGATKCommand( myPipelineArgs, 'Realign' + chr, GATKArgs, lastJobs ), outputBAM def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ): diff --git a/python/vcfReader.py b/python/vcfReader.py index 14c4d84e6..74e7862f2 100755 --- a/python/vcfReader.py +++ b/python/vcfReader.py @@ -74,7 +74,7 @@ class VCFRecord: def getFilter(self): return self.get("FILTER") def failsFilters(self): return not self.passesFilters() def passesFilters(self): - v = self.getFilter() == "." or self.getFilter() == "0" or self.getFilter() == 'PASSES' + v = self.getFilter() == "." or self.getFilter() == "0" or self.getFilter() == 'PASS' #print self.getFilter(), ">>>", v, self return v @@ -180,6 +180,7 @@ def readVCFHeader(lines): return header, columnNames, itertools.chain([line], lines) # we reach this point for empty files + #print 'header is', header return header, columnNames, [] def quickCountRecords(lines):