From dd978dd525a8abba6787058a029d98d2c1e5c1bc Mon Sep 17 00:00:00 2001
From: depristo <depristo@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Fri, 9 Jul 2010 00:13:35 +0000
Subject: [PATCH] misc. changes to python scripts

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3751 348d0f76-0448-11de-a6fe-93d51630548a
---
 python/1kgStatsForCalls.py | 32 +++++++++++++++++++++-----------
 python/farm_commands2.py   |  2 +-
 python/madPipelineUtils.py |  2 +-
 python/picard_utils.py     |  7 +++----
 python/realignBamByChr.py  |  2 +-
 python/vcfReader.py        |  3 ++-
 6 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/python/1kgStatsForCalls.py b/python/1kgStatsForCalls.py
index ff0c9e9cd..38c368a48 100755
--- a/python/1kgStatsForCalls.py
+++ b/python/1kgStatsForCalls.py
@@ -11,7 +11,7 @@ import string
 
 def average(l):
     sum = reduce(operator.add, l, 0)
-    return sum / len(l)
+    return sum / (1.0 * len(l))
 
 def printHeaderSep():
     print
@@ -104,12 +104,17 @@ def findVariantEvalResults(key, file, type=str):
 def getDBSNPRate(file):
     if file != None:
         key = "[evaluation_name=eval].[comparison_name=dbsnp].[jexl_expression=none].[filter_name=called].[novelty_name=all].[analysis=Comp Overlap].[data_point=% evals at comp]"
-        return findVariantEvalResults(key, file, float)[0]
+        r = findVariantEvalResults(key, file, float)
+        if len(r) > 0:
+            return r[0]
+        else:
+            return -1
     else:
         return -1
 
 def useProject(project):
-    return OPTIONS.project == None or project == OPTIONS.project
+    #print 'match', project, OPTIONS.project, re.match(OPTIONS.project, project)
+    return OPTIONS.project == None or re.match(OPTIONS.project, project) != None
 
 def countMappedBases(samples, alignmentIndex):
     if ( OPTIONS.coverageFile != None ): 
@@ -148,16 +153,21 @@ def readMappedBasesFromBAS(samples, alignmentIndex):
 
 def countSNPs(samples, snpsVCF, useIndels = False):
     total = 0
+    novel = 0
     
     header, columnNames, remainingLines = vcfReader.readVCFHeader(open(snpsVCF))
     sampleIDs = columnNames[9:]
 
+    print 'Counting SNPs...'
     for header, vcf, counter in vcfReader.lines2VCF(remainingLines, extendedOutput = True, decodeAll = False):
         if ( counter > OPTIONS.maxRecords and OPTIONS.maxRecords != -1 ):
             break
-        if vcf.passesFilters():
+            
+        if vcf.passesFilters() and vcf.getChrom not in ['MT', 'Y']:
             if ( vcf.isVariant() ): 
                 total += 1
+
+                if ( vcf.isNovel() ): novel += 1
                 
                 if ( OPTIONS.verbose and total % 10000 == 0 ):
                     print '  progress', vcf.getChrom(), vcf.getPos()
@@ -176,7 +186,7 @@ def countSNPs(samples, snpsVCF, useIndels = False):
                                 samples[sampleID].nSNPs += 1
 
     printStatus(samples)
-    return total
+    return total, novel
 
 def countIndels(samples, indelsVCF):
     total = 0
@@ -184,7 +194,7 @@ def countIndels(samples, indelsVCF):
     if ( indelsVCF != None ):
         return countSNPs(samples, indelsVCF, True)
     
-    return total
+    return total, 0
 
 def readSamples(vcf):
     print 'Reading samples for', OPTIONS.population
@@ -229,11 +239,11 @@ if __name__ == "__main__":
     ignore, totalMappedBases = countMappedBases(samples, OPTIONS.alignmentIndex)    
     totalBases = countBases(samples, OPTIONS.sequenceIndex)
     meanMappedDepth = totalMappedBases / OPTIONS.totalGenome / nSamples
-    totalSNPs = countSNPs(samples, OPTIONS.snps)
-    totalIndels = countIndels(samples, OPTIONS.indels)
+    totalSNPs, novelSNPs = countSNPs(samples, OPTIONS.snps)
+    totalIndels, novelIndels = countIndels(samples, OPTIONS.indels)
     
     snpNoveltyRate = 100 - getDBSNPRate(OPTIONS.snpsVE)
-    indelNoveltyRate = 100 - getDBSNPRate(OPTIONS.indelsVE)
+    indelNoveltyRate = novelIndels / (1.0*totalIndels) # 100 - getDBSNPRate(OPTIONS.indelsVE)
 
     out = sys.stdout
     if ( OPTIONS.output != None ): out = open(OPTIONS.output, 'w')
@@ -247,8 +257,8 @@ if __name__ == "__main__":
     
     print >> out, 'bases called (fraction ref genome) %f (%.2f%%)' % (OPTIONS.calledGenome, 100.0 * OPTIONS.calledGenome / OPTIONS.totalGenome)
     print >> out, 'number of SNP sites (%% novel) %d (%.2f%%)' % (totalSNPs, snpNoveltyRate)
-    print >> out, 'average # SNP sites per individual', average(map(Sample.getNSNPs, samples.itervalues()))
+    print >> out, 'average # SNP sites per individual %.0f' % average(map(Sample.getNSNPs, samples.itervalues()))
     print >> out, 'number of indel sites (%% novel) %d (%.2f%%)' % (totalIndels, indelNoveltyRate)
-    print >> out, 'average # indel sites per individual', average(map(Sample.getNIndels, samples.itervalues()))
+    print >> out, 'average # indel sites per individual %.0f' % average(map(Sample.getNIndels, samples.itervalues()))
     out.close()
     
diff --git a/python/farm_commands2.py b/python/farm_commands2.py
index 1e8488b54..239828f5b 100755
--- a/python/farm_commands2.py
+++ b/python/farm_commands2.py
@@ -133,7 +133,7 @@ def buildExecutionString(job, farm_queue = None, debug = True):
             #farm_stdout = None
             farm_stdout = "%J.lsf.output"
             
-        cmd_str = "bsub -q " + farm_queue 
+        cmd_str = "bsub -r -q " + farm_queue 
         if farm_stdout != None:
             cmd_str += " -o " + farm_stdout
         
diff --git a/python/madPipelineUtils.py b/python/madPipelineUtils.py
index 560a78d03..23c333b3d 100755
--- a/python/madPipelineUtils.py
+++ b/python/madPipelineUtils.py
@@ -45,7 +45,7 @@ def appendExtension(path, newExt, addExtension = True):
 #    return os.path.join(OPTIONS.dir, s)
 
 class PipelineArgs:
-    def __init__( self, GATK_JAR = GATK_JAR, ref = 'hg18', name = None, memory = '4g', excludeChrs = [] ):
+    def __init__( self, GATK_JAR = GATK_JAR, ref = 'hg18', name = None, memory = '3g', excludeChrs = [] ):
         self.GATK = 'java -Xmx%s -Djava.io.tmpdir=/broad/shptmp/depristo/tmp/ -jar ' + GATK_JAR + ' -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -l INFO '
         self.ref = ref
         self.name = name
diff --git a/python/picard_utils.py b/python/picard_utils.py
index 2a44b31f5..441c06936 100755
--- a/python/picard_utils.py
+++ b/python/picard_utils.py
@@ -154,7 +154,7 @@ def aggregateGeliCalls( sortedGeliCalls ):
     #return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
     return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
 
-def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx4096m', compression_level = 1 ):
+def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx2g', compression_level = 1 ):
     if useSamtools:
         return SAMTOOLS_MERGE_BIN + ' ' + output_filename + ' ' + ' '.join(inputFiles)
     else:
@@ -166,13 +166,12 @@ def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True,
         if MSD: MSDStr = 'MSD=true'
     
         return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true COMPRESSION_LEVEL=' + str(compression_level) + ' SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
-        #return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
 
-def mergeFixingMatesBAMCmd( output_filename, inputFiles, memLimit = '-Xmx4096m', compression_level = 1, tmpdir = '/broad/shptmp' ):
+def mergeFixingMatesBAMCmd( output_filename, inputFiles, memLimit = '-Xmx2g', compression_level = 1, tmpdir = '/broad/shptmp' ):
     if type(inputFiles) <> list:
         inputFiles = list(inputFiles)
 
-    return 'java %s -Djava.io.tmpdir=%s -jar %s COMPRESSION_LEVEL=%d SO=coordinate O=%s VALIDATION_STRINGENCY=SILENT I=%s' % ( memLimit, tmpdir, FIXMATES_BIN, compression_level, output_filename, 'I='.join(inputFiles) )
+    return 'java %s -Djava.io.tmpdir=%s -jar %s COMPRESSION_LEVEL=%d SO=coordinate O=%s VALIDATION_STRINGENCY=SILENT I=%s' % ( memLimit, tmpdir, FIXMATES_BIN, compression_level, output_filename, ' I='.join(inputFiles) )
 
 def getPicardPath(lane, picardRoot = '/seq/picard/'):
     flowcell, laneNo = lane.split('.')
diff --git a/python/realignBamByChr.py b/python/realignBamByChr.py
index bf443e069..31cdfa67e 100755
--- a/python/realignBamByChr.py
+++ b/python/realignBamByChr.py
@@ -113,7 +113,7 @@ def createTargets( myPipelineArgs, chr, inputBam, outputRoot, args, lastJobs ):
 
 def realign( myPipelineArgs, chr, inputBam, outputRoot, intervals, lastJobs ):
     outputBAM = outputRoot + ".bam"
-    GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -L %s' % (inputBam, intervals, outputBAM, chr)
+    GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -stats %s -L %s' % (inputBam, intervals, outputBAM, outputBAM + ".stats", chr)
     return simpleGATKCommand( myPipelineArgs, 'Realign' + chr, GATKArgs, lastJobs ), outputBAM
 
 def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ):
diff --git a/python/vcfReader.py b/python/vcfReader.py
index 14c4d84e6..74e7862f2 100755
--- a/python/vcfReader.py
+++ b/python/vcfReader.py
@@ -74,7 +74,7 @@ class VCFRecord:
     def getFilter(self): return self.get("FILTER")
     def failsFilters(self): return not self.passesFilters()
     def passesFilters(self):
-        v = self.getFilter() == "." or self.getFilter() == "0" or self.getFilter() == 'PASSES'
+        v = self.getFilter() == "." or self.getFilter() == "0" or self.getFilter() == 'PASS'
         #print self.getFilter(), ">>>", v, self
         return v
         
@@ -180,6 +180,7 @@ def readVCFHeader(lines):
             return header, columnNames, itertools.chain([line], lines)
             
     # we reach this point for empty files    
+    #print 'header is', header
     return header, columnNames, []
 
 def quickCountRecords(lines):