misc. changes to python scripts

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3751 348d0f76-0448-11de-a6fe-93d51630548a
2010-07-09 00:13:35 +00:00 · 2010-07-09 00:13:35 +00:00 · dd978dd525
parent 6ffcaa0afe
commit dd978dd525
6 changed files with 29 additions and 19 deletions
--- a/python/1kgStatsForCalls.py
+++ b/python/1kgStatsForCalls.py
@ -11,7 +11,7 @@ import string

 def average(l):
    sum = reduce(operator.add, l, 0)
-    return sum / len(l)
+    return sum / (1.0 * len(l))

 def printHeaderSep():
    print
@ -104,12 +104,17 @@ def findVariantEvalResults(key, file, type=str):
 def getDBSNPRate(file):
    if file != None:
        key = "[evaluation_name=eval].[comparison_name=dbsnp].[jexl_expression=none].[filter_name=called].[novelty_name=all].[analysis=Comp Overlap].[data_point=% evals at comp]"
-        return findVariantEvalResults(key, file, float)[0]
+        r = findVariantEvalResults(key, file, float)
+        if len(r) > 0:
+            return r[0]
+        else:
+            return -1
    else:
        return -1

 def useProject(project):
-    return OPTIONS.project == None or project == OPTIONS.project
+    #print 'match', project, OPTIONS.project, re.match(OPTIONS.project, project)
+    return OPTIONS.project == None or re.match(OPTIONS.project, project) != None

 def countMappedBases(samples, alignmentIndex):
    if ( OPTIONS.coverageFile != None ): 
@ -148,16 +153,21 @@ def readMappedBasesFromBAS(samples, alignmentIndex):

 def countSNPs(samples, snpsVCF, useIndels = False):
    total = 0
+    novel = 0
    
    header, columnNames, remainingLines = vcfReader.readVCFHeader(open(snpsVCF))
    sampleIDs = columnNames[9:]

+    print 'Counting SNPs...'
    for header, vcf, counter in vcfReader.lines2VCF(remainingLines, extendedOutput = True, decodeAll = False):
        if ( counter > OPTIONS.maxRecords and OPTIONS.maxRecords != -1 ):
            break
-        if vcf.passesFilters():
+            
+        if vcf.passesFilters() and vcf.getChrom not in ['MT', 'Y']:
            if ( vcf.isVariant() ): 
                total += 1
+
+                if ( vcf.isNovel() ): novel += 1
                
                if ( OPTIONS.verbose and total % 10000 == 0 ):
                    print '  progress', vcf.getChrom(), vcf.getPos()
@ -176,7 +186,7 @@ def countSNPs(samples, snpsVCF, useIndels = False):
                                samples[sampleID].nSNPs += 1

    printStatus(samples)
-    return total
+    return total, novel

 def countIndels(samples, indelsVCF):
    total = 0
@ -184,7 +194,7 @@ def countIndels(samples, indelsVCF):
    if ( indelsVCF != None ):
        return countSNPs(samples, indelsVCF, True)
    
-    return total
+    return total, 0

 def readSamples(vcf):
    print 'Reading samples for', OPTIONS.population
@ -229,11 +239,11 @@ if __name__ == "__main__":
    ignore, totalMappedBases = countMappedBases(samples, OPTIONS.alignmentIndex)    
    totalBases = countBases(samples, OPTIONS.sequenceIndex)
    meanMappedDepth = totalMappedBases / OPTIONS.totalGenome / nSamples
-    totalSNPs = countSNPs(samples, OPTIONS.snps)
-    totalIndels = countIndels(samples, OPTIONS.indels)
+    totalSNPs, novelSNPs = countSNPs(samples, OPTIONS.snps)
+    totalIndels, novelIndels = countIndels(samples, OPTIONS.indels)
    
    snpNoveltyRate = 100 - getDBSNPRate(OPTIONS.snpsVE)
-    indelNoveltyRate = 100 - getDBSNPRate(OPTIONS.indelsVE)
+    indelNoveltyRate = novelIndels / (1.0*totalIndels) # 100 - getDBSNPRate(OPTIONS.indelsVE)

    out = sys.stdout
    if ( OPTIONS.output != None ): out = open(OPTIONS.output, 'w')
@ -247,8 +257,8 @@ if __name__ == "__main__":
    
    print >> out, 'bases called (fraction ref genome) %f (%.2f%%)' % (OPTIONS.calledGenome, 100.0 * OPTIONS.calledGenome / OPTIONS.totalGenome)
    print >> out, 'number of SNP sites (%% novel) %d (%.2f%%)' % (totalSNPs, snpNoveltyRate)
-    print >> out, 'average # SNP sites per individual', average(map(Sample.getNSNPs, samples.itervalues()))
+    print >> out, 'average # SNP sites per individual %.0f' % average(map(Sample.getNSNPs, samples.itervalues()))
    print >> out, 'number of indel sites (%% novel) %d (%.2f%%)' % (totalIndels, indelNoveltyRate)
-    print >> out, 'average # indel sites per individual', average(map(Sample.getNIndels, samples.itervalues()))
+    print >> out, 'average # indel sites per individual %.0f' % average(map(Sample.getNIndels, samples.itervalues()))
    out.close()
    
--- a/python/farm_commands2.py
+++ b/python/farm_commands2.py
@ -133,7 +133,7 @@ def buildExecutionString(job, farm_queue = None, debug = True):
            #farm_stdout = None
            farm_stdout = "%J.lsf.output"
            
-        cmd_str = "bsub -q " + farm_queue 
+        cmd_str = "bsub -r -q " + farm_queue 
        if farm_stdout != None:
            cmd_str += " -o " + farm_stdout
        
--- a/python/madPipelineUtils.py
+++ b/python/madPipelineUtils.py
@ -45,7 +45,7 @@ def appendExtension(path, newExt, addExtension = True):
 #    return os.path.join(OPTIONS.dir, s)

 class PipelineArgs:
-    def __init__( self, GATK_JAR = GATK_JAR, ref = 'hg18', name = None, memory = '4g', excludeChrs = [] ):
+    def __init__( self, GATK_JAR = GATK_JAR, ref = 'hg18', name = None, memory = '3g', excludeChrs = [] ):
        self.GATK = 'java -Xmx%s -Djava.io.tmpdir=/broad/shptmp/depristo/tmp/ -jar ' + GATK_JAR + ' -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -l INFO '
        self.ref = ref
        self.name = name
--- a/python/picard_utils.py
+++ b/python/picard_utils.py
@ -154,7 +154,7 @@ def aggregateGeliCalls( sortedGeliCalls ):
    #return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
    return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]

-def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx4096m', compression_level = 1 ):
+def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx2g', compression_level = 1 ):
    if useSamtools:
        return SAMTOOLS_MERGE_BIN + ' ' + output_filename + ' ' + ' '.join(inputFiles)
    else:
@ -166,13 +166,12 @@ def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True,
        if MSD: MSDStr = 'MSD=true'
    
        return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true COMPRESSION_LEVEL=' + str(compression_level) + ' SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
-        #return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))

-def mergeFixingMatesBAMCmd( output_filename, inputFiles, memLimit = '-Xmx4096m', compression_level = 1, tmpdir = '/broad/shptmp' ):
+def mergeFixingMatesBAMCmd( output_filename, inputFiles, memLimit = '-Xmx2g', compression_level = 1, tmpdir = '/broad/shptmp' ):
    if type(inputFiles) <> list:
        inputFiles = list(inputFiles)

-    return 'java %s -Djava.io.tmpdir=%s -jar %s COMPRESSION_LEVEL=%d SO=coordinate O=%s VALIDATION_STRINGENCY=SILENT I=%s' % ( memLimit, tmpdir, FIXMATES_BIN, compression_level, output_filename, 'I='.join(inputFiles) )
+    return 'java %s -Djava.io.tmpdir=%s -jar %s COMPRESSION_LEVEL=%d SO=coordinate O=%s VALIDATION_STRINGENCY=SILENT I=%s' % ( memLimit, tmpdir, FIXMATES_BIN, compression_level, output_filename, ' I='.join(inputFiles) )

 def getPicardPath(lane, picardRoot = '/seq/picard/'):
    flowcell, laneNo = lane.split('.')
--- a/python/realignBamByChr.py
+++ b/python/realignBamByChr.py
@ -113,7 +113,7 @@ def createTargets( myPipelineArgs, chr, inputBam, outputRoot, args, lastJobs ):

 def realign( myPipelineArgs, chr, inputBam, outputRoot, intervals, lastJobs ):
    outputBAM = outputRoot + ".bam"
-    GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -L %s' % (inputBam, intervals, outputBAM, chr)
+    GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -stats %s -L %s' % (inputBam, intervals, outputBAM, outputBAM + ".stats", chr)
    return simpleGATKCommand( myPipelineArgs, 'Realign' + chr, GATKArgs, lastJobs ), outputBAM

 def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ):
--- a/python/vcfReader.py
+++ b/python/vcfReader.py
@ -74,7 +74,7 @@ class VCFRecord:
    def getFilter(self): return self.get("FILTER")
    def failsFilters(self): return not self.passesFilters()
    def passesFilters(self):
-        v = self.getFilter() == "." or self.getFilter() == "0" or self.getFilter() == 'PASSES'
+        v = self.getFilter() == "." or self.getFilter() == "0" or self.getFilter() == 'PASS'
        #print self.getFilter(), ">>>", v, self
        return v
        
@ -180,6 +180,7 @@ def readVCFHeader(lines):
            return header, columnNames, itertools.chain([line], lines)
            
    # we reach this point for empty files    
+    #print 'header is', header
    return header, columnNames, []

 def quickCountRecords(lines):