From 2bdb0118650b398a4654e4bf7ea8e33ced65e4c4 Mon Sep 17 00:00:00 2001
From: depristo <depristo@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Fri, 16 Jul 2010 15:33:47 +0000
Subject: [PATCH] Improvements to 1KG processing pipeline

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3807 348d0f76-0448-11de-a6fe-93d51630548a
---
 python/1kgStatsForCalls.py | 18 ++++++++++++------
 python/madPipelineUtils.py |  2 ++
 python/realignBamByChr.py  | 17 +++++++++++------
 3 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/python/1kgStatsForCalls.py b/python/1kgStatsForCalls.py
index 38c368a48..3a4e90a7f 100755
--- a/python/1kgStatsForCalls.py
+++ b/python/1kgStatsForCalls.py
@@ -11,7 +11,7 @@ import string
 
 def average(l):
     sum = reduce(operator.add, l, 0)
-    return sum / (1.0 * len(l))
+    return sum / (1.0 * (max(len(l), 1)))
 
 def printHeaderSep():
     print
@@ -87,8 +87,9 @@ def countBases(samples, seqIndex):
     return total
 
 def printStatus(samples):
-    for sample in samples.itervalues():
-        print sample
+    if OPTIONS.verbose:
+        for sample in samples.itervalues():
+            print sample
 
 def findVariantEvalResults(key, file, type=str):
     def capture1(line):
@@ -159,11 +160,14 @@ def countSNPs(samples, snpsVCF, useIndels = False):
     sampleIDs = columnNames[9:]
 
     print 'Counting SNPs...'
+    #lines = 0
     for header, vcf, counter in vcfReader.lines2VCF(remainingLines, extendedOutput = True, decodeAll = False):
+        #lines += 1
+        #print 'lines', lines
         if ( counter > OPTIONS.maxRecords and OPTIONS.maxRecords != -1 ):
             break
             
-        if vcf.passesFilters() and vcf.getChrom not in ['MT', 'Y']:
+        if vcf.passesFilters() and vcf.getChrom() not in ['MT', 'Y']:
             if ( vcf.isVariant() ): 
                 total += 1
 
@@ -184,6 +188,8 @@ def countSNPs(samples, snpsVCF, useIndels = False):
                                 samples[sampleID].nIndels += 1
                             else:
                                 samples[sampleID].nSNPs += 1
+        else:
+            print 'rejecting line', vcf
 
     printStatus(samples)
     return total, novel
@@ -238,12 +244,12 @@ if __name__ == "__main__":
 
     ignore, totalMappedBases = countMappedBases(samples, OPTIONS.alignmentIndex)    
     totalBases = countBases(samples, OPTIONS.sequenceIndex)
-    meanMappedDepth = totalMappedBases / OPTIONS.totalGenome / nSamples
+    meanMappedDepth = totalMappedBases / OPTIONS.totalGenome / (max(nSamples, 1))
     totalSNPs, novelSNPs = countSNPs(samples, OPTIONS.snps)
     totalIndels, novelIndels = countIndels(samples, OPTIONS.indels)
     
     snpNoveltyRate = 100 - getDBSNPRate(OPTIONS.snpsVE)
-    indelNoveltyRate = novelIndels / (1.0*totalIndels) # 100 - getDBSNPRate(OPTIONS.indelsVE)
+    indelNoveltyRate = novelIndels / (1.0*max(totalIndels,1)) # 100 - getDBSNPRate(OPTIONS.indelsVE)
 
     out = sys.stdout
     if ( OPTIONS.output != None ): out = open(OPTIONS.output, 'w')
diff --git a/python/madPipelineUtils.py b/python/madPipelineUtils.py
index 23c333b3d..3db856bd2 100755
--- a/python/madPipelineUtils.py
+++ b/python/madPipelineUtils.py
@@ -73,6 +73,7 @@ class PipelineArgs:
         return cmd
         
     def chrsToSplitBy(self, chrs):
+        #print 'XXX', self.excludeChrs
         return filter(lambda x: x not in self.excludeChrs, chrs)
         
 # 
@@ -101,6 +102,7 @@ def splitGATKCommandByChr( myPipelineArgs, cmd, outputsToParallelize, mergeComma
         chrCmd = FarmJob(chr_cmd_str, jobName = cmd.jobName + '.byChr' + chr, dependencies = cmd.dependencies)
         return chrCmd, chrOutputMap
 
+    #print '######################################### chrsToSplitBy', myPipelineArgs.chrsToSplitBy(hg18)
     splits = map( makeChrCmd, myPipelineArgs.chrsToSplitBy(hg18) )
     splitCommands = map(lambda x: x[0], splits)
 
diff --git a/python/realignBamByChr.py b/python/realignBamByChr.py
index 31cdfa67e..42ae65f48 100755
--- a/python/realignBamByChr.py
+++ b/python/realignBamByChr.py
@@ -68,7 +68,7 @@ def main():
     def includeStage(name):
         return name in stages
 
-    out = open(outputBamList, 'w')
+    #out = open(outputBamList, 'w')
     realignInfo = []
     for chr in getContigs(OPTIONS.contigs, hg18):
         lastJobs = None
@@ -92,16 +92,19 @@ def main():
         realignInfo.append([realignJobs, realignedBam])
         # need to merge and then index
         indexJobs, ignore = execStage('index', index, realignedBam, realignJobs)
-        print >> out, os.path.abspath(realignedBam)
+        #print >> out, os.path.abspath(realignedBam)
      
-    out.close() 
+    #out.close() 
 
     if 'merge' in stages:
         realignerJobs = []
         if realignInfo[0][0] != []:
             realignerJobs = map(lambda x: x[0][0], realignInfo)
-        mergerJob = mergeBams(myPipelineArgs, outputRoot + ".bam", map(lambda x: x[1], realignInfo), realignerJobs)
+        mergedBam = outputRoot + ".bam"
+        mergerJob = mergeBams(myPipelineArgs, mergedBam, map(lambda x: x[1], realignInfo), realignerJobs)
+        indexJob, ignore = index(myPipelineArgs, 'ignore', 'ignore', 'ignore', mergedBam, mergerJob)
         allJobs.append(mergerJob)
+        allJobs.append(indexJob)
 
     print 'EXECUTING JOBS'
     executeJobs(allJobs, farm_queue = OPTIONS.farmQueue, just_print_commands = OPTIONS.dry) 
@@ -113,7 +116,7 @@ def createTargets( myPipelineArgs, chr, inputBam, outputRoot, args, lastJobs ):
 
 def realign( myPipelineArgs, chr, inputBam, outputRoot, intervals, lastJobs ):
     outputBAM = outputRoot + ".bam"
-    GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -stats %s -L %s' % (inputBam, intervals, outputBAM, outputBAM + ".stats", chr)
+    GATKArgs = '-T IndelRealigner -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s -targetIntervals %s --output %s -stats %s -snps %s -L %s' % (inputBam, intervals, outputBAM, outputBAM + ".stats", outputBAM + ".snps", chr)
     return simpleGATKCommand( myPipelineArgs, 'Realign' + chr, GATKArgs, lastJobs ), outputBAM
 
 def index( myPipelineArgs, chr, inputBam, outputRoot, realignedBam, lastJobs ):
@@ -123,7 +126,9 @@ def mergeBams( myPipelineArgs, outputFilename, bamsToMerge, lastJobs ):
     print lastJobs
     #cmd = picard_utils.mergeBAMCmd( outputFilename, bamsToMerge, compression_level = 5 )
     cmd = picard_utils.mergeFixingMatesBAMCmd(outputFilename, bamsToMerge, compression_level = 5)
-    return FarmJob(cmd, jobName = 'merge.' + myPipelineArgs.name, dependencies = lastJobs)
+    jobs1 = FarmJob(cmd, jobName = 'merge.' + myPipelineArgs.name, dependencies = lastJobs)
+    #jobs2 = indexBAMFile( myPipelineArgs.name, outputFilename, jobs1 )
+    return jobs1
 
 if __name__ == "__main__":
     main()