more and better python scripts for dealing with calls

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@881 348d0f76-0448-11de-a6fe-93d51630548a
2009-06-02 20:37:19 +00:00 · 2009-06-02 20:37:19 +00:00 · 3998085e4b
parent a1218ef508
commit 3998085e4b
5 changed files with 63 additions and 14 deletions
--- a/python/Gelis2PopSNPs.py
+++ b/python/Gelis2PopSNPs.py
@ -10,7 +10,22 @@ import itertools

 gatkPath = "~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar"
 ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
-analysis = "CombineDuplicates"
+   
+def geli2dbsnpFile(geli):
+    root, flowcellDotlane, ext = picard_utils.splitPath(geli)
+    return os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
+
+   
+def bams2geli(bams):
+    def call1(bam):
+        geli = os.path.splitext(bam)[0] + '.geli'
+        jobid = 0
+        if not os.path.exists(geli):
+            cmd = picard_utils.callGenotypesCmd( bam, geli, options = picard_utils.hybridSelectionExtraArgsForCalling())
+            jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue )
+        return geli, jobid
+    calls = map(call1, bams)
+    return map(lambda x: x[0], calls), map(lambda x: x[1], calls)        
   
 def main():    
    global OPTIONS, ROOT
@ -25,7 +40,7 @@ def main():
                        help="minimum lod for calling a variant")
    parser.add_option("-k", "--column", dest="column",
                        type="int", default=1,
-                        help="Column in the file with the geli file path")
+                        help="Column in the file with the bam or geli file path")
    parser.add_option("-o", "--output", dest="output",
                        type="string", default='/dev/stdout',
                        help="x")
@ -35,17 +50,36 @@ def main():
        parser.error("incorrect number of arguments")
    lines = [line.split() for line in open(args[0])]
    nIndividuals = int(args[1])
-    gelis = map( lambda x: x[OPTIONS.column-1], lines )
-    variantsOut = map( lambda geli: os.path.split(geli)[1] + '.calls', gelis)
-
+    
+    data = map( lambda x: x[OPTIONS.column-1], lines )
+    if os.path.splitext(data[0])[1] == '.bam':
+        gelis, jobids = bams2geli(data)
+        if filter(lambda x: x <> 0, jobids) <> []:
+            # there's still work to do
+            sys.exit('Stopping.  Please rerun this program when the farm jobs are complete: ' + str(jobids))
+        print 'gelis', gelis
+        print 'jobids', jobids
+    else:
+        gelis = map( lambda x: x[OPTIONS.column-1], lines )
+        jobids = [None] * len(gelis)
+    
+    print 'Geli files'
    print gelis
-    print variantsOut

+    for geli, jobid in zip(gelis, jobids):
+        dbsnpFile = geli2dbsnpFile(geli)
+        if not os.path.exists(dbsnpFile):
+            dbsnpCmd = picard_utils.CollectDbSnpMatchesCmd(geli, dbsnpFile, OPTIONS.lod)
+            farm_commands.cmd(dbsnpCmd, OPTIONS.farmQueue, waitID = jobid)
+
+    # read in the dbSNP tracks
    nTotalSnps = 0
    nNovelSnps = 0
    for geli in gelis:
        root, flowcellDotlane, ext = picard_utils.splitPath(geli)
-        dbsnp_matches = os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
+        #dbsnp_matches = os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
+        dbsnp_matches = geli2dbsnpFile(geli)
+        print dbsnp_matches
        if os.path.exists(dbsnp_matches):
            TOTAL_SNPS, NOVEL_SNPS, PCT_DBSNP, NUM_IN_DB_SNP = picard_utils.read_dbsnp(dbsnp_matches)
            nTotalSnps += int(TOTAL_SNPS)
@ -55,7 +89,9 @@ def main():
    print 'DATA:    NOVEL SNP CALLS SUMMED ACROSS LANES, NOT ACCOUNT FOR IDENTITY ', nNovelSnps
    print 'DATA:    AVERAGE DBSNP RATE ACROSS LANES ', float(nTotalSnps - nNovelSnps) / nTotalSnps

+    # convert the geli's to text
    jobid = None
+    variantsOut = map( lambda geli: os.path.split(geli)[1] + '.calls', gelis)
    for geli, variantOut in zip(gelis, variantsOut):
        if not os.path.exists(variantOut):
            cmd = ("GeliToText.jar I=%s | awk '$7 > %f' > %s" % ( geli, OPTIONS.lod, variantOut) )
--- a/python/MergeBAMsUtils.py
+++ b/python/MergeBAMsUtils.py
@ -90,11 +90,11 @@ class MergeFilesSpec:
            sizes = map(greek, sizes)
        return sizes
                
-    def mergeCmd(self, mergeBin = None):
+    def mergeCmd(self, mergeBin = None, MSD = False):
        if mergeBin == None:
            mergeBin = MERGE_BIN
            
-        return picard_utils.mergeBAMCmd(self.getMergedBAM(), self.sources(), mergeBin)
+        return picard_utils.mergeBAMCmd(self.getMergedBAM(), self.sources(), mergeBin, MSD = MSD)
        
    def getIndexCmd(self):
        return "samtools index " + self.getMergedBAM()
--- a/python/MergeBamsByKey.py
+++ b/python/MergeBamsByKey.py
@ -34,6 +34,9 @@ by key and spawn merge and index jobs to merge all of the files sharing the same
    parser.add_option("-m", "--mergeBin", dest="mergeBin",
                        type="string", default=None,
                        help="Path to merge binary")
+    parser.add_option("", "--MSD", dest="MSD",
+                        action='store_true', default=False,
+                        help="Merge sequence dictionaries?")
    parser.add_option("", "--keyCol", dest="keyCol",
                        type=int, default=1,
                        help="Column in the list file holding the key")
@ -69,7 +72,7 @@ by key and spawn merge and index jobs to merge all of the files sharing the same
            if len(spec.sources()) == 1 and OPTIONS.link:
                cmd = 'ln -s ' + spec.sources()[0] + ' ' + spec.getMergedBAM()
            else:
-                cmd = spec.mergeCmd(OPTIONS.mergeBin)
+                cmd = spec.mergeCmd(OPTIONS.mergeBin, MSD = OPTIONS.MSD)
            print cmd
            jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, just_print_commands = OPTIONS.dry)

--- a/python/farm_commands.py
+++ b/python/farm_commands.py
@ -46,5 +46,5 @@ def cmd(cmd_str_from_user, farm_queue=False, output_head=None, just_print_comman
        status = os.system(cmd_str)
        if not farm_queue:
            print "<<< Exit code:", status,"\n"
-        return status
+        return int(status)
        
--- a/python/picard_utils.py
+++ b/python/picard_utils.py
@ -21,6 +21,9 @@ analysis = "CombineDuplicates"
 MERGE_BIN = '/seq/software/picard/current/bin/MergeSamFiles.jar'
 CALL_GENOTYPES_BIN = '/seq/software/picard/current/bin/CallGenotypes.jar'

+def CollectDbSnpMatchesCmd(inputGeli, outputFile, lod): 
+    return 'CollectDbSnpMatches.jar INPUT=%s OUTPUT=%s DBSNP=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.dbsnp MINIMUM_LOD=%f' % (inputGeli, outputFile, lod)
+
 def unique(l):
    return list(set(l))

@ -140,10 +143,14 @@ def aggregateGeliCalls( sortedGeliCalls ):
    #return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
    return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]

-def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN ):
+def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True ):
    if type(inputFiles) <> list:
        inputFiles = list(inputFiles)
-    return 'java -Xmx4096m -jar ' + mergeBin + ' MSD=true AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
+
+    MSDStr = ''
+    if MSD: MSDStr = 'MSD=true'
+
+    return 'java -Xmx4096m -jar ' + mergeBin + ' ' + MSDStr + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
    #return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))

 def getPicardPath(lane, picardRoot = '/seq/picard/'):
@ -165,8 +172,11 @@ def getReferenceGenotypeFileFromConcordanceFile(concordFile):
            return match.group(1)
    return None

+def hybridSelectionExtraArgsForCalling():
+    return "TARGET_INTERVALS=/seq/references/HybSelOligos/thousand_genomes_alpha_redesign/thousand_genomes_alpha_redesign.targets.interval_list CALL_ZERO_COVERAGE_LOCI=true"
+
 def callGenotypesCmd( inputBam, outputFilename, callGenotypesBin = CALL_GENOTYPES_BIN, options = ''):
-    return "java -jar %s INPUT=%s OUTPUT=%s CALLER_ALGORITHM=QUALITY_SCORE PRIOR_MODEL=SNP_FREQUENCY %s" % ( callGenotypesBin, inputBam, outputFilename, options)
+    return "java -jar %s INPUT=%s OUTPUT=%s REFERENCE_SEQUENCE=%s CALLER_ALGORITHM=QUALITY_SCORE PRIOR_MODEL=SNP_FREQUENCY %s" % ( callGenotypesBin, inputBam, outputFilename, ref, options)

 def concord(options, geli, output, genotypeFile):
    return ("java -jar /seq/software/picard/current/bin/CollectGenotypeConcordanceStatistics.jar OPTIONS_FILE=%s INPUT=%s OUTPUT=%s REFERENCE_GENOTYPES=%s MINIMUM_LOD=5.0" % ( options, geli, output, genotypeFile ) )