more and better python scripts for dealing with calls

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@881 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-06-02 20:37:19 +00:00
parent a1218ef508
commit 3998085e4b
5 changed files with 63 additions and 14 deletions

View File

@ -10,7 +10,22 @@ import itertools
gatkPath = "~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar"
ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
analysis = "CombineDuplicates"
def geli2dbsnpFile(geli):
root, flowcellDotlane, ext = picard_utils.splitPath(geli)
return os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
def bams2geli(bams):
def call1(bam):
geli = os.path.splitext(bam)[0] + '.geli'
jobid = 0
if not os.path.exists(geli):
cmd = picard_utils.callGenotypesCmd( bam, geli, options = picard_utils.hybridSelectionExtraArgsForCalling())
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue )
return geli, jobid
calls = map(call1, bams)
return map(lambda x: x[0], calls), map(lambda x: x[1], calls)
def main():
global OPTIONS, ROOT
@ -25,7 +40,7 @@ def main():
help="minimum lod for calling a variant")
parser.add_option("-k", "--column", dest="column",
type="int", default=1,
help="Column in the file with the geli file path")
help="Column in the file with the bam or geli file path")
parser.add_option("-o", "--output", dest="output",
type="string", default='/dev/stdout',
help="x")
@ -35,17 +50,36 @@ def main():
parser.error("incorrect number of arguments")
lines = [line.split() for line in open(args[0])]
nIndividuals = int(args[1])
gelis = map( lambda x: x[OPTIONS.column-1], lines )
variantsOut = map( lambda geli: os.path.split(geli)[1] + '.calls', gelis)
data = map( lambda x: x[OPTIONS.column-1], lines )
if os.path.splitext(data[0])[1] == '.bam':
gelis, jobids = bams2geli(data)
if filter(lambda x: x <> 0, jobids) <> []:
# there's still work to do
sys.exit('Stopping. Please rerun this program when the farm jobs are complete: ' + str(jobids))
print 'gelis', gelis
print 'jobids', jobids
else:
gelis = map( lambda x: x[OPTIONS.column-1], lines )
jobids = [None] * len(gelis)
print 'Geli files'
print gelis
print variantsOut
for geli, jobid in zip(gelis, jobids):
dbsnpFile = geli2dbsnpFile(geli)
if not os.path.exists(dbsnpFile):
dbsnpCmd = picard_utils.CollectDbSnpMatchesCmd(geli, dbsnpFile, OPTIONS.lod)
farm_commands.cmd(dbsnpCmd, OPTIONS.farmQueue, waitID = jobid)
# read in the dbSNP tracks
nTotalSnps = 0
nNovelSnps = 0
for geli in gelis:
root, flowcellDotlane, ext = picard_utils.splitPath(geli)
dbsnp_matches = os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
#dbsnp_matches = os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
dbsnp_matches = geli2dbsnpFile(geli)
print dbsnp_matches
if os.path.exists(dbsnp_matches):
TOTAL_SNPS, NOVEL_SNPS, PCT_DBSNP, NUM_IN_DB_SNP = picard_utils.read_dbsnp(dbsnp_matches)
nTotalSnps += int(TOTAL_SNPS)
@ -55,7 +89,9 @@ def main():
print 'DATA: NOVEL SNP CALLS SUMMED ACROSS LANES, NOT ACCOUNT FOR IDENTITY ', nNovelSnps
print 'DATA: AVERAGE DBSNP RATE ACROSS LANES ', float(nTotalSnps - nNovelSnps) / nTotalSnps
# convert the geli's to text
jobid = None
variantsOut = map( lambda geli: os.path.split(geli)[1] + '.calls', gelis)
for geli, variantOut in zip(gelis, variantsOut):
if not os.path.exists(variantOut):
cmd = ("GeliToText.jar I=%s | awk '$7 > %f' > %s" % ( geli, OPTIONS.lod, variantOut) )

View File

@ -90,11 +90,11 @@ class MergeFilesSpec:
sizes = map(greek, sizes)
return sizes
def mergeCmd(self, mergeBin = None):
def mergeCmd(self, mergeBin = None, MSD = False):
if mergeBin == None:
mergeBin = MERGE_BIN
return picard_utils.mergeBAMCmd(self.getMergedBAM(), self.sources(), mergeBin)
return picard_utils.mergeBAMCmd(self.getMergedBAM(), self.sources(), mergeBin, MSD = MSD)
def getIndexCmd(self):
return "samtools index " + self.getMergedBAM()

View File

@ -34,6 +34,9 @@ by key and spawn merge and index jobs to merge all of the files sharing the same
parser.add_option("-m", "--mergeBin", dest="mergeBin",
type="string", default=None,
help="Path to merge binary")
parser.add_option("", "--MSD", dest="MSD",
action='store_true', default=False,
help="Merge sequence dictionaries?")
parser.add_option("", "--keyCol", dest="keyCol",
type=int, default=1,
help="Column in the list file holding the key")
@ -69,7 +72,7 @@ by key and spawn merge and index jobs to merge all of the files sharing the same
if len(spec.sources()) == 1 and OPTIONS.link:
cmd = 'ln -s ' + spec.sources()[0] + ' ' + spec.getMergedBAM()
else:
cmd = spec.mergeCmd(OPTIONS.mergeBin)
cmd = spec.mergeCmd(OPTIONS.mergeBin, MSD = OPTIONS.MSD)
print cmd
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, just_print_commands = OPTIONS.dry)

View File

@ -46,5 +46,5 @@ def cmd(cmd_str_from_user, farm_queue=False, output_head=None, just_print_comman
status = os.system(cmd_str)
if not farm_queue:
print "<<< Exit code:", status,"\n"
return status
return int(status)

View File

@ -21,6 +21,9 @@ analysis = "CombineDuplicates"
MERGE_BIN = '/seq/software/picard/current/bin/MergeSamFiles.jar'
CALL_GENOTYPES_BIN = '/seq/software/picard/current/bin/CallGenotypes.jar'
def CollectDbSnpMatchesCmd(inputGeli, outputFile, lod):
return 'CollectDbSnpMatches.jar INPUT=%s OUTPUT=%s DBSNP=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.dbsnp MINIMUM_LOD=%f' % (inputGeli, outputFile, lod)
def unique(l):
return list(set(l))
@ -140,10 +143,14 @@ def aggregateGeliCalls( sortedGeliCalls ):
#return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN ):
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True ):
if type(inputFiles) <> list:
inputFiles = list(inputFiles)
return 'java -Xmx4096m -jar ' + mergeBin + ' MSD=true AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
MSDStr = ''
if MSD: MSDStr = 'MSD=true'
return 'java -Xmx4096m -jar ' + mergeBin + ' ' + MSDStr + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
#return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
def getPicardPath(lane, picardRoot = '/seq/picard/'):
@ -165,8 +172,11 @@ def getReferenceGenotypeFileFromConcordanceFile(concordFile):
return match.group(1)
return None
def hybridSelectionExtraArgsForCalling():
return "TARGET_INTERVALS=/seq/references/HybSelOligos/thousand_genomes_alpha_redesign/thousand_genomes_alpha_redesign.targets.interval_list CALL_ZERO_COVERAGE_LOCI=true"
def callGenotypesCmd( inputBam, outputFilename, callGenotypesBin = CALL_GENOTYPES_BIN, options = ''):
return "java -jar %s INPUT=%s OUTPUT=%s CALLER_ALGORITHM=QUALITY_SCORE PRIOR_MODEL=SNP_FREQUENCY %s" % ( callGenotypesBin, inputBam, outputFilename, options)
return "java -jar %s INPUT=%s OUTPUT=%s REFERENCE_SEQUENCE=%s CALLER_ALGORITHM=QUALITY_SCORE PRIOR_MODEL=SNP_FREQUENCY %s" % ( callGenotypesBin, inputBam, outputFilename, ref, options)
def concord(options, geli, output, genotypeFile):
return ("java -jar /seq/software/picard/current/bin/CollectGenotypeConcordanceStatistics.jar OPTIONS_FILE=%s INPUT=%s OUTPUT=%s REFERENCE_GENOTYPES=%s MINIMUM_LOD=5.0" % ( options, geli, output, genotypeFile ) )