more and better python scripts for dealing with calls
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@881 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a1218ef508
commit
3998085e4b
|
|
@ -10,7 +10,22 @@ import itertools
|
|||
|
||||
gatkPath = "~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar"
|
||||
ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
|
||||
analysis = "CombineDuplicates"
|
||||
|
||||
def geli2dbsnpFile(geli):
|
||||
root, flowcellDotlane, ext = picard_utils.splitPath(geli)
|
||||
return os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
|
||||
|
||||
|
||||
def bams2geli(bams):
|
||||
def call1(bam):
|
||||
geli = os.path.splitext(bam)[0] + '.geli'
|
||||
jobid = 0
|
||||
if not os.path.exists(geli):
|
||||
cmd = picard_utils.callGenotypesCmd( bam, geli, options = picard_utils.hybridSelectionExtraArgsForCalling())
|
||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue )
|
||||
return geli, jobid
|
||||
calls = map(call1, bams)
|
||||
return map(lambda x: x[0], calls), map(lambda x: x[1], calls)
|
||||
|
||||
def main():
|
||||
global OPTIONS, ROOT
|
||||
|
|
@ -25,7 +40,7 @@ def main():
|
|||
help="minimum lod for calling a variant")
|
||||
parser.add_option("-k", "--column", dest="column",
|
||||
type="int", default=1,
|
||||
help="Column in the file with the geli file path")
|
||||
help="Column in the file with the bam or geli file path")
|
||||
parser.add_option("-o", "--output", dest="output",
|
||||
type="string", default='/dev/stdout',
|
||||
help="x")
|
||||
|
|
@ -35,17 +50,36 @@ def main():
|
|||
parser.error("incorrect number of arguments")
|
||||
lines = [line.split() for line in open(args[0])]
|
||||
nIndividuals = int(args[1])
|
||||
gelis = map( lambda x: x[OPTIONS.column-1], lines )
|
||||
variantsOut = map( lambda geli: os.path.split(geli)[1] + '.calls', gelis)
|
||||
|
||||
|
||||
data = map( lambda x: x[OPTIONS.column-1], lines )
|
||||
if os.path.splitext(data[0])[1] == '.bam':
|
||||
gelis, jobids = bams2geli(data)
|
||||
if filter(lambda x: x <> 0, jobids) <> []:
|
||||
# there's still work to do
|
||||
sys.exit('Stopping. Please rerun this program when the farm jobs are complete: ' + str(jobids))
|
||||
print 'gelis', gelis
|
||||
print 'jobids', jobids
|
||||
else:
|
||||
gelis = map( lambda x: x[OPTIONS.column-1], lines )
|
||||
jobids = [None] * len(gelis)
|
||||
|
||||
print 'Geli files'
|
||||
print gelis
|
||||
print variantsOut
|
||||
|
||||
for geli, jobid in zip(gelis, jobids):
|
||||
dbsnpFile = geli2dbsnpFile(geli)
|
||||
if not os.path.exists(dbsnpFile):
|
||||
dbsnpCmd = picard_utils.CollectDbSnpMatchesCmd(geli, dbsnpFile, OPTIONS.lod)
|
||||
farm_commands.cmd(dbsnpCmd, OPTIONS.farmQueue, waitID = jobid)
|
||||
|
||||
# read in the dbSNP tracks
|
||||
nTotalSnps = 0
|
||||
nNovelSnps = 0
|
||||
for geli in gelis:
|
||||
root, flowcellDotlane, ext = picard_utils.splitPath(geli)
|
||||
dbsnp_matches = os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
|
||||
#dbsnp_matches = os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
|
||||
dbsnp_matches = geli2dbsnpFile(geli)
|
||||
print dbsnp_matches
|
||||
if os.path.exists(dbsnp_matches):
|
||||
TOTAL_SNPS, NOVEL_SNPS, PCT_DBSNP, NUM_IN_DB_SNP = picard_utils.read_dbsnp(dbsnp_matches)
|
||||
nTotalSnps += int(TOTAL_SNPS)
|
||||
|
|
@ -55,7 +89,9 @@ def main():
|
|||
print 'DATA: NOVEL SNP CALLS SUMMED ACROSS LANES, NOT ACCOUNT FOR IDENTITY ', nNovelSnps
|
||||
print 'DATA: AVERAGE DBSNP RATE ACROSS LANES ', float(nTotalSnps - nNovelSnps) / nTotalSnps
|
||||
|
||||
# convert the geli's to text
|
||||
jobid = None
|
||||
variantsOut = map( lambda geli: os.path.split(geli)[1] + '.calls', gelis)
|
||||
for geli, variantOut in zip(gelis, variantsOut):
|
||||
if not os.path.exists(variantOut):
|
||||
cmd = ("GeliToText.jar I=%s | awk '$7 > %f' > %s" % ( geli, OPTIONS.lod, variantOut) )
|
||||
|
|
|
|||
|
|
@ -90,11 +90,11 @@ class MergeFilesSpec:
|
|||
sizes = map(greek, sizes)
|
||||
return sizes
|
||||
|
||||
def mergeCmd(self, mergeBin = None):
|
||||
def mergeCmd(self, mergeBin = None, MSD = False):
|
||||
if mergeBin == None:
|
||||
mergeBin = MERGE_BIN
|
||||
|
||||
return picard_utils.mergeBAMCmd(self.getMergedBAM(), self.sources(), mergeBin)
|
||||
return picard_utils.mergeBAMCmd(self.getMergedBAM(), self.sources(), mergeBin, MSD = MSD)
|
||||
|
||||
def getIndexCmd(self):
|
||||
return "samtools index " + self.getMergedBAM()
|
||||
|
|
|
|||
|
|
@ -34,6 +34,9 @@ by key and spawn merge and index jobs to merge all of the files sharing the same
|
|||
parser.add_option("-m", "--mergeBin", dest="mergeBin",
|
||||
type="string", default=None,
|
||||
help="Path to merge binary")
|
||||
parser.add_option("", "--MSD", dest="MSD",
|
||||
action='store_true', default=False,
|
||||
help="Merge sequence dictionaries?")
|
||||
parser.add_option("", "--keyCol", dest="keyCol",
|
||||
type=int, default=1,
|
||||
help="Column in the list file holding the key")
|
||||
|
|
@ -69,7 +72,7 @@ by key and spawn merge and index jobs to merge all of the files sharing the same
|
|||
if len(spec.sources()) == 1 and OPTIONS.link:
|
||||
cmd = 'ln -s ' + spec.sources()[0] + ' ' + spec.getMergedBAM()
|
||||
else:
|
||||
cmd = spec.mergeCmd(OPTIONS.mergeBin)
|
||||
cmd = spec.mergeCmd(OPTIONS.mergeBin, MSD = OPTIONS.MSD)
|
||||
print cmd
|
||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, just_print_commands = OPTIONS.dry)
|
||||
|
||||
|
|
|
|||
|
|
@ -46,5 +46,5 @@ def cmd(cmd_str_from_user, farm_queue=False, output_head=None, just_print_comman
|
|||
status = os.system(cmd_str)
|
||||
if not farm_queue:
|
||||
print "<<< Exit code:", status,"\n"
|
||||
return status
|
||||
return int(status)
|
||||
|
||||
|
|
|
|||
|
|
@ -21,6 +21,9 @@ analysis = "CombineDuplicates"
|
|||
MERGE_BIN = '/seq/software/picard/current/bin/MergeSamFiles.jar'
|
||||
CALL_GENOTYPES_BIN = '/seq/software/picard/current/bin/CallGenotypes.jar'
|
||||
|
||||
def CollectDbSnpMatchesCmd(inputGeli, outputFile, lod):
|
||||
return 'CollectDbSnpMatches.jar INPUT=%s OUTPUT=%s DBSNP=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.dbsnp MINIMUM_LOD=%f' % (inputGeli, outputFile, lod)
|
||||
|
||||
def unique(l):
|
||||
return list(set(l))
|
||||
|
||||
|
|
@ -140,10 +143,14 @@ def aggregateGeliCalls( sortedGeliCalls ):
|
|||
#return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
||||
return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
||||
|
||||
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN ):
|
||||
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True ):
|
||||
if type(inputFiles) <> list:
|
||||
inputFiles = list(inputFiles)
|
||||
return 'java -Xmx4096m -jar ' + mergeBin + ' MSD=true AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
||||
|
||||
MSDStr = ''
|
||||
if MSD: MSDStr = 'MSD=true'
|
||||
|
||||
return 'java -Xmx4096m -jar ' + mergeBin + ' ' + MSDStr + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
||||
#return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
||||
|
||||
def getPicardPath(lane, picardRoot = '/seq/picard/'):
|
||||
|
|
@ -165,8 +172,11 @@ def getReferenceGenotypeFileFromConcordanceFile(concordFile):
|
|||
return match.group(1)
|
||||
return None
|
||||
|
||||
def hybridSelectionExtraArgsForCalling():
|
||||
return "TARGET_INTERVALS=/seq/references/HybSelOligos/thousand_genomes_alpha_redesign/thousand_genomes_alpha_redesign.targets.interval_list CALL_ZERO_COVERAGE_LOCI=true"
|
||||
|
||||
def callGenotypesCmd( inputBam, outputFilename, callGenotypesBin = CALL_GENOTYPES_BIN, options = ''):
|
||||
return "java -jar %s INPUT=%s OUTPUT=%s CALLER_ALGORITHM=QUALITY_SCORE PRIOR_MODEL=SNP_FREQUENCY %s" % ( callGenotypesBin, inputBam, outputFilename, options)
|
||||
return "java -jar %s INPUT=%s OUTPUT=%s REFERENCE_SEQUENCE=%s CALLER_ALGORITHM=QUALITY_SCORE PRIOR_MODEL=SNP_FREQUENCY %s" % ( callGenotypesBin, inputBam, outputFilename, ref, options)
|
||||
|
||||
def concord(options, geli, output, genotypeFile):
|
||||
return ("java -jar /seq/software/picard/current/bin/CollectGenotypeConcordanceStatistics.jar OPTIONS_FILE=%s INPUT=%s OUTPUT=%s REFERENCE_GENOTYPES=%s MINIMUM_LOD=5.0" % ( options, geli, output, genotypeFile ) )
|
||||
|
|
|
|||
Loading…
Reference in New Issue