Deleting old python code

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4961 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-01-07 21:33:45 +00:00
parent 3362f0c280
commit e64a300642
44 changed files with 0 additions and 6901 deletions

View File

@ -1,3 +0,0 @@
python ~/dev/GenomeAnalysisTK/trunk/python/MergeBAMBatch.py -d /humgen/gsa-hphome1/projects/1kg_pilot1/mergedBamsByPopulation/ -q gsa /broad/1KG/DCC_merged/lists/low_coverage_freeze5.list -n /broad/1KG/DCC_merged/lists/naids_and_pop.txt
python ~/dev/GenomeAnalysisTK/trunk/python/MergeBAMBatch.py -d freeze5.1 -q gsa -s /broad/1KG/DCC_merged/lists/trios_freeze5.list

View File

@ -1,47 +0,0 @@
#!/usr/bin/env python
import os,sys
from farm_commands import cmd
def run_paired_BWA(bam):
"Takes a paired end BAM file and outputs an aligned '.aligned.bam' file "
lane = 0
root = os.path.splitext(bam)[0]
# BAM to BFQ
cmd("/seq/software/picard/1.21/bin/BamToBfq.jar I="+bam+" LANE="+str(lane)+" FLOWCELL_BARCODE="+root+" PAIRED_RUN=True RUN_BARCODE="+root+" ANALYSIS_DIR=.")
# BFQ to FQ
root1 = root+"."+str(lane)+".0.1"
root2 = root+"."+str(lane)+".0.2"
bfq1 = root1+".bfq"
bfq2 = root2+".bfq"
fq1 = root1+".fq"
fq2 = root2+".fq"
cmd("/seq/dirseq/maq-0.7.1/maq bfq2fastq "+bfq1+" "+fq1)
cmd("/seq/dirseq/maq-0.7.1/maq bfq2fastq "+bfq2+" "+fq2)
# Run BWA
sai1 = root1+".sai"
sai2 = root2+".sai"
cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa aln /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+fq1+" >"+sai1)
cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa aln /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+fq2+" >"+sai2)
# Pairing
alignedsam = root+".aligned.sam"
cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa sampe /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+sai1+" "+sai2+" "+fq1+" "+fq2+" > "+alignedsam)
# SAM to BAM
bam_ref_list = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bam.ref_list"
alignedbam = root+".aligned.bam"
cmd("/seq/dirseq/samtools/current/samtools import "+bam_ref_list+" "+alignedsam+" "+alignedbam)
if __name__ == "__main__":
if len(sys.argv) < 2:
print "Usage: AlignBam.py BAM_FILE"
sys.exit()
bam = sys.argv[1]
run_paired_BWA(bam)

View File

@ -1,41 +0,0 @@
import os,sys
from farm_commands import cmd
def run_BWA(bam, paired=True):
"Takes a paired end BAM file and outputs an aligned '.aligned.bam' file "
lane = 0
root = os.path.splitext(bam)[0]
print root
# BAM to BFQ
cmd("/seq/software/picard/current/bin/BamToBfq.jar I="+bam+" LANE="+str(lane)+" FLOWCELL_BARCODE="+root+" PAIRED_RUN=True RUN_BARCODE="+root+" ANALYSIS_DIR=.")
# BFQ to FQ
root1 = root+"."+str(lane)+".0.1"
root2 = root+"."+str(lane)+".0.2"
bfq1 = root1+".bfq"
bfq2 = root2+".bfq"
fq1 = root1+".fq"
fq2 = root2+".fq"
cmd("/seq/dirseq/maq-0.7.1/maq bfq2fastq "+bfq1+" "+fq1)
cmd("/seq/dirseq/maq-0.7.1/maq bfq2fastq "+bfq2+" "+fq2)
# Run BWA
sai1 = root1+".sai"
sai2 = root2+".sai"
cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa aln /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+fq1+" >"+sai1)
cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa aln /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+fq2+" >"+sai2)
# Pairing
alignedsam = root+".aligned.sam"
cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa sampe /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+sai1+" "+sai2+" "+fq1+" "+fq2+" > "+alignedsam)
# SAM to BAM
bam_ref_list = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bam.ref_list"
alignedbam = root+".aligned.bam"
cmd("/seq/dirseq/samtools/current/samtools import "+bam_ref_list+" "+alignedsam+" "+alignedbam)
if __name__ == "__main__":
bamfiles = sys.argv[1:]

View File

@ -1,64 +0,0 @@
#!/usr/bin/python
import os,sys
from farm_commands import cmd
def get_line_number_from_file(filename, line_num):
for index, line in enumerate(open(filename)):
if index == line_num:
return line
if __name__ == "__main__":
from optparse import OptionParser
class OptionParser (OptionParser):
def check_required (self, opts):
for opt in opts:
option = self.get_option(opt)
# Assumes the option's 'default' is set to None!
if getattr(self.values, option.dest) is None:
self.error("%s option not supplied" % option)
parser = OptionParser()
parser.add_option("-q", "--queue", help="Queue to submit jobs to (default: long)", dest="queue", default="long")
parser.add_option("-l", "--lane", help="Lane number to process", dest="lane", default="")
parser.add_option("-t", "--tmpdir", help="Temp directory to use", dest="tmpdir")
parser.add_option("-b", "--barcodes", help="File with barcodes as \"Id[TABa]barcode_sequence\" with one header line", dest="barcodes")
parser.add_option("-a", "--baits", help="Bait interval list (default: 1KG bait list)", dest="ti", default="/seq/references/HybSelOligos/thousand_genomes_alpha_redesign/thousand_genomes_alpha_redesign.baits.interval_list")
parser.add_option("-r", "--targets", help="Target interval list (default: 1KG target list)", dest="ti", default="/seq/references/HybSelOligos/thousand_genomes_alpha_redesign/thousand_genomes_alpha_redesign.targets.interval_list")
(options, args) = parser.parse_args()
parser.check_required(("-l","-t","-b"))
solexa_dir = "."
barcodes = []
barcode_file = open(options.barcodes) #"../barcode_sequences.txt")
barcode_file.readline()
for line in barcode_file:
fields = line.split()
barcodes.append(fields[1])
farm = options.queue
lane = options.lane
alias = "lane"+str(lane)
tmp_dir = options.tmpdir #"/humgen/gsa-scr1/andrewk/tmp/"
cmd("/seq/software/picard/current/bin/ExtractIlluminaBarcodes.jar B="+solexa_dir+" L="+str(lane)+" BARCODE_POSITION=1 METRICS_FILE=metrics.out BARCODE=" + " BARCODE=".join(barcodes))
cmd( "/seq/software/picard/current/bin/BustardToSam.jar B="+solexa_dir+" L="+str(lane)+" O="+alias+".bam SAMPLE_ALIAS="+alias+" RUN_BARCODE="+alias+" TMP_DIR="+tmp_dir)#, farm_queue=farm) #PAIRED_RUN=True
for barcode in barcodes:
id1 = cmd( "/seq/software/picard/current/bin/BustardToSam.jar B="+solexa_dir+" L="+str(lane)+" BARCODE_POSITION=1 BARCODE="+barcode+" O="+barcode+".bam RUN_BARCODE="+barcode+" SAMPLE_ALIAS=lane5 TMP_DIR="+tmp_dir, farm_queue=farm) #PAIRED_RUN=True
id2 = cmd( "/home/radon01/andrewk/dev/Sting/trunk/python/AlignBam.py "+barcode+".bam", farm_queue=farm, waitID = id1)
id3 = cmd("/seq/software/picard/1.21/bin/MergeSamFiles.jar I="+barcode+".aligned.bam O="+barcode+".aligned.merged.bam VALIDATION_STRINGENCY=SILENT", farm_queue=farm, waitID = id2)
id4 = cmd("java -Xmx4096m -jar /seq/software/picard/1.21/bin/MarkDuplicates.jar VALIDATION_STRINGENCY=SILENT I= "+barcode+".aligned.merged.bam O="+barcode+".aligned.duplicates_marked.bam M="+barcode+".aligned.bam.duplicates_marked.bam.duplicate_metrics TMP_DIR="+tmp_dir, farm_queue=farm, waitID=id3)
cmd("java -jar /seq/software/picard/current/bin/CalculateHsMetrics.jar BI="+options.bait_list+" TI="+options.target_list+" I="+barcode+".aligned.duplicates_marked.bam M="+barcode+".hs_metrics VALIDATION_STRINGENCY=SILENT", farm_queue=farm, waitID = id4)
fout = open("hybrid_selection.metrics", "w")
print >> fout, "BARCODE\t"+get_line_number_from_file(barcodes[0]+".hs_metrics", 6).rstrip()
for barcode in barcodes:
print >> fout, barcode+"\t"+get_line_number_from_file(barcode+".hs_metrics", 7).rstrip()

View File

@ -1,357 +0,0 @@
#!/usr/bin/env python
import sys, itertools, FlatFileTable
from enum import Enum
def subset_list_by_indices(indices, list):
subset = []
for index in indices:
subset.append(list[index])
return subset
def chunk_generator(record_gen, key_fields):
"""Input:
record_gen: generator that produces dictionaries (records in database speak)
key_fields: keys in each dictionary used to determine chunk membership
Output:
locus_chunk: list of consecutive lines that have the same key_fields"""
locus_chunk = []
last_key = ""
first_record = True
for record in record_gen:
key = [record[f] for f in key_fields]
if key == last_key or first_record:
locus_chunk.append(record)
first_record = False
else:
if locus_chunk != []:
yield locus_chunk
locus_chunk = [record]
last_key = key
yield locus_chunk
class call_stats:
def __init__(self, call_type, coverage): #, acc_conf_calls, conf_call_rate, cum_corr_calls, cum_calls, coverage):
self.call_type = call_type
self.coverage = coverage
self.calls = 0
self.conf_ref_calls = 0
self.conf_het_calls = 0
self.conf_hom_calls = 0
self.conf_var_calls = 0
self.conf_genotype_calls = 0
self.conf_refvar_calls = 0
self.correct_genotype = 0
self.correct_refvar = 0
def add_stat(self, calls, conf_ref_calls, conf_het_calls, conf_hom_calls, conf_var_calls, conf_genotype_calls, conf_refvar_calls, correct_genotype, correct_refvar):
#print self, calls, conf_ref_calls, conf_het_calls, conf_hom_calls, conf_var_calls, conf_genotype_calls, conf_refvar_calls, correct_genotype, correct_refvar
self.calls += calls
self.conf_ref_calls += conf_ref_calls
self.conf_het_calls += conf_het_calls
self.conf_hom_calls += conf_hom_calls
self.conf_var_calls += conf_var_calls
self.conf_genotype_calls += conf_genotype_calls
self.conf_refvar_calls += conf_refvar_calls
self.correct_genotype += correct_genotype
self.correct_refvar += correct_refvar
@staticmethod
def calc_discovery_stats(chunk):
conf_calls = 0
correct_genotype = 0
calls = 0
for record in chunk:
calls += 1
if float(record["BtrLod"]) >= 5 or call_type.from_record_2_state(record["ReferenceBase"][0],record["BestGenotype"]) == call_type.call_types_2_state.Reference and float(record["BtnbLod"]) >= 5:
conf_calls += 1
if call_type.discovery_call_correct(record):
correct_genotype += 1
return correct_genotype, conf_calls, calls
@staticmethod
def calc_genotyping_stats(chunk):
conf_calls = 0
correct_genotype = 0
calls = 0
for record in chunk:
calls += 1
if float(record["BtnbLod"]) >= 5:
conf_calls += 1
if call_type.genotyping_call_correct(record):
correct_genotype += 1
return correct_genotype, conf_calls, calls
@staticmethod
def stats_header():
return "TrueGenotype,Coverage,AccuracyConfidentGenotypingCalls,ConfidentGenotypingCallRate,AccuracyConfidentDiscoveryCalls,ConfidentDiscoveryCallRate,Calls,ConfRefCalls,ConfHetCalls,ConfHomCalls,ConfGenotypeCalls,CorrectGenotypes,ConfVarCalls,ConfDiscoveryCalls,CorrectDiscovery"
def __str__(self):
return ",".join(map(str, (self.calls, self.conf_ref_calls, self.conf_het_calls, self.conf_hom_calls, self.conf_genotype_calls, self.correct_genotype, self.conf_var_calls, self.conf_refvar_calls, self.correct_refvar)))
def stats_str(self):
return "%s,%d,%.5f,%.5f,%.5f,%.5f,%s" % (self.call_type, self.coverage, float(self.correct_genotype) / max(self.conf_genotype_calls,1), float(self.conf_genotype_calls) / max(self.calls,1), float(self.correct_refvar) / max(self.conf_refvar_calls,1), float(self.conf_refvar_calls) / max(self.calls,1), self.__str__())
class call_type:
"""Class that returns an Enum with the type of call provided by a record"""
call_types_3_state = Enum("HomozygousSNP","HeterozygousSNP","HomozygousReference")
call_types_3_state_short = Enum("Hom","Het","Ref")
call_types_2_state = Enum("Variant","Reference")
@staticmethod
def from_record_3_state(ref, genotype): # record):
"""Given reference base as string, determine whether called genotype is homref, het, homvar"""
#ref = record["ReferenceBase"][0]
#genotype = record["HapmapChipGenotype"]
return call_type.call_types_3_state[genotype.count(ref)]
@staticmethod
def from_record_2_state(ref, genotype):
"""Given reference base as string, determine whether called genotype is ref or var"""
#ref = record["ReferenceBase"][0]
#genotype = record["HapmapChipGenotype"]
return call_type.call_types_2_state[0] if genotype.count(ref) < 2 else call_type.call_types_2_state[1]
@staticmethod
def genotyping_call_correct(record):
return record["HapmapChipGenotype"] == record["BestGenotype"]
@staticmethod
def discovery_call_correct(record):
return call_type.from_record_2_state(record["ReferenceBase"][0], record["HapmapChipGenotype"]) == call_type.from_record_2_state(record["ReferenceBase"][0], record["BestGenotype"])
def print_record_debug(rec):
print "TEST Genotyping:", call_type.genotyping_call_correct(rec)
print "TEST Discovery:", call_type.discovery_call_correct(rec)
print "DIFF:", call_type.genotyping_call_correct(rec) != call_type.discovery_call_correct(rec)
print "\n".join([" %20s => '%s'" % (k,v) for k,v in sorted(rec.items())])
print call_type.from_record_2_state(rec["ReferenceBase"][0],rec["HapmapChipGenotype"])
print call_type.from_record_2_state(rec["ReferenceBase"][0],rec["BestGenotype"])
print
def aggregate_stats(filename, max_loci, table_filename, debug):
aggregate = dict()
fout = open(table_filename,"w")
fout.write(call_stats.stats_header()+"\n")
locus_gen = chunk_generator(FlatFileTable.record_generator(filename, None), ("Sequence","Position"))
for index, locus_chunk in enumerate(locus_gen):
if index >= max_loci:
break
if (index % 1000) == 0:
sys.stderr.write( str(index)+" loci processed, at: "+locus_chunk[0]["Sequence"]+":"+locus_chunk[0]["Position"]+"\n")
covs = dict()
coverage_chunk_gen = chunk_generator(locus_chunk, ("DownsampledCoverage", "Sequence", "Position"))
for cov_chunk in coverage_chunk_gen:
first_record = cov_chunk[0]
#stat = call_stats.calc_stats(cov_chunk)
for record in cov_chunk:
hapmap_genotyping_call_type = call_type.from_record_3_state(record["ReferenceBase"][0],record["HapmapChipGenotype"])
key = hapmap_genotyping_call_type, int(record["DownsampledCoverage"])
#key = call_type.from_record_3_state(record)#, int(record["DownsampledCoverage"])
record["DownsampledCoverage"] = int(record["DownsampledCoverage"])
record["HapmapChipCallType"] = hapmap_genotyping_call_type
value = record
correct_genotype, conf_genotype_calls, genotype_calls = call_stats.calc_genotyping_stats([record])
correct_refvar, conf_refvar_calls, refvar_calls = call_stats.calc_discovery_stats([record])
assert(genotype_calls == refvar_calls)
conf_ref_calls = 0
conf_het_calls = 0
conf_hom_calls = 0
best_genotyping_call_type = call_type.from_record_3_state(record["ReferenceBase"][0],record["BestGenotype"])
if conf_genotype_calls:
if best_genotyping_call_type.index == 0: conf_hom_calls = 1
if best_genotyping_call_type.index == 1: conf_het_calls = 1
if best_genotyping_call_type.index == 2: conf_ref_calls = 1
conf_var_calls = 0
if conf_refvar_calls:
this_variant_call_type = call_type.from_record_2_state(record["ReferenceBase"][0],record["BestGenotype"])
conf_var_calls = 1 if this_variant_call_type.index == 0 else 0
aggregate.setdefault(key, call_stats(*key))
#print ",".join(map(str,(genotype_calls, conf_ref_calls, conf_het_calls, conf_hom_calls, conf_var_calls, conf_genotype_calls, conf_refvar_calls, correct_genotype, correct_refvar)))
aggregate[key].add_stat(genotype_calls, conf_ref_calls, conf_het_calls, conf_hom_calls, conf_var_calls, conf_genotype_calls, conf_refvar_calls, correct_genotype, correct_refvar)
if debug:# and conf_refvar_calls:
print "KEYS:",key
print_record_debug(record)
for key, records in sorted(aggregate.items()):
fout.write(records.stats_str()+"\n")
fout.close()
#return aggregate
class weighted_avg:
def __init__(self):
self.sum = 0.0
self.count = 0
def add(self, value, counts):
self.sum += value*counts
self.count += counts
def return_avg(self):
return float(self.sum) / max(self.count,1)
def stats_from_hist(options, depth_hist_filename, stats_filename, variant_eval_dir, depth_multiplier=1.0):
#hist_zero = {"CallType" : ,"Coverage","AccuracyConfidentCalls","ConfidentCallRate","CumCorrectCalls","CumCalls","CumgCorrectFraction"}
#prob_genotype = [1e-5, 1e-3, 1-1e-3]
#prob_genotype = [0.37, 0.62, .0]
#prob_genotype = [0.203, 0.304, .491]
#prob_genotype = [0.216, 0.302, .481]
prob_genotype = [0.205, 0.306, 0.491] # Based on CEU NA12878 actual hapmap chip calls
#prob_genotype = [0.213, 0.313, 0.474] # Based on YRB NA19240 actual hapmap chip calls
theta = 1.0/1850 # expected heterozygosity
prob_genotype = [0.5 * theta, 1.0 * theta, 1 - 1.5*theta] # Based on CEU NA12878 actual hapmap chip calls
hist = []
hist_gen = FlatFileTable.record_generator(depth_hist_filename, sep=" ", skip_until_regex_line = "^depth count freq")
for index, record in enumerate(hist_gen):
assert int(record["depth"]) == index
hist.append(int(record["count"]))
# If upsampling is not done in the CoverageEval GATK module, the number of observations of reads
# with high depth of coverage can be very low giving
stats_dict = dict()
stats_gen = FlatFileTable.record_generator(stats_filename, sep=",")
for record in stats_gen:
key1 = int(record["Coverage"])
key2 = record["TrueGenotype"]
#print key2
stats_dict.setdefault(key1, dict()) # create a nested dict if it doesn't exist
stats_dict[key1][key2] = record # create an entry for these keys
#highest_homref_calls = 0
#if record["TrueGenotype"] == "HomozygousReference":
# calls = int(record["Calls"])
# if calls > highest_homref_calls:
# highest_homref_calls = calls
acc = dict()
call_rate = dict()
conf_calls = dict()
start = 1
end = 1000
max_usable_depth = 40 # Depth of coverage beyond which stats are not sampled enough and we take the stat at this depth instead
for depth, depth_count in enumerate(hist[start:end+1],start): # For Cd = depth count
#print "DEPTH: "+str(depth)
try:
depth = max(int(float(depth*depth_multiplier)),1)
if depth > max_usable_depth: # Ensure that high entries with bad stats use a good stat from a depth that we now is well sampled
depth = max_usable_depth
depth_entries = stats_dict[depth]
except KeyError:
print "Stopped on depth",depth
break
if True:
for true_genotype, stat in depth_entries.items(): # For t (SNP type) = true_genotype
#print "TRUE_GENOTYPE: "+str(true_genotype)
for genotype in call_type.call_types_3_state:
conf_calls.setdefault(genotype, 0.0)
prob_conf_x_call = float(stat["Conf"+str(call_type.call_types_3_state_short[genotype.index])+"Calls"])/float(stat["Calls"])
conf_calls[genotype] += depth_count * prob_conf_x_call * prob_genotype[genotype.index]
#if genotype.index == 1:
# print "%.5f " % prob_conf_x_call, depth, depth_count, conf_calls[genotype], int(stat["Conf"+str(call_type.call_types_3_state_short[genotype.index])+"Calls"]), int(stat["Calls"])
acc.setdefault(true_genotype,weighted_avg())
call_rate.setdefault(true_genotype,weighted_avg())
acc[true_genotype].add(float(stat["AccuracyConfidentGenotypingCalls"]), depth_count)
call_rate[true_genotype].add(float(stat["ConfidentGenotypingCallRate"]), depth_count)
import numpy
for genotype in call_type.call_types_3_state:
print "%19s accuracy : %.3f" % (str(genotype), acc[str(genotype)].return_avg())
print "%19s call rate: %.3f" % (str(genotype), call_rate[str(genotype)].return_avg())
print "\nExpected performance given perfect accuracy and call rate:"
print "%19s %7s %7s %7s" % ("", "Actual", "Perfect", "Diff.")
total_hist_sites = numpy.sum(hist)
total_predicted = 0
for genotype in call_type.call_types_3_state:
predicted = conf_calls[genotype]
total_predicted += predicted
perfect = prob_genotype[genotype.index]*total_hist_sites
diff = perfect - predicted
percent_of_possible = predicted / perfect * 100
print "%19s calls: %8.0f %8.0f %8.0f %8.1f" % (genotype, predicted, perfect, diff, percent_of_possible)
#repl_string += "%s %.0f\n" % (genotype, predicted)
print " Total calls: %7d" % total_predicted
#stats_gen = FlatFileTable.record_generator(stats_filename, sep=",")
#for chunk in chunk_generator(stats_gen, key_fields=("True_Genotype")):
print "\nCoverage histogram mean: %.2f" % numpy.average(range(len(hist)), weights=hist)
# STEM AND LEAF PLOT
# If VariantEval directory given, compare with those results
if options.variant_eval_file != None:
vareval_file = open(options.variant_eval_file)
num_hets = None; num_homs = None
for line in vareval_file:
if "UNKNOWN_CALLED_VAR_HET_NO_SITES" in line: num_hets = line.rstrip().split()[2]
if "UNKNOWN_CALLED_VAR_HOM_NO_SITES" in line: num_homs = line.rstrip().split()[2]
if options.oneline_stats != None:
oneline_stats = open(options.oneline_stats, "w")
pred_hom = conf_calls[call_type.call_types_3_state[0]]
pred_het = conf_calls[call_type.call_types_3_state[1]]
print >>oneline_stats, "%s %.0f %s %.0f %s" % (options.oneline_stats, pred_hom, num_homs, pred_het, num_hets)
def usage(parser):
parser.print_usage()
sys.exit()
if __name__ == "__main__":
from optparse import OptionParser
parser = OptionParser()
parser.add_option("-c", "--genotype_call_file", help="GELI file to use in generating coverage stat table", dest="genotype_filename", )
parser.add_option("-s", "--stats_file", help="file to containing empirical genotyping stats", dest="stats_filename", default=None)
parser.add_option("-g", "--histogram_file", help="file containing counts of each depth of coverage", dest="hist_filename")
parser.add_option("-m", "--max_loci", help="maximum number of loci to parse (for debugging)", default=sys.maxint, dest="max_loci", type="int")
parser.add_option("-e", "--evaluate", help="evaluate genotypes; requires a stats file and a histogram file", default=False, dest="evaluate_genotypes", action="store_true")
parser.add_option("-d", "--debug", help="provide debugging output", default=False, dest="debug", action="store_true")
parser.add_option("-p", "--depth_multiplier", help="multiply all depths in histogram by this value; for \"downsampling\" depth", default=1.0, dest="depth_multiplier", type=float)
parser.add_option("-v", "--variant_eval_file", help="file with output of VariantEval genotype concordance to compare this prediction to", default=None, dest="variant_eval_file")
parser.add_option("-o", "--oneline_stats_file", help="output single, tabular line of stats to this file", default=None, dest="oneline_stats")
(options, args) = parser.parse_args()
if not options.evaluate_genotypes:
print "Creating performance tables from genotypes file"
#if options.stats_filename == None:
# sys.exit("Must provide -s stats fliname option")
if options.genotype_filename == None:
sys.exit("Must provide -c genotype call filename option")
stats_filename = options.stats_filename if options.stats_filename != None else options.genotype_filename+".stats"
aggregate_stats(options.genotype_filename, options.max_loci, options.stats_filename, options.debug)
else:
print "Evaluating genotypes"
print "Depth multiplier:",options.depth_multiplier
if options.hist_filename == None:
sys.exit("Must provide -g histogram filename option")
if options.stats_filename == None:
sys.exit("Must provide -s stats fliname option")
stats_from_hist(options, options.hist_filename, options.stats_filename, options.variant_eval_file, options.depth_multiplier)

View File

@ -1,141 +0,0 @@
#!/usr/bin/env python
import sys, os
from farm_commands import cmd
#if __name__ == "__main__":
from optparse import OptionParser
class OptionParser (OptionParser):
def check_required (self, opts):
for opt in opts:
option = self.get_option(opt)
# Assumes the option's 'default' is set to None!
if getattr(self.values, option.dest) is None:
self.error("%s option not supplied" % option)
def exists_and_non_zero(file):
return os.path.exists(file) and os.path.getsize(file) != 0
def process_bam(bam_file):
gatk_exe = "java -Xmx8192m -jar ~/dev/Sting/trunk/dist/GenomeAnalysisTK.jar "
output_basepath = os.path.splitext(bam_file)[0]
output_basefile = os.path.basename(output_basepath)
print "Input BAM file:",bam_file
using_hapmapchip = None
if False:
# Config for chr1 of pilot 2 people
ref = " -R /broad/1KG/reference/human_b36_both.fasta"
#interval_string = "1"
# Assure existance of intervals file
intervals_file = output_basepath+".hapmap.intervals.chr1"
if not exists_and_non_zero(intervals_file):
Cmd = "awk -F' ' '{ if ($1 == \""+interval_string+"\") {print $1 \":\" $4} }' "+hapmapchip_file+" > "+intervals_file
cmd(Cmd, die_on_fail = True)
# Setup hapmapchip GFFs; assure that hapmap chip file is here
using_hapmapchip = True
hapmapchip_file = output_basepath+".hapmapchip.gff"
if not exists_and_non_zero(hapmapchip_file):
sys.exit("Expected to find "+hapmapchip_file+" but it's not around")
else:
# Config for pilot 3 Broad data
#ref = " -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
#intervals_file = "/seq/references/HybSelOligos/thousand_genomes_alpha_redesign/thousand_genomes_alpha_redesign.targets.interval_list"
# Config for pilot 3 DCC data (12/16/09 r. 2375)
ref = " -R /broad/1KG/reference/human_b36_both.fasta"
intervals_file = "/humgen/gsa-scr1/GATK_Data/thousand_genomes_alpha_redesign.targets.b36.interval_list"
# Assure that BAM index exists
bam_index_file = output_basepath+".bam.bai"
if not exists_and_non_zero(bam_index_file):
Cmd = "samtools index "+bam_file
cmd(Cmd, die_on_fail = True)
# Run UnifiedGenotyper calls
vcf_outfile = output_basepath+".calls.vcf"
if not exists_and_non_zero(vcf_outfile):
print "Running UnifiedGenotyper"
Cmd = gatk_exe + ref + " -T UnifiedGenotyper"+\
" -I "+bam_file+\
" -varout "+vcf_outfile+\
" -L "+intervals_file+\
" -fmq0 -l INFO "
#" --genotype -lod 5"
#+interval_string+\
cmd(Cmd, die_on_fail=True)
# Run VariantEval on UnifiedGenotyper calls
varianteval_outfile = output_basepath+".varianteval"
if not exists_and_non_zero(varianteval_outfile):
print "Running VariantEval with output to "+varianteval_outfile
Cmd = gatk_exe + ref + " -T VariantEval"+\
" -B dbsnp,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod"+\
" -B eval,VCF,"+ssg_outfile+\
" -o "+varianteval_outfile+\
" -L "+intervals_file+\
" -G -V"+\
" --evalContainsGenotypes"+\
" -minPhredConfidenceScore 150"+\
" -fmq0 -l INFO"
if using_hapmapchip:
Cmd = Cmd + " -hc "+hapmapchip_file
cmd(Cmd, die_on_fail=True)
# Run CoverageHist on BAM file
hist_outfile = output_basepath+".hapmap.coverage_hist"
if not exists_and_non_zero(hist_outfile):
print "Running CoverageHist on "+bam_file
Cmd = gatk_exe + ref + " -T DepthOfCoverage"+\
" -I "+bam_file+\
" -o "+hist_outfile+\
" -L "+intervals_file+\
" -histogram --suppressLocusPrinting"+\
" -fmq0 -l INFO"
cmd(Cmd, die_on_fail=True)#,"gsa", output_basepath)
# Now do CoverageEval predictions if they haven't been done
oneline_stats_file = output_basepath+".oneline_stats"
if not exists_and_non_zero(oneline_stats_file):
Cmd = "CoverageEval.py -e"+\
" -s /humgen/gsa-scr1/projects/1kg_pilot2/coverage_eval/fixed_stats/current/NA12878.chr1.full.stats"+\
" -g "+hist_outfile+\
" -v "+varianteval_outfile+\
" -o "+oneline_stats_file
cmd(Cmd, die_on_fail=True)
if __name__ == "__main__":
parser = OptionParser()
parser.add_option("-b", "--bam-file", help="Input BAM filename", dest="bam_file", default=None)
parser.add_option("-a", "--all_bams", help="Process all BAMs in current directory", default=False, dest="all_bams", action="store_true")
(options, args) = parser.parse_args()
if not options.all_bams:
parser.check_required(("-b",))
process_bam(options.bam_file)
else:
bams = [f for f in os.listdir(".") if f.endswith(".bam")]
print bams
for bam in bams:
process_bam(bam)
#print

View File

@ -1,47 +0,0 @@
#!/util/bin/python
import math, string, sys
for input_file in sys.argv[1:]:
FILENAME = input_file
# Read the depth of coverage distribution from file
depth = []; count = []
for line in open(FILENAME):
try:
int(line[0])
split = string.split(line,' ')
if int(split[1]) != 0:
depth.append(int(split[0]))
count.append(int(split[1]))
except ValueError: pass
# Calculate the mode
mode = depth[count.index(max(count))]
# Find the index of maximum extent of 'good' data
idx = depth.index(mode); dist = 10**9
while abs(count[idx] - count[0]) < dist:
dist = abs(count[idx] - count[0])
idx += 1
model_count = count[:idx + 1]
# Calculate mean & variance of 'good' data
model_tot = sum(model_count); adder = 0
for i in range(len(model_count)): adder += depth[i]*model_count[i]
mean = adder/float(model_tot)
adder = 0
for i in range(len(model_count)): adder += model_count[i]*(mean - depth[i])**2
var = adder/float(model_tot)
stdev = var**0.5
# Output Thresholds
print FILENAME
print 'Mean Depth of Coverage: ' + str(mean)
print 'Plus One Std Dev: ' + str(mean + stdev)
print 'Plus Two Std Dev: ' + str(mean + 2*stdev)
print 'Plus Three Std Dev: ' + str(mean + 3*stdev)
print 'Plus Four Std Dev: ' + str(mean + 4*stdev)
print 'Plus Five Std Dev: ' + str(mean + 5*stdev)

View File

@ -1,309 +0,0 @@
#!/usr/bin/env python
# Evaluates mapping results produced my RunMapper.py by comparig
# the result to the original simulated reads in a sam file
import getopt, sys, os, re
from operator import attrgetter, itemgetter
from subprocess import Popen, STDOUT, PIPE
from SAM import SAMIO
from pushback_file import pushback_file
from qltout import qltout_file
from aln_file import aln_file
#!/usr/bin/env python
import string, sys
class SAMRecordWrapper:
def __init__(self, rec):
#print 'Rec', rec
self.rec = rec
def id(self): return self.rec.getQName()
def contig(self): return self.rec.getRname()
def pos(self): return self.rec.getPos()
def offset(self): return self.pos()-1
def map_qual(self): return self.rec.getMapq()
def __str__(self): return str(self.rec)
def SAMIOWrapper( file ):
print 'SAMIOWrapper', file
return SAMIO( file, debugging=True, func=SAMRecordWrapper)
class mapper_info:
def __init__(self, name, ext, file_class):
self.name = name.lower()
self.ext = ext
self.file_class = file_class
mapper_defs = [ mapper_info("ilt", "ilt.qltout", qltout_file),
mapper_info("merlin", "qltout", qltout_file),
mapper_info("swmerlin", "qltout", qltout_file),
mapper_info("bwa", "sam", SAMIOWrapper),
mapper_info("bwa32", "sam", SAMIOWrapper),
mapper_info("maq", "aln.txt", aln_file) ]
mapper_defs = dict( [(m.name, m) for m in mapper_defs] )
def Comp_sam_map(comp_head, mapper, file_output, sam_indel):
# sam_indel is a +/- number corresponding to the size of insertions / deletions
# in generated SAM file that are fixed (1,2,4,8,16...), this doesn't always
# translate to the SAM's cigar, but the full value should be used
file_output_head = comp_head+"."+mapper.name
if file_output:
# if non-false, output to filename in file_output_head
# if "" or False, leave output going to stdout
#saveout = sys.stdout
feval = open(file_output_head+".eval", "w")
#sys.stdout = feval
# create a file with a one-line tabular output of stats
# (for merging into Excel / R input)
fevaltab = open(file_output_head+".tab", "w")
# if maq file, convert aln.map to aln.txt
aln_txt_filename = file_output_head+"."+mapper.ext
if mapper.name == "maq" and not os.path.exists(aln_txt_filename):
maq_exe = "/seq/dirseq/maq-0.7.1/maq"
# SHOULD BE THIS
#cmd_str = maq_exe+" mapview "+file_output_head+".out.aln.map"
cmd_str = maq_exe+" mapview "+file_output_head+".out.aln.map"
print >> sys.stderr, "Executing "+cmd_str
fout = open(aln_txt_filename, "w")
p = Popen( cmd_str , shell=True, stdout=fout)
map_filename = comp_head+"."+mapper.name+"."+mapper.ext
print 'map_filename', map_filename
map_file = mapper.file_class(map_filename)
SAM_filename = comp_head+".sam"
sam_file = SAMIO(SAM_filename, debugging=True)
#sams = [s for s in sam_file]
print "Reading map file..."
maps = [q for q in map_file]
print "Reading SAM file..."
sams = {}
if mapper.name in ["maq", 'bwa', 'bwa32']:
for s in sam_file:
sams[s.getQName()] = s
else:
for i, s in enumerate(sam_file):
sams[i] = s
#maps = {}
#for m in map_file:
# maps[m.id()] = m
#sams = [s for s in sam_file]
class smq_t:
def __init__(self, sam, map, qual):
self.sam = sam # SAM record (exists for all reads)
self.map = map # Mapping record, may be None
self.qual = qual # Quality of mapping
print "Making SMQ records..."
SMQ = []
for maprec in maps:
samrec = sams.get(maprec.id()) # get mapping otherwise None
SMQ.append( smq_t(samrec, maprec, maprec.map_qual()) )
print "Sorting..."
SMQ.sort(key=attrgetter('qual'), reverse=True)
print "Evaluating placements..."
placed = 0
incorrectly_placed = 0
correctly_placed = 0
fevaltab.write( "\t".join( ["last_qual", "total_reads", "placed", "correctly_placed", "incorrectly_placed", "not_placed"] ) + "\n" )
if len(SMQ):
last_qual = SMQ[0].qual # grab 1st qual
for smq in SMQ:
if smq.sam == None:
continue
if smq.qual != last_qual:
total_reads = len(sams)#placed + not_placed
not_placed = len(sams) - placed
fevaltab.write( "\t".join( map(str, [last_qual, total_reads, placed, correctly_placed, incorrectly_placed, not_placed]) ) + "\n" )
#print "Wrote all reads with qual >= "+str(last_qual)
last_qual = smq.qual
placed += 1
# Convert the CIGAR string - e.g. 75M1I, 74M2D - into an int corresponding to bases
# inserted / deleted - e.g. +1, -2
#cigar = smq.sam.getCigar()
#indelled_bases = 0
#match = re.search(r'(\d+)I', cigar)
#if match:
# indelled_bases += int(match.groups()[0])
#match = re.search(r'(\d+)D', cigar)
#if match:
# indelled_bases -= int(match.groups()[0])
# We consider a read properly aligned if
# - both start positions are the same OR
# - the end position of the mapping = end position of the samq
#if smq.sam.getPos() != smq.map.pos() and smq.sam.getPos() - indelled_bases != smq.map.pos():
try:
map_indel_bases = smq.map.indelled_bases() # total number of indels in mapped seq
# are generally at the beginning of an alignment spanning the anchor base
except AttributeError:
map_indel_bases = 0 # Since MAQ doesn't do local alignment, we don't care
# if its sam.py interface doesn't have access to indel data since there are
# no indels in its alignmnets
if smq.sam.getPos() != smq.map.pos() and smq.map.pos() + sam_indel + map_indel_bases != smq.sam.getPos():
#print >> feval, "Indelled SAM bases: "+str(indelled_bases)
try:
print >> feval, "Indelled map bases: "+str(map_indel_bases)
except AttributeError:
pass
print >> feval, "SAM ref: %5s %s" % (smq.sam.getRname(), smq.sam.getPos())
print >> feval, "Mapping: %5s %s" % (smq.map.contig(), smq.map.pos())
print >> feval, "SAM record:\n"+str(smq.sam)
print >> feval, "Mapping record:\n"+str(smq.map)+"\n"
incorrectly_placed += 1
else:
correctly_placed +=1
#indexed_maps = [None] * len(sams)
#for q in maps:
# indexed_maps[q.id()] = q
#def f(s,q):
# if q == None:
# return s
# else:
# return False
#missing = filter( None, map(f, sams, indexed_maps))#, range(len(indexed_maps))) )
#for miss in missing:
# print miss
#not_placed = len(missing)
#not_placed = len(sams) - len(maps)
total_reads = len(sams) #placed + not_placed
not_placed = len(sams) - placed
print >> feval, "Total reads : "+str(total_reads)
print >> feval, "+-Placed : "+str(placed)
print >> feval, "| +-Correct : "+str(correctly_placed)
print >> feval, "| +-Incorrect: "+str(incorrectly_placed)
print >> feval, "+-Not placed : "+str(not_placed)
fevaltab.write( "\t".join( map(str, [0, total_reads, placed, correctly_placed, incorrectly_placed, not_placed]) ) + "\n" )
print "Wrote all reads with all quals"
#if file_output_head:
# sys.stdout = saveout
def remove_escape_codes(s):
import re
return re.sub(r'\x1b\[(\d\d)?\w', "", s)
def usage():
print "EvalMapping.py\n"
print "Required arguments:\n"
print " -h HEAD All input and output files start with HEAD"
print " --OR--"
print " <list of *.sam files on STDIN to use as filename templates>"
print " i.e. ls *.sam | EvalMapping.py -m MAPPER"
print
print " -m MAPPER Mapping program output to compare (e.g. merlin, ILT)"
print
print "Optional arguments:\n"
print
print " -o Output to screen instead of HEAD.MAPPER.eval"
print
print "Input files:"
print " HEAD.sam"
print
print "Result files expected for Merlin and ILT:"
print " HEAD.MAPPER.qltout"
print
print "Result files expected for MAQ:"
print
sys.exit(2)
if __name__ == "__main__":
opts = None
try:
opts, args = getopt.getopt(sys.argv[1:], "h:m:o", ["head","mapper","file-output"])
except getopt.GetoptError:
usage()
comp_heads = []
mapper_str = ""
file_output = True
for opt, arg in opts:
if opt in ("-h", "--head"):
comp_heads = [arg]
if opt in ("-m", "--mapper"):
mapper_str = arg
possible_mappers = mapper_defs.keys()
# Check it later now...
#if mapper_str not in possible_mappers:
# print "--mapper argument was \""+mapper_str+"\"; expected one of the following: "+", ".join(possible_mappers)
# sys.exit(2)
if opt in ("-o", "--file-output"):
file_output = False
# Check for required args
if (mapper_str == ""):
usage()
elif (mapper_str == "all"):
mappers = mapper_defs.values()
print mappers
else:
try:
mapper_str_items = mapper_str.split(",")
mappers = [mapper_defs.get(mapper.lower()) for mapper in mapper_str_items]
except KeyError:
print "Don't know this mapper given in -m option: "+mapper
print "Try: ilt, merlin, maq, bwa"
sys.exit(1)
# if we didn't already get a single file from -h
if len(comp_heads) == 0:
# and there's a list on STDIN,
if not sys.stdin.isatty(): # redirected from file or pipe
comp_heads = [ os.path.splitext( remove_escape_codes(file).rstrip() )[0] for file in sys.stdin.readlines() ]
comp_heads = [ f for f in comp_heads if len(f) > 0 ]
else:
# no single file AND no STDIN file list
usage()
print "\nFile heads to evaluate:"
print "\n".join(comp_heads)+"\n"
comp_heads.reverse()
for comp_head in comp_heads: #[:1]
for mapper in mappers:
# Guess the original size of simulated indel from filename
m = re.search(r'_den\.(\d+)_', comp_head)
if m:
if ".DELETION_" in comp_head:
indel_size = -1 * int(m.groups()[0])
elif ".INSERTION_" in comp_head:
indel_size = int(m.groups()[0])
else:
indel_size = 0
else:
sys.exit("Couldn't guess simulated read indel size from filename")
print "Evaluating "+mapper.name+" results with indel size: "+str(indel_size)+" and file header: "+comp_head
Comp_sam_map(comp_head, mapper, file_output, indel_size)

View File

@ -1,65 +0,0 @@
#!/usr/bin/env python
# Utility script to convert FASTA + QUALS file into FASTQ required for MAQ
# MAQ needs to convert fastq into bfq file using fastq2bfq in order to use
# this data
import sys, fasta, os
from itertools import *
def usage():
print "Usage: fastq.py fasta_in quals_in fastq_out"
print " fasta_in: *.fasta file"
print " quals_in: *.quals.txt OR *.quala"
sys.exit()
def output_fastq_record(file_handle, fasta, quals):
#print 'fasta, quals', fasta, quals
#for fasta, qual in zip(fastas, quals):
qualsVector = map(int, quals.seq.split())
print >> fqout, "@"+fasta.id
print >> fqout, fasta.seq
print >> fqout, "+"
print >> fqout, "".join( map(lambda q: chr(q+33), qualsVector ) )
def qualStr2fasta( line ):
elts = line.split()
id = elts[0]
quals = ' '.join(elts[1:])
return fasta.fasta_record(id, quals)
if __name__ == "__main__":
if len(sys.argv[1:]) != 3:
usage()
else:
fasta_in_file, quals_in_file, fastq_out_file = sys.argv[1:]
print "Quals input file: "+quals_in_file
print "Fasta input file: "+fasta_in_file
if os.path.splitext(quals_in_file)[1] == ".txt":
qual_factory = imap( qualStr2fasta, file(quals_in_file) )
else:
if os.path.splitext(quals_in_file)[1] == ".quala":
# Create fasta factory from the quala file and treat it
# as if it was a fasta file and pull the numbers
qual_factory = fasta.fasta_file(quals_in_file, cleanup=False)
else:
print "Qual input file should be a *.quals.txt or *.quala file"
print "Fastq ouptut file: "+fastq_out_file
fqout = file(fastq_out_file, "w")
for qual_record, fasta_record in izip_longest(qual_factory, fasta.fasta_file(fasta_in_file)):
if qual_record == None or fasta_record == None:
# We ran out of one record type, so warn the user!
print "WARNING: Different number of quals and fastas in input files!"
sys.exit(1)
#print 'qual_record', qual_record
#print 'fasta_record', fasta_record
output_fastq_record(fqout, fasta_record, qual_record)

View File

@ -1,40 +0,0 @@
#!/usr/bin/env python
import sys, itertools
def record_generator(filename, sep="\t", skip_n_lines=0, skip_until_regex_line=""):
"""Given a file with field headers on the first line and records on subsequent lines,
generates a dictionary for each line keyed by the header fields"""
fin = open(filename)
if skip_n_lines > 0:
for i in range(skip_n_lines): # Skip a number of lines
fin.readline()
found_regex = False
if skip_until_regex_line != "":
import re
regex_line = re.compile(skip_until_regex_line)
for line in fin:
match = regex_line.search(line)
if match:
found_regex = line
break
if not found_regex:
print "Warning: Regex "+skip_until_regex_line+" not found in FlatFileTable:record_generator"
if found_regex:
header = found_regex.rstrip().split(sep) # Parse header
else:
header = fin.readline().rstrip().split(sep) # Pull off header
for line in fin: #
fields = line.rstrip().split(sep)
record = dict(itertools.izip(header, fields))
yield record
def record_matches_values(record, match_field_values):
for match_field, match_values in match_field_values:
if record[match_field] not in match_values:
return False
return True

View File

@ -1,49 +0,0 @@
#!/usr/bin/env python
# Converts Geli files (genotype likelihood in picard) to GFF format for GATK
import sys, os, subprocess
Index2AffyChr = []
Index2AffyChr.append("chrM")
for i in range(1,23):
Index2AffyChr.append("chr"+str(i))
Index2AffyChr.append("chrX")
Index2AffyChr.append("chrY")
if __name__ == "__main__":
if len(sys.argv) < 3:
print "usage: Geli2GFF.py INPUT_FILE OUTPUT_DIRECTORY"
sys.exit()
else:
infile = sys.argv[1]
outdir = sys.argv[2]
output_file = outdir+"/"+os.path.splitext(os.path.basename(infile))[0]+".gff"
outfile = open(output_file, "w")
cmd = "/seq/software/picard/current/bin/GeliToText.jar I="+infile
pipe = subprocess.Popen(cmd, shell=True, bufsize=4000, stdout=subprocess.PIPE).stdout
pipe.readline()
for line in pipe:
if line[0] not in ('@', '#'):
fields = line.split("\t")
#print fields
try:
contig = fields[0] #Index2AffyChr[int(fields[0])]
except KeyError, ValueError:
print "skipping "+fields[0]
continue # Skip entries not in chr 0 through 24
print >>outfile, contig+"\tHapmapGenotype\t"+fields[5]+"\t"+fields[1]+"\t"+str(int(fields[1])+1)+"\t.\t.\t.\t"

View File

@ -1,170 +0,0 @@
import farm_commands
import os.path
import sys
from optparse import OptionParser
import string
import re
import glob
import picard_utils
import itertools
gatkPath = "~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar"
ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
def geli2dbsnpFile(geli):
root, flowcellDotlane, ext = picard_utils.splitPath(geli)
if ext == ".calls":
return None
else:
return os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
def SingleSampleGenotyperCmd(bam, geli, use2bp):
naid = os.path.split(bam)[1].split(".")[0]
metrics = geli + '.metrics'
gatkPath = '/humgen/gsa-scr1/kiran/repositories/Sting/trunk/dist/GenomeAnalysisTK.jar'
hapmapChip = '/home/radon01/andrewk/hapmap_1kg/gffs/' + naid + '.gff'
hapmapChipStr = ' '.join(['--hapmap_chip', hapmapChip])
if not os.path.exists(hapmapChip):
print '*** warning, no hapmap chip resuls for', naid
hapmapChipStr = ''
targetList = '/home/radon01/depristo/work/1kg_pilot_evaluation/data/thousand_genomes_alpha_redesign.targets.interval_list'
cmd = "java -ea -jar " + gatkPath + ' ' + ' '.join(['-T SingleSampleGenotyper', '-I', bam, '-L', targetList, '-R', ref, '-D', '/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod', '-metout', metrics, '-varout', geli, '-geli -l INFO ']) + hapmapChipStr
return cmd
def bams2geli(bams):
def call1(bam):
geli = os.path.splitext(bam)[0] + '.geli'
jobid = 0
if OPTIONS.useSSG:
if not os.path.exists(geli + '.calls'):
cmd = SingleSampleGenotyperCmd(bam, geli + '.calls', OPTIONS.useSSG2b)
else:
if not os.path.exists(geli):
cmd = picard_utils.callGenotypesCmd( bam, geli, options = picard_utils.hybridSelectionExtraArgsForCalling())
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry )
return geli, jobid
calls = map(call1, bams)
return map(lambda x: x[0], calls), map(lambda x: x[1], calls)
def gelis2gelisText( gelis ):
def geli2geliText( maybeGeli ):
if os.path.splitext(maybeGeli)[1] == ".calls" :
return maybeGeli
else:
return os.path.split(maybeGeli)[1] + '.calls'
return map( geli2geliText, gelis)
def main():
global OPTIONS, ROOT
usage = "usage: %prog lanes.list nIndividuals [options]"
parser = OptionParser(usage=usage)
parser.add_option("-q", "--farmQueue", dest="farmQueue",
type="string", default=None,
help="Farm queue to submit jobs to. Leave blank for local processing")
parser.add_option("-l", "--lod", dest="lod",
type="float", default=5,
help="minimum lod for calling a variant")
parser.add_option("-k", "--column", dest="column",
type="int", default=1,
help="Column in the file with the bam or geli file path")
parser.add_option("", "--dry", dest="dry",
action='store_true', default=False,
help="If provided, nothing actually gets run, just a dry run")
parser.add_option("", "--ssg", dest="useSSG",
action='store_true', default=False,
help="If provided, we'll use the GATK SSG for genotyping")
parser.add_option("", "--ssg2b", dest="useSSG2b",
action='store_true', default=False,
help="If provided, we'll use 2bp enabled GATK SSG ")
parser.add_option("-o", "--output", dest="output",
type="string", default='/dev/stdout',
help="x")
(OPTIONS, args) = parser.parse_args()
if len(args) != 2:
parser.error("incorrect number of arguments: " + str(args))
lines = [line.strip().split() for line in open(args[0])]
nIndividuals = int(args[1])
outputFile = open(OPTIONS.output, 'w')
print >> outputFile, '#', ' '.join(sys.argv)
data = map( lambda x: x[OPTIONS.column-1], lines )
if os.path.splitext(data[0])[1] == '.bam':
gelis, jobids = bams2geli(data)
if filter(lambda x: x <> 0, jobids) <> []:
# there's still work to do
sys.exit('Stopping. Please rerun this program when the farm jobs are complete: ' + str(jobids))
# TODO: Should add a wait here for all farm jobs to finish...
print 'gelis', gelis
print 'jobids', jobids
else:
gelis = map( lambda x: x[OPTIONS.column-1], lines )
jobids = [None] * len(gelis)
print 'Geli files'
print gelis
for geli, jobid in zip(gelis, jobids):
dbsnpFile = geli2dbsnpFile(geli)
if dbsnpFile == None and not os.path.exists(dbsnpFile):
dbsnpCmd = picard_utils.CollectDbSnpMatchesCmd(geli, dbsnpFile, OPTIONS.lod)
if jobid == 0: jobid = None
farm_commands.cmd(dbsnpCmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid)
# TODO: Should add a wait here for all farm jobs to finish...
# read in the dbSNP tracks
nTotalSnps = 0
nNovelSnps = 0
for geli in gelis:
root, flowcellDotlane, ext = picard_utils.splitPath(geli)
#dbsnp_matches = os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
dbsnp_matches = geli2dbsnpFile(geli)
print dbsnp_matches
if os.path.exists(dbsnp_matches):
TOTAL_SNPS, NOVEL_SNPS, PCT_DBSNP, NUM_IN_DB_SNP = picard_utils.read_dbsnp(dbsnp_matches)
nTotalSnps += int(TOTAL_SNPS)
nNovelSnps += int(NOVEL_SNPS)
print >> outputFile, '# DATA: ', flowcellDotlane, TOTAL_SNPS, NOVEL_SNPS, PCT_DBSNP, NUM_IN_DB_SNP, dbsnp_matches
print >> outputFile, '# DATA: TOTAL SNP CALLS SUMMED ACROSS LANES, NOT ACCOUNT FOR IDENTITY', nTotalSnps
print >> outputFile, '# DATA: NOVEL SNP CALLS SUMMED ACROSS LANES, NOT ACCOUNT FOR IDENTITY ', nNovelSnps
print >> outputFile, '# DATA: AVERAGE DBSNP RATE ACROSS LANES %.2f' % (100.0 * float(nTotalSnps - nNovelSnps) / (max(nTotalSnps, 1)))
# convert the geli's to text
jobid = None
variantsOut = gelis2gelisText( gelis )
for geli, variantOut in zip(gelis, variantsOut):
name = os.path.split(geli)[1]
if not os.path.exists(variantOut):
cmd = ("GeliToText.jar I=%s | awk '$1 !~ \"@\" && $1 !~ \"#Sequence\" && $0 !~ \"GeliToText\"' | awk '$7 > %f {print \"%s\" $0}' > %s" % ( geli, OPTIONS.lod, name, variantOut) )
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry)
cmd = ("cat %s | sort -k 1 -k 2 -n > tmp.calls" % ( ' '.join(variantsOut) ) )
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid)
sortedCallFile = 'all.sorted.calls'
cmd = ("~/dev/GenomeAnalysisTK/trunk/perl/sortByRef.pl -k 1 tmp.calls ~/work/humanref/Homo_sapiens_assembly18.fasta.fai > %s" % ( sortedCallFile ) )
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid)
if not OPTIONS.dry:
sortedCalls = [line.split() for line in open(sortedCallFile)]
aggregratedCalls = picard_utils.aggregateGeliCalls(sortedCalls)
print >> outputFile, 'loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom'
for snp in map( lambda x: picard_utils.aggregatedGeliCalls2SNP(x, nIndividuals), aggregratedCalls ):
if snp == None: continue # ignore bad calls
#print snp
#sharedCalls = list(sharedCallsGroup)
#genotype = list(sharedCalls[0][5])
print >> outputFile, '%s %s %s %.6f -0.0 -0.0 0.000000 100.0 %d %d %d %d' % (snp.loc, snp.ref, snp.alt(), snp.q(), nIndividuals, snp.nRefGenotypes(), snp.nHetGenotypes(), snp.nHomVarGenotypes())
outputFile.close()
if __name__ == "__main__":
main()

View File

@ -1,48 +0,0 @@
#!/usr/bin/env python
import subprocess, time, sys
dinucs = ("AA","AC","AG","AT","CA","CC","CG","CT","GA","GC","GG","GT","TA","TC","TG","TT")
#covar_work_dir = "/humgen/gsa-scr1/andrewk/recalibration_work"
#covar_table_dir = "/humgen/gsa-scr1/andrewk/recalibration_tables"
fileroot = sys.argv[1]
covar_counts_root = fileroot+".covariate_counts"
parameter_root = fileroot+".log_reg"
recal_table = fileroot+".recalibration_table"
# Run all R process to do regression and wait for completion
kids = []
for dinuc in dinucs:
kids.append(subprocess.Popen(["/home/radon01/andrewk/covariates/logistic_regression.R", covar_counts_root, parameter_root, dinuc]))
#kids.append(subprocess.Popen(["sleep", "8"]))
while any(kid.poll() is None for kid in kids):
time.sleep(0.25)
fout = file(recal_table, "w")
fout.write("dinuc_v.1\t")
for p in range(5):
for q in range(5):
fout.write("%d,%d\t" % (p,q))
fout.write("\n")
for dinuc in dinucs:
dinin = open(parameter_root+"."+dinuc+".parameters")
#dinin.readline()
params = []
for line in dinin:
line.rstrip("\n")
params.extend(map(float, line.split()))
fout.write(dinuc+"\t")
fout.write("\t".join(map(str, params)))
fout.write("\n")

View File

@ -1,99 +0,0 @@
#!/usr/bin/env python
import os,sys
def exit(msg,errorlevel):
"Exit the program with the specified message and error code."
print msg
sys.exit(errorlevel)
def open_source_file(source_filename):
"Open the source file with the given name. Make sure it's readable."
if not os.access(source_filename,os.R_OK):
exit("Unable to read covariate counts file '" + sys.argv[1] + "'",1)
if not source_filename.endswith('.csv'):
exit("Source file is in incorrect format. Must be csv.")
return open(source_filename,'r')
def read_header(source_file):
"Read the header from the given source file. Do basic validation."
header = source_file.readline().split(',')
if header[0] != 'rg' or header[1] != 'dn':
exit("Input file is in invalid format. First two columns should be <read group>,<dinucleotide>",1)
return header
def create_intermediate_file(source_file,header,read_group,dinuc):
"Create an intermediate file for a particular read group / dinuc from a given source file"
base = source_file.name[:source_file.name.rfind('.csv')]
intermediate_filename = "%s.%s.%s.csv" % (base,read_group,dinuc)
intermediate_file = open(intermediate_filename,"w")
intermediate_file.write(','.join(header[2:]))
return intermediate_file
def open_target_file(target_filename):
"Open a target file and write out the header."
target_file = open(target_filename,'w')
target_file.write("rg\tdinuc\t")
for p in range(5):
for q in range(5):
target_file.write("%d,%d\t" % (p,q))
target_file.write("\n")
return target_file
def process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script):
"Process the contents of an intermediate file. An intermediate file is the specific data arranged per read group, per dinuc."
base = source_file.name[:source_file.name.rfind('.csv')] + '.' + read_group
regression_command = ' '.join((R_exe,logistic_regression_script,base,base,dinuc))
result = os.system(regression_command)
if result != 0:
exit('Unable to run linear regression; command was %s, error code was %d' % (regression_command, result),1)
parameters_filename = '.'.join((base,dinuc,'parameters'))
if not os.access(parameters_filename,os.R_OK):
exit("Unable to read output of R from file " + parameters_filename)
parameters_file = open(parameters_filename,'r')
parameters = ' '.join([line.rstrip('\n') for line in parameters_file]).split(' ')
target_file.write('\t'.join([read_group,dinuc]+parameters)+'\n')
# optionally remove the input and output files. For now, keep them around for debugging purposes.
# os.remove('.'.join((base,dinuc,'csv')))
# os.remove('.'.join((base,dinuc,'parameters')))
def compute_logistic_regression(source_filename,target_filename,R_exe,logistic_regression_script):
"Do the heavy lifting. Group covariant data into individual output files, compute the logistic regression, and format the results."
source_file = open_source_file(source_filename)
target_file = open_target_file(target_filename)
header = read_header(source_file)
intermediate_file = None
read_group = None
dinuc = None
for data_line in source_file:
data_line = data_line.strip()
if len(data_line) == 0:
continue
data = data_line.split(',')
if read_group != data[0] or dinuc != data[1]:
if intermediate_file:
intermediate_file.close()
process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script)
read_group,dinuc = data[0:2]
intermediate_file = create_intermediate_file(source_file,header,read_group,dinuc)
intermediate_file.write(','.join(data[2:])+'\n')
if intermediate_file:
intermediate_file.close()
process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script)
source_file.close()
target_file.close()
if __name__ == "__main__":
# set up sensible defaults for the locations of R and the logistic regression script
R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
logistic_regression_script="resources/logistic_regression.R"
if len(sys.argv) < 3:
exit("Usage: logistic_regression <covariate count input csv> <parameters csv>",1)
compute_logistic_regression(sys.argv[1],sys.argv[2],R_exe,logistic_regression_script)

View File

@ -1,78 +0,0 @@
import farm_commands
import os.path
import sys
from optparse import OptionParser
from datetime import date
import glob
import operator
import ValidateGATK
import picard_utils
from MergeBAMsUtils import *
def splitSourcesByPopulation(allSources, merged_filename_base, NAID2Pop):
sourcePairs = [[source, source] for source in allSources]
return groupSources(sourcePairs, NAID2Pop, merged_filename_base)
if __name__ == "__main__":
usage = "usage: %prog files.list [options]"
parser = OptionParser(usage=usage)
parser.add_option("-q", "--farm", dest="farmQueue",
type="string", default=None,
help="Farm queue to send processing jobs to")
parser.add_option("-d", "--dir", dest="output_dir",
type="string", default="./",
help="Output directory")
parser.add_option("", "--dry", dest="dry",
action='store_true', default=False,
help="If provided, nothing actually gets run, just a dry run")
parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles",
action='store_true', default=False,
help="Ignores already written files, if present")
parser.add_option("-s", "--useSamtools", dest="useSamtools",
action='store_true', default=False,
help="If present, uses samtools to perform the merge")
parser.add_option("-m", "--mergeBin", dest="mergeBin",
type="string", default=None,
help="Path to merge binary")
parser.add_option("-n", "--naIDPops", dest="NAIDS2POP",
type="string", default=None,
help="Path to file contains NAID POP names. If provided, input files will be merged by population")
parser.add_option("-p", "--pop", dest="onlyPop",
type="string", default=None,
help="If provided, only this population will be processed")
(OPTIONS, args) = parser.parse_args()
if len(args) != 1:
parser.error("incorrect number of arguments")
directory = OPTIONS.output_dir
if not os.path.exists(directory):
os.mkdir(directory)
NAID2Pop = None
if OPTIONS.NAIDS2POP <> None:
NAID2Pop = readNAIdMap(OPTIONS.NAIDS2POP)
for line in open(args[0]):
s = line.split()
if ( s <> [] and s[0] <> '#' ):
merged_filename_base = s[0]
allSources = reduce( operator.__add__, map( glob.glob, s[1:] ), [] )
print 'Merging info:'
for spec in splitSourcesByPopulation(allSources, merged_filename_base, NAID2Pop):
if OPTIONS.onlyPop <> None and spec.group() <> OPTIONS.onlyPop:
continue
spec.setPath(directory)
spec.pprint()
jobid = None
if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAM()):
output = spec.getMergedBase()
cmd = spec.mergeCmd(OPTIONS.mergeBin, useSamtools = OPTIONS.useSamtools)
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry)
if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAMIndex()):
jobid = farm_commands.cmd(spec.getIndexCmd(), OPTIONS.farmQueue, None, waitID = jobid, just_print_commands = OPTIONS.dry)

View File

@ -1,210 +0,0 @@
import farm_commands
import os.path
import sys
from optparse import OptionParser
from datetime import date
import glob
import operator
import ValidateGATK
import picard_utils
import operator
MERGE_BIN = '/seq/software/picard/current/bin/MergeSamFiles.jar'
bam_ext = 'bam'
bam_index_ext = 'bai'
def readNAIdMap(NAIdFile):
m = dict()
for data in [line.split() for line in open(NAIdFile)]:
naid, pop = data[0:2]
print naid, ' => ', pop
assert naid not in m
m[naid] = pop
print 'Read NAID->population map'
print 'Contains', len(m), 'id -> population mappings'
print 'Distinct populations:', picard_utils.unique(m.values())
return m
_abbrevs = [
(1<<50L, 'P'),
(1<<40L, 'T'),
(1<<30L, 'G'),
(1<<20L, 'M'),
(1<<10L, 'k'),
(1, '')
]
def greek(size):
"""Return a string representing the greek/metric suffix of a size"""
for factor, suffix in _abbrevs:
if size > factor:
break
return '%.1f%s' % (float(size)/factor, suffix)
class MergeFilesSpec:
def __init__(self, sources, group, merged_filename_base, path = '' ):
self.sourceFiles = sources
self.groupName = group
self.merged_filename_base = merged_filename_base
self.path = ''
def __str__(self):
return 'MergeFilesSpec: ' + self.group()
def group(self):
return self.groupName
def sources(self):
return self.sourceFiles
def filename(self):
if self.merged_filename_base <> None:
return self.merged_filename_base + '.'.join([self.group()])
else:
return self.group()
def pprint(self):
print '--------------------------------------------------------------------------------'
print ' Population: ', self.group()
print ' Merged filename: ', self.getMergedBAM()
print ' N sources: ', len(self.sources())
for source in sorted(self.sources()):
print ' Source: ', source
print ' Sizes: ', self.sourceSizes(humanReadable=True)
print ' Est. merged size: ', greek(reduce(operator.__add__, self.sourceSizes(), 0))
def setPath(self, path):
self.path = path
def getMergedBase(self):
return os.path.join(self.path, self.filename())
def getMergedBAM(self):
return self.getMergedBase() + '.' + bam_ext
def getMergedBAMIndex(self):
return self.getMergedBase() + '.' + bam_ext + '.' + bam_index_ext
def sourceSizes(self, humanReadable=False):
sizes = map( os.path.getsize, self.sources() )
if humanReadable:
sizes = map(greek, sizes)
return sizes
def mergeCmd(self, mergeBin = None, MSD = True, useSamtools = False):
if mergeBin == None:
mergeBin = MERGE_BIN
return picard_utils.mergeBAMCmd(self.getMergedBAM(), self.sources(), mergeBin, MSD = MSD, useSamtools = useSamtools)
def getIndexCmd(self):
return "samtools index " + self.getMergedBAM()
# Very general-purpose operation that returns merge specs given two lists of pairs:
# The first list, sources, contains superkey / sourceFile pairs.
# The second list, groups, contains key / group pairs.
#
# This function walks over all source pairs, and for each superkey, it
# looks for any key within groups contained within superkey. If it finds it,
# it associates sourceFile with the merge group in groups.
#
# The system requires that superkey match one (and only) one key in groups. It also
# requires that each group string be unique. The system can handle groups provided
# as a list of pairs of a dictionary.
#
# The function returns a list of MergeFileSpecs, one for each group with
# at least one associated sourceFile.
#
def groupSources(sources, groups, merged_filename_base):
if groups == None:
return [MergeFilesSpec(map( lambda x: x[1], sources), '', merged_filename_base)]
else:
specs = dict()
if type(groups) == list: groups = dict(groups)
for superkey, sourceFile in sources:
spec = None
for key, group in groups.iteritems():
#print 'Examining', superkey, key, group
if superkey.find(key) <> -1:
if group in specs:
spec = specs[group]
else:
spec = MergeFilesSpec([], group, merged_filename_base)
specs[group] = spec
print 'Mapping', group, key, superkey, sourceFile
spec.sourceFiles.append(sourceFile)
if spec == None:
sys.exit('File contains an unknown superkey: ' + superkey)
v = specs.values()
v.sort(key = MergeFilesSpec.group)
return v
import unittest
class TestMergeBAMsUtils(unittest.TestCase):
def setUp(self):
import cStringIO
groupsString = """NA10846 ceu CEPH1
NA10847 ceu CEPH1
NA12144 ceu CEPH1
NA12145 ceu CEPH1
NA12146 yri CEPH1
NA12239 yri CEPH1
NA07029 ceu CEPH1
NA07019 ceu CEPH1
NA06994 ceu CEPH1
NA07000 ceu CEPH1
NA07022 ceu CEPH1
NA07056 ceu CEPH1
NA07048 ceu CEPH1
NA06991 ceu CEPH1
NA07034 ceu CEPH1
"""
lanesString = """NA10846 30GA9AAXX 1 Paired CEPH 30GA9AAXX.1.observed_genotypes.geli
NA10846 30GA9AAXX 6 Paired CEPH 30GA9AAXX.6.observed_genotypes.geli
NA10847 30GA9AAXX 7 Paired CEPH 30GA9AAXX.7.observed_genotypes.geli
NA12146 30JLTAAXX 2 Paired CEPH 30JLTAAXX.2.observed_genotypes.geli
NA12239 30PNVAAXX 1 Paired CEPH 30PNVAAXX.1.observed_genotypes.geli
NA12144 30PYMAAXX 1 Paired CEPH 30PYMAAXX.1.observed_genotypes.geli
NA12146 30PYMAAXX 7 Paired CEPH 30PYMAAXX.7.observed_genotypes.geli
"""
self.laneIDCounts = dict([["NA10846", 2], ["NA10847", 1], ["NA12146", 2], ["NA12239", 1], ["NA12144", 1]])
pops = [line.strip() for line in cStringIO.StringIO(groupsString)]
lanes = [line.strip() for line in cStringIO.StringIO(lanesString)]
print pops
print lanes
self.ids2pop = [line.split()[0:2] for line in pops]
self.ids2gelis = [[line.split()[0], line.split()[5]] for line in lanes]
self.ids2ids = dict([[line.split()[0]] * 2 for line in lanes])
def testPopGroups(self):
specs = groupSources(self.ids2gelis, self.ids2pop, 'foo')
print 'Specs', specs
self.assertEqual(len(specs), 2)
self.assertEqual(specs[0].group(), 'ceu')
self.assertEqual(specs[1].group(), 'yri')
ceu = specs[0]
yri = specs[1]
#print ceu.sources()
self.assertEqual(len(ceu.sources()), 4)
self.assertEqual(len(yri.sources()), 3)
self.assertEqual(ceu.getMergedBAM(), 'foo.ceu.bam')
self.assertEqual(ceu.getMergedBAMIndex(), 'foo.ceu.bam.bai')
self.assertEqual(yri.getMergedBAM(), 'foo.yri.bam')
self.assertEqual(yri.getMergedBAMIndex(), 'foo.yri.bam.bai')
def testIDGroups(self):
specs = groupSources(self.ids2gelis, self.ids2ids, 'foo')
self.assertEqual(len(specs), 5)
for spec in specs:
print 'Spec', spec
self.assertEqual(len(spec.sources()), self.laneIDCounts[spec.group()])
self.assertEqual(spec.getMergedBAM(), 'foo.' + spec.group() + '.bam')
if __name__ == '__main__':
unittest.main()

View File

@ -1,82 +0,0 @@
import farm_commands
import os.path
import sys
from optparse import OptionParser
import picard_utils
from MergeBAMsUtils import *
def splitSourcesByKeys( bams, keys ):
keyPairs = [[key, key] for key in keys]
keybamPairs = zip(keys, bams)
return groupSources(keybamPairs, keyPairs, None)
if __name__ == "__main__":
usage = """usage: %prog bams.list [options]
Merges BAM files by keys from a file of a list of bams.
bams.list is a whitespace separated file. One column (--keyCol arg) is the key, and another
column (--bamCol) is a path to a bam file. This program will group the bam files
by key and spawn merge and index jobs to merge all of the files sharing the same key together"""
parser = OptionParser(usage=usage)
parser.add_option("-q", "--farm", dest="farmQueue",
type="string", default=None,
help="Farm queue to send processing jobs to")
parser.add_option("-d", "--dir", dest="output_dir",
type="string", default="./",
help="Output directory")
parser.add_option("", "--dry", dest="dry",
action='store_true', default=False,
help="If provided, nothing actually gets run, just a dry run")
parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles",
action='store_true', default=False,
help="Ignores already written files, if present")
parser.add_option("-m", "--mergeBin", dest="mergeBin",
type="string", default=None,
help="Path to merge binary")
parser.add_option("", "--MSD", dest="MSD",
action='store_true', default=False,
help="Merge sequence dictionaries?")
parser.add_option("", "--keyCol", dest="keyCol",
type=int, default=1,
help="Column in the list file holding the key")
parser.add_option("", "--bamCol", dest="bamCol",
type=int, default=2,
help="Column in the list file holding the bam file path")
parser.add_option("-l", "--link", dest="link",
action='store_true', default=False,
help="If true, program will soft link single bam files that don't need merging")
(OPTIONS, args) = parser.parse_args()
if len(args) != 1:
parser.error("incorrect number of arguments")
directory = OPTIONS.output_dir
if not os.path.exists(directory):
os.mkdir(directory)
bamsList = [line.strip().split() for line in open(args[0])]
keys = map( lambda x: x[OPTIONS.keyCol-1], bamsList )
bams = map( lambda x: x[OPTIONS.bamCol-1], bamsList )
print 'Merging info:'
for info in bamsList: print info
for spec in splitSourcesByKeys(bams, keys):
spec.setPath(directory)
spec.pprint()
jobid = None
if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAM()):
output = spec.getMergedBase() + '.stdout'
if len(spec.sources()) == 1 and OPTIONS.link:
cmd = 'ln -s ' + spec.sources()[0] + ' ' + spec.getMergedBAM()
else:
cmd = spec.mergeCmd(OPTIONS.mergeBin, MSD = OPTIONS.MSD)
print cmd
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, just_print_commands = OPTIONS.dry)
if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAMIndex()):
#pass
jobid = farm_commands.cmd(spec.getIndexCmd(), OPTIONS.farmQueue, None, waitID = jobid, just_print_commands = OPTIONS.dry)

View File

@ -1,154 +0,0 @@
#!/usr/bin/env python
# merges .tab files produced by EvalMapping.py into a tabular format
#from __future__ import print_function # python 2.6 / 3.0 print as a function
import os,sys,re
from rpy2 import *
def ordered_eval_files(aligner, mutation_type, file_ext):
tab_files = [f for f in os.listdir(".") if f.endswith("."+aligner+"."+file_ext) and (mutation_type in f or "None_den." in f)]
tabs = []
for f in tab_files:
num = int(re.search(r'den\.(\d\d?)_', f).groups()[0])
if "None_den" in f:
num = 0
tabs.append( (num,f) )
tabs.sort()
return tabs
def anchor_hist(filename):
fin = open(filename)
offsets = []
read_poses = []
for line in fin:
id = None
fields = line.split()
if len(fields) > 0 and ".read." in fields[0]:
id = line.split()[0]
read_poses.append(id.split(".")[-1])
offsets.append(id.split(":")[1].split(".")[0])
#if "read name :" in line:
# id = line.split()[3]
import rpy2.robjects as robj
if len(read_poses):
print len(read_poses)
return robj.r.hist(robj.FloatVector(read_poses), breaks=77, plot=False)
else:
print 0
return robj.IntVector([])
def estimateCPUTime(filename):
times = []
if os.path.exists(filename):
regex = re.compile('CPU time\s*:\s+([\d\.]+) sec')
#print ('Filename', filename)
def extractTime1(line):
sobj = regex.search(line)
if sobj <> None:
time = float(sobj.group(1))
#print (line, sobj, sobj.groups(), time)
return time
else:
return None
times = [time for time in map(extractTime1, open(filename)) if time <> None]
#print('times', times)
return max(times)
def print_eval_tables(aligner, mut_type, num_var, filename, min_qual_cutoff=0):
filebase = os.path.splitext(filename)[0]
cpuTime = estimateCPUTime(filebase + '.stdout')
lines = open(filename).readlines()
data = None
if len(lines) > 1:
da_line = lines[0]
for line in lines:
line = line.rstrip()
fields = line.split("\t")
try:
qual_cutoff = int(fields[0])
except:
qual_cutoff = 1000
if qual_cutoff <= min_qual_cutoff:
break
else:
da_line = line
#print ("%2d\t" % num_var, file=sys.stderr),
data = [aligner, mut_type, min_qual_cutoff, num_var] + map(int, da_line.split()) + [cpuTime]
print "%s\t%s\t%2d\t%s\t%.2f" % (aligner, mut_type, num_var, da_line, cpuTime)
return data
def plot_anchor_hists():
import rpy2.robjects as robj
#robj.r.par( mfrow = robj.RVector([2,2]) )
#robj.r('X11(width=24, height=15)')
robj.r('png("lotto_histo.png", width=1500, height=900)')
robj.r('par(mfrow=c(4,6 ), cin=c(3,3))')
for (aligner, cutoff) in (("maq", 30), ("ilt",-1), ("merlin",-1), ("swmerlin",-1)):
print aligner, ">"+str(cutoff)
num_file = ordered_eval_files(aligner, mut_type, file_ext="eval")
#print (*num_file, sep="\n")
for num, file in num_file:
h = anchor_hist( file )
title = "{aligner} {num} bp insert".format(aligner=aligner, num=num)
robj.r.plot(h, xlab="Anchor position", main=title, col="blue", ylim=robj.FloatVector([0,40]), xlim=robj.FloatVector([0,90]))
robj.r("dev.off()")
#raw_input("Press any key to exit...")
def analyzeData( data, mut_type ):
import rpy2.robjects as robj
# print name / data name
aligners = [('MAQ', 'maq>30'), ('Merlin', 'swmerlin>-1'), ('ILT', 'ilt>-1'), ('BWA', 'bwa32>30')]
def selectData( key ):
def keepDataP( data ):
return data[0] == key and data[1] == mut_type
return filter( keyDataP, data )
def placeRatePlot():
robj.r('png("x.png", width=1500, height=900)')
x = robj.FloatVector(mutDensity)
y = robj.FloatVector(placeRate)
robj.r.plot(x, y, xlab="Mutation density / size", col="blue")
robj.r("dev.off()")
return 0
if __name__ == "__main__":
if len(sys.argv) < 2:
print "merge_tabs.py MUTATION_TYPE (e.g. INSERTION, DELETION, SNP)"
sys.exit(1)
else:
mut_type = sys.argv[1]
if False:
plot_anchor_hists()
else:
data = []
for (aligner, cutoff) in (("maq", 30), ("maq", 0), ("maq",-1), ("ilt",-1), ("merlin",-1), ("swmerlin",-1), ("bwa", 30), ("bwa", 0), ("bwa", -1), ("bwa32", 30), ("bwa32", 0), ("bwa32", -1)):
name = aligner + ">" + str(cutoff)
#print (aligner, ">"+str(cutoff))
num_file = ordered_eval_files(aligner, mut_type, file_ext="tab")
#num_file = ordered_eval_files(aligner, mut_type, file_ext="eval")
for num, file in num_file:
datum = print_eval_tables( name, mut_type, num, file, cutoff )
if datum <> None:
data.append(datum)
#print
analyzeData( data, mut_type )

View File

@ -1,221 +0,0 @@
#!/usr/bin/env python
# Executable files. Please update these to match the installed locations of your tools.
samtools_exe='/seq/dirseq/samtools/current/samtools'
java_exe='/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/bin/java'
R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
# Any special site-specific arguments to pass the JVM.
jvm_args='-ea'
# Which platforms should the calibration tool be run over?
platforms=['illumina']
# Where to put the output created as part of recalibration.
# If editing, please end this variable with a trailing slash.
output_root = './'
# Location of the resource files distributed with the recalibration tool.
# If editing, please end this variable with a trailing slash.
resources='recalibratorResources/'
#resources='/broad/1KG/DCC_recal/ReadQualityRecalibrator/resources/'
import getopt,glob,os,sys
import LogisticRegressionByReadGroup
# if the provided resources path is relative, convert it to a path
# relative to the calling script rather than the cwd.
if not os.path.isabs(resources):
application_path = os.path.dirname(sys.argv[0])
resources = os.path.abspath(application_path) + '/' + resources
# Where does the reference live?
reference_base = resources + 'Homo_sapiens_assembly18'
reference = reference_base + '.fasta'
reference_dict = reference_base + '.dict'
reference_fai = reference_base + '.fasta.fai'
# Where does DBSNP live?
dbsnp = resources + 'dbsnp_129_hg18.rod'
# Where are the application files required to run the recalibration?
#gatk = resources + 'gatk/GenomeAnalysisTK.jar'
gatk = '/home/radon01/depristo/dev/GenomeAnalysisTK_stable/trunk/dist/GenomeAnalysisTK.jar'
logistic_regression_script = resources + 'logistic_regression.R'
empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R'
# Assemble the platform list into command-line arguments.
platform_args = ' '.join(['-pl %s' % platform for platform in platforms])
def output_file_needs_update(output_file, input_files = []):
if reprocessFiles: # we are always reprocessing files
return True
if not os.path.exists(output_file): # the file doesn't exist
return True
else:
return False
def executeCommand(cmd):
if not dryRun:
return os.system(cmd)
else:
print ' => Would execute', cmd
return 0
def exit(msg,errorcode):
print msg
sys.exit(errorcode)
def check_input_file_available(filename,description):
if not os.access(filename,os.R_OK):
exit('Unable to access %s %s' % (description,filename),1)
def graph_file(graph_script,graph_data):
'Graph the given data using the given script. Leave the data in the output directory.'
check_input_file_available(graph_script,'%s R graphing script' % graph_script)
check_input_file_available(graph_data,'%s graphing data' % graph_data)
result = executeCommand(' '.join((R_exe,graph_script,graph_data)))
if result != 0:
exit('Unable to graph data: %s' % graph_data,1)
def recalibrate():
'Recalibrate the given bam file'
# generate the covariates
print 'generating covariates'
linear_regression_results = output_dir + 'linear_regression_results.out'
if output_file_needs_update(linear_regression_results):
generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
returncode = executeCommand(generate_covariates)
if returncode != 0:
exit('Unable to generate covariates',1)
# compute the logistic regression
if not dryRun:
print 'computing the logistic regression'
LogisticRegressionByReadGroup.compute_logistic_regression(output_dir + 'initial.covariate_counts.csv',linear_regression_results,R_exe,logistic_regression_script)
else:
print 'Logistic recalibration data files already generated', linear_regression_results
if output_file_needs_update(calibrated_bam):
# apply the logistic regression, writing the output data to calibrated_bam
print 'applying the correction to the reads'
apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams',output_dir+'linear_regression_results.out','-outputBAM',calibrated_bam))
returncode = executeCommand(apply_logistic_regression)
if returncode != 0:
exit('Unable to apply logistic regression',1)
else:
print 'Recalibrated BAM file already generated', calibrated_bam
# index the calibrated bam
if output_file_needs_update(calibrated_bam + '.bai'):
print 'indexing the calibrated bam'
index_calibrated_bamfile = ' '.join((samtools_exe,'index',calibrated_bam))
returncode = executeCommand(index_calibrated_bamfile)
if returncode != 0:
exit('Unable to index calibrated bamfile',1)
else:
print 'Recalibrated BAM index file already generated', calibrated_bam + '.bai'
print 'Recalibration complete! Calibrated bam is available here: ' + calibrated_bam
def evaluate():
'Evaluate recalibration results.'
print 'Evaluating recalibration results'
recalibrated_regression_results = output_dir + 'recalibrated' + 'linear_regression_results.out'
if output_file_needs_update(recalibrated_regression_results):
# regenerate the covariates
regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
print 'regenerating covariates'
returncode = executeCommand(regenerate_covariates)
if returncode != 0:
exit('Unable to regenerate covariates',1)
print 'graphing initial results'
for filename in glob.glob(output_dir+'initial.*.empirical_v_reported_quality.csv'):
graph_file(empirical_vs_reported_grapher,filename)
print 'graphing final results'
for filename in glob.glob(output_dir+'recalibrated.*.empirical_v_reported_quality.csv'):
graph_file(empirical_vs_reported_grapher,filename)
else:
print 'Evaluated files already generated', recalibrated_regression_results
def usage():
exit('Usage: python RecalQual.py [--recalibrate] [--evaluate] <input bam file> <calibrated output bam file>',1)
# Try to parse the given command-line arguments.
try:
opts, args = getopt.gnu_getopt(sys.argv[1:],'',['recalibrate','evaluate', 'reprocess', 'dry', 'outputRoot='])
except getopt.GetoptError, err:
usage()
# An input and an output file is required. Fail if left unspecified.
if len(args) < 2:
usage()
# Determine whether to evaluate / recalibrate.
recalibrate_requested = False
evaluate_requested = False
reprocessFiles = False
dryRun = False
for opt,arg in opts:
if opt == '--recalibrate':
recalibrate_requested = True
if opt == '--evaluate':
evaluate_requested = True
if opt == '--reprocess':
reprocessFiles = True
if opt == '--dry':
dryRun = True
if opt == '--outputRoot':
output_root = arg
# Default to 'recalibrate' unless the user specified only evaluate.
do_recalibration = not (evaluate_requested and not recalibrate_requested)
# Only evaluate if the user specifically requested evaluation.
do_evaluation = evaluate_requested
print 'Looking for supplemental files in ' + resources
# check that the input bam file exists, and that the bam is indexed.
bam = args[0]
bam_index = bam + '.bai'
check_input_file_available(bam,'reads file')
check_input_file_available(bam_index,'reads index file')
# parse the user's calibration output file requirements
calibrated_bam = args[1]
calibrated_bam_index = calibrated_bam + '.bai'
# check that the fasta and supporting files are available
check_input_file_available(reference,'reference file')
check_input_file_available(reference_dict,'reference dictionary')
check_input_file_available(reference_fai,'reference index file')
# check that the dbsnp is available
check_input_file_available(dbsnp,'dbsnp file')
# sanity check that the software is available
check_input_file_available(samtools_exe,'samtools')
check_input_file_available(java_exe,'java')
check_input_file_available(R_exe,'R')
check_input_file_available(gatk,'Genome Analysis Toolkit')
check_input_file_available(logistic_regression_script,'logistic regression script')
# make an output directory for temporary files
output_dir=output_root+'output.' + bam[bam.rfind('/')+1:bam.rfind('.bam')] + '/'
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
if not os.path.isdir(output_dir):
exit('Unable to create output directory ' + output_dir,1)
# assemble the required program arguments
gatk_base_cmdline = ' '.join((java_exe,jvm_args,'-jar',gatk,'-R',reference,'--DBSNP',dbsnp,'-l INFO'))
if do_recalibration:
recalibrate()
if do_evaluation:
evaluate()

View File

@ -1,237 +0,0 @@
import string
import operator
from pushback_file import pushback_file
# Type TAG Req? Default Description
SAMHeaderFields = \
[['HD', 'VN', True, '0.1.0', 'File format version'], \
['HD', 'SO', False, None, 'Sort order. Valid values are: unsorted, queryname or coordinate'], \
['SQ', 'SN', True, None, 'Sequence name. Unique among all sequence records in the file. The value of this field is used in alignment records.'], \
['SQ', 'LN', True, None, 'Sequence length'], \
['SQ', 'AS', False, None, 'Genome assembly identifier. Refers to the reference genome assembly in an unambiguous form. Example: HG18.'], \
['SQ', 'M5', False, None, 'MD5 checksum of the sequence in the uppercase (gaps and space are removed)'], \
['SQ', 'UR', False, None, 'URI of the sequence'], \
['SQ', 'SP', False, None, 'Species'], \
['RG', 'ID', True, 'ReadGroup1', 'Unique read group identifier. The value of the ID field is used in the RG tags of alignment records.'], \
['RG', 'SM', True, 'SampleA', 'Sample (use pool name where a pool is being sequenced)'], \
['RG', 'LB', False, None, 'Library'], \
['RG', 'DS', False, None, 'Description'], \
['RG', 'PU', False, None, 'Platform unit (e.g. lane for Illumina or slide for SOLiD)'], \
['RG', 'PI', False, None, 'Predicted median insert size (maybe different from the actual median insert size)'], \
['RG', 'CN', False, None, 'Name of sequencing center producing the read'], \
['RG', 'DT', False, None, 'Date the run was produced (ISO 8601 date or date/time)'], \
['RG', 'PL', False, None, 'Platform/technology used to produce the read'], \
['PG', 'ID', True, 'ProgramA','Program name'], \
['PG', 'VN', False, None, 'Program version'], \
['PG', 'CL', False, None, 'Command line']]
def SAMHeaderEncode( type, tag ):
return type + '.' + tag
SAMHeaderDict = dict()
for record in SAMHeaderFields:
type, tag, req, default, desc = record
SAMHeaderDict[SAMHeaderEncode(type, tag)] = [req, default, desc]
# -----------------------------------------------------------------------------------------------
#
# A SAM header is a potentially complex object, so we just punt on it. The only really required
# outputs that *must* be user specified are the SQ: SN and SQ: LN fields.
# Everything else is just split up and stored in our hash table by reading the SAMHeaderFields
# data structure (defined above). All required fields are initialized to their default values
#
# -----------------------------------------------------------------------------------------------
class SAMHeader(dict):
def __init__(self, seqName, seqLen, **keys):
# setup initial header
self.fields = dict()
for record in SAMHeaderFields:
type, tag, req, default, desc = record
self.setField( type, tag, default )
self.setField('SQ', 'SN', seqName )
self.setField('SQ', 'LN', seqLen )
# add keyword arguments
for key, value in keys.iteritems():
type, tag = key.split('.')
self.setField( type, tag, value )
def isReq( self, type, tag ):
return SAMHeaderDict[SAMHeaderEncode(type,tag)][0]
def setField( self, type, tag, value ):
#print 'Setting', type, tag, value
self.fields[SAMHeaderEncode(type, tag)] = value
def getField( self, type, tag ):
return self.fields[SAMHeaderEncode(type, tag)]
def __str__( self ):
types = ['HD', 'SQ', 'RG']
def formatType( type ):
s = '@' + type + '\t'
for record in SAMHeaderFields:
qtype, tag, req, default, desc = record
if type == qtype and self.isReq(type, tag):
value = self.getField(type, tag)
if value == None:
raise Error('Unset required SAM header ' + type + ' ' + tag)
s += tag + ':' + str(value) + '\t'
return s
return string.join(map( formatType, types ),'\n')
SAM_SEQPAIRED = 0x0001 # the read is paired in sequencing, no matter whether it is mapped in a pair
SAM_MAPPAIRED = 0x0002 # the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1
SAM_UNMAPPED = 0x0004 # the query sequence itself is unmapped
SAM_MATEUNMAPPED = 0x0008 # the mate is unmapped 1
SAM_QUERYSTRAND = 0x0010 # strand of the query (0 for forward; 1 for reverse strand)
SAM_MATESTRAND = 0x0020 # strand of the mate 1
SAM_ISFIRSTREAD = 0x0040 # the read is the first read in a pair 1,2
SAM_ISSECONDREAD = 0x0080 # the read is the second read in a pair 1,2
SAM_NOTPRIMARY = 0x0100 # the alignment is not primary (a read having split hits may have multiple primary alignment records)
SAM_FLAGS = {
SAM_SEQPAIRED : 'the read is paired in sequencing, no matter whether it is mapped in a pair',
SAM_MAPPAIRED : 'the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1',
SAM_UNMAPPED : 'the query sequence itself is unmapped',
SAM_MATEUNMAPPED : 'the mate is unmapped 1',
SAM_QUERYSTRAND : 'strand of the query (0 for forward; 1 for reverse strand)' ,
SAM_MATESTRAND : 'strand of the mate 1' ,
SAM_ISFIRSTREAD : 'the read is the first read in a pair 1,2' ,
SAM_ISSECONDREAD : 'the read is the second read in a pair 1,2',
SAM_NOTPRIMARY : 'the alignment is not primary (a read having split hits may have multiple primary alignment records)'
}
def SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig = '*', pairPos = 0, insertSize = 0 ):
r = SAMRecord()
r.setValuesFromArgs( qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig, pairPos, insertSize )
return r
def SAMRecordFromString( str ):
r = SAMRecord()
r.setValuesFromString( str )
return r
def SAMFlagValue( flags, testFlag ):
return testFlag & flags
def SAMFlagIsSet( flags, testFlag ):
return SAMFlagValue(flags, testFlag) <> 0
def SAMFlagsDescs( flags ):
def keepMe(p):
flagKey, flagDesc = p
return [flagKey, SAMFlagIsSet(flags, flagKey), flagDesc]
return sorted(map( keepMe, SAM_FLAGS.iteritems() ))
# -----------------------------------------------------------------------------------------------
#
# This is really the meat of the SAM I/O system.
#
# setValuesFromArgs takes the required arguments for a SAM record and stores them in a list
# (in SAM ordering) so that writing the values to a string is trivial. Note the conversion
# of int quals to ASCII encoded quals.
#
# -----------------------------------------------------------------------------------------------
class SAMRecord:
def __init__( self, vals = []):
self.vals = []
def setValuesFromArgs( self, qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig = '*', pairPos = 0, insertSize = 0 ):
self.vals = [qname, self.formatFlags(flags), rname, pos, mapq, \
cigar, pairContig, pairPos, insertSize, seq.lower(), self.toASCII33(quals) ]
def setValuesFromString( self, line ):
#print 'line is', line
formats = [str, int, str, int, int, str, str, int, int, str, str]
self.vals = map( lambda f, v: f(v), formats, line.split()[0:len(formats)] )
def formatFlags( self, flags ):
b = reduce( operator.__or__, flags, 0 )
#print 'FormatFlags', flags, b
return b
def getQName(self): return self.vals[0]
def getFlags(self): return self.vals[1]
def getRname(self): return self.vals[2] # Reference sequence name (can be chr1, chr2, etc...)
def getPos(self): return self.vals[3]
def getMapq(self): return self.vals[4]
def getCigar(self): return self.vals[5] # Returns CIGAR which gives match, insertion, and other info as "75M2I"
def getSeq(self): return self.vals[9]
def getQuals(self): return self.fromASCII33(self.vals[10])
def toASCII33( self, quals ):
return string.join( map( lambda q: chr(q+33), quals ), '' )
def fromASCII33( self, qualStr ):
return map( lambda q: ord(q)-33, qualStr)
def __str__( self ):
return string.join( map( str, self.vals ), '\t')
# -----------------------------------------------------------------------------------------------
#
# Wrapper class for reading and writing records to a SAM file
#
# -----------------------------------------------------------------------------------------------
class SAMIO:
def __init__( self, fileName, header = None, debugging = False, func = None ):
self.header = header
self.debugging = debugging
if func == None:
func = lambda x: x
self.func = func
if self.header == None:
self.fileHandle = pushback_file(fileName, "r")
self.readHeader()
else:
self.fileHandle = file(fileName, "w")
self.writeHeader()
def close(self):
self.fileHandle.close()
def readHeader(self):
self.header = ""
while True:
line = self.fileHandle.readline()
if line.startswith("@"):
self.header += line
else:
self.fileHandle.pushback(line)
if self.debugging:
pass #print str(self.header)
return True
return self.header
def RecordGenerator( self ):
for line in self.fileHandle:
#print 'Reading line', line
line = line.rstrip("\n")
yield self.func(SAMRecordFromString(line))
raise StopIteration
return
def __iter__(self):
return self.RecordGenerator()
def writeRecord( self, recordIterator ):
for record in recordIterator:
self.writeRecord(record)
def writeRecord( self, record ):
if self.debugging:
print record
print >> self.fileHandle, record
return True
def writeHeader( self ):
if self.debugging:
print str(self.header)
print >> self.fileHandle, str(self.header)
return True

View File

@ -1,304 +0,0 @@
#!/util/bin/python
#
#
# This file really needs to be cleaned up and functions need to be improved
#
# (1) utility operations should be moved to a utility file
# (2) Need to add support for windows paired reads, information about where
# reads come from (which file), and n-base pair loci
# (3) Move to C++ based SAM/BAM i/o system
# (4) Figure out a way to get the reference to stream, and use a automatically pickled version
# (5) Add capability to walk over all reference bases, regardless of coverage. Currently
# we only walk over covered bases.
#
#
from optparse import OptionParser
import os
import sys
import random
import string
import os.path
import heapq
from itertools import *
from SAM import *
import SimpleSAM
#import SimulateReads # utility functions should be imported
# Global variable containing command line arguments
OPTIONS = None
#
# Trivial class that added pushback support to a stream
#
class peakStream(object):
"""A simple wrapper around a stream that adds unget operation"""
def __init__( self, stream ):
self.ungot = []
self.s = stream
def unget( self, elt ):
"""Put elt at the front of the underlying stream, making the next get call return it"""
#print 'unget', self.ungot
self.ungot.append(elt)
#print ' -> unget', self.ungot
# we hold the next() operation
def __iter__( self ): return self
def next(self):
"""For python iterator support"""
#print 'ungot', self.ungot
if self.ungot <> []:
elt = self.ungot.pop(0)
else:
elt = self.s.next()
#print ' -> ungot', self.ungot
#print ' elt', elt
return elt
#
# Returns a single, combined stream of SAM records in chromosome order from a list of
# input streams, each in reference order
#
def rawReadStream( ref, inputs ):
"""Returns a single iterable that returns records in reference order
from each of the input streams"""
def includePriorities( stream ):
return imap( lambda x: (x.getPos(), x), stream )
def prunePriorities( stream ):
return imap( lambda p: SimpleSAM.MappedReadFromSamRecord(ref, p[1]), stream )
with_priorities = map( includePriorities, inputs )
return prunePriorities( heapq.merge( *with_priorities ) )
#
# Just wraps the raw read stream objects in a list as they come out
#
def readStream( ref, inputs ):
"""Returns a single iterable that returns SAM records in reference order
from each of the input streams"""
rs = rawReadStream( ref, inputs )
return imap( lambda x: [x], rs )
#
# More complex stream object. Takes a list of input streams and creates a stream
# returning successive loci covered by the reads in the combined input stream.
#
class lociStream():
"""A streaming data structure that returns reads spanning each covered loci in
the input reads, offsets into them
where the bases are equivalent, and the position of the locus.
"""
def __init__( self, ref, inputs ):
self.ref = ref
self.rs = peakStream(rawReadStream( ref, inputs ))
self.window = []
self.chr = None
self.pos = None
def __iter__(self):
return self
def pushRead( self, read ):
self.window.append(read)
def cleanWindow( self, chr, pos ):
"""Walk over the window of reads, deleting those who no longer cover the current pos"""
if OPTIONS.debug:
print 'cleanWindow start:', chr, pos, self.window
def keepReadP( read ):
return read.chr == chr and pos >= read.start and pos <= read.end
self.window = filter( keepReadP, self.window )
if OPTIONS.debug:
print 'cleanWindow stop:', chr, pos, self.window
def expandWindow( self, chr, pos ):
"""Keep reading from the read stream until we've added all reads from the stream covering pos"""
if OPTIONS.debug:
print 'expandWindow start:', chr, pos, self.window
for read in self.rs:
#print 'read', read, pos
if read.chr == chr and read.start <= pos and read.end >= pos:
self.pushRead(read)
else:
self.rs.unget( read )
#self.rs = chain( [read], self.rs )
break
if OPTIONS.debug:
print 'expandWindow stop:', chr, pos, self.window
#
# This is the workhorse routine. It constructs a window of reads covering the
# next locus in the genome, and returns the reference, the contig, its position
# in the genome, along with a vector of offsets into the list of reads that denote
# the equivalent base among all the reads and reference.
#
def next(self):
if self.pos <> None:
self.cleanWindow(self.chr, self.pos) # consume reads not covering pos
self.expandWindow(self.chr, self.pos) # add reads to window covering pos
if self.window == []:
# the window is empty, we need to jump to the first pos of the first read in the stream:
nextRead = self.rs.next()
self.pushRead( nextRead )
self.chr = nextRead.chr
self.pos = nextRead.start
return self.next()
else:
# at this point, window contains all reads covering the pos, we need to return them
# and the offsets into each read for this loci
def calcOffset( read ):
offset = self.pos - read.start
return offset
offsets = map(calcOffset, self.window)
currPos = self.pos
self.pos += 1 # we are going to try to get the next position
return self.ref, self.chr, currPos, offsets, self.window
#
# Reference reader
#
def readRef(referenceFasta):
ref = {}
header = None
seq = ''
for line in open(referenceFasta):
if line[0] == '>':
if header <> None:
ref[header] = seq.lower()
seq = ''
header = line[1:].strip()
else:
seq += line.strip()
ref[header] = seq.lower()
#print ref
return ref
#
# Main() procedure
#
def main():
global OPTIONS, ROOT
# ------------------------------------------------------------
#
# Setup command line options
#
# ------------------------------------------------------------
usage = "usage: %prog [options] Walker [sam or bam file or file list]+"
parser = OptionParser(usage=usage)
parser.add_option("-r", "--ref", dest="reference",
type="string", default=None,
help="Reference DNA seqeunce in FASTA format")
parser.add_option("-m", "--maxRecords", dest="maxRecords",
type=int, default=None,
help="Max number of SAM records to process")
parser.add_option("-q", "--quiet", dest="quiet",
action='store_true', default=False,
help="Be quiet when generating output")
parser.add_option("-d", "--debug", dest="debug",
action='store_true', default=False,
help="Verbose debugging output?")
(OPTIONS, args) = parser.parse_args()
print 'args', args
if len(args) < 2:
parser.error("Incorrect number of arguments")
# ------------------------------------------------------------
#
# Dynamically load the walker class (the first cmd line arg)
# and initialize a walker class object
#
#
# load walkers from standard location
#
# ------------------------------------------------------------
walkerName = args[0]
walkerModule = __import__(walkerName, globals(), locals(), [], -1)
walkerClass = walkerModule.__dict__[walkerName]
walker = walkerClass()
print walkerName
print walkerModule
print walkerClass
print walker
print '------------------------------------------------------------'
print 'program:', sys.argv[0]
print ''
print ' ref: ', OPTIONS.reference
print ' walker: ', walker
if walker.hasOption('name'):
print ' -> ', walker.getOption('name')
if walker.hasOption('desc'):
print ' -> ', walker.getOption('desc')
print '------------------------------------------------------------'
# ------------------------------------------------------------
#
# Initialize the engine
#
# ------------------------------------------------------------
# read the reference
refs = readRef(OPTIONS.reference)
# create the low-level SAMIO streams
files = args[1:]
inputs = map( lambda file: SAMIO( file, debugging=OPTIONS.debug ), files )
# build the higher level stream object, either by record or by loci
if walker.isRecordWalker():
stream = readStream( refs, inputs )
if walker.isLociWalker():
stream = lociStream( refs, inputs )
# ------------------------------------------------------------
#
# Move the walker object over the stream
#
# For each element in the record or loci stream, invoke the walker
# object on it. Use filter, map, and reduce to construct the sum
#
# ------------------------------------------------------------
sum = walker.reduceDefault
counter = 0
for elt in stream:
counter += 1
#print 'processing elt', counter, elt, OPTIONS.maxRecords
if OPTIONS.maxRecords <> None:
if (counter * 10) % OPTIONS.maxRecords == 0:
print ' => ', (counter * 100.0) / OPTIONS.maxRecords, '% done'
if counter > OPTIONS.maxRecords:
break
if walker.filterfunc( *elt ):
x = walker.mapfunc( *elt )
sum = walker.reducefunc( x, sum )
print 'Result is', sum
if __name__ == "__main__":
if True:
import cProfile, pstats
cProfile.run('main()', 'prof')
stats = pstats.Stats('prof')
stats.sort_stats('time')
stats.print_stats(20)
else:
main()

View File

@ -1,31 +0,0 @@
#!/util/bin/python
#from SamWalk import *
from SimpleSAM import *
import Walker
class SamWalkTest(object, Walker.Walker):
def __init__(self):
Walker.Walker.__init__( self, 'byRecord', {}, name = 'SamWalkTest' )
def filterfunc( self, read ):
"""Return true if you want to keep the read for mapping"""
return True
#return datum.getPos() % 2 == 0
def mapfunc( self, read ):
"""Do something to the read, and return a result for reduction"""
#print datum
return 1
#return len(datum.getSeq())
#return datum.getSeq() + '/'
#return "\n>%s\n%s" % (datum.getQName(), datum.getSeq())
#reduceDefault = ''
def reducefunc( self, x, sum ):
"""Take the result of mapping, and previous reduce results in sum, and combine them"""
#print x
return x + sum

View File

@ -1,69 +0,0 @@
from SAM import *
def read2align( refSeq, readSeq, chr, pos, cigar ):
"""Converts the read bases, a reference sequence, and a cigar string into a pair of
alignments strings for refSeq and readSeq. The alignment string has base equivalents
between the reference and the read bases"""
#
# only works for non-indel containing reads right now!
#
contig = refSeq[chr]
refAlign = contig[pos:pos+len(readSeq)]
readAlign = readSeq
return refAlign, readAlign
def MappedReadFromSamRecord(ref, r):
"""Converts SameRead into a SimpleSamRecord object"""
#
# This is a temp. function while we wait for Ted's C++ SAM/BAM io system to come online
#
refAlign, readAlign = read2align(ref, r.getSeq(), r.getRname(), r.getPos(), r.getCigar)
return MappedRead( r.getQName(), r.getFlags(), r.getRname(), r.getPos(), r.getMapq(),
r.getCigar(), r.getSeq(), refAlign, readAlign, r.getQuals() )
class MappedRead(object):
"""Higher level object representing a mapped read.
Attempts to be more accessible than a raw SAM record, doing a lot of processing on the record w.r.t.
the reference to make analysis of the reads easier. This is the fundamental data structure
used to represent reads through the walker engine"""
def __init__( self, qname, flags, chr, pos, mapq, cigar, readBases, refAlign, readAlign, qualsByBase ):
# basic information about the read
self.qname = qname
self.mapq = mapq
self.cigar = cigar
self.bases = readBases # actual bases of reads, unaligned, may be fw or reverse
self.quals = qualsByBase # as a vector of floats, by base position
# dealing with position information, more than a standard record Hasbro
self.chr = chr
self.start = pos
self.readlen = len(readBases)
self.end = pos + len(refAlign) - 1
# print 'refAlign', refAlign, len(refAlign)
# dealing with stand
self.flags = flags
self.isForwardStrand = SAMFlagIsSet(flags, SAM_QUERYSTRAND)
self.isReverseStrand = not SAMFlagIsSet(flags, SAM_QUERYSTRAND)
self.isPrimary = SAMFlagIsSet(flags, SAM_NOTPRIMARY)
self.isUnmapped = SAMFlagIsSet(flags, SAM_UNMAPPED)
self.isPaired = SAMFlagIsSet(flags, SAM_SEQPAIRED)
self.isMapPaired = SAMFlagIsSet(flags, SAM_MAPPAIRED)
self.isFirstReadInPair = SAMFlagIsSet(flags, SAM_ISFIRSTREAD)
self.isSecondReadInPair = SAMFlagIsSet(flags, SAM_ISSECONDREAD)
#self.isMapPaired = SAMFlagIsSet(flags, SAM_MATEUNMAPPED)
#self.isMapPaired = SAMFlagIsSet(flags, SAM_MATESTRAND)
# everything will be reverse complemented already
self.refAlign = refAlign #
self.readAlign = readAlign # all should be forward strand
def __str__(self):
return '<MappedRead %s:%d-%d>' % (self.chr, self.start, self.end)
__repr__ = __str__

View File

@ -1,754 +0,0 @@
#!/util/bin/python
from optparse import OptionParser
import os
import sys
#import set
import random
import string
from SAM import *
def all(iterable):
for element in iterable:
if not element:
return False
return True
def any(iterable):
for element in iterable:
if element:
return True
return False
OPTIONS = None
bases = set(['a', 't', 'g', 'c'])
class Mutation:
"""Representation of a change from one sequence to another"""
SNP = "SNP"
INSERTION = "Insertion"
DELETION = "Deletion"
TYPES = set([SNP, INSERTION, DELETION])
CIGARChars = { SNP : 'M', INSERTION : 'I', DELETION : 'D' }
def __init__( self, contig, pos, type, original, mutated ):
# example: chr20 123 SNP "a" "t" <- a to t at base 123 on chr20
# example: chr20 123 INSERTION "" "atg" <- insertion of 3 bases starting at pos 123
# example: chr20 123 DELETION "atg" "" <- deletion of 3 bases starting at pos 123
self.contig = contig # contig of the mutation
self._pos = pos # position of the change in contig coordinates
self._type = type # one of the TYPES
assert(self._type in Mutation.TYPES)
self.original = original # String of the original genotype
self.mutated = mutated # String of the mutated genotype
def pos(self, offset=0): return self._pos - offset
def type(self): return self._type
def __len__(self):
return max( len(self.mutated), len(self.original) )
def mutString( self ):
if self._type == Mutation.SNP:
return 'P:%s->%s' % (self.original, self.mutated)
elif self._type == Mutation.INSERTION:
return 'I:%s' % (self.mutated)
elif self._type == Mutation.DELETION:
return 'D:%s' % (self.original)
else:
raise Exception('Unexpected mutation type ' + self._type)
def CIGARChar( self ):
return Mutation.CIGARChars[self.type()]
def __str__( self ):
return '%s:%d %s' % (self.contig, self._pos, self.mutString())
def __repr__( self ):
return 'Mutation(%s, %d, %s)' % (self.contig, self._pos, self.mutString())
def execute( self, offset, refseq, mutseq ):
p = self.pos(offset)
if self.type() == Mutation.SNP:
mutseq[p] = self.mutated
elif self.type() == Mutation.INSERTION:
refseq[p:p] = string.join(['-'] * len(self),'')
mutseq[p:p] = self.mutated
elif self.type() == Mutation.DELETION:
mutseq[p:p+len(self)] = string.join(['-'] * len(self), '')
refseq.extend(self.mutated)
mutseq.extend(self.mutated)
#print refseq, mutseq
return refseq, mutseq
# ---------------------------------------------------------------------------
#
# Code for dealing with CIGAR strings
#
# ---------------------------------------------------------------------------
def flatten(l):
out = []
for item in l:
if isinstance(item, (list, tuple)):
out.extend(flatten(item))
else:
out.append(item)
return out
def mutationsCIGAR( readStart, alignStart, readLen, mutations ):
if len(mutations) == 1 and mutations[0].type() <> Mutation.SNP:
mut = mutations[0]
internalPos = mut.pos() - alignStart
leftLen = max(internalPos - readStart,0)
mutLen = min(internalPos + len(mut) - readStart, readLen - leftLen, len(mut))
rightLen = readLen - leftLen - mutLen
#print mut, readStart, alignStart, internalPos, leftLen, mutLen, rightLen
#print
l = [ (leftLen, 'M'), (mutLen, mut.CIGARChar()), (rightLen, 'M') ]
l = filter( lambda x: x[0] > 0, l )
return string.join(map(str, flatten(l)), '')
if not all( map( lambda mut: mut.type() == Mutation.SNP, mutations ) ):
# bail
raise Exception('Currently only supports multiple SNPs CIGARS')
return str(readLen) + 'M'
# def mutationsCIGARBAD( refStart, readStart, readStop, mutations ):
# sortedMutations = sorted(mutations, key=Mutation.pos)
# CIGAR = []
# pos = readStart
#
# for mutation in sortedMutations:
# internalMutationStart = mutation.pos() - refStart
# if internalMutationStart >= readStart and internalMutationStart < readStop
# delta = mutation.pos() - pos
# if not OPTIONS.quiet:
# print 'mutationsCIGAR', pos, CIGAR, delta, mutation
# if mutation.type() <> Mutation.SNP and delta > 0:
# CIGAR.append([delta, 'M'])
# pos = mutation.pos()
#
# if mutation.type == Mutation.INSERTION:
# CIGAR.append(['I', len(mutation)])
# if mutation.type == Mutation.DELETION:
# CIGAR.append(['D', len(mutation)])
#
# delta = refSeqPos + refLen - pos
# if not OPTIONS.quiet:
# print 'mutationsCIGAR', pos, CIGAR, delta, mutation
# if delta > 0:
# CIGAR.append([delta, 'M'])
#
# s = string.join(map( str, flatten(CIGAR) ), '')
# print 'CIGAR:', CIGAR, s
# return s
# ---------------------------------------------------------------------------
#
# mutating the reference
#
# ---------------------------------------------------------------------------
def executeMutations( mutations, seqIndex, refseq, mutseq ):
for mutation in mutations:
internalSite = mutation.pos() - seqIndex
refseq, mutseq = mutation.execute( seqIndex, refseq, mutseq )
return refseq, mutseq
def isBadSequenceForSim(seq):
if any( map( lambda b: b not in bases, seq.tostring().lower() ) ):
return True
else:
return False
def mutateReference(ref, contig, mutSite, mutType, mutParams, nBasesToPad):
#print 'mutateReference'
# start and stop
refStartIndex = mutSite - nBasesToPad + 1
internalStartIndex = nBasesToPad - 1
if OPTIONS.mutationType == 'SNP' or OPTIONS.mutationType == 'NONE':
refStopIndex = mutSite + nBasesToPad
else:
refStopIndex = mutSite + nBasesToPad - 1
refsub = ref[refStartIndex : refStopIndex]
mutseq = refsub.tomutable()
refsub = refsub.tomutable()
mutations = []
def success():
return True, mutations, refsub, mutseq
def fail():
return False, [], None, None
if refStartIndex < 0:
return fail()
if isBadSequenceForSim(refsub):
#print 'rejecting seq', refsub.tostring()
#print map( lambda b: b not in bases, refsub.tostring().lower() )
return fail()
if not OPTIONS.quiet:
print ref
print refsub
otherSites = set(range(refStartIndex, refStopIndex)) - set([mutSite])
# print 'otherSites', otherSites
for site in [mutSite] + random.sample(otherSites, max(OPTIONS.mutationDensity-1,0)):
internalSite = site - refStartIndex
if mutType == 'NONE' or OPTIONS.mutationDensity < 1:
pass
elif mutType == 'SNP':
mutBase = ref[site].lower()
otherBases = bases - set(mutBase)
otherBase = random.choice(list(otherBases))
assert mutBase <> otherBase
if not OPTIONS.quiet:
print mutBase, ' => ', otherBase
mutations.append( Mutation( contig, site, Mutation.SNP, mutBase, otherBase ) )
elif mutType == 'INSERTION':
inSeq = string.join([ random.choice(list(bases)) for i in range( OPTIONS.indelSize ) ], '')
mutation = Mutation( contig, site, Mutation.INSERTION, '', inSeq )
mutations.append( mutation )
elif mutType == 'DELETION':
inSeq = string.join([ random.choice(list(bases)) for i in range( OPTIONS.indelSize ) ], '')
mutation = Mutation( contig, site, Mutation.DELETION, '', ref[refStopIndex:refStopIndex+OPTIONS.indelSize])
mutations.append( mutation )
else:
raise Exception('Unexpected mutation type', mutType)
# process the mutations
refsub, mutseq = executeMutations( mutations, refStartIndex, refsub, mutseq )
return success()
# version 1.0 -- prototype
# def mutateReference(ref, mutSite, mutType, mutParams, nBasesToPad):
# #print 'mutateReference'
# refStartIndex = mutSite - nBasesToPad + 1
# refStopIndex = mutSite + nBasesToPad
# internalStartIndex = nBasesToPad - 1
#
# if refStartIndex < 0:
# return False, None, None
#
# refsub = ref[refStartIndex : refStopIndex]
# print ref
# print refsub
#
# if mutType == 'SNP' or mutType == 'None':
# #print ' In IF'
# mutBase = ref[mutSite]
# bases = set(['A', 'T', 'G', 'C'])
# if mutBase in bases:
# otherBases = bases - set(mutBase)
# otherBase = random.choice(list(otherBases))
# assert mutBase <> otherBase
# mutseq = refsub.tomutable()
#
# if mutType == 'SNP':
# print mutBase, ' => ', otherBase
# mutseq[internalStartIndex] = otherBase
#
# print refsub
# print mutseq
# align = Bio.Align.Generic.Alignment(Gapped(IUPAC.unambiguous_dna, '-'))
# align.add_sequence("ref", refsub.tostring())
# align.add_sequence("mut", mutseq.tostring())
# print str(align)
#
# return True, refsub, mutseq
#
# return False, None, None
# ---------------------------------------------------------------------------
#
# sample reads from alignments
#
# ---------------------------------------------------------------------------
def accumulateBases( mutSeq, start, readLen ):
count = 0
for stop in range(start, len(mutSeq)):
if notDash(mutSeq[stop]):
count += 1
#print stop, count
if count == readLen:
break
stop += 1
read = string.join(filter( notDash, mutSeq[start:stop] ), '')
return read, stop
# doesn't support paired reads
#
# def sampleReadsFromAlignment(refSeq, mutSeq, alignStart, readLen, nReads, mutations):
# #print 'REF:', refSeq.tostring()
# #print 'MUT:', mutSeq.tostring()
# #print ''
#
# # we are assuming that mutSeq is exactly 1 + 2 * readLen in size, which the mutation
# # right in the middle
# lastGoodStartSite = readLen - 1
# if not OPTIONS.quiet:
# print refSeq.tostring(), 'ref'
# print mutSeq.tostring(), 'mut'
#
# # we can potentially start at any site in mutSeq
# nPotentialStarts = len(mutSeq) - readLen + 1
# potentialStarts = range(nPotentialStarts)
#
# if nReads.lower() == 'tile' or int(nReads) >= nPotentialStarts:
# if OPTIONS.uniqueReadsOnly:
# starts = potentialStarts
# else:
# starts = map( lambda x: random.choice(potentialStarts), range(nPotentialStarts))
# else:
# starts = random.sample(potentialStarts, nPotentialStarts)
#
# #print 'potentialStarts', potentialStarts
# #print 'Starts', starts
#
# def sampleRead( start ):
# if mutSeq[start] <> '-':
# read, stop = accumulateBases( mutSeq, start, readLen )
# remaining = len(mutSeq) - stop
# refForRead = refSeq[start:stop]
# #print 'XXXX', start, stop, refForRead, read
# cigar = mutationsCIGAR( alignStart, readLen, mutations )
# if not OPTIONS.quiet:
# leftPads = string.join(start * ['-'], '')
# rightPads = string.join(remaining * ['-'], '')
# print leftPads + mutSeq.tostring()[start:stop] + rightPads, start, stop, cigar
# return filter(notDash, read), alignStart + start, cigar
# else:
# return False, False, False
#
# reads = map( sampleRead, starts )
# return [read for read in reads if read[0] <> False]
class refSite:
def __init__( self, refSeq, mutSeq, alignStart, mutations ):
self.refSeq = refSeq
self.mutSeq = mutSeq
self.alignStart = alignStart
self.mutations = mutations
def sample1Read( refSite, start, readLen, reverseComplementP = False ):
if refSite.mutSeq[start] <> '-':
read, stop = accumulateBases( refSite.mutSeq, start, readLen )
mutSubSeq = refSite.mutSeq[start:stop]
if reverseComplementP:
s = Seq( read, IUPAC.unambiguous_dna )
#print 'reverseComplementP', read, s.reverse_complement().tostring(), mutSubSeq
read = s.reverse_complement().tostring()
mutSubSeq.complement()
remaining = len(refSite.mutSeq) - stop
refForRead = refSite.refSeq[start:stop]
#print 'XXXX', start, stop, refForRead, read
cigar = mutationsCIGAR( start, refSite.alignStart, readLen, refSite.mutations )
leftPads = string.join(start * ['-'], '')
rightPads = string.join(remaining * ['-'], '')
ppReadStr = leftPads + mutSubSeq.tostring() + rightPads # + '(' + read + ')'
return True, filter(notDash, read), refSite.alignStart + start, cigar, ppReadStr
else:
return False, None, None, None, None
def sampleReadsFromAlignment(wholeRef, refSeq, mutSeq, alignStart, readLen, nReads, mutations, pairedOffset, mutations2, refSeq2, mutSeq2):
#print 'REF:', refSeq.tostring()
#print 'MUT:', mutSeq.tostring()
#print ''
# pairedSite, mutations2, refSeq2, mutSeq2 can all be None
refSite1 = refSite( refSeq, mutSeq, alignStart, mutations )
refSite2 = refSite( refSeq2, mutSeq2, alignStart + pairedOffset, mutations2 )
#print refSite1.alignStart, refSite2.alignStart, pairedOffset
# we are assuming that mutSeq is exactly 1 + 2 * readLen in size, which the mutation
# right in the middle
lastGoodStartSite = readLen - 1
if not OPTIONS.quiet:
if OPTIONS.generatePairedEnds:
r = wholeRef.seq[refSite1.alignStart:refSite2.alignStart + 2*readLen - 1]
print r.tostring(), 'ref'
print r.complement().tostring(), 'ref, complement'
else:
print refSeq.tostring(), 'ref'
print mutSeq.tostring(), 'mut'
# we can potentially start at any site in mutSeq
nPotentialStarts = len(mutSeq) - readLen + 1
potentialStarts = range(nPotentialStarts)
#print 'potentialStarts', potentialStarts
if nReads.lower() == 'tile' or int(nReads) >= nPotentialStarts:
if OPTIONS.uniqueReadsOnly:
starts = potentialStarts
else:
starts = map( lambda x: random.choice(potentialStarts), range(nPotentialStarts))
else:
starts = random.sample(potentialStarts, int(nReads))
#print 'Starts', starts
def sampleRead( start ):
good1, read1, pos1, cigar1, ppstr1 = sample1Read( refSite1, start, readLen )
if OPTIONS.generatePairedEnds:
good2, read2, pos2, cigar2, ppstr2 = sample1Read( refSite2, start, readLen, True )
if not OPTIONS.quiet:
offsetDashes = string.join(['-'] * (pairedOffset), '')
print
print ppstr1 + offsetDashes, pos1, cigar1
print offsetDashes + ppstr2, pos2, cigar2
return [(read1, pos1, cigar1), (read2, pos2, cigar2)]
else:
if not OPTIONS.quiet:
print ppstr1, pos1, cigar1
return [(read1, pos1, cigar1), (None, 0, None)]
reads = map( sampleRead, starts )
return [read for read in reads if read[0][0] <> None]
def notDash( c ):
return c <> '-'
def fakeQuals( seq ):
return len(seq) * [30]
#return range(len(seq))
def alignedRead2SAM( siteID, readID, fastaID, read, pos, cigar, read2, pos2, cigar2 ):
rname = fastaID
mapq = 40
quals = fakeQuals( read )
qname = '%s:%d.read.%d' % (fastaID, siteID, readID)
if not OPTIONS.generatePairedEnds:
# not in paired mode
flags = []
record1 = SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, read, quals )
record2 = None
else:
flags = [SAM_SEQPAIRED, SAM_MAPPAIRED]
insertSize = pos2 - pos - len(read)
record1 = SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, read, quals, pairContig = rname, pairPos = pos2, insertSize = insertSize )
record2 = SAMRecordFromArgs( qname + 'p', flags, rname, pos2, mapq, cigar2, read2, quals, pairContig = rname, pairPos = pos, insertSize = insertSize )
#print 'read1, read2', record1.getSeq(), record2.getSeq()
return [record1, record2]
# ---------------------------------------------------------------------------
#
# paired end reads
#
# ---------------------------------------------------------------------------
def pairedReadSite( ref, leftMutSite, readLen ):
pairedOffset = int(round(random.normalvariate(OPTIONS.insertSize, OPTIONS.insertDev)))
def fail(): return False, None
if pairedOffset < 0:
return fail()
pairedSite = pairedOffset + leftMutSite + readLen
#print 'pairedSite', pairedOffset, leftMutSite, pairedSite
if pairedSite + readLen >= len(ref):
return fail()
refsub = ref[pairedSite - readLen:pairedSite + readLen + 1]
if isBadSequenceForSim( refsub ):
return fail()
return True, pairedSite
# ---------------------------------------------------------------------------
#
# build allowed regions and sampling starts from region
#
# ---------------------------------------------------------------------------
def parseSamplingRange( arg, ref, readLen ):
# returns a list of the allowed regions to sample in the reference
def one(x):
elts = x.split()
#print 'elts', elts
if len(elts) > 0:
elts1 = elts[0].split(':')
#print 'elts1', elts1
return map( int, elts1 )
else:
return False
if arg <> None:
try:
return [ one(arg) ]
except:
# try to read it as a file
return filter( None, map( one, open(arg).readlines() ) )
else:
return [[0, len(ref.seq) - readLen]]
def sampleStartSites( sampleRanges, nsites ):
print 'sampleStartSites', sampleRanges, nsites
# build a set of potential sites
if len(sampleRanges) > 1:
potentialSites = set()
for start, end in sampleRanges:
#print 'potentialSites', start, end, potentialSites
potentialSites = potentialSites.union(set(xrange(start, end)))
else:
start, end = sampleRanges[0]
potentialSites = xrange(start, end)
#print 'potentialSites', potentialSites
# choose sites from potentials
if len(potentialSites) <= nsites:
# tile every site
sites = potentialSites
else:
# we need to sample from the potential sites
sites = random.sample( potentialSites, nsites )
# print 'Sites', sites
print 'Number of potential start sites:', len(potentialSites)
print 'Number of start sites that will be generated:', len(sites)
return sites
def readRef(referenceFasta):
handle = open(referenceFasta)
for seq_record in SeqIO.parse(handle, "fasta"):
print seq_record.id
print seq_record.name
#print repr(seq_record.seq)
print len(seq_record.seq)
yield seq_record
handle.close()
import os.path
def outputFilename():
root = os.path.splitext(os.path.split(OPTIONS.reference)[1])[0]
if OPTIONS.generatePairedEnds:
pairedP = 'Yes'
else:
pairedP = 'No'
params = [['mut',OPTIONS.mutationType], ['den', max(OPTIONS.mutationDensity, OPTIONS.indelSize)], [OPTIONS.readLen,'bp'], \
['nsites', OPTIONS.nSites], ['cov', OPTIONS.coverage], ['range', OPTIONS.range], ['paired', pairedP]]
filename = root + '__' + string.join(map( lambda p: string.join(map(str, p),'.'), params), '_')
root = os.path.join(OPTIONS.outputPrefix, filename)
return root, root + '.sam'
def main():
global OPTIONS, ROOT
usage = "usage: %prog [options]"
parser = OptionParser(usage=usage)
parser.add_option("-r", "--ref", dest="reference",
type="string", default=None,
help="Reference DNA seqeunce in FASTA format")
parser.add_option("-o", "--outputPrefix", dest="outputPrefix",
type="string", default='',
help="Output Prefix SAM file")
parser.add_option("-c", "--coverage", dest="coverage",
type="string", default='1',
help="Number of reads to produce that cover each mutated region for the reference, or tile for all possible reads.")
parser.add_option("-n", "--nsites", dest="nSites",
type=int, default=1,
help="Number of sites to generate mutations in the reference")
parser.add_option("-l", "--readlen", dest="readLen",
type=int, default=36,
help="Length of reads to generate")
parser.add_option("-s", "--seed", dest="seed",
type=int, default=123456789,
help="Random number seed. If 0, then system clock is used")
parser.add_option("-t", "--muttype", dest="mutationType",
type="string", default='None',
help="Type of mutation to introduce from reference. Can be None, SNP, insertion, or deletion")
parser.add_option("-e", "--mutdensity", dest="mutationDensity",
type=int, default=1,
help="Number of mutations to introduce from the reference")
parser.add_option("-x", "--range", dest="range",
type="string", default=None,
help="Sampling range restriction, in the form of x:y without a space")
parser.add_option("-u", "--unique", dest="uniqueReadsOnly",
action='store_true', default=True,
help="Assumes that the user wants unique reads generated. If the coverage is greater than the number of unique reads at the site, the region is just tiled")
parser.add_option("-i", "--indelsize", dest="indelSize",
type=int, default=0,
help="Size in bp of the insertion or deletion to introduce")
# Paired end options
parser.add_option("", "--paired", dest="generatePairedEnds",
action='store_true', default=False,
help="Should we generate paired end reads?")
parser.add_option("", "--insertSize", dest="insertSize",
type=int, default=280,
help="Size in bp of the paired library insertion")
parser.add_option("", "--insertDev", dest="insertDev",
type=int, default=5,
help="Standard deviation in the size in bp of the paired library insertion")
parser.add_option("-q", "--quiet", dest="quiet",
action='store_true', default=False,
help="Be quiet when generating output")
parser.add_option("-d", "--debug", dest="debug",
action='store_true', default=False,
help="Verbose debugging output?")
(OPTIONS, args) = parser.parse_args()
if len(args) != 0:
parser.error("incorrect number of arguments")
root, outputSAM = outputFilename()
if OPTIONS.seed == 0:
random.seed(None)
else:
random.seed(OPTIONS.seed)
OPTIONS.mutationType = OPTIONS.mutationType.upper()
print '------------------------------------------------------------'
print 'program:', sys.argv[0]
print ''
print ' ref: ', OPTIONS.reference
print ' output: ', outputSAM
print ''
print ' mutation type: ', OPTIONS.mutationType
print ' mutation density: ', OPTIONS.mutationDensity
print ' indel size: ', OPTIONS.indelSize
print ' readLen: ', OPTIONS.readLen
print ' nSites: ', OPTIONS.nSites
print ' coverage: ', OPTIONS.coverage
print ' range restriction: ', OPTIONS.range
print ''
print ' paired ends?: ', OPTIONS.generatePairedEnds
print ' insert size: ', OPTIONS.insertSize
print ' insert stddev: ', OPTIONS.insertDev
print ''
print ' Debugging?: ', OPTIONS.debug
print '------------------------------------------------------------'
if OPTIONS.mutationType <> 'SNP' and OPTIONS.mutationDensity > 1:
raise Exception('Does not support mutation density > 1 for mutations of class', OPTIONS.mutationType)
readLen = OPTIONS.readLen
fastaRecords = [seq for seq in readRef(OPTIONS.reference)]
header = SAMHeader( fastaRecords[0].id, len(fastaRecords[0].seq) )
SAMout = SAMIO( outputSAM, header, debugging=OPTIONS.debug )
mutationsout = open(root + '.mutations.txt', 'w')
qualsout = open(root + '.quals.txt', 'w')
refout = open(root + '.ref', 'w')
fastaout = open(root + '.fasta', 'w')
if OPTIONS.generatePairedEnds:
fastaout2 = open(root + '.pairs.fasta', 'w')
qualsout2 = open(root + '.pairs.quals.txt', 'w')
counter = 0
refLen = 0
for ref in fastaRecords:
refLen = len(ref.seq)
# write the crazy ref file info needed by samtools
print >> refout, ref.id + '\t' + str(refLen)
sampleRanges = parseSamplingRange( OPTIONS.range, ref, readLen )
sites = sampleStartSites( sampleRanges, OPTIONS.nSites )
for mutSite, siteCounter in zip( sites, range(1, len(sites)+1) ):
#print 'Sampling site:', mutSite, refLen
#mutSite = readLen-1
nSitesSelected = max(int(round(len(sites)/100.0)), 1)
#print siteCounter, len(sites), nSitesSelected)
if siteCounter % nSitesSelected == 0:
print 'Sampled site %d of %d (%.2f%%)' % ( siteCounter, len(sites), (100.0*siteCounter) / len(sites))
good, mutations, refSeq, mutSeq = mutateReference(ref.seq, ref.id, mutSite, OPTIONS.mutationType, [], readLen)
pairedSite, mutations2, refSeq2, mutSeq2 = 0, None, None, None
if good and OPTIONS.generatePairedEnds:
if OPTIONS.mutationType == 'SNP':
pairedMutType = 'SNP'
else:
pairedMutType = 'NONE'
good, pairedSite = pairedReadSite( ref.seq, mutSite, readLen )
print 'pairedReadSite', good, pairedSite, good
if good:
good, mutations2, refSeq2, mutSeq2 = mutateReference(ref.seq, ref.id, pairedSite, pairedMutType, [], readLen)
#print 'Good', good, mutations, refSeq, mutSeq
if good:
print >> mutationsout, ref.id + ':' + str(mutSite) + '\t' + string.join(map(str, mutations), '\t')
#print 'Mutations', mutations
refSeqPos = mutSite - readLen + 1
readPairs = sampleReadsFromAlignment(ref, refSeq, mutSeq, refSeqPos, readLen, OPTIONS.coverage, mutations, pairedSite - mutSite, mutations2, refSeq2, mutSeq2)
for i in range(len(readPairs)):
counter += 1
read1, read2 = readPairs[i]
seq, pos, cigar = read1
seq2, pos2, cigar2 = read2
# write out the sam and fasta files
def write1( record, recordi ):
if record <> None:
SAMout.writeRecord( record )
sequences = [SeqRecord(Seq(record.getSeq()), id = record.getQName(), description='')]
#print sequences
if recordi % 2 == 0:
localFastaOut, localQualsOut = fastaout, qualsout
else:
localFastaOut, localQualsOut = fastaout2, qualsout2
SeqIO.write(sequences, localFastaOut, "fasta")
print >> localQualsOut, record.getQName(), string.join(map( str, record.getQuals() ), ' ')
#print i, read1, read2
records = alignedRead2SAM( mutSite, i+1, ref.id, seq, pos + 1, cigar, seq2, pos2 + 1, cigar2 )
map( write1, records, range( len(records) ) )
SAMout.close()
qualsout.close()
fastaout.close()
refout.close()
mutationsout.close()
if OPTIONS.generatePairedEnds:
qualsout2.close()
fastaout2.close()
# for record in SAMIO( outputSAM ):
# print record
if __name__ == "__main__":
import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC, Gapped
main()

View File

@ -1,194 +0,0 @@
#!/usr/bin/env python
import getopt, sys, os, string
from farm_commands import *
#FastaQuals2Fastq_exe = "/wga/dev/andrewk/Arachne/AlignerEvaluation/FastaQuals2Fastq.py"
FastaQuals2Fastq_exe = "FastaQuals2Fastq.py"
def isFastaB(filename):
"""Is the file a fastb file already?"""
#print os.path.splitext(filename)
return os.path.splitext(filename)[1] == '.fastb'
def readListOfLanes( listFile ):
"""Simply reads a list of files to process from a file"""
lines = map( string.split, map( string.strip, open(listFile).readlines() ) )
return map( lambda x: x[0], lines )
#return map( lambda x: x[0], lines ), map( lambda x: x[1], lines )
def run_swmerlin(input_file, input_head, farm=""):
run_merlin(input_file, input_head, farm, sw=True)
def run_merlin(input_file, input_head, farm="", sw=False):
"sw = Merlin Smith-Waterman option"
if isFastaB(input_file):
input_fastb = input_file
else:
input_fastb = input_head+".fastb"
if regenExistingFiles or not os.path.exists(input_fastb):
cmd("Fasta2Fastb IN= "+input_file, just_print_commands=justPrintCommands)
if sw:
output_head = input_head+".swmerlin"
else:
output_head = input_head+".merlin"
cmd_str = "Merlin REF_FASTB= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.fastb REF_MERLIN= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.merlinref.bin FASTB= "+input_fastb+" OUT_HEAD="+output_head
if sw:
cmd_str += " SW=True"
cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands)
USE_BATCH = False
def run_ilt(input_file, input_head, farm=""):
#print 'isFastaB', input_file, isFastaB(input_file)
if isFastaB(input_file):
input_fastb = input_file
else:
input_fastb = input_head+".fastb"
if regenExistingFiles or not os.path.exists(input_fastb):
cmd("Fasta2Fastb IN= "+input_file, just_print_commands=justPrintCommands)
output_head = input_head+".ilt"
if USE_BATCH:
cmd_str = "~depristo/bin/batchShortQueryLookup2.pl --NUMPROCS=10 --BATCHQUEUE=long --SEQS="+input_fastb+" --L=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.lookup --MAX_FREQ=1000 --O= "+output_head
cmd(cmd_str, False, input_head, just_print_commands=justPrintCommands)
else:
cmd_str = "ImperfectLookupTable SEQS= "+input_fastb+" L= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.lookup MAX_FREQ=1000 OUT_PREFIX= "+output_head
cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands)
def run_BWA32(input_file, input_head, farm=""):
run_BWA(input_file, input_head, farm, 32)
def run_BWA(fasta, input_head, farm="", seed='inf'):
output_head = input_head+".bwa"
quals = input_head+".quals.txt"
fastq = input_head+".fastq"
if regenExistingFiles or not os.path.exists(fastq) :
cmd_str = FastaQuals2Fastq_exe+" "+fasta+" "+quals+" "+fastq
cmd(cmd_str)
if seed <> 'inf':
output_head += str(seed)
output_sam = output_head + ".sam"
cmd_str = "bwahuman samse "+str(seed)+" "+fastq+" "+output_sam
cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands)
def run_MAQ(input_fasta, head, farm=""):
maq_exe = "/seq/dirseq/maq-0.7.1/maq"
bfa_ref="/seq/dirseq/ktibbett/maq-0.7.1-test/Homo_sapiens_assembly18.bfa"
fasta = input_fasta
quals = head+".quals.txt"
fastq = head+".fastq"
if regenExistingFiles or not os.path.exists(fastq) :
cmd_str = FastaQuals2Fastq_exe+" "+fasta+" "+quals+" "+fastq
cmd(cmd_str)
bfq = head+".bfq"
if regenExistingFiles or not os.path.exists(bfq):
cmd( maq_exe+" fastq2bfq "+fastq+" "+bfq, just_print_commands=justPrintCommands )
out_head = head+".maq"
maq_out = out_head+".out.aln.map"
cmd_str = maq_exe+" map -e 100 -a 600 -s 0 "+maq_out+" "+bfa_ref+" "+bfq
cmd(cmd_str, farm, out_head, just_print_commands=justPrintCommands)
def usage():
print "Required arguments:"
print " -i Input FASTA head (*.fasta, *.qualb)"
print " OR"
print " -d Directory to grab all FASTA files from"
print " OR"
print " -l List of FASTA/FASTB files to process"
print
print "Optional arguments:"
print " -f QUEUE Farm jobs to QUEUE on LSF"
print
print " -m MAPPER Compare output from MAPPER which can be: ilt, merlin, swmerlin, maq, all (default: all)"
print
print " -x Don't execute commands, just print them"
print
print " -w Output files to current directory (strip path from input file/dir/list"
print
def get_all_fasta_files(fasta_dir):
files = os.listdir(fasta_dir)
if not fasta_dir.endswith("/"): fasta_dir += "/"
fasta_files = [fasta_dir+f for f in files if f.endswith(".fasta") and os.path.getsize(fasta_dir+f) > 0]
#print fasta_files
return fasta_files
justPrintCommands = False
regenExistingFiles = False
if __name__ == "__main__":
opts = None
try:
opts, args = getopt.getopt(sys.argv[1:], "i:d:f:m:l:xw:r", ["input","fasta_dir","farm","mapper","listOfLanes", "dontexe", "regenExistingFiles", "outputInWorkingDirectory"])
except getopt.GetoptError:
print sys.argv
usage()
sys.exit(2)
input_head = ""
fasta_dir = ""
mapper_str = "all"
farm_sub = False
listOfLanes = None
outputInWorkingDirectory = False
for opt, arg in opts:
print opt, arg
if opt in ("-i", "--input"):
input_head = arg
if opt in ("-l", "--listOfLanes"):
listOfLanes = arg
if opt in ("-d", "--fasta_dir"):
fasta_dir = arg
if opt in ("-f", "--farm"):
farm_sub = arg
if opt in ("-r", "--regenExistingFiles"):
regenExistingFiles = True
if opt in ("-m", "--mapper"):
mapper_str = arg
if opt in ("-x", "--dontexe"):
justPrintCommands = True
if opt in ("-w", "--outputInWorkingDirectory"):
outputInWorkingDirectory = True
if (input_head == "") and (fasta_dir == "") and (listOfLanes == None):
print input_head, fasta_dir, listOfLanes
usage()
sys.exit(2)
# Select function(s) for mapper
mapper_func_list = {"ilt":run_ilt, "merlin":run_merlin, "swmerlin":run_swmerlin, "maq":run_MAQ, "bwa":run_BWA, "bwa32":run_BWA32}
if mapper_str.lower() == "all":
mapper_list = mapper_func_list.values()
else:
mapper_list = [mapper_func_list.get(mapper_str.lower())]
if mapper_list == [None]:
sys.exit("Don't know of mapper argument: "+mapper_str)
if input_head:
input_heads = [None]
input_files = [input_head + 'fasta']
elif listOfLanes <> None:
input_files = readListOfLanes(listOfLanes)
input_heads = [None] * len(input_files)
else:
input_files = [file for file in get_all_fasta_files(fasta_dir)]
input_heads = [None] * len(input_files)
for input_file, input_head in zip(input_files, input_heads):
if input_head == None:
file_head = os.path.splitext(input_file)[0]
if outputInWorkingDirectory:
file_head = os.path.split(file_head)[1]
else:
file_head = input_head
for mapper in mapper_list:
mapper( input_file, file_head, farm=farm_sub )
print

View File

@ -1,92 +0,0 @@
#!/usr/bin/env python
import getopt, sys, os, string
from farm_commands import *
def picardCMD(name, **keywords ):
cmd = name
for key, value in keywords.iteritems():
cmd += ' ' + key + "=" + str(value)
return cmd
def spawnValidationJob( input_file, output_head, farm, maxErrors):
validate_exe = "ValidateSAM"
output_file = output_head + '.stdout'
if regenExistingFiles or not os.path.exists(output_file):
cmd_str = picardCMD( validate_exe, I=input_file, M=maxErrors )
if farm == "":
cmd_str += " > " + output_file
cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands)
def usage():
print "Required arguments:"
print " -d Directory to grab all sam/bam files from"
print
print "Optional arguments:"
print " -f QUEUE Farm jobs to QUEUE on LSF"
print
print " -m MAXERRORS Maximum number of errors to detect before aborting"
print
def get_all_sam_files(dir):
files = []
for dirpath, dirnames, filenames in os.walk(dir):
for filename in filenames:
base, ext = os.path.splitext(filename)
if ext.lower() in ['.sam', '.bam']:
files.append( os.path.join( dirpath, filename ) )
#print filename, base, ext
return files
def output_filename( input_file ):
parts = filter(lambda x: x.strip() <> '', input_file.split("/"))
print parts
return ".".join(parts) + ".validation"
justPrintCommands = False
regenExistingFiles = False
if __name__ == "__main__":
opts = None
try:
opts, args = getopt.getopt(sys.argv[1:], "d:f:m:r", ["dir","farm","maxErrors", "regenExistingFiles"])
except getopt.GetoptError:
print sys.argv
usage()
sys.exit(2)
dir = ""
mapper_str = "all"
farm_sub = False
maxErrors = 1000
for opt, arg in opts:
print opt, arg
if opt in ("-d", "--dir"):
dir = arg
if opt in ("-f", "--farm"):
farm_sub = arg
if opt in ("-m", "--maxErrors"):
maxErrors = arg
if opt in ("-r", "--regenExistingFiles"):
regenExistingFiles = True
if dir == "":
usage()
sys.exit(2)
input_files = get_all_sam_files(dir)
print 'Processing files: N=', len(input_files)
for input_file in input_files:
print ' ->', input_file
for input_file in input_files:
output_file = output_filename( input_file )
print input_file, "=>", output_file
spawnValidationJob( input_file, output_file, farm_sub, maxErrors )

View File

@ -1,66 +0,0 @@
import farm_commands
import os.path
import sys
import getopt
defaultCommands = ['CountReads', 'Pileup']
def usage():
print "Optional arguments:"
print " -f QUEUE Farm jobs to QUEUE on LSF"
print " -c cmd1,cmd2 Walkers to execute, otherwise", ' '.join(defaultCommands)
print " -e Ignore existing files", ' '.join(defaultCommands)
if __name__ == "__main__":
opts = None
try:
opts, args = getopt.getopt(sys.argv[1:], "f:c:a:e", ["farm", "commands", "args", "ignoreExistingFiles"])
except getopt.GetoptError:
print sys.argv
usage()
sys.exit(2)
farm_sub = False
commandsList = defaultCommands
ignoreExistingFiles = False
extraArgs = ''
for opt, arg in opts:
if opt in ("-f", "--farm"):
farm_sub = arg
if opt in ("-c", "--commands"):
commandsList = arg.split(',')
if opt in ("-e", "--ignoreExistingFiles"):
ignoreExistingFiles = True
if opt in ("-a", "--args"):
extraArgs = arg
directory = args[1]
for line in open(args[0]):
lineParts = line.split()
lane = lineParts[0].strip()
ref = '/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta'
if ( len(lineParts) > 1 ):
ref = lineParts[1].strip()
if not os.path.exists(lane):
print 'Input SAM/BAM file: "', lane, '" does not exist, skipping...'
continue
head, lane_filename = os.path.split(lane)
filebase = os.path.splitext(lane_filename)[0]
# make the pileup
# 503 8:03 samtools view -b MV1994.bam Escherichia_coli_K12:10-1000 > sub.bam
# 504 8:03 samtools pileup -f Escherichia_coli_K12_MG1655.fasta sub.bam
# convert the fasta
for analysis in commandsList:
output = os.path.join(directory, filebase + '.' + analysis + '.output')
if ignoreExistingFiles or not os.path.exists(output):
cmd = "java -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -T " + analysis + " -I " + lane + " -R " + ref + " -o " + output + " -l INFO " + extraArgs
print cmd
farm_commands.cmd(cmd, farm_sub, output)

View File

@ -1,201 +0,0 @@
import farm_commands
import os.path
import sys
from optparse import OptionParser
import string
defaultCommands = ['CountReads', 'Pileup']
def indexBAM(reads, queue=None):
cmd = "samtools index " + reads
farm_commands.cmd(cmd, queue, None, just_print_commands=OPTIONS.justPrint)
def readIntervalFile(file):
def notHeaderP(line):
return line[0][0] <> '@'
lines = filter(notHeaderP, map( string.split, [line for line in open(file)]))
byContig = {}
for entry in lines:
elt = [int(entry[1]), int(entry[2])]
if entry[0] in byContig:
byContig[entry[0]].append(elt)
else:
byContig[entry[0]] = [elt]
#print byContig
return byContig
def isIntervalFile(filename):
return os.path.splitext(filename)[1] == '.interval_list'
def filterByInterval(intervalMap, unfilteredPileupFile, filteredPileupFile):
print 'Intervals', intervalMap
def maybeWrite1(line, offsetMap, out):
debug = False
parts = line.split()
contig = parts[0]
pos = int(parts[1])
if debug: print 'pileup', contig, pos, line,
if contig in intervalMap:
intervals = intervalMap[contig]
while (offsetMap[contig] < len(intervals)):
offset = offsetMap[contig]
#print intervals[offset]
curStart, curStop = intervals[offset][0:2]
if debug: print contig, pos, curStart, curStop
if pos >= curStart:
if pos <= curStop:
#print line,
print >> out, line,
break
else:
if debug: print 'Progressing', contig, pos, curStart, curStop
offsetMap[contig] += 1
else:
break
return offsetMap
def allDone( offsetMap, intervalMap ):
def oneDone( contig ):
return offsetMap[contig] >= len(intervalMap[contig])
return all( map(oneDone, offsetMap.iterkeys()) )
i = 0
offsetMap = dict([(key, 0) for key in intervalMap.keys()])
out = open(filteredPileupFile, 'w')
for line in open(unfilteredPileupFile):
offsetMap = maybeWrite1(line, offsetMap, out)
i += 1
if i % 10000 == 0 and allDone(offsetMap, intervalMap):
break
out.close()
def main():
global OPTIONS, ROOT
usage = "usage: %prog [options]"
parser = OptionParser(usage=usage)
parser.add_option("-f", "--file", dest="validateInfoFile",
type="string", default=None,
help="File name of the validation set")
parser.add_option("-d", "--dataDir", dest="dataDir",
type="string", default=None,
help="Directory to the data files")
parser.add_option("-o", "--outputDir", dest="outputDir",
type="string", default=None,
help="Directory to put the output")
parser.add_option("-q", "--farmQueue", dest="farmQueue",
type="string", default=None,
help="Farm queue to submit jobs to. Leave blank for local processing")
parser.add_option("-e", "--ignoreExistingFiles", dest="ignoreExistingFiles",
action='store_true', default=False,
help="Ignores the existing files")
parser.add_option("-a", "--rebuildAllFiles", dest="rebuildAllFiles",
action='store_true', default=False,
help="If provided, all intermediate files (BAM and pileups) will be regenerated")
parser.add_option("-n", "--dontValidate", dest="dontValidate",
action='store_true', default=False,
help="If provided, won't try to validate, just make sure the validation data files are ready")
parser.add_option("-g", "--gatk", dest="gatkPath",
type="string", default="~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar",
help="Path to the gatk")
parser.add_option("-x", "--args", dest="extraArgs",
type="string", default="",
help="Extra args to pass to the GATK")
parser.add_option("-p", "--justPrint", dest="justPrint",
action='store_true', default=False,
help="Don't actually run GATK, just setup data files")
(OPTIONS, args) = parser.parse_args()
if len(args) != 0:
parser.error("incorrect number of arguments")
if OPTIONS.validateInfoFile == None:
parser.error("f option is required")
if OPTIONS.dataDir == None:
parser.error("d option is required")
if OPTIONS.outputDir == None:
parser.error("o option is required")
if not os.path.exists(OPTIONS.outputDir):
os.mkdir(OPTIONS.outputDir)
if not os.path.exists(OPTIONS.dataDir):
os.mkdir(OPTIONS.dataDir)
for line in open(OPTIONS.validateInfoFile):
if line.strip() == "" or line[0] == '#':
# comment line
continue
# line is of the format:
# <reads.bam> <ref.fasta> <processingRegion>
originalReads, ref, region = line.split()
originalReads = os.path.expanduser(originalReads)
ref = os.path.expanduser(ref)
if not os.path.exists(originalReads) or not os.path.exists(ref):
print 'Input files do not exist!', originalReads, ref
sys.exit(1)
head, read_filename = os.path.split(originalReads)
filebase = os.path.splitext(read_filename)[0]
reads = os.path.join(OPTIONS.dataDir, read_filename)
readsIndex = reads + '.bai'
subBAM = os.path.join(OPTIONS.dataDir, filebase + '.selected.bam')
pileup = os.path.join(OPTIONS.dataDir, filebase + '.selected.pileup')
validationOutput = os.path.join(OPTIONS.outputDir, filebase + '.validate.output')
#print 'reads', reads
if not os.path.exists(reads) or OPTIONS.rebuildAllFiles:
farm_commands.cmd("ln -s " + os.path.abspath(originalReads) + " " + reads, just_print_commands=OPTIONS.justPrint)
if not os.path.exists(readsIndex) or OPTIONS.rebuildAllFiles:
indexBAM(reads)
if not os.path.exists(subBAM) or OPTIONS.rebuildAllFiles:
if region == '*' or isIntervalFile(region):
farm_commands.cmd("ln -s " + os.path.abspath(reads) + " " + subBAM, just_print_commands=OPTIONS.justPrint)
else:
cmd = "samtools view -b " + reads + " " + region + " > " + subBAM
farm_commands.cmd(cmd, None, None, just_print_commands=OPTIONS.justPrint)
subBAMIndex = subBAM+'.bai'
if not os.path.exists(subBAMIndex) or OPTIONS.rebuildAllFiles:
if region == '*' or isIntervalFile(region):
farm_commands.cmd("ln -s " + os.path.abspath(readsIndex) + " " + subBAMIndex, just_print_commands=OPTIONS.justPrint)
else:
indexBAM(subBAM)
if not os.path.exists(pileup) or OPTIONS.rebuildAllFiles:
if isIntervalFile(region):
filteredPileup = pileup + ".unfiltered"
if not os.path.exists(filteredPileup) or OPTIONS.rebuildAllFiles:
cmd = "samtools pileup -cf " + ref + " " + subBAM + " > " + filteredPileup
#farm_commands.cmd(cmd, None, None)
filterByInterval(readIntervalFile(region), filteredPileup, pileup)
else:
cmd = "samtools pileup -cf " + ref + " " + subBAM + " > " + pileup
farm_commands.cmd(cmd, None, None, just_print_commands=OPTIONS.justPrint)
if not OPTIONS.dontValidate and (not os.path.exists(validationOutput) or OPTIONS.ignoreExistingFiles):
print validationOutput, 'does not exist'
analysis = "ValidatingPileup"
cmd = "java -ea -Xmx1024m -jar " + OPTIONS.gatkPath + " -T " + analysis + " -I " + subBAM + " -R " + ref + " -l INFO -S SILENT -U " + OPTIONS.extraArgs
if isIntervalFile(region):
cmd += " --intervals " + region
cmd += " -B pileup,SAMPileup," + pileup
print cmd
farm_commands.cmd(cmd, OPTIONS.farmQueue, outputFile=validationOutput, just_print_commands=OPTIONS.justPrint)
if __name__ == "__main__":
main()

View File

@ -1,58 +0,0 @@
#!/util/bin/python
import Walker
from SimpleSAM import *
class WalkLociTest(object, Walker.Walker):
def __init__(self):
Walker.Walker.__init__( self, 'byLoci', {}, name = 'WalkLociTest' )
self.debug = False
def filterfunc( self, *data ):
return True
#return datum.getPos() % 2 == 0
def mapfunc( self, ref, chrom, pos, offsets, reads ):
if self.debug:
print '------------------------------'
print ' locus', chrom, pos
print ' offsets = ', offsets
print ' reads = ', reads
#
# This is the heart of the test walker. You've got the reference, contig,
# pos, offsets, and reads. The line below generate pile-ups
#
def getField(field):
return map( lambda r, o: r.__getattribute__(field)[o], reads, offsets )
def toASCII33( quals ):
return ''.join( map( lambda q: chr(q+33), quals ))
if False:
# actually do some useful work
refbase = ref[chrom][pos-1]
bases = ''.join(getField('bases'))
quals = toASCII33(getField('quals'))
print chrom, pos, refbase, len(reads), all(map(lambda b: b == refbase, bases)), bases, quals
if self.debug:
for offset, read in zip( offsets, reads ):
if self.debug:
print ' offset, read', offset, read
print read.bases[offset], read.quals[offset]
#print datum
return 1
#return len(datum.getSeq())
#return datum.getSeq() + '/'
#return "\n>%s\n%s" % (datum.getQName(), datum.getSeq())
#reduceDefault = ''
def reducefunc( self, x, sum ):
#print x
return x + sum

View File

@ -1,88 +0,0 @@
class Walker:
"""Represents a functional walking object to be applied to SAM records.
Actually useful programs will inherit from this walker object to implement
one or more of the blank operators provided by this module. Effectly, this
engine applies a walker object in a standard order to each record in a SAM/BAM
file:
walker = MyWalker(args)
samRecords = parseReadsInFiles( files ):
results = []
foreach record in files:
if walker.filterfunc( record ):
x = walker.mapfunc( record )
results.append(x)
reduced = reduce( walker.reducefunc, results, walker.reduceDefault )
where:
filterfunc( data ) -> returns true or false if the data should be
included or excluded from mapping
mapfunc( data ) -> applied to each record, locus, or pair of records, depending
on the walker type
reducefunc( x, sum ) -> combines partial results x with the accumulating
results sum across all the data
"""
def __init__(self, walkerType, options, name = None, desc = None ):
self.options = options
self.setOption( 'walkerType', walkerType )
self.setOption( 'name', name )
self.setOption( 'desc', desc )
print self.options
reduceDefault = 0
#
# walker types
#
def isRecordWalker( self ):
return self.getOption('walkerType') == 'byRecord'
def isPairedRecordWalker( self ):
return self.getOption('walkerType') == 'byRecordPairs'
def isLociWalker( self ):
return self.getOption('walkerType') == 'byLoci'
#
# Option processing
#
def getOption( self, flag ):
if self.hasOption( flag ):
return self.options[flag]
else:
return None
def setOption( self, flag, value ):
self.options[flag] = value
def hasOption(self, flag):
return flag in self.options
def optionEnabled(self, flag):
return flag in self.options and self.options[flag] <> False
def optionDisabled(self, flag):
return not self.optionEnabled(flag)
#
# Default functions
#
def filterfunc( self, datum ):
return True
def mapfunc( self, datum ):
return datum
def reducefunc( self, mapResult, sum ):
return sum + 1
if __name__ == "__main__":
main()

View File

@ -1,87 +0,0 @@
#!/usr/bin/env python
import string, sys
class aln_record:
"Stores one record of data from a MAQ .aln.txt file"
field_names = (
"read name",
"chromosome",
"position",
"strand",
"insert size from the outer coorniates of a pair",
"paired flag",
"mapping quality",
"single-end mapping quality",
"alternative mapping quality",
"number of mismatches of the best hit",
"sum of qualities of mismatched bases of the best hit",
"number of 0-mismatch hits of the first 24bp",
"number of 1-mismatch hits of the first 24bp on the reference",
"length of the read",
"read sequence",
"sequence quality"
)
max_field_name_len = max(map(len, field_names))
def __init__(self, obj, parse = True):
self.fields = []
if type(obj) == str:
self.setValuesFromString( obj, parse );
else:
raise TypeError("aln_record did not recognize type: "+str(type(obj)))
def setValuesFromString( self, line, parse = True ):
if parse:
formats = [str, str, int, str, int, int, int, int, int, int, int, int, int, int, str, str]
self.fields = map( lambda f, v: f(v), formats, line.split() )
else:
self.fields = line.split()
def __str__(self):
s = ""
for n,v in zip(aln_record.field_names, self.fields):
s += ("%"+str(aln_record.max_field_name_len)+"s : %s\n") % (n, str(v))
return s
#return string.join( map( str, self.fields ), ' ')
def id(self): return self.fields[0]
def contig(self): return self.fields[1]
def offset(self): return self.fields[2]-1
def pos(self): return self.fields[2]
# Quality of read mapping (only maq gives this field)
def map_qual(self): return self.fields[6]
#def offset_end(self): return self.fields[8]
#def pos_end(self): return self.fields[8]+1
#def linear_start(self): return self.fields[9]
class aln_file:
def __init__(self, filename, parse=True):
self.filename = filename
self.parse = parse
self.faln = open(self.filename)
def RecordGenerator(self):
for line in self.faln:
yield aln_record(line, self.parse)
raise StopIteration
def __iter__(self):
return self.RecordGenerator()
if __name__ == "__main__":
if len(sys.argv) != 2:
print "To test aln_file class:\naln_file.py ALN_FILE"
else:
count = 0
for aln in aln_file(sys.argv[1]):
print aln
count += 1
#if count > 5:
# break

View File

@ -1,87 +0,0 @@
#!/usr/bin/env python
import string, sys
class aln_record:
"Stores one record of data from a MAQ .aln.txt file"
field_names = (
"read name",
"chromosome",
"position",
"strand",
"insert size from the outer coorniates of a pair",
"paired flag",
"mapping quality",
"single-end mapping quality",
"alternative mapping quality",
"number of mismatches of the best hit",
"sum of qualities of mismatched bases of the best hit",
"number of 0-mismatch hits of the first 24bp",
"number of 1-mismatch hits of the first 24bp on the reference",
"length of the read",
"read sequence",
"sequence quality"
)
max_field_name_len = max(map(len, field_names))
def __init__(self, obj, parse = True):
self.fields = []
if type(obj) == str:
self.setValuesFromString( obj, parse );
else:
raise TypeError("aln_record did not recognize type: "+str(type(obj)))
def setValuesFromString( self, line, parse = True ):
if parse:
formats = [str, str, int, str, int, int, int, int, int, int, int, int, int, int, str, str]
self.fields = map( lambda f, v: f(v), formats, line.split() )
else:
self.fields = line.split()
def __str__(self):
s = ""
for n,v in zip(aln_record.field_names, self.fields):
s += ("%"+str(aln_record.max_field_name_len)+"s : %s\n") % (n, str(v))
return s
#return string.join( map( str, self.fields ), ' ')
def id(self): return self.fields[0]
def contig(self): return self.fields[1]
def offset(self): return self.fields[2]-1
def pos(self): return self.fields[2]
# Quality of read mapping (only maq gives this field)
def map_qual(self): return self.fields[6]
#def offset_end(self): return self.fields[8]
#def pos_end(self): return self.fields[8]+1
#def linear_start(self): return self.fields[9]
class aln_file:
def __init__(self, filename, parse=True):
self.filename = filename
self.parse = parse
self.faln = open(self.filename)
def RecordGenerator(self):
for line in self.faln:
yield aln_record(line, self.parse)
raise StopIteration
def __iter__(self):
return self.RecordGenerator()
if __name__ == "__main__":
if len(sys.argv) != 2:
print "To test aln_file class:\naln_file.py ALN_FILE"
else:
count = 0
for aln in aln_file(sys.argv[1]):
print aln
count += 1
#if count > 5:
# break

View File

@ -1,37 +0,0 @@
import farm_commands
import os.path
import sys
from optparse import OptionParser
from datetime import date
import glob
import operator
if __name__ == "__main__":
usage = "usage: %prog files"
parser = OptionParser(usage=usage)
parser.add_option("-s", "--size", dest="regionSize",
type="int", default=None, help="")
(OPTIONS, args) = parser.parse_args()
def extract(line):
s = line.split()
t = s[0].split(":")
if len(t) == 2:
chr, pos = t
return chr, int(pos)
else:
return None, None
for file in args:
chr, lastPos, startPos, calledBases, progress = None, None, None, None, None
lastLine = None
for line in open(file):
if startPos == None:
chr, startPos = extract(line)
lastLine = line
if lastLine <> None and startPos <> None:
lastPos = extract(lastLine)[1]
calledBases = lastPos - startPos
progress = "%.2f" % (float(calledBases) / OPTIONS.regionSize * 100)
print file, chr, startPos, lastPos, calledBases, progress

View File

@ -1,591 +0,0 @@
#!/usr/bin/env python
import sys, string
import os
import re
from itertools import *
from optparse import OptionParser
from memo import DiskMemoize, time_func
class ref_genome:
"""Reads reference genome in FASTA format into a dict"""
def __init__(self, ref_genome_file):
ref_genome.chr_offset = [[] for i in range(45)]
chr_id = 0
seq = ""
for line in open(ref_genome_file):
if line.startswith(">"):
print line[1:],
if line.startswith(">chrM"): # skip first > line
continue
ref_genome.chr_offset[chr_id] = seq
chr_id += 1
seq = " " # make it 1 indexed instead of 0 indexed
#if chr_id > 2:
# break
else:
seq += line.rstrip().upper()
ref_genome.chr_offset[chr_id] = seq
def __getitem__(self, key):
return ref_genome.chr_offset[key]
AffyChr2Index = dict()
for i in range(1,23):
AffyChr2Index[str(i)] = i
AffyChr2Index['MT'] = 0
AffyChr2Index['X'] = 23
AffyChr2Index['Y'] = 24
class GenotypeCall:
#ref = time_func(ref_genome)("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
def __init__( self, chr, pos, genotype, snpP, lod ):
self.chr = chr
self.pos = int(pos)
self._site = chr + ':' + str(self.pos)
self._isSNP = snpP
self.genotype = string.join(map(string.upper, sorted(genotype)), '/') # sorted list of bases at position
self.lod = lod
def refbase(self):
return GenotypeCall.ref[AffyChr2Index[self.chr]][self.pos]
def __hash__(self):
return hash(self._site)
def __eq__(self, other):
return self._site == other._site
def site(self): return self._site
def isSNP(self): return self._isSNP
def ref_het_hom(self):
if self.genotype[0] <> self.genotype[2]:
return 1 # het(erozygous non-ref)
else:
# homozygous something
if self.genotype[0] == self.refbase:
return 0 # ref
else:
return 2 # hom(ozygous non-ref)
def isHET(self): return self.genotype[0] <> self.genotype[2]
def isHOM(self): return self.genotype[0] == self.genotype[2]
def __str__(self):
return "%s:%s %s %s" % ( self.chr, self.pos, self.genotype, self.lod)
MAQGenotypeEncoding = {
'A' : ['A', 'A'],
'C' : ['C', 'C'],
'T' : ['T', 'T'],
'G' : ['G', 'G'],
"M" : ['A', 'C'],
'K' : ['G', 'T'],
'Y' : ['C', 'T'],
'R' : ['A', 'G'],
'W' : ['A', 'T'],
'S' : ['C', 'G'],
'D' : ['A', 'G', 'T'],
'B' : ['C', 'G', 'T'],
'H' : ['A', 'C', 'T'],
'V' : ['A', 'C', 'G'],
'N' : ['A', 'C', 'G', 'T'] }
MAQ2STDChr = dict()
for i in range(1,23):
MAQ2STDChr['chr'+str(i)] = str(i)
MAQ2STDChr['chrM'] = 'MT'
MAQ2STDChr['chrX'] = 'X'
MAQ2STDChr['chrY'] = 'Y'
def convertMAQChr(maqChr):
#print 'convertMAQChr:', maqChr, MAQ2STDChr[maqChr]
if maqChr in MAQ2STDChr:
return MAQ2STDChr[maqChr]
else:
return '?'
def convertMAQGenotype( oneBaseCode ):
return MAQGenotypeEncoding[oneBaseCode]
def internalReadSNPFile( parse1, filename ):
result = []
snps_extracted = 0
for snp in imap( parse1, open(filename) ):
if snp:
result.append(snp)
snps_extracted += 1
if snps_extracted > OPTIONS.debug_lines:
break
print len(result),"genotypes extracted"
return result
def snpMAP( snps ):
#d = dict( map( lambda x: [x.site(), x], snps ) )
d = dict()
for snp in snps:
d[snp.site()] = snp#d
#print 'snps', snps, d
return d
def overlappingSites( snps1, snps2 ):
map1 = snpMAP(snps1)
map2 = snpMAP(snps2)
shared = set(map1.keys()) & set(map2.keys())
print 'Number of snp1 records', len(map1)
print 'Number of snp2 records', len(map2)
print 'Number of shared sites', len(shared)
print "\n".join(map(str,snps1))
return shared
def readMAQSNPs(filename):
# Each line consists of:
# chromosome
# position
# reference base
# consensus base
# Phred-like consensus quality
# read depth
# the average number of hits of reads covering this position
# the highest mapping quality of the reads covering the position
# the minimum consensus quality in the 3bp flanking regions at each side of the site (6bp in total)
# the second best call
# log likelihood ratio of the second best and the third best call
# and the third best call.
#
# Also, note that:
#
# What do those "S", "M" and so on mean in the cns2snp output?
# They are IUB codes for heterozygotes. Briefly:
#
# M=A/C, K=G/T, Y=C/T, R=A/G, W=A/T, S=G/C, D=A/G/T, B=C/G/T, H=A/C/T, V=A/C/G, N=A/C/G/T
def read1(line):
formats = [str, int, str, str, int, int]
vals = map( lambda f, x: f(x), formats, line.split()[0:6] )
alignQual = vals[4]
if alignQual >= (10*OPTIONS.lod):
return GenotypeCall( convertMAQChr(vals[0]), vals[1], convertMAQGenotype(vals[3]), vals[2] <> vals[3], alignQual/10.0 )
else:
#print 'Filtering', alignQual, vals
return False
return internalReadSNPFile( read1, filename )
OPTIONS = None
def MerlinChr( index ):
if index == 0:
return 'MT'
elif index == 23:
return 'X'
elif index == 24:
return 'Y'
else:
return str(index)
def readMerlinSNPs(filename):
# 0:72 G GG 155.337967 0.000000 homozygous A:0 C:2 G:510 T:2 514 0 1 1 GG:-5.59 CG:-160.92 GT:-161.51 AG:-162.11 CT:-1293.61 CC:-1293.61 TT:-1294.19 AC:-1294.21 AT:-1294.80 AA:-1295.40
# 0:149 T CC 118.595886 1131.024696 homozygous-SNP A:2 C:442 G:1 T:7 452 0 1 1 CC:-24.21 CT:-142.81 AC:-156.33 CG:-156.96 TT:-1155.23 AT:-1159.41 GT:-1160.04 AA:-1173.26 AG:-1173.56 GG:-1174.20
# chr:pos ref genotype bestVsRef bestVsNextBest class ...
def read1(line):
formats = [lambda x: x.split(':'), str, sorted, float, float, str]
vals = map( lambda f, x: f(x), formats, line.split()[0:6] )
bestVsRef, bestVsNext = vals[3:5]
isSNP = vals[5].find('-SNP') <> -1
if bestVsRef >= OPTIONS.lod and isSNP:
return GenotypeCall( MerlinChr(int(vals[0][0])), int(vals[0][1]) + 1, vals[2], isSNP, bestVsRef )
else:
return False
return internalReadSNPFile( read1, filename )
def readSNPfile( filename, format ):
formats = { 'merlin' : readMerlinSNPs, 'maq' : readMAQSNPs }
if format.lower() in formats:
return list(formats[format.lower()](filename))
else:
raise Exception('Unknown SNP file format ' + format)
def readAffyFile(filename):
# chrom position genotype probe_set_id dbsnp_id
# 1 84647761 TC SNP_A-1780419 rs6576700
# 5 156323558 GG SNP_A-1780418 rs17054099
def read1(line):
formats = [str, int, sorted, str, str]
vals = map( lambda f, x: f(x), formats, line.split() )
try:
chr = str(int(vals[0]))
except:
chr = convertMAQChr(vals[0])
#print 'CHR', chr, vals[0]
return GenotypeCall( chr, vals[1], vals[2], False, 100 )
file = open(filename)
file.readline() # skip header
#affyData = map( read1, file )
affyData = []
for index, line in enumerate(file):
affyData.append(read1(line))
if index > OPTIONS.debug_lines:
break
if index % 10000 == 0:
print index
# Give a chance to use list before creating dictionary
return affyData
#print "1111111"
#return dict( zip( map( GenotypeCall.site, affyData ), affyData ) )
def equalSNPs( snp1, snp2 ):
return snp1.genotype == snp2.genotype
# def concordance( truthSet, testSet, includeVector = None ):
# # calculates a bunch of useful stats about the two
# # data genotype call sets above
#
# states = [[x,0] for x in ['tested', 'shared', 'shared-equal', 'test-snp', 'hom-ref', 'hom-snp', 'het-snp', 'het-ref']]
# counts = dict(states)
#
# def incShared(state, equalP ):
# counts[state] += 1
# if equalP:
# counts[state + '-equal'] += 1
#
# nTestSites = 0
# for i, testSNP in izip( count(), testSet ):
# if includeVector <> None and not includeVector[i]:
# # we are skiping this site
# continue
#
# nTestSites += 1
#
# if testSNP.isSNP():
# counts['test-snp'] += 1
#
# #print testSNP.site()
# if testSNP.site() in truthSet:
# truth = truthSet[testSNP.site()]
# eql = equalSNPs( testSNP, truth )
#
# incShared( 'shared', eql )
# if testSNP.isSNP():
# if truth.isHOM(): incShared( 'hom-snp', eql )
# else: incShared( 'het-snp', eql )
# else:
# if truth.isHOM(): incShared( 'hom-ref', eql )
# else: incShared( 'het-ref', eql )
#
# if OPTIONS.verbose and nTestSites % 100 == 0 and nSharedSites > 0:
# #print nTestSites, nSharedSites, nEqualSites
# print nTestSites, counts
#
# counts['tested'] = nTestedSites
#
# return counts
class ConcordanceData:
def __init__(self, name, file1count, file2count):
self.name = name
self.nFile1Sites = file1count # num sites in file 1
self.nFile2Sites = file2count # num sites in file 1
self.nSharedSites = 0 # num SNP pairs that map to same position on the genome
self.nEqualSites = 0 # num SNPs pars with the same genotype
def inc( self, truthSNP, testSNP ):
self.nSharedSites += 1
if equalSNPs( testSNP, truthSNP ): # if the genotypes are equal
self.nEqualSites += 1
def rate(self):
return (100.0 * self.nEqualSites) / max(self.nSharedSites,1)
def __str__(self):
return '%d %d %.2f' % ( self.nSharedSites, self.nEqualSites, self.rate() )
def concordance( truthSet, testSet, sharedSites = None ):
# calculates a bunch of useful stats about the two
# data genotype call sets above
#
# The 2 calls in main work like this:
# affy, snp1, snp1_snp2_shared
# affy, snp2, snp1_snp2_shared
nTestSites = 0
# Now for each of the calls to concordance, we generate 3 sets:
# - allData: all SNP1 sites that are also in Affy
allData = ConcordanceData('all', len(truthSet), len(testSet))
# - sharedData: SNP1 sites that are also SNP2 sites that are alse in Affy
sharedData = ConcordanceData('shared', len(truthSet), len(testSet))
# - uniqueData: SNP1 sites that are not SNP2 sites but that are in Affy
uniqueData = ConcordanceData('unique', len(truthSet), len(testSet))
for i, testSNP in izip( count(), testSet ):
nTestSites += 1
if testSNP.site() in truthSet:
truthSNP = truthSet[testSNP.site()]
allData.inc( truthSNP, testSNP )
if sharedSites <> None:
if testSNP.site() in sharedSites:
sharedData.inc( truthSNP, testSNP )
else:
uniqueData.inc( truthSNP, testSNP )
if OPTIONS.verbose and nTestSites % 100000 == 0:
#print nTestSites, nSharedSites, nEqualSites
print nTestSites, allData, sharedData, uniqueData
return nTestSites, allData, sharedData, uniqueData
# def concordance( truthSet, testSet, includeVector = None ):
# # calculates a bunch of useful stats about the two
# # data genotype call sets above
#
# states = [[x,0] for x in ['tested', 'shared', 'test-snp', 'shared-hom-ref', 'shared-het-snp', 'shared-hom-snp']]
# counts = dict(states)
#
# nTestSites = 0
# nSharedSites = 0
# nEqualSites = 0
# for i, testSNP in izip( count(), testSet ):
# nTestSites += 1
# #print testSNP.site()
# if testSNP.site() in truthSet:
# nSharedSites += 1
# if equalSNPs( testSNP, truthSet[testSNP.site()] ):
# nEqualSites += 1
# #else:
# # print '~', testSNP, truthSet[testSNP.site()]
# if OPTIONS.verbose and nTestSites % 100000 == 0 and nSharedSites > 0:
# #print nTestSites, nSharedSites, nEqualSites
# print nTestSites, nSharedSites, nEqualSites, (100.0 * nEqualSites) / nSharedSites
#
# return [nTestSites, nSharedSites, nEqualSites, (100.0 * nEqualSites) / nSharedSites]
def printConcordanceResults( filename1, filename2, results, hasSharedSites = False ):
nTestSites, allData, sharedData, uniqueData = results
def print1(data):
print '------------------------------------------------------------'
print 'Concordance results', data.name, 'sites'
print ' -> Genotype file1', filename1
print ' -> Genotype file2', filename2
print ' -> Number of tested sites (%s %s): %d' % (filename2, data.name, nTestSites)
print ' -> Number of sites shared between files (%s %s): %d' % (filename2, data.name, data.nSharedSites)
print ' -> Percent sites in file1 shared (%s %s): %.2f' % (filename1, data.name, data.nSharedSites / (0.01 * data.nFile1Sites))
print ' -> Percent sites in file2 shared (%s %s): %.2f' % (filename1, data.name, data.nSharedSites / (0.01 * data.nFile2Sites))
print ' -> Number of genotypically equivalent sites (%s %s): %d' % (filename2, data.name, data.nEqualSites)
print ' -> Concordance rate (%s %s): %.2f' % (filename2, data.name, data.rate())
print1(allData)
if hasSharedSites:
print1( sharedData ) # shared between SNP1 and SNP2
print1( uniqueData ) # unique to SNP1 or to SNP2 only
def dump_shared_snps( affys, snp_list1, snp_list2 ):
print len(affys), len(snp_list1), len(snp_list2)
snps1 = snpMAP(snp_list1)
snps2 = snpMAP(snp_list2)
snp1_sites = set(snps1.keys()); print "SNP1s:",len(snp1_sites)
snp2_sites = set(snps2.keys()); print "SNP2s:",len(snp2_sites)
affy_sites = set(affys.keys()); print "Affys:",len(affy_sites)
snp1or2_affy_sites = (snp1_sites | snp2_sites) & affy_sites
snp1and2_affy_sites = (snp1_sites & snp2_sites) & affy_sites
print "SNP 1 or 2 and Affy: ",len(snp1or2_affy_sites)
print "SNP 1 and 2 and Affy:",len(snp1and2_affy_sites)
fsnp = open ("snp.tab","w")
print >>fsnp, "site lod1 lod2 lod1v2 gen1 gen2 genaff inc1 inc2 inc12 lodm genm incm ref_het_hom refbase"
for site in snp1and2_affy_sites:
snp1 = snps1[site]
snp2 = snps2[site]
affy = affys[site]
print >>fsnp, "%-11s %5.2f %5.2f" % (site, snp1.lod, snp2.lod),
try:
snp1div2 = snp1.lod / snp2.lod
except ZeroDivisionError:
snp1div2 = 1000
print >>fsnp, "%5.2f" % snp1div2,
print >>fsnp, snp1.genotype, snp2.genotype, affy.genotype,
print >>fsnp, "%1d %1d %1d" % (not equalSNPs(snp1, affy), not equalSNPs(snp2,affy), not equalSNPs(snp1,snp2)),
# Calculte meta_lod from the two lods
if snp1.genotype == snp2.genotype:
meta_lod = snp1.lod + snp2.lod
meta_genotype = snp1.genotype
else:
if snp1.lod > snp2.lod:
meta_lod = snp1.lod
meta_genotype = snp1.genotype
else:
meta_lod = snp2.lod
meta_genotype = snp2.genotype
meta_inc = meta_genotype != affy.genotype
print >>fsnp, "%5.2f %3s %1d" % (meta_lod, meta_genotype, meta_inc),
print >>fsnp, affy.ref_het_hom(),
print >>fsnp, affy.refbase()
def intersection_union_snps( affy, snps1, snps2 ):
map1 = snpMAP(snps1)
map2 = snpMAP(snps2)
shared_nonaffy_sites = (set(map1.keys()) & set(map2.keys())).difference(affy.keys())
nonaffy_shared = [(map1[site].lod, map2[site].lod) for site in shared_nonaffy_sites]
shared_affy_sites = set(map1.keys()) & set(map2.keys()) & set(affy.keys())
#shared = []
#for site in shared_sites:
# shared.append((map1[site], map2[site]))
#print "Shared:",len( shared )
#shared_x = [s[0].lod for s in shared]
#shared_y = [s[1].lod for s in shared]
both_corr = []
snp1_corr = []
snp2_corr = []
neither_corr = []
# given two bools telling whether snp1 and snp2 are correct,
# return the correct object
#
# snp2 incorrect, snp2 correct
truth_list = [[neither_corr, snp2_corr], # snp1 incorrect
[snp1_corr, both_corr]] # snp1 correct
for site in shared_affy_sites:
snp1_true = equalSNPs(map1[site], affy[site])
snp2_true = equalSNPs(map2[site], affy[site])
truth_list[snp1_true][snp2_true].append( (map1[site].lod, map2[site].lod) )
print "Beginning plot..."
import rpy2.robjects as robj
robj.r('X11(width=15, height=15)')
XY_MAX = 25
plots = ((nonaffy_shared, "gray45"),(both_corr, "black"), (snp1_corr, "red"), (snp2_corr, "green"), (neither_corr, "blue"))
plots = ((both_corr, "black"), (snp1_corr, "red"), (snp2_corr, "green"), (neither_corr, "blue"))
robj.r.plot([0, XY_MAX], [0, XY_MAX], \
xlab = os.path.splitext(OPTIONS.snp1)[0].capitalize()+" LOD", \
ylab = os.path.splitext(OPTIONS.snp2)[0].capitalize()+" LOD", \
main = "Shared SNP site LODs", \
xlim = robj.FloatVector([0,XY_MAX]), \
ylim = robj.FloatVector([0,min(XY_MAX,25)]), \
pch = 19, \
cex = 0.0)
for xy, color in plots:
print "X"
robj.r.points([pt[0] for pt in xy], [pt[1] for pt in xy], col=color, pch=19, cex=.3)
print "Color:",color
print "Len:",len(xy)
#print "\n".join(["%25s %25s" % pt for pt in xy])
#print "\n".join(["%25s %25s" % shared_snp for shared_snp in shared])
#robj.r.plot(shared_x, shared_y, xlab=OPTIONS.format1.capitalize()+" LOD", ylab=OPTIONS.format2.capitalize()+" LOD", main="Shared SNP site LODs", col="black", xlim=robj.FloatVector([0,XY_MAX]), ylim=robj.FloatVector([0,min(XY_MAX,25)]), pch=19, cex=0.3)
raw_input("Press enter to continue")
return
ss1 = set(snps1)
ss2 = set(snps2)
print "Shared:",len( ss1.intersection(ss2) )
print "Snp1 only:",len( ss1.difference(ss2) )
print "Snp2 only:",len( ss2.difference(ss1) )
print "Snp1 total:",len( ss1 )
print "Snp2 total:",len( ss2 )
def count_het_sites(snp_list):
hets = 0
for snp in snp_list:
if snp.isHET():
hets += 1
print hets,"hets,",len(snp_list),"total,",
print "%.1f" % (float(hets)/len(snp_list)*100)
def main(argv):
global OPTIONS, ROOT
usage = "usage: %prog --truth affy-truth-file --snp1 snpfile1 --snp2 snpfile2"
parser = OptionParser(usage=usage)
parser.add_option("-t", "--truth", dest="affy",
type="string", default=None,
help="Affy truth file")
parser.add_option("-1", "--snp1", dest="snp1",
type="string", default=None,
help="Affy truth file")
parser.add_option("-2", "--snp2", dest="snp2",
type="string", default=None,
help="Affy truth file")
parser.add_option("", "--f1", dest="format1",
type="string", default=None,
help="File type of snpfile1")
parser.add_option("", "--f2", dest="format2",
type="string", default=None,
help="File type of snpfile2")
parser.add_option("-l", "--lod", dest="lod",
type="float", default=5.0,
help="Minimum LOD of confident SNP call in Merlin or Q (10x) in MAQ")
parser.add_option("-v", "--verbose", dest="verbose",
action='store_true', default=False,
help="Verbose output")
parser.add_option("-d", "--debug_lines", dest="debug_lines",
type='float', default=sys.maxint,
help="Number of input data lines to process for debugging")
(OPTIONS, args) = parser.parse_args()
if len(args) != 0:
parser.error("incorrect number of arguments")
if OPTIONS.affy == None:
parser.error("No affy data specified")
if OPTIONS.format1 == None:
parser.error("First format cannot be none")
if OPTIONS.snp2 <> None and OPTIONS.format2 == None:
parser.error("snp2 file given but format was not specified")
# Load reference genome
#ref = ref_genome("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
#sys.exit()
if 1:
print "Reading Affy truth data..."
#readAffyFile2 = DiskMemoize( readAffyFile, "readAffyFile", global_deps = ["OPTIONS.lod"] )
readAffyFile2 = time_func(readAffyFile)
affy_list = readAffyFile2( filename=OPTIONS.affy )
#count_het_sites(affy_list)
#sys.exit()
affy = dict( zip( map( GenotypeCall.site, affy_list ), affy_list ) )
print 'Read affy truth data:'
print ' -> number of genotyped loci', len(affy)
#readSNPfile2 = DiskMemoize( readSNPfile, "readSNPfile", global_deps = ["OPTIONS.lod"] )
readSNPfile2 = time_func(readSNPfile)
print "Reading SNPs 1 file..."
snps1 = readSNPfile2( filename=OPTIONS.snp1, format=OPTIONS.format1 )
if OPTIONS.snp2 <> None:
print "Reading SNPs 2 file..."
snps2 = readSNPfile2( filename=OPTIONS.snp2, format=OPTIONS.format2 )
dump_shared_snps( affy, snps1, snps2 )
#intersection_union_snps( affy, snps1, snps2 )
#sys.exit()
sharedSites = None
if OPTIONS.snp2 <> None:
sharedSites = overlappingSites( snps1, snps2 )
results1 = concordance( affy, snps1, sharedSites )
printConcordanceResults( OPTIONS.affy, OPTIONS.snp1, results1, True )
results2 = concordance( affy, snps2, sharedSites )
printConcordanceResults( OPTIONS.affy, OPTIONS.snp2, results2, True )
else:
results = concordance( affy, snps1 )
printConcordanceResults( OPTIONS.affy, OPTIONS.snp1, results, sharedSites )
if __name__ == "__main__":
main(sys.argv)

View File

@ -1,81 +0,0 @@
#!/usr/bin/env python
# Creates summary stats from a concordance file - SNPs called and number in dbSNP - and will output
import sys, re, os, farm_commands
sting_dir = os.environ.get("STING_DIR")
if sting_dir == None:
print "You need to set the environment variable STING_DIR to point to your Sting/GATK build"
sys.exit()
if len(sys.argv) < 2:
print "Usage: PROG CONCORDANCE_FILE"
sys.exit()
class callset_data:
def __init__(self, snps, in_dbSNP, titv):
self.snps = snps
self.in_dbSNP = in_dbSNP
self.titv = titv
def inc_snps(self):
self.snps += 1
def inc_dbsnps(self):
self.in_dbSNP += 1
def __str__(self):
return "SNPs: %10d in_dbSNP: %d TiTv: %d" % (self.snps, self.in_dbSNP, self.titv)
sets = dict()
concordance_filename = os.path.abspath(sys.argv[1])
for line in file(concordance_filename):
fields = line.split("\t")
filter = 0
if not re.search('HybridSelectionVariantFilter', line):
match = re.search('Venn=(\S*)', line) ### TODO: Only works in 2-way venn with ";", remove ";" for N-way Venn - should be changed to work for both automatically
if match:
my_set = match.groups()[0]
if sets.has_key(my_set):
sets[my_set].inc_snps()
if fields[2] != ".":
sets[my_set].inc_dbsnps()
else:
sets[my_set] = callset_data(1, 0, 0)
print "\n".join(["%40s %s" % (k,str(v)) for k,v in sets.items()])
print "Splitting concordance file\n"
splits_dir = concordance_filename+".splits"
for vennSet in sets.keys():
print vennSet
output_head = splits_dir+"_"+vennSet
vcf_file = splits_dir+"_"+vennSet+".vcf"
varianteval_file = splits_dir+"_"+vennSet+".varianteval"
if not os.path.exists(varianteval_file):
fout = open(vcf_file, "w")
for line in file(concordance_filename):
if line.startswith("#"):
fout.write(line)
else:
match = re.search('Venn='+vennSet, line)
if match:
fout.write(line)
varianteval_out = os.path.basename(concordance_filename)+".vcf"
farm_commands.cmd("java -Xmx4096m -jar "+sting_dir+" -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta "
+"-T VariantEval "
+"-B eval,VCF,"+vcf_file+" "
+"-o "+varianteval_file+" "
+"-D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod",
"gsa")

View File

@ -1,80 +0,0 @@
import farm_commands
import os.path
import sys
from optparse import OptionParser
from gatkConfigParser import *
import glob
if __name__ == "__main__":
usage = """usage: %prog [-c config.cfg]* input.bam output.bam"""
parser = OptionParser(usage=usage)
parser.add_option("-A", "--args", dest="args",
type="string", default="",
help="arguments to GATK")
parser.add_option("-m", "--mode", dest="RecalMode",
type="string", default="SEQUENTIAL",
help="Mode argument to provide to table recalibrator")
parser.add_option("-q", "--farm", dest="farmQueue",
type="string", default=None,
help="Farm queue to send processing jobs to")
parser.add_option("-c", "--config", dest="configs",
action="append", type="string", default=[],
help="Configuration file")
parser.add_option("-d", "--dir", dest="scratchDir",
type="string", default="./",
help="Output directory")
parser.add_option("-w", "--wait", dest="initialWaitID",
type="string", default=None,
help="If providedm the first job dispatched to LSF will use this id as it ended() prerequisite")
parser.add_option("", "--dry", dest="dry",
action='store_true', default=False,
help="If provided, nothing actually gets run, just a dry run")
parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles",
action='store_true', default=False,
help="Ignores already written files, if present")
(OPTIONS, args) = parser.parse_args()
if len(args) != 2:
parser.error("incorrect number of arguments")
config = gatkConfigParser(OPTIONS.configs)
inputBAM = args[0]
outputBAM = args[1]
rootname = os.path.split(os.path.splitext(outputBAM)[0])[1]
covariateRoot = os.path.join(OPTIONS.scratchDir, rootname)
covariateInitial = covariateRoot + '.init'
initDataFile = covariateInitial + '.recal_data.csv'
covariateRecal = covariateRoot + '.recal'
recalDataFile = covariateRecal + '.recal_data.csv'
if not os.path.exists(OPTIONS.scratchDir):
os.mkdir(OPTIONS.scratchDir)
def covariateCmd(bam, outputDir):
add = " -I %s --OUTPUT_FILEROOT %s" % (bam, outputDir)
return config.gatkCmd('CountCovariates', log=outputDir, stdLogName=True) + add
def recalibrateCmd(inputBAM, dataFile, outputBAM):
return config.gatkCmd('TableRecalibration', log=outputBAM, stdLogName=True) + " -I %s -params %s -outputBAM %s -mode %s" % (inputBAM, dataFile, outputBAM, OPTIONS.RecalMode)
def runCovariateCmd(inputBAM, dataFile, dir, jobid):
if OPTIONS.ignoreExistingFiles or not os.path.exists(dataFile):
cmd = covariateCmd(inputBAM, dir)
return farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
#
# Actually do some work here
#
jobid = OPTIONS.initialWaitID
if OPTIONS.ignoreExistingFiles or not os.path.exists(initDataFile):
jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid)
if OPTIONS.ignoreExistingFiles or not os.path.exists(outputBAM):
cmd = recalibrateCmd(inputBAM, initDataFile, outputBAM)
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
jobid = farm_commands.cmd('samtools index ' + outputBAM, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid)

View File

@ -1,24 +0,0 @@
import os.path
from optparse import OptionParser
def main():
global OPTIONS
usage = "usage: %prog [options] files"
parser = OptionParser(usage=usage)
parser.add_option("-c", "--category", dest="category",
type='string', default="Anonymous",
help="If provided, catagory name")
(OPTIONS, args) = parser.parse_args()
if len(args) == 0:
parser.error("incorrect number of arguments")
print '<Category name="%s">' % OPTIONS.category
for arg in args:
path = os.path.abspath(arg)
name = os.path.basename(arg)
print ' <Resource name="%s" path="%s" serverURL="http://www.broad.mit.edu/webservices/igv"/>' % (name, path)
print '</Category>'
if __name__ == "__main__":
main()

View File

@ -1,107 +0,0 @@
import os.path
import sys
from optparse import OptionParser
from vcfReader import *
from itertools import *
import faiReader
def main():
global OPTIONS
usage = "usage: %prog [options] file1 ... fileN"
parser = OptionParser(usage=usage)
parser.add_option("-f", "--f", dest="fai",
type='string', default=None,
help="FAI file defining the sort order of the VCF")
parser.add_option("-o", "--o", dest="output",
type='string', default=None,
help="if provided, output will go here instead of stdout")
parser.add_option("-a", "--assumeSorted", dest="assumeSorted",
action='store_true', default=False,
help="If provided, this assumes the input VCF files are themselves sorted, enabling a simple efficent merge")
parser.add_option("-v", "--verbose", dest="verbose",
action='store_true', default=False,
help="If provided, verbose progress will be enabled")
(OPTIONS, args) = parser.parse_args()
if len(args) == 0:
parser.error("Requires at least 1 VCF to merge")
order = None
if OPTIONS.fai <> None:
if OPTIONS.verbose: print 'reading FAI', OPTIONS.fai
order = faiReader.readFAIContigOrdering(OPTIONS.fai)
#print 'Order', order
if OPTIONS.output != None:
out = open(OPTIONS.output,'w')
else:
out = sys.stdout
if OPTIONS.assumeSorted:
mergeSort(out, args, order)
else:
memSort(out, args, order)
def cmpVCFRecords(order, r1, r2):
if order <> None:
c1 = order[str(r1.getChrom())]
c2 = order[str(r2.getChrom())]
orderCmp = cmp(c1, c2)
if orderCmp <> 0:
return orderCmp
return cmp(r1.getPos(), r2.getPos())
def mergeSort(out, args, order):
#print 'MergeSort', args, order
header = None
orderMap = []
for file in args:
#print file
openedFile = open(file)
for header, record, counter in lines2VCF(openedFile, extendedOutput = True, decodeAll = False):
orderMap.append([record, file])
break
openedFile.close()
#print orderMap
sortedOrderMap = sorted(orderMap, key=lambda x: x[0], cmp = lambda r1, r2: cmpVCFRecords(order, r1, r2))
#print sortedOrderMap
for headerLine in header: print >> out, headerLine
i = 0
n = len(sortedOrderMap)
for file in map( lambda x: x[1], sortedOrderMap):
if OPTIONS.verbose:
i += 1
print 'Processing', file, ':', i, 'of', n
for record in lines2VCF(open(file), extendedOutput = False, decodeAll = False):
print >> out, record.format()
def memSort(args, order):
header = None
records = []
for file in args:
#print file
for header, record, counter in lines2VCF(open(file), extendedOutput = True, decodeAll = False):
records.append(record)
records.sort(lambda r1, r2: cmpVCFRecords(order, r1, r2))
for line in formatVCF(header, records):
#pass
print >> out, line
PROFILE = False
if __name__ == "__main__":
if PROFILE:
import cProfile
cProfile.run('main()', 'fooprof')
import pstats
p = pstats.Stats('fooprof')
p.sort_stats('cumulative').print_stats(10)
p.sort_stats('time').print_stats(10)
p.sort_stats('time', 'cum').print_stats(.5, 'init')
else:
main()

View File

@ -1,332 +0,0 @@
import farm_commands
import os.path
import sys
from optparse import OptionParser
from datetime import date
import glob
import operator
import faiReader
import math
import shutil
GATK_STABLE = 'java -ea -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar -l INFO -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta '
GATK_DEV = 'java -ea -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta '
GATK = GATK_STABLE
class CallTarget:
def __init__(self, name, hetero, minQ = 50, depth = 120, truthGFFName = None, variantEvalArgs = '', otherVCF = None):
self.name = name
self.hetero = hetero
self.minQ = minQ
self.depth = depth
if truthGFFName <> None:
self.truthGFFName = truthGFFName
else:
self.truthGFFName = name
self.variantEvalArgs = variantEvalArgs
self.otherVCF = otherVCF
self.unionVCF = None
if otherVCF != None:
self.unionVCF = self.name + '.gatk.glftrio.union.filtered.vcf'
self.listFile = None
self.releaseVCF = None
CEU_HET = 0.79e-3
YRI_HET = 1.0e-3
targets = [
CallTarget('NA12878', CEU_HET),
CallTarget('NA12891', CEU_HET),
CallTarget('NA12892', CEU_HET),
CallTarget('NA19238', YRI_HET),
CallTarget('NA19239', YRI_HET),
CallTarget('NA19240', YRI_HET),
CallTarget('ceu.trio', CEU_HET, 50, 360, 'NA12878', '--sampleName NA12878', '/humgen/gsa-hpprojects/1kg/1kg_pilot2/currentBestProjectCalls/CEU_1kg_pilot2.vcf'),
CallTarget('yri.trio', YRI_HET, 50, 360, 'NA19240', '--sampleName NA19240', '/humgen/gsa-hpprojects/1kg/1kg_pilot2/currentBestProjectCalls/YRI_1kg_pilot2.vcf'),
# CallTarget('NA19240.alltechs.solid_original', YRI_HET, 50, 120, 'NA19240'),
# CallTarget('NA12878.alltechs.solid_original', CEU_HET, 50, 120, 'NA12878'),
CallTarget('NA19240.alltechs.solid_recal', YRI_HET, 50, 120, 'NA19240'),
CallTarget('NA12878.alltechs.solid_recal', CEU_HET, 50, 120, 'NA12878'),
]
sets = ['Intersection', 'filteredInBoth', 'gatk', 'gatk-filteredInOther', 'glftrio', 'glftrio-filteredInOther', 'Intersection', ['gatk-unique', 'gatk.*'], ['glftrio-unique', 'glftrio.*']]
def main():
global OPTIONS
usage = "usage: %prog stage [options]"
parser = OptionParser(usage=usage)
# parser.add_option("-q", "--farm", dest="farmQueue",
# type="string", default=None,
# help="Farm queue to send processing jobs to")
parser.add_option("", "--dry", dest="dry",
action='store_true', default=False,
help="If provided, nothing actually gets run, just a dry run")
parser.add_option("-s", "--sample", dest="useSample",
type='string', default=None,
help="If provided, only run pipeline for this sample")
parser.add_option("-c", "--minQ", dest="minQ",
type='float', default=None,
help="If provided, will actually use this Q threshold for calls")
parser.add_option("-d", "--dir", dest="dir",
type='string', default="",
help="If provided, this is the root where files are read and written")
parser.add_option("-q", "--farm", dest="farmQueue",
type="string", default=None,
help="Farm queue to send processing jobs to")
(OPTIONS, args) = parser.parse_args()
if len(args) != 1:
parser.error("incorrect number of arguments")
stages = args[0].split(",")
allJobs = []
for callTarget in targets:
if callTarget.name == OPTIONS.useSample or OPTIONS.useSample == None:
lastJobs = None
if OPTIONS.minQ != None:
callTarget.minQ = OPTIONS.minQ
for stage in stages:
print 'STAGE', stage
target = callTarget.name
callTarget.listFile = os.path.join("lists", target + ".list")
unfilteredVCFBaseName = target + '.gatk.ug.vcf'
unfilteredVCF = os.path.join(OPTIONS.dir, unfilteredVCFBaseName)
filteredVCF = os.path.join(OPTIONS.dir, target + '.gatk.ug.filtered.vcf')
tmpdir = os.path.join(OPTIONS.dir, "intermediates", unfilteredVCFBaseName + ".scatter")
if not os.path.exists(tmpdir):
os.makedirs(tmpdir)
if callTarget.unionVCF != None:
callTarget.releaseVCF = os.path.join(OPTIONS.dir, callTarget.name + ".gatk_glftrio.intersection.annotated.filtered.vcf")
print 'Heading into stage', stage, lastJobs
newJobs = []
if stage == 'CALL':
newJobs = callSNPs(target, callTarget, callTarget.listFile, tmpdir)
if stage == 'MERGE':
newJobs = mergeSNPs(target, lastJobs, unfilteredVCF, tmpdir)
if stage == 'FILTER':
newJobs = filterSNPs(callTarget, lastJobs, unfilteredVCF, filteredVCF)
if stage == 'UNION':
newJobs = unionSNPs(callTarget, lastJobs, filteredVCF )
if stage == 'SELECT_INTERSECT':
if ( callTarget.unionVCF != None ):
# we are one fo the release targets
newJobs = finalize1KGRelease(callTarget, lastJobs, os.path.join(OPTIONS.dir, callTarget.unionVCF ), callTarget.releaseVCF)
if stage == 'EVAL':
newJobs = evalSNPs(callTarget, lastJobs, unfilteredVCF, filteredVCF)
if stage == 'RELEASE':
dir = '/humgen/gsa-scr1/pub/1000GenomesPilot2'
subdir = '1000GenomesPilot2SNPs_GATK_glftrio_' + date.today().strftime("%m%d%y")
releaseSNPs(os.path.join(dir, subdir), callTarget, filteredVCF )
if stage == 'CLEAN':
shutil.rmtree("intermediates", True)
for file in [filteredVCF, unfilteredVCF]:
if os.path.exists(file): os.remove(file)
print 'New jobs'
for job in newJobs:
print ' ', job
allJobs.append(newJobs)
if newJobs != []:
lastJobs = newJobs
print 'EXECUTING JOBS'
farm_commands.executeJobs(allJobs, farm_queue = OPTIONS.farmQueue, just_print_commands = OPTIONS.dry)
hg18 = ['chr' + str(i) for i in range(1,23)] + ['chrX']
b36 = [str(i) for i in range(1,23)] + ['X']
def autosomePlusX(name):
return name in b36 or name in hg18
def partition(faiRecords, maybeChrom, n):
# 1 247249719 3 60 61
# 2 242951149 251370554 60 61
chromAndSize = [[x[0], int(x[1])] for x in faiRecords if autosomePlusX(x[0])]
#print chromAndSize
if maybeChrom <> None:
chromAndSize = filter(lambda x: x[0] == maybeChrom, chromAndSize)
def r(chrom, chrStart, chrEnd):
outputVCF = 'calls.chr%s.good.%s.vcf' % (chrom, chrStart)
L = '-L %s:%d-%d' % (chrom, chrStart, chrEnd)
return outputVCF, chrom, chrStart, chrEnd, L
for chrom, size in chromAndSize:
#print 'SIZE', chrom, size
if n == 1:
yield r(chrom, 1, size)
else:
idealBp = float(size-1) / n
chrEnd = None
chrStart = 1
for i in range(1, n):
chrEnd = 1 + int(round(idealBp * (i + 1)))
#print chrom, chrStart, chrEnd, idealBp
result = r(chrom, chrStart, chrEnd)
chrStart = chrEnd + 1
yield result
if chrEnd <> size:
raise Exception('X')
def callSNPs(target, callTarget, listFile, tmpdir):
extras = "-mmq 10 -mbq 10 -pl SOLID --heterozygosity %e" % (callTarget.hetero)
fai = "/humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta.fai"
NWaysParallel = 5
bins = partition(faiReader.readFAI(fai), None, NWaysParallel)
jobs = list()
for outputVCF, chrom, chrStart, chrEnd, L in bins:
outputVCF = os.path.join(tmpdir, outputVCF)
#print outputVCF, L, os.path.exists(outputVCF)
#if not os.path.exists(outputVCF):
print 'Enqueuing job for', outputVCF, L
cmd = 'java -Xmx2048m -jar /home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar ' + \
'-T UnifiedGenotyper -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -mrl 500000 ' + \
'-I %s -confidence %d %s -varout %s -vf VCF -L %s -gm JOINT_ESTIMATE' % (listFile, callTarget.minQ, extras, outputVCF, L)
#jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry)
jobs.append(farm_commands.FarmJob(cmd, jobName = "CALL_%s_c%s_s%s" % (target, str(chrom), str(chrStart))))
return jobs
def mergeSNPs(target, lastJobs, snpFile, tmpdir):
#print 'LastJobs = ', lastJobs
cmd = "python ~/dev/GenomeAnalysisTK/trunk/python/mergeVCFs.py -a -f /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta.fai %s/*.vcf > %s" % (tmpdir, snpFile)
return [farm_commands.FarmJob(cmd, jobName = "MERGE_%s" % (target), dependencies = lastJobs)]
def filterSNPs(callTarget, lastJobs, unfilteredVCF, filteredVCF):
target = callTarget.name
#expression = "AB > 0.75 || DP > %s" % depth
expression1 = ['GATK_STANDARD', "AB > 0.75 || DP > %s || MQ0 > 40 || SB > -0.10" % callTarget.depth]
expression2 = ['HARD_TO_VALIDATE', "MQ0 >= 4 && ((MQ0 / (1.0 * DP)) > 0.1)"]
cmd = GATK + '-T VariantFiltration -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -B variant,VCF,%s --clusterWindowSize 10 -o %s ' % (unfilteredVCF, filteredVCF)
for name, exp in [expression1, expression2]:
cmd += '--filterName %s --filterExpression "%s" ' % ( name, exp )
#jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry)
return [farm_commands.FarmJob(cmd, jobName = "FILTER_%s" % (target), dependencies = lastJobs)]
def evalSNPs(callTarget, lastJobs, unfilteredVCF, filteredVCF):
evalRoot = os.path.join(OPTIONS.dir, "eval")
if not os.path.exists(evalRoot):
os.makedirs(evalRoot)
def eval1(vcfFullPath, namePostfix = "", args = ""):
vcf = os.path.split(vcfFullPath)[1]
out = os.path.join(OPTIONS.dir, "eval", vcf + namePostfix + ".eval")
cmd = GATK + "-T VariantEval -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -B 1kg_ceu,VCF,/humgen/gsa-hpprojects/1kg/1kg_pilot1/SNPCalls/Joint/RC1/CEU.2and3_way.annotated.vcf -B 1kg_yri,VCF,/humgen/gsa-hpprojects/1kg/1kg_pilot1/SNPCalls/Joint/RC1/YRI.2and3_way.annotated.vcf -B eval,VCF,%s -G -A -o %s -L %s" % ( vcfFullPath, out, '\;'.join(map(str, b36)) )
hapmap3 = os.path.join("../../hapmap3Genotypes", callTarget.truthGFFName + ".b36.gff")
if os.path.exists(hapmap3):
cmd += " -B hapmap-chip,GFF,%s %s %s" % (hapmap3, callTarget.variantEvalArgs, args)
#jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry)
return farm_commands.FarmJob(cmd, jobName = "EVAL_%s_%s" % (callTarget.name, namePostfix), dependencies = lastJobs)
jobs = []
jobs.append(eval1(filteredVCF))
jobs.append(eval1(unfilteredVCF))
if callTarget.unionVCF != None:
jobs.append(eval1(os.path.join(OPTIONS.dir, callTarget.unionVCF), ".union"))
for set in sets:
if type(set) == list:
name, selector = set
else:
name, selector = set, set
jobs.append(eval1(os.path.join(OPTIONS.dir, callTarget.unionVCF), "." + name, " -vcfInfoSelector set=\"" + selector + "\""))
if callTarget.releaseVCF != None:
jobs.append(eval1(callTarget.releaseVCF, ".release"))
return jobs
def unionSNPs(callTarget, lastJobs, filteredVCF ):
if callTarget.otherVCF != None:
cmd = GATK + "-T VCFCombine -B GATK,VCF,%s -B glfTrio,VCF,%s -O %s -type UNION -priority GATK,glfTrio -A" % ( filteredVCF, callTarget.otherVCF, os.path.join(OPTIONS.dir, callTarget.unionVCF) )
return [farm_commands.FarmJob(cmd, jobName = "UNION_%s" % (callTarget.name), dependencies = lastJobs)]
else:
return []
# jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry)
def finalize1KGRelease(callTarget, lastJobs, inputVCF, outputVCF):
commands = []
lastJob = lastJobs
# subdir = os.path.join(OPTIONS.dir, "1kg_release")
# if not os.path.exists(subdir):
# os.makedirs(subdir)
# first, filter out the intersection
cmd = GATK + '-T VCFSelect -B variant,VCF,%s -o %s -match "(set eq \'Intersection\' || set eq \'filteredInBoth\')" ' % (inputVCF, outputVCF)
lastJob = farm_commands.FarmJob(cmd, jobName = "SELECT_%s" % (callTarget.name), dependencies = lastJob)
commands.append(lastJob)
# call variant annotator to annotate all of the calls
# annotatedVCF = os.path.join(subdir, callTarget.name + ".gatk_glftrio.intersection.annotated.vcf")
# cmd = GATK + '-T VariantAnnotator -I %s -standard -B variant,VCF,%s -vcf %s -L 1:1-1,000,000 ' % (callTarget.listFile, intersectVCF, annotatedVCF)
# lastJob = farm_commands.FarmJob(cmd, jobName = "ANNOTATE_%s" % (callTarget.name), dependencies = lastJob)
# commands.append(lastJob)
#
# # filter the calls
# filteredVCF = os.path.join(subdir, callTarget.name + ".gatk_glftrio.intersection.annotated.filtered.vcf")
# commands = commands + filterSNPs(callTarget, lastJob, annotatedVCF, filteredVCF)
#
return commands
def releaseSNPs(dir, callTarget, filteredVCF ):
if not os.path.exists(dir):
os.makedirs(dir)
print 'Copying files into ', dir, filteredVCF, callTarget.unionVCF, callTarget.releaseVCF
if not OPTIONS.dry: shutil.copy(filteredVCF, dir)
if callTarget.unionVCF != None:
if not OPTIONS.dry: shutil.copy(os.path.join(OPTIONS.dir, callTarget.unionVCF), dir)
if callTarget.releaseVCF != None:
if not OPTIONS.dry: shutil.copy(callTarget.releaseVCF, dir)
if __name__ == "__main__":
main()
# java -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -T VCFCombine -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -B GATK,VCF,ceu.trio.gatk.ug.filtered.vcf -B glfTrio,VCF,/humgen/gsa-hpprojects/1kg/1kg_pilot2/currentBestProjectCalls/CEU_1kg_pilot2.vcf -O test.vcf -type UNION -priority GATK,glfTrio -l INFO -A
# java -ea -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -T VariantEval -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -B eval,VCF,test.vcf -B hapmap-chip,GFF,../../hapmap3Genotypes/NA12878.b36.gff --sampleName NA12878 -vcfInfoSelector set=gatk-filtered
# if ( $1 == 3.1 ) then
# cat ceu.trio.calls.allTechs.mmq10_mbq10_q200.filtered.vcf | cut -f 1-10 > ceu.trio.calls.allTechs.mmq10_mbq10_q200.NA12878only.filtered.vcf
# endif
#
# if ( $1 == 4 ) then
# foreach callset ( NA12878.allTechs.mmq10_mbq10_q200 ceu.trio.calls.allTechs.mmq10_mbq10_q200.NA12878only )
# java -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar -T CallsetConcordance -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -B GATK,VCF,$callset.filtered.vcf -B glfTrio,VCF,CEU_1kg_pilot2.na12878.vcf -CT SimpleVenn -CO ${callset}_v_CEU_1kg_pilot2.filtered.vcf -l INFO
# cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "callset2_only"' > ${callset}_v_CEU_1kg_pilot2.filtered.CEU_1kg_pilot2Unique.vcf
# cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "callset1_only"' > ${callset}_v_CEU_1kg_pilot2.filtered.${callset}Unique.vcf
# cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "concordant"' > ${callset}_v_CEU_1kg_pilot2.filtered.concordant.vcf
# end
# endif
#
# if ( $1 == 5 ) then
# mkdir /humgen/gsa-scr1/pub/1000Pilot2_010710
#
# foreach file ( NA12878.allTechs.mmq10_mbq10_q200.filtered.vcf NA12878.SLX.mmq10_mbq10_q50.filtered.vcf NA12891.calls.mmq10_mbq10_q50.filtered.vcf NA12892.calls.mmq10_mbq10_q50.filtered.vcf ceu.trio.calls.allTechs.mmq10_mbq10_q200.filtered.vcf )
# echo $file
# cp $file /humgen/gsa-scr1/pub/1000Pilot2_010710
# end
#
# endif

View File

@ -1,72 +0,0 @@
#!/usr/bin/env python
import string
class align_block:
def __init__(self, indel_count, length, mismatches):
self.indel_count = int(indel_count)
self.length = int(length)
self.mismatches = int(mismatches)
def __str__(self):
return string.join( map( str, [self.indel_count, self.length, self.mismatches]), ' ')
class qltout_record:
def __init__(self, obj):
self.fields = []
self.align_blocks = []
if type(obj) == str:
self.setValuesFromString( obj );
else:
raise TypeError("qltout_record did not recognize type: "+str(type(obj)))
def setValuesFromString( self, line ):
formats = [str, int, int, int, int, int, int, int, int, int, int]
split_line = line.split()
self.fields = map( lambda f, v: f(v), formats, split_line[:11] )
next_align_block = 11
while next_align_block < len(split_line):
self.align_blocks.append( align_block(*split_line[next_align_block:next_align_block+3]) )
next_align_block += 3
#print self.__str__()
def indelled_bases(self):
return sum([ab.indel_count for ab in self.align_blocks])
def __str__(self):
return string.join( map( str, self.fields ), ' ')+' '+string.join( map( str, self.align_blocks ), ' ')
def id(self): return self.fields[1]
def contig(self): return self.fields[6]
def offset(self): return self.fields[7]
def pos(self): return self.fields[7]+1
def offset_end(self): return self.fields[8]
def pos_end(self): return self.fields[8]+1
def linear_start(self): return self.fields[9]
def map_qual(self): return 100 # This only exists in maq files
# for ILT and Merlin, we assume the same confidence for all mappings
class qltout_file:
def __init__(self, filename):
self.filename = filename
self.fqlt = open(self.filename)
def RecordGenerator(self):
for line in self.fqlt:
#fields = line.rstrip("\n").split("\t")
if not line.startswith("QUERY"):
continue
yield qltout_record(line) #fields)
raise StopIteration
def __iter__(self):
return self.RecordGenerator()

View File

@ -1,20 +0,0 @@
#!/bin/tcsh
setenv refroot ~/work/refmut/
setenv refname test2
setenv ref "${refroot}/${refname}"
setenv cov 100
setenv sites 1
setenv x '' # '-x 6696745:6708910'
# testing samtools
./SimulateReads.py --ref ${ref}.fasta --coverage ${cov} -n ${sites} -l 10 -t None ${x}
# running samtools
samtools import ${refname}__mut.None_10.bp_nsites.${sites}_cov.${cov}.ref ${refname}__mut.None_10.bp_nsites.${sites}_cov.${cov}.sam samtest.bam
samtools sort samtest.bam samtestSorted
samtools index samtestSorted.bam
samtools pileup -f ${ref}.fasta samtestSorted.bam

View File

@ -1,728 +0,0 @@
import os.path
import sys
from optparse import OptionParser
from vcfReader import *
#import pylab
import operator
from itertools import *
import math
import random
DEBUG = False
class Range:
ANY = '*'
def __init__(self, left = ANY, right = ANY, leftOpen = False, rightOpen = False):
self.left = left
self.right = right
self.leftOpen = leftOpen
self.rightOpen = rightOpen
def __str__(self):
leftB, rightB = '[', ']'
if self.leftOpen: leftB = '('
if self.rightOpen: rightB = ')'
return '%s%s, %s%s' % (leftB, self.left, self.right, rightB)
__repr__ = __str__
def dashedString(self):
return str(self).replace(", ", "-")
def contains(self, v):
def test(r, op, open):
return r == Range.ANY or op(v, r) or (not open and v == r)
return test(self.left, operator.__gt__, self.leftOpen) and test(self.right, operator.__lt__, self.rightOpen)
class CallCovariate:
def __init__(self, feature, featureRange, qualRange, FPRate = None):
self.feature = feature
self.featureRange = featureRange
self.qualRange = qualRange
self.FPRate = FPRate
def containsVariant(self, call):
inFeature = self.featureRange.contains(call.getField(self.feature))
inQual = self.qualRange.contains(call.getQual())
#print 'inFeature, inQual', inFeature, inQual
return inFeature and inQual
def getFPRate(self): return self.FPRate
def getFeature(self): return self.feature
def getCovariateField(self): return self.getFeature() + '_RQ'
def __str__(self): return "[CC feature=%s range=%s qualRange=%s]" % (self.feature, self.featureRange, self.qualRange)
class RecalibratedCall:
def __init__(self, call, features):
self.call = call
self.features = dict([[feature, None] for feature in features])
def recalFeature( self, feature, FPRate ):
assert self.features[feature] == None, "Feature " + feature + ' has value ' + str(self.features[feature]) + ' for call ' + str(self.call) # not reassigning values
assert FPRate <= 1 and FPRate >= 0
self.features[feature] = FPRate
def getFeature( self, feature, missingValue = None, phredScaleValue = False ):
v = self.features[feature]
if v == None:
return missingValue
elif phredScaleValue:
return phredScale(v)
else:
return v
def jointFPErrorRate(self):
#print self.features
logTPRates = [math.log10(1-r) for r in self.features.itervalues() if r <> None]
logJointTPRate = reduce(lambda x, y: x + y, logTPRates, 0)
logJointTPRate = min(logJointTPRate, 1e-3 / 3) # approximation from het of 0.001
jointTPRate = math.pow(10, logJointTPRate)
#print logTPRates
#print logJointTPRate, jointTPRate
return 1 - jointTPRate
def featureStringList(self):
return ','.join(map(lambda feature: '%s=Q%d' % (feature, self.getFeature(feature, '*', True)), self.features.iterkeys()))
def __str__(self):
return '[%s: %s => Q%d]' % (str(self.call), self.featureStringList(), phredScale(self.jointFPErrorRate()))
def readVariants( file, maxRecords = None, decodeAll = True, downsampleFraction = 1, filter = None, minQScore = -1, maxQScore = 10000000, mustBeVariant = False, skip = None ):
if filter == None:
filter = not OPTIONS.unfiltered
f = open(file)
header, columnNames, lines = readVCFHeader(f)
nLowQual = 0
counter = 0
def parseVariant(args):
global nLowQual, counter
header1, VCF, counter = args
counter += 1
#print counter, skip, counter % skip
if filter and not VCF.passesFilters() or ( False and mustBeVariant == True and not VCF.isVariant() ): # currently ignore mustBeVariant
#print 'filtering', VCF
return None
elif VCF.getQual() <= minQScore or VCF.getQual() > maxQScore:
#print 'filtering', VCF.getQual()
#nLowQual += 1
return None
elif skip <> None and counter % skip <> 0:
#print 'skipping'
return None
elif random.random() <= downsampleFraction:
#print 'keeping', VCF
return VCF
else:
return None
variants = ifilter(None, imap(parseVariant, islice(lines2VCF(lines, header=header, columnNames = columnNames, extendedOutput = True, decodeAll = decodeAll), maxRecords)))
if nLowQual > 0:
print '%d snps filtered due to QUAL < %d' % (nLowQual, minQScore)
return header, variants
def selectVariants( variants, selector = None ):
if selector <> None:
return filter(selector, variants)
else:
return variants
def titv(variants):
ti = len(filter(VCFRecord.isTransition, variants))
tv = len(variants) - ti
titv = ti / (1.0*max(tv,1))
return titv
def dbSNPRate(variants):
inDBSNP = len(filter(VCFRecord.isKnown, variants))
return float(inDBSNP) / max(len(variants),1)
def gaussian(x, mu, sigma):
constant = 1 / math.sqrt(2 * math.pi * sigma**2)
exponent = -1 * ( x - mu )**2 / (2 * sigma**2)
return constant * math.exp(exponent)
# if target = T, and FP calls have ti/tv = 0.5, we want to know how many FP calls
# there are in N calls with ti/tv of X.
#
def titvFPRateEstimate(variants, target):
titvRatio = titv(variants)
# f <- function(To,T) { (To - T) / (1/2 - T) + 0.001 }
def theoreticalCalc():
if titvRatio >= target:
FPRate = 0
else:
FPRate = (titvRatio - target) / (0.5 - target)
FPRate = min(max(FPRate, 0), 1)
TPRate = max(min(1 - FPRate, 1 - dephredScale(OPTIONS.maxQScoreForCovariate)), dephredScale(OPTIONS.maxQScoreForCovariate))
if DEBUG: print 'FPRate', FPRate, titvRatio, target
assert FPRate >= 0 and FPRate <= 1
return TPRate
# gaussian model
def gaussianModel():
LEFT_HANDED = True
sigma = 1 # old value is 5
constant = 1 / math.sqrt(2 * math.pi * sigma**2)
exponent = -1 * ( titvRatio - target )**2 / (2 * sigma**2)
TPRate = gaussian(titvRatio, target, sigma) / gaussian(target, target, sigma)
if LEFT_HANDED and titvRatio >= target:
TPRate = 1
TPRate -= dephredScale(OPTIONS.maxQScoreForCovariate)
if DEBUG: print 'TPRate', TPRate, constant, exponent, dephredScale(OPTIONS.maxQScoreForCovariate)
return TPRate
FPRate = 1 - theoreticalCalc()
#FPRate = 1 - gaussianModel()
nVariants = len(variants)
if DEBUG: print ':::', nVariants, titvRatio, target, FPRate
return titvRatio, FPRate
def phredScale(errorRate):
return -10 * math.log10(max(errorRate, 1e-10))
def dephredScale(qscore):
return math.pow(10, float(qscore) / -10)
def frange6(*args):
"""A float range generator."""
start = 0.0
step = 1.0
l = len(args)
if l == 1:
end = args[0]
elif l == 2:
start, end = args
elif l == 3:
start, end, step = args
if step == 0.0:
raise ValueError, "step must not be zero"
else:
raise TypeError, "frange expects 1-3 arguments, got %d" % l
v = start
while True:
if (step > 0 and v >= end) or (step < 0 and v <= end):
raise StopIteration
yield v
v += step
def compareFieldValues( v1, v2 ):
if type(v1) <> type(v2):
#print 'Different types', type(v1), type(v2)
c = cmp(type(v1), type(v2))
else:
c = cmp(v1, v2)
#print 'Comparing %s %s = %s' % (v1, v2, c)
return c
def calculateBins(variants, field, minValue, maxValue, partitions):
values = map(lambda x: x.getField(field), variants)
return calculateBinsForValues(values, field, minValue, maxValue, partitions)
def calculateBinsForValues(values, field, minValue, maxValue, partitions):
sortedValues = sorted(values)
captureFieldRangeForPrinting(field, sortedValues)
targetBinSize = len(values) / (1.0*partitions)
#print sortedValues
uniqBins = groupby(sortedValues)
binsAndSizes = map(lambda x: [x[0], len(list(x[1]))], uniqBins)
#print 'BINS AND SIZES', binsAndSizes
def bin2Break(bin): return [bin[0], bin[0], bin[1]]
bins = [bin2Break(binsAndSizes[0])]
for bin in binsAndSizes[1:]:
#print ' Breaks', bins
#print ' current bin', bin
curLeft = bins[-1][0]
curSize = bin[1]
prevSize = bins[-1][2]
#print curSize, prevSize
if curSize + prevSize > targetBinSize or (not isNumber(curLeft) and isNumber(bin[0])):
#print ' => appending', bin2Break(bin)
bins.append(bin2Break(bin))
else:
bins[-1][1] = bin[0]
bins[-1][2] += curSize
#print 'Returning ', bins
#sys.exit(1)
return bins
def fieldRange(variants, field):
values = map(lambda v: v.getField(field), variants)
minValue = min(values)
maxValue = max(values)
#rangeValue = maxValue - minValue
bins = calculateBins(variants, field, minValue, maxValue, OPTIONS.partitions)
validateBins(bins)
return minValue, maxValue, bins
def validateBins(bins):
#print 'Bins are', bins
for left1, right1, count1 in bins:
for left2, right2, count2 in bins:
def contains2(x):
return left2 < x and x < right2
if left1 <> left2 and right1 <> right2:
if None in [left1, left2, right1, right2]:
pass # we're ok
elif contains2(left1) or contains2(right2):
raise Exception("Bad bins", left1, right1, left2, right2)
def printFieldQualHeader():
more = ""
if TRUTH_CALLS <> None:
more = CallCmp.HEADER
def p(stream):
if stream <> None:
print >> stream, ' %20s %20s left right %15s nVariants nNovels titv titvNovels dbSNP Q' % ("category", "field", "qRange"), more
p(sys.stdout)
p(RECAL_LOG)
def printFieldQual( category, field, cc, variants ):
more = ""
if TRUTH_CALLS <> None:
callComparison, theseFPs = sensitivitySpecificity(variants, TRUTH_CALLS)
more = str(callComparison)
novels = selectVariants(variants, VCFRecord.isNovel)
def p(stream):
if stream <> None:
print >> stream, ' %20s %20s %15s %15s %8d %8d %2.2f %2.2f %3.2f %3d' % (category, field, binString(field, cc.featureRange), cc.qualRange.dashedString(), len(variants), len(novels), titv(variants), titv(novels), dbSNPRate(variants) * 100, phredScale(cc.FPRate)), more
p(sys.stdout)
p(RECAL_LOG)
FIELD_RANGES = dict()
def captureFieldRangeForPrinting(field, sortedValues):
"""Finds the minimum float value in sortedValues for convenience printing in recal.log"""
#print sortedValues
floatValues = filter(isNumber, sortedValues)
if floatValues <> []:
FIELD_RANGES[field] = floatValues[0]
#print 'Setting field range to', field, FIELD_RANGES[field]
def isNumber(x):
return isinstance(x, (int, long, float))
def binString(field, cc):
epsilon = 1e-2
left, right = cc.left, cc.right
leftStr = str(left)
rightStr = "%5s" % str(right)
if OPTIONS.plottableNones and not isNumber(left) and not isNumber(right) and field in FIELD_RANGES:
left = right = FIELD_RANGES[field] - epsilon
if OPTIONS.plottableNones and not isNumber(left) and isNumber(right):
left = right - epsilon
if isNumber(left): leftStr = "%.4f" % left
if isNumber(right): rightStr = "%.4f" % right
return '%12s %12s' % (leftStr, rightStr)
#
#
#
def recalibrateCalls(variants, fields, callCovariates):
def phred(v): return phredScale(v)
for variant in variants:
recalCall = RecalibratedCall(variant, fields)
originalQual = variant.getField('QUAL')
for callCovariate in callCovariates:
if callCovariate.containsVariant(variant):
FPR = callCovariate.getFPRate()
recalCall.recalFeature(callCovariate.getFeature(), FPR)
recalCall.call.setField(callCovariate.getCovariateField(), phred(FPR))
#recalCall.call.setField('QUAL', phred(recalCall.jointFPErrorRate()))
recalCall.call.setField('QUAL', phred(recalCall.jointFPErrorRate()))
recalCall.call.setField('OQ', originalQual)
#print 'recalibrating', variant.getLoc()
#print ' =>', variant
yield recalCall.call
#
#
#
def optimizeCalls(variants, fields, titvTarget):
callCovariates = calibrateFeatures(variants, fields, titvTarget, category = "covariates", useBreaks=True)
recalCalls = recalibrateCalls(variants, fields, callCovariates)
return recalCalls, callCovariates
def printCallQuals(field, recalCalls, titvTarget, info = ""):
print '--------------------------------------------------------------------------------'
print info
calibrateFeatures(recalCalls, [field], titvTarget, printCall = True, cumulative = False, forcePrint = True, prefix = "OPT-", printHeader = False, category = "optimized-calls" )
print 'Cumulative'
calibrateFeatures(recalCalls, [field], titvTarget, printCall = True, cumulative = True, forcePrint = True, prefix = "OPTCUM-", printHeader = False, category = "optimized-calls" )
def all( p, l ):
for elt in l:
if not p(elt): return False
return True
def mapVariantBins(variants, field, cumulative = False, breakQuals = [Range()]):
minValue, maxValue, featureBins = fieldRange(variants, field)
#print 'BREAKQuals', breakQuals[0]
bins = [(x,y) for x in featureBins for y in breakQuals]
#print 'BINS', bins
def variantsInBin(featureBin, qualRange):
right = featureBin[1]
if cumulative:
right = Range.ANY
cc = CallCovariate(field, Range(featureBin[0], right), qualRange)
return cc, selectVariants(variants, lambda v: cc.containsVariant(v))
#sys.exit(1)
return starmap( variantsInBin, bins )
def qBreaksRanges(qBreaks, useBreaks):
if qBreaks == None or not useBreaks:
return [Range()] # include everything in a single range
else:
breaks = map(float, qBreaks.split(','))
return map(lambda x, y: Range(x,y, rightOpen = True), chain([Range.ANY], breaks), chain(breaks, [Range.ANY]))
def calibrateFeatures(variants, fields, titvTarget, printCall = False, cumulative = False, forcePrint = False, prefix = '', printHeader = True, category = None, useBreaks = False ):
covariates = []
if printHeader: printFieldQualHeader()
for field in fields:
if DEBUG: print 'Optimizing field', field
titv, FPRate = titvFPRateEstimate(variants, titvTarget)
#print 'Overall FRRate:', FPRate, nErrors, phredScale(FPRate)
for cc, selectedVariants in mapVariantBins(variants, field, cumulative = cumulative, breakQuals = qBreaksRanges(OPTIONS.QBreaks, useBreaks and field <> 'QUAL')):
#print 'CC', cc, field, useBreaks
if len(selectedVariants) > max(OPTIONS.minVariantsPerBin,1) or forcePrint:
titv, FPRate = titvFPRateEstimate(selectedVariants, titvTarget)
#dbsnp = dbSNPRate(selectedVariants)
cc.FPRate = FPRate
covariates.append(cc)
printFieldQual( category, prefix + field, cc, selectedVariants )
else:
print 'Not calibrating bin', cc, 'because it contains too few variants:', len(selectedVariants)
return covariates
class CallCmp:
def __init__(self, nTP, nFP, nFN):
self.nTP = nTP
self.nFP = nFP
self.nFN = nFN
# def FPRate(self):
# return (1.0*self.nFP) / max(self.nTP + self.nFP, 1)
def FNRate(self):
return (1.0*self.nFN) / max(self.nTP + self.nFN, 1)
def sensitivity(self):
# = TP / (TP + FN)
return (1.0*self.nTP) / max( self.nTP + self.nFN,1 )
def PPV(self):
# = TP / (TP + FP)
return (1.0*self.nTP) / max( self.nTP + self.nFP, 1 )
HEADER = "TP FP FN FNRate Sensitivity PPV"
def __str__(self):
return '%6d %6d %6d %.3f %.3f %.3f' % (self.nTP, self.nFP, self.nFN, self.FNRate(), self.sensitivity(), self.PPV())
def variantInTruth(variant, truth):
if variant.getLoc() in truth:
return truth[variant.getLoc()]
else:
return False
def isVariantInSample(t, sample):
#print "isVariantInSample", t.getLoc(), t.getField(sample), x
return t.getField(sample) <> "0/0"
def variantsInTruth(truth):
# fixme
return len(filter(lambda x: isVariantInSample(x, OPTIONS.useSample), truth))
def sensitivitySpecificity(variants, truth):
nTP, nFP = 0, 0
FPs = []
for variant in variants:
t = variantInTruth(variant, truth)
isTP, isFP = False, False
if OPTIONS.useSample or OPTIONS.onlyAtTruth:
if t: # we have a site
isTP = (isVariantInSample(t, OPTIONS.useSample) and t.isVariant()) or (not isVariantInSample(t, OPTIONS.useSample) and not t.isVariant())
isFP = not isTP
else:
isTP = t
isFP = not t
#if variant.getLoc() == "1:867694":
# print variant, 'T: [', t, '] isTP, isFP', isTP, isFP
if isTP:
t.setField("FN", 0)
variant.setField("TP", 1)
nTP += 1
elif isFP:
nFP += 1
variant.setField("TP", 0)
#print t, variant, "is a FP!"
FPs.append(variant)
nRef = 0 # len(filter(lambda x: not x.isVariant(), truth.itervalues()))
nFN = variantsInTruth(truth.itervalues()) - nTP - nRef
#print 'nRef', nTP, nFP, nFN, nRef
return CallCmp(nTP, nFP, nFN), FPs
def markTruth(calls):
if not OPTIONS.useSample:
for variant in calls.itervalues():
variant.setField("TP", 0) # set the TP field to 0
def compareCalls(calls, truthCalls):
#markTruth(truthCalls)
def compare1(name, cumulative):
for field in ["QUAL", "OQ"]:
for cc, selectedVariants in mapVariantBins(calls, field, cumulative = cumulative):
#print selectedVariants[0]
printFieldQual("truth-comparison-" + name, field, cc, selectedVariants )
print 'PER BIN nCalls=', len(calls)
compare1('per-bin', False)
print 'CUMULATIVE nCalls=', len(calls)
compare1('cum', True)
def randomSplit(l, pLeft):
def keep(elt, p):
if p < pLeft:
return elt, None
else:
return None, elt
data = [keep(elt, p) for elt, p in zip(l, map(lambda x: random.random(), l))]
def get(i): return filter(lambda x: x <> None, [x[i] for x in data])
return get(0), get(1)
def setup():
global OPTIONS, header
usage = "usage: %prog files.list [options]"
parser = OptionParser(usage=usage)
parser.add_option("-f", "--f", dest="fields",
type='string', default="QUAL",
help="Comma-separated list of fields (either in the VCF columns of as INFO keys) to use during optimization [default: %default]")
parser.add_option("-t", "--truth", dest="truth",
type='string', default=None,
help="VCF formated truth file. If provided, the script will compare the input calls with the truth calls. It also emits calls tagged as TP and a separate file of FP calls")
parser.add_option("-l", "--recalLog", dest="recalLog",
type='string', default="recal.log",
help="VCF formated truth file. If provided, the script will compare the input calls with the truth calls. It also emits calls tagged as TP and a separate file of FP calls")
parser.add_option("-u", "--unfiltered", dest="unfiltered",
action='store_true', default=False,
help="If provided, unfiltered calls will be used in comparisons [default: %default]")
parser.add_option("", "--plottable", dest="plottableNones",
action='store_true', default=False,
help="If provided, will generate fake plottable points for annotations with None values -- doesn't effect the behavior of the system just makes it easy to plot outputs [default: %default]")
parser.add_option("", "--onlyAtTruth", dest="onlyAtTruth",
action='store_true', default=False,
help="If provided, we only consider TP/FP/FN at truth sites[default: %default]")
parser.add_option("-p", "--partitions", dest="partitions",
type='int', default=25,
help="Number of partitions to use for each feature. Don't use so many that the number of variants per bin is very low. [default: %default]")
parser.add_option("", "--maxRecordsForCovariates", dest="maxRecordsForCovariates",
type='int', default=2000000,
help="Derive covariate information from up to this many VCF records. For files with more than this number of records, the system downsamples the reads [default: %default]")
parser.add_option("-m", "--minVariantsPerBin", dest="minVariantsPerBin",
type='int', default=10,
help="Don't include any covariates with fewer than this number of variants in the bin, if such a thing happens. NEEDS TO BE FIXED")
parser.add_option("-M", "--maxRecords", dest="maxRecords",
type='int', default=None,
help="Maximum number of input VCF records to process, if provided. Default is all records")
parser.add_option("-q", "--qMin", dest="minQScore",
type='int', default=-1,
help="The minimum Q score of the initial SNP list to consider for selection [default: %default]")
parser.add_option("-Q", "--qMax", dest="maxQScore",
type='int', default=1000000,
help="The maximum Q score allowed for both a single covariate and the overall QUAL score [default: %default]")
parser.add_option("", "--QBreaks", dest="QBreaks",
type='string', default=None,
help="Breaks in QUAL for generating covarites [default: %default]")
parser.add_option("", "--maxQScoreForCovariate", dest="maxQScoreForCovariate",
type='int', default=60,
help="The maximum Q score allowed for both a single covariate and the overall QUAL score [default: %default]")
parser.add_option("-o", "--outputVCF", dest="outputVCF",
type='string', default="recal.vcf",
help="If provided, a VCF file will be written out to this file [default: %default]")
parser.add_option("", "--FNoutputVCF", dest="FNoutputVCF",
type='string', default=None,
help="If provided, VCF file will be written out to this file [default: %default]")
parser.add_option("", "--titv", dest="titvTarget",
type='float', default=None,
help="If provided, we will optimize calls to the targeted ti/tv rather than that calculated from known calls [default: %default]")
parser.add_option("-b", "--bootstrap", dest="bootStrap",
type='float', default=None,
help="If provided, the % of the calls used to generate the recalibration tables. [default: %default]")
parser.add_option("-k", "--skip", dest="skip",
type=int, default=None,
help="If provided, we'll only take every nth record [default: %default]")
parser.add_option("-s", "--useSample", dest="useSample",
type='string', default=False,
help="If provided, we will examine sample genotypes for this sample, and consider TP/FP/FN in the truth conditional on sample genotypes [default: %default]")
parser.add_option("-r", "--dontRecalibrate", dest="dontRecalibrate",
action='store_true', default=False,
help="If provided, we will not actually do anything to the calls, they will just be assessed [default: %default]")
(OPTIONS, args) = parser.parse_args()
if len(args) > 2:
parser.error("incorrect number of arguments")
return args
def assessCalls(file):
print 'Counting records in VCF', file
numberOfRecords = 1
downsampleFraction = 1
if OPTIONS.maxRecords <> None:
numberOfRecords = quickCountRecords(open(file))
if OPTIONS.maxRecords < numberOfRecords:
numberOfRecords = OPTIONS.maxRecords
downsampleFraction = min(float(OPTIONS.maxRecordsForCovariates) / numberOfRecords, 1)
#print 'Reading variants', OPTIONS.skip, downsampleFraction
header, allCalls = readVariants(file, OPTIONS.maxRecords, downsampleFraction=downsampleFraction, minQScore = OPTIONS.minQScore, maxQScore = OPTIONS.maxQScore, skip = OPTIONS.skip)
allCalls = list(allCalls)
print 'Number of VCF records', numberOfRecords, ', max number of records for covariates is', OPTIONS.maxRecordsForCovariates, 'so keeping', downsampleFraction * 100, '% of records'
print 'Number of selected VCF records', len(allCalls)
titvtarget = OPTIONS.titvTarget
if titvtarget == None:
titvtarget = titv(selectVariants(allCalls, VCFRecord.isKnown))
print 'Ti/Tv all ', titv(allCalls)
print 'Ti/Tv known', titv(selectVariants(allCalls, VCFRecord.isKnown))
print 'Ti/Tv novel', titv(selectVariants(allCalls, VCFRecord.isNovel))
return header, allCalls, titvtarget
def determineCovariates(allCalls, titvtarget, fields):
if OPTIONS.bootStrap:
callsToOptimize, recalEvalCalls = randomSplit(allCalls, OPTIONS.bootStrap)
else:
callsToOptimize = allCalls
recalOptCalls, covariates = optimizeCalls(callsToOptimize, fields, titvtarget)
printCallQuals("QUAL", list(recalOptCalls), titvtarget, 'OPTIMIZED CALLS')
if OPTIONS.bootStrap:
recalibatedEvalCalls = recalibrateCalls(recalEvalCalls, fields, covariates)
printCallQuals("QUAL", list(recalibatedEvalCalls), titvtarget, 'BOOTSTRAP EVAL CALLS')
return covariates
def writeRecalibratedCalls(file, header, calls):
if file:
f = open(file, 'w')
#print 'HEADER', header
i = 0
for line in formatVCF(header, calls):
if i % 10000 == 0: print 'writing VCF record', i
i += 1
print >> f, line
f.close()
def readTruth(truthVCF):
print 'Reading truth file', truthVCF
rawTruth = list(readVariants(truthVCF, maxRecords = None, decodeAll = True, mustBeVariant = True)[1])
truth = dict( [[v.getLoc(), v] for v in rawTruth])
print 'Number of raw and passing filter truth calls', len(rawTruth), len(truth)
return truth
def evaluateTruth(header, callVCF, truth, truthVCF):
print 'Reading variants back in from', callVCF
header, calls = readVariants(callVCF)
calls = list(calls)
print '--------------------------------------------------------------------------------'
print 'Comparing calls to truth', truthVCF
print ''
compareCalls(calls, truth)
writeRecalibratedCalls(callVCF, header, calls)
def isFN(v):
return isVariantInSample(v, OPTIONS.useSample) and not v.hasField("FN")
if truth <> None and OPTIONS.FNoutputVCF:
f = open(OPTIONS.FNoutputVCF, 'w')
#print 'HEADER', header
for line in formatVCF(header, filter( isFN, truth.itervalues())):
print >> f, line
f.close()
TRUTH_CALLS = None
RECAL_LOG = None
def main():
global TRUTH_CALLS, RECAL_LOG
args = setup()
fields = OPTIONS.fields.split(',')
truthVCF = None
#print("LENGTH OF ARGS "+str(len(args)))
if OPTIONS.truth <> None:
truthVCF = OPTIONS.truth
TRUTH_CALLS = readTruth(truthVCF)
if OPTIONS.recalLog <> None:
RECAL_LOG = open(OPTIONS.recalLog, "w")
print >> RECAL_LOG, "# optimized vcf", args[0]
print >> RECAL_LOG, "# truth vcf", truthVCF
for key, value in OPTIONS.__dict__.iteritems():
print >> RECAL_LOG, '#', key, value
header, allCalls, titvTarget = assessCalls(args[0])
if not OPTIONS.dontRecalibrate:
covariates = determineCovariates(allCalls, titvTarget, fields)
print OPTIONS
header, callsToRecalibate = readVariants(args[0], OPTIONS.maxRecords, minQScore = OPTIONS.minQScore, maxQScore = OPTIONS.maxQScore, skip = OPTIONS.skip)
RecalibratedCalls = recalibrateCalls(callsToRecalibate, fields, covariates)
writeRecalibratedCalls(OPTIONS.outputVCF, header, RecalibratedCalls)
else:
printFieldQualHeader()
printCallQuals("QUAL", allCalls, titvTarget)
OPTIONS.outputVCF = args[0]
if truthVCF <> None:
evaluateTruth(header, OPTIONS.outputVCF, TRUTH_CALLS, truthVCF)
PROFILE = False
if __name__ == "__main__":
if PROFILE:
import cProfile
cProfile.run('main()', 'fooprof')
import pstats
p = pstats.Stats('fooprof')
p.sort_stats('cumulative').print_stats(10)
p.sort_stats('time').print_stats(10)
p.sort_stats('time', 'cum').print_stats(.5, 'init')
else:
main()

View File

@ -1,49 +0,0 @@
import sys
from optparse import OptionParser
def getRsIDSet(dbSNPIds, idIndex):
s = set()
for line in open(dbSNPIds):
#print line
rsID = line.split()[idIndex]
s.add(rsID)
return frozenset(s)
def main():
global OPTIONS
usage = "usage: %prog [options] dbSNP.in.rsids dbSNP.to.match dbSNP.out"
parser = OptionParser(usage=usage)
parser.add_option("-v", "--verbose", dest="verbose",
action='store_true', default=False,
help="")
(OPTIONS, args) = parser.parse_args()
if len(args) != 3:
parser.error("incorrect number of arguments")
dbSNPIds = args[0]
dbSNPMatch = args[1]
dbSNPOut = args[2]
idIndex = 4
rsSet = getRsIDSet(dbSNPIds, idIndex)
print 'rsID set has %d elements' % len(rsSet)
#
count = 0
matched = 0
out = open(dbSNPOut, 'w')
for line in open(dbSNPMatch):
count += 1
rsID = line.split()[idIndex]
if rsID in rsSet:
#sys.stdout.write(line)
matched += 1
out.write(line)
print 'Processed %d lines, matching %d elements, excluding %d' % ( count, matched, count - matched )
out.close()
if __name__ == "__main__":
main()

View File

@ -1,287 +0,0 @@
#!/usr/bin/env python
"""Starts from TGTC list of fastb, qualb, and qltout files and creates merged SAM/BAM files"""
import os, commands, sys
from farm_commands import cmd
# Global list of failed conversions to be reported at end of execution
failed_list = []
class FailedConversion:
def __init__(self, lane, error_str):
self.lane = lane
self.error_str = error_str
print self
def __str__(self):
return self.error_str+" while attempting conversion of "+str(self.lane)
class lane:
bam_ref_list = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bam.ref_list";
pipe_dirs_head = "/slxa/pipelineOutput/farm"
pipe_dirs = os.listdir(pipe_dirs_head)
pipe_dirs_dict = dict( [(dir.split("_")[1], dir) for dir in pipe_dirs if "_" in dir] )
def __init__(self, line):
fields = line.split(" ")
if len(fields) == 5:
self.fdl, self.fastb, self.qualb, self.qltout, group = fields
else:
self.fdl, self.fastb, self.qualb, self.qltout = fields
self.qltout = self.qltout.rstrip()
self.fqltout = os.path.splitext(self.qltout)[0]+".fais_filtered.qltout"
self.flowcell, self.date, self.lane = self.fdl.split(".")
self.flowlane = self.flowcell+"."+self.lane
self.path_head = os.path.splitext(self.qltout)[0]
self.sam = self.path_head+".sam"
self.bam = self.path_head+".bam"
self.head = ".".join(os.path.basename(self.path_head).split(".")[:2]) # should amount to FLOWCELL.LANE
#self.refdict_metrics()
def pipe_dir(self):
dir = lane.pipe_dirs_dict[self.flowcell]
return lane.pipe_dirs_head+"/"+dir
def set_default_refdict(self):
self.refdict = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.dict";
#print "WARNING: Using standard Hg18 reference dictionary (refdict) for",self
def set_default_metrics(self):
self.metrics = "/broad/1KG/legacy_data/tgtc2sam/unknown.metrics"
#print "WARNING: Using default unkonwn.metrics file for",self
def refdict_metrics(self):
"Ensures that a reference dictionary and metrics are available"
try:
pipe_lane_dir = self.pipe_dir()
except KeyError:
self.set_default_refdict()
self.set_default_metrics()
return True
self.metrics = pipe_lane_dir+"/"+str(self.flowlane)+".metrics"
if os.path.exists(self.metrics):
line = commands.getoutput("grep '^reference=' "+pipe_lane_dir+"/"+self.flowlane+".metrics")
if line != "":
reference_fasta = line.split("=")[1]
self.refdict = os.path.splitext(reference_fasta)[0]+'.dict'
if "PhiX" in self.refdict:
failed_list.append(FailedConversion(self, "PhiX dictionary missing"))
return False
else:
self.set_default_refdict()
return True # We were able to generate refdict and metrics files
else:
self.set_default_refdict()
self.set_default_metrics()
return True # We were able to generate refdict file and substituted unknown.metrics as the metrics file
#failed_list.append(FailedConversion(self, "Could not find metrics file"))
#return False
def __str__(self):
return self.fdl
def usage():
print "Usage: tgtc_sam.py <command> <command_args>..."
print
print " Works from tgtc files which contain fastb, qualb, qltout filenames corresponding to one lane of data"
print " Commands and command arguments:"
print " convert tgtc_list - given a tgtc file converts files to SAM and BAM"
print " tgtc - file with lines of fastb, qualb, qltout filenames"
print " [clean] - optional 3rd argument that will create SAM/BAM files"
print " regardless of whether they already exist"
print
print " merge tgtc output_file - merges converted BAM files"
print " tgtc - file with lines of fastb, qualb, qltout filenames"
print " output_file - path and name of merged BAM file"
print
print " lists list_of_tgtc_lists - run over a list of tgtc lists given in a file"
print " list_of_tgtc_files - file with tgtc filenames in it"
print
print " Global options:"
print " print_only: do not run commands, just print what they would be "
sys.exit(1)
def make_lane_list(tgtc_file):
all_lanes = [lane(l) for l in open(tgtc_file) if not l.startswith("#")]
lanes = []
for ln in all_lanes:
if ln.refdict_metrics(): # if we were able to generate refdict and metrics
# append this lane to the lanes list that we will work with
lanes.append(ln)
unpaired = [l for l in lanes if ".end1." not in l.qualb and ".end2." not in l.qualb]
paired1 = [l for l in lanes if ".end1." in l.qualb]
paired2 = [l for l in lanes if ".end2." in l.qualb]
print "Unpaired lanes:",len(unpaired)
print "Paired 1 lanes:",len(paired1)
print "Paired 2 lanes:",len(paired2)
assert len(paired1) == len(paired2)
return [(u, None) for u in unpaired] + zip(paired1, paired2)
def convert_lanes(tgtc_file, recreate_all=False, just_print_commands=False):
lanes_done = 0
lane_list = make_lane_list(tgtc_file)
print "Beginning processing of these lanes (unpaired and paired):"
print "\n".join(["%s %s" % (p1,p2) for p1,p2 in lane_list])
for p1, p2 in lane_list:
if recreate_all or not os.path.exists(p1.sam) or not os.path.getsize(p1.sam):
# Filer reads with > 4 errors
cmd_str = "\"FilterAlignsILTStyle QLTIN="+p1.qltout+" QLTOUT="+p1.fqltout
if p2 == None:
# This is an UNPAIRED LANE
print "Processing unpaired lane",p1
cmd_str += (" && Qltout2SAM"+
" HEAD="+p1.head+
" SAM="+p1.sam+
" QLTOUT="+p1.fqltout+
" FASTB="+p1.fastb+
" QUALB="+p1.qualb+
" REFDICT="+p1.refdict+
" METRICS="+p1.metrics+
" ALIGNER=PairFinder"+
" TMPDIR=/broad/hptmp/andrewk")
else:
# This is a PAIRED LANE
print "Processing paired lanes",p1,p2
if p1.flowcell != p2.flowcell or p1.lane != p2.lane:
print "### WARNING ### paired lanes ",p1,p2,"do not match! Skipping."
continue
# Filter 2nd Qltout file too
cmd_str += " && FilterAlignsILTStyle QLTIN="+p2.qltout+" QLTOUT="+p2.fqltout
cmd_str += (" && QltoutPair2SAM"+
" HEAD="+p1.head+
" SAM="+p1.sam+
" QLTOUT1="+p1.fqltout+" QLTOUT2="+p2.fqltout+
" FASTB1="+p1.fastb+" FASTB2="+p2.fastb+
" QUALB1="+p1.qualb+" QUALB2="+p2.qualb+
" REFDICT="+p1.refdict+
" METRICS="+p1.metrics+
" ALIGNER=PairFinder"+
" TMPDIR=/broad/hptmp/andrewk")
# Make the BAM file
cmd_str += " && /seq/dirseq/samtools/current/samtools import "+lane.bam_ref_list+" "+p1.sam+" "+p1.bam+"\""
# Now remove one or both SAM files if all previous commands returned 0 for success
cmd_str += " && rm -f "+p1.sam
if p2 != None: cmd_str += " && rm -f "+p2.sam
status = cmd(cmd_str, "long", output_head=p1.path_head, just_print_commands=just_print_commands)
if status == 0:
lanes_done += 1
else:
failed_list.append("Unkown failure")
#if lanes_done + len(failed_list) > 0:
#break
failed = len(failed_list)
lanes_total = lanes_done+failed
percent_done = lanes_done / lanes_total if lanes_done != 0 else 0
print "Lane conversion success: %d/%d (%.1f%%)" % (lanes_done, lanes_total, percent_done)
def merge_bams(tgtc_file, outfile, just_print_commands=False):
input_lane_list = make_lane_list(tgtc_file)
#lane_list = [lp for lp in lane_list if lp[0].flowcell =="30LLT"]
missing_files = False
lane_list = []
for p1,p2 in input_lane_list:
if "adapted" not in p1.qualb or (p2 != None and "adapted" not in p2.qualb):
print "NOT ADAPTED", p1.qualb, p2.qualb
if not os.path.exists(p1.bam):
#print "Missing file:",p1.bam
missing_files = True
else:
lane_list.append( (p1,p2) )
#return
if not missing_files:
if not os.path.exists(outfile):
print len(lane_list),"lanes / lane pairs to merge"
if not os.path.exists(os.path.dirname(outfile)):
os.makedirs(os.path.dirname(outfile))
cmd("java -cp /home/radon01/depristo/dev/workspace/Libraries/bin"+
" edu.mit.broad.picard.sam.MergeSamFiles"+
" "+" ".join(["I="+p1.bam for p1,p2 in lane_list])+
" O="+outfile+
" TMP_DIR=/broad/hptmp/andrewk/"+
" && /seq/dirseq/samtools/current/samtools index "+outfile+
" && /xchip/tcga/gbm/visualization/sam/IGVAlignmentProcessor/bin/run_linux-64.sh "+outfile+" "+os.path.dirname(outfile)+" hg18 -w 100",
"long", outfile, just_print_commands=just_print_commands)
# Original samtools merge command below that was used for rev. 1 merging
#cmd("/seq/dirseq/samtools/current/samtools merge "+outfile+
# " "+" ".join([p1.bam for p1,p2 in lane_list])+
# " && /seq/dirseq/samtools/current/samtools index "+outfile+
# " && /xchip/tcga/gbm/visualization/sam/IGVAlignmentProcessor/bin/run_linux-64.sh "+outfile+" "+os.path.dirname(outfile)+" hg18 -w 100",
# "long", outfile)
else:
# Convert the lanes that had missing outfiles
#convert_lanes(tgtc_file)
pass
def process_tgtc_lists(tgtc_list_file):
"Works through a list of tgtc files and merges them to produce the output"
lists_done = 0
for line in open(tgtc_list_file):
if line[0] == '#':
continue
fields = line.split()
if len(fields) != 2:
continue
head, tgtc_list = fields
bam = head+".bam"
if True: #not os.path.exists(bam):
print "Processing tgtc_list_file",tgtc_list,"with goal of producing",bam
merge_bams(tgtc_list, bam)
print
else:
print bam,"already exists\n"
lists_done += 1
#if lists_done > 0:
# break
if __name__ == "__main__":
if len(sys.argv) < 2:
usage()
operation = sys.argv[1]
# Check for print commands only option
if "print_only" in sys.argv:
just_print_commands = True
sys.argv.remove("print_only")
else:
just_print_commands = False
if operation == "convert":
if len(sys.argv) < 3:
usage()
tgtc_file = sys.argv[2]
# Run convert_lanes and recreate_all == True if clean option was given as 3rd arg
convert_lanes(tgtc_file, len(sys.argv) > 3 and sys.argv[3] == "clean", just_print_commands=just_print_commands)
elif operation == "merge":
if len(sys.argv) < 4:
usage()
tgtc_file = sys.argv[2]
outfile = sys.argv[3]
merge_bams(tgtc_file, outfile, just_print_commands=just_print_commands)
elif operation == "lists":
tgtc_list_file = "/broad/1KG/legacy_data/tgtc2sam/tgtc_lists_2_legacy_bams"
if len(sys.argv) >= 3:
tgtc_list_file = sys.argv[2]
process_tgtc_lists(tgtc_list_file)
else:
print "Didn't understand operation value: "+operation
usage()