Deleting old python code
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4961 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3362f0c280
commit
e64a300642
|
|
@ -1,3 +0,0 @@
|
|||
python ~/dev/GenomeAnalysisTK/trunk/python/MergeBAMBatch.py -d /humgen/gsa-hphome1/projects/1kg_pilot1/mergedBamsByPopulation/ -q gsa /broad/1KG/DCC_merged/lists/low_coverage_freeze5.list -n /broad/1KG/DCC_merged/lists/naids_and_pop.txt
|
||||
python ~/dev/GenomeAnalysisTK/trunk/python/MergeBAMBatch.py -d freeze5.1 -q gsa -s /broad/1KG/DCC_merged/lists/trios_freeze5.list
|
||||
|
||||
|
|
@ -1,47 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import os,sys
|
||||
from farm_commands import cmd
|
||||
|
||||
def run_paired_BWA(bam):
|
||||
"Takes a paired end BAM file and outputs an aligned '.aligned.bam' file "
|
||||
lane = 0
|
||||
root = os.path.splitext(bam)[0]
|
||||
|
||||
# BAM to BFQ
|
||||
cmd("/seq/software/picard/1.21/bin/BamToBfq.jar I="+bam+" LANE="+str(lane)+" FLOWCELL_BARCODE="+root+" PAIRED_RUN=True RUN_BARCODE="+root+" ANALYSIS_DIR=.")
|
||||
|
||||
# BFQ to FQ
|
||||
root1 = root+"."+str(lane)+".0.1"
|
||||
root2 = root+"."+str(lane)+".0.2"
|
||||
bfq1 = root1+".bfq"
|
||||
bfq2 = root2+".bfq"
|
||||
fq1 = root1+".fq"
|
||||
fq2 = root2+".fq"
|
||||
|
||||
cmd("/seq/dirseq/maq-0.7.1/maq bfq2fastq "+bfq1+" "+fq1)
|
||||
cmd("/seq/dirseq/maq-0.7.1/maq bfq2fastq "+bfq2+" "+fq2)
|
||||
|
||||
# Run BWA
|
||||
sai1 = root1+".sai"
|
||||
sai2 = root2+".sai"
|
||||
cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa aln /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+fq1+" >"+sai1)
|
||||
cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa aln /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+fq2+" >"+sai2)
|
||||
|
||||
# Pairing
|
||||
alignedsam = root+".aligned.sam"
|
||||
cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa sampe /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+sai1+" "+sai2+" "+fq1+" "+fq2+" > "+alignedsam)
|
||||
|
||||
# SAM to BAM
|
||||
bam_ref_list = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bam.ref_list"
|
||||
alignedbam = root+".aligned.bam"
|
||||
cmd("/seq/dirseq/samtools/current/samtools import "+bam_ref_list+" "+alignedsam+" "+alignedbam)
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print "Usage: AlignBam.py BAM_FILE"
|
||||
sys.exit()
|
||||
bam = sys.argv[1]
|
||||
run_paired_BWA(bam)
|
||||
|
||||
|
||||
|
|
@ -1,41 +0,0 @@
|
|||
import os,sys
|
||||
from farm_commands import cmd
|
||||
|
||||
def run_BWA(bam, paired=True):
|
||||
"Takes a paired end BAM file and outputs an aligned '.aligned.bam' file "
|
||||
lane = 0
|
||||
root = os.path.splitext(bam)[0]
|
||||
print root
|
||||
|
||||
# BAM to BFQ
|
||||
cmd("/seq/software/picard/current/bin/BamToBfq.jar I="+bam+" LANE="+str(lane)+" FLOWCELL_BARCODE="+root+" PAIRED_RUN=True RUN_BARCODE="+root+" ANALYSIS_DIR=.")
|
||||
|
||||
# BFQ to FQ
|
||||
root1 = root+"."+str(lane)+".0.1"
|
||||
root2 = root+"."+str(lane)+".0.2"
|
||||
bfq1 = root1+".bfq"
|
||||
bfq2 = root2+".bfq"
|
||||
fq1 = root1+".fq"
|
||||
fq2 = root2+".fq"
|
||||
|
||||
cmd("/seq/dirseq/maq-0.7.1/maq bfq2fastq "+bfq1+" "+fq1)
|
||||
cmd("/seq/dirseq/maq-0.7.1/maq bfq2fastq "+bfq2+" "+fq2)
|
||||
|
||||
# Run BWA
|
||||
sai1 = root1+".sai"
|
||||
sai2 = root2+".sai"
|
||||
cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa aln /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+fq1+" >"+sai1)
|
||||
cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa aln /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+fq2+" >"+sai2)
|
||||
|
||||
# Pairing
|
||||
alignedsam = root+".aligned.sam"
|
||||
cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa sampe /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+sai1+" "+sai2+" "+fq1+" "+fq2+" > "+alignedsam)
|
||||
|
||||
# SAM to BAM
|
||||
bam_ref_list = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bam.ref_list"
|
||||
alignedbam = root+".aligned.bam"
|
||||
cmd("/seq/dirseq/samtools/current/samtools import "+bam_ref_list+" "+alignedsam+" "+alignedbam)
|
||||
|
||||
if __name__ == "__main__":
|
||||
bamfiles = sys.argv[1:]
|
||||
|
||||
|
|
@ -1,64 +0,0 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import os,sys
|
||||
from farm_commands import cmd
|
||||
|
||||
def get_line_number_from_file(filename, line_num):
|
||||
for index, line in enumerate(open(filename)):
|
||||
if index == line_num:
|
||||
return line
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
from optparse import OptionParser
|
||||
class OptionParser (OptionParser):
|
||||
def check_required (self, opts):
|
||||
for opt in opts:
|
||||
option = self.get_option(opt)
|
||||
|
||||
# Assumes the option's 'default' is set to None!
|
||||
if getattr(self.values, option.dest) is None:
|
||||
self.error("%s option not supplied" % option)
|
||||
|
||||
parser = OptionParser()
|
||||
parser.add_option("-q", "--queue", help="Queue to submit jobs to (default: long)", dest="queue", default="long")
|
||||
parser.add_option("-l", "--lane", help="Lane number to process", dest="lane", default="")
|
||||
parser.add_option("-t", "--tmpdir", help="Temp directory to use", dest="tmpdir")
|
||||
parser.add_option("-b", "--barcodes", help="File with barcodes as \"Id[TABa]barcode_sequence\" with one header line", dest="barcodes")
|
||||
parser.add_option("-a", "--baits", help="Bait interval list (default: 1KG bait list)", dest="ti", default="/seq/references/HybSelOligos/thousand_genomes_alpha_redesign/thousand_genomes_alpha_redesign.baits.interval_list")
|
||||
parser.add_option("-r", "--targets", help="Target interval list (default: 1KG target list)", dest="ti", default="/seq/references/HybSelOligos/thousand_genomes_alpha_redesign/thousand_genomes_alpha_redesign.targets.interval_list")
|
||||
(options, args) = parser.parse_args()
|
||||
parser.check_required(("-l","-t","-b"))
|
||||
|
||||
solexa_dir = "."
|
||||
|
||||
barcodes = []
|
||||
barcode_file = open(options.barcodes) #"../barcode_sequences.txt")
|
||||
barcode_file.readline()
|
||||
for line in barcode_file:
|
||||
fields = line.split()
|
||||
barcodes.append(fields[1])
|
||||
|
||||
farm = options.queue
|
||||
lane = options.lane
|
||||
alias = "lane"+str(lane)
|
||||
tmp_dir = options.tmpdir #"/humgen/gsa-scr1/andrewk/tmp/"
|
||||
|
||||
cmd("/seq/software/picard/current/bin/ExtractIlluminaBarcodes.jar B="+solexa_dir+" L="+str(lane)+" BARCODE_POSITION=1 METRICS_FILE=metrics.out BARCODE=" + " BARCODE=".join(barcodes))
|
||||
|
||||
cmd( "/seq/software/picard/current/bin/BustardToSam.jar B="+solexa_dir+" L="+str(lane)+" O="+alias+".bam SAMPLE_ALIAS="+alias+" RUN_BARCODE="+alias+" TMP_DIR="+tmp_dir)#, farm_queue=farm) #PAIRED_RUN=True
|
||||
|
||||
for barcode in barcodes:
|
||||
id1 = cmd( "/seq/software/picard/current/bin/BustardToSam.jar B="+solexa_dir+" L="+str(lane)+" BARCODE_POSITION=1 BARCODE="+barcode+" O="+barcode+".bam RUN_BARCODE="+barcode+" SAMPLE_ALIAS=lane5 TMP_DIR="+tmp_dir, farm_queue=farm) #PAIRED_RUN=True
|
||||
|
||||
id2 = cmd( "/home/radon01/andrewk/dev/Sting/trunk/python/AlignBam.py "+barcode+".bam", farm_queue=farm, waitID = id1)
|
||||
id3 = cmd("/seq/software/picard/1.21/bin/MergeSamFiles.jar I="+barcode+".aligned.bam O="+barcode+".aligned.merged.bam VALIDATION_STRINGENCY=SILENT", farm_queue=farm, waitID = id2)
|
||||
id4 = cmd("java -Xmx4096m -jar /seq/software/picard/1.21/bin/MarkDuplicates.jar VALIDATION_STRINGENCY=SILENT I= "+barcode+".aligned.merged.bam O="+barcode+".aligned.duplicates_marked.bam M="+barcode+".aligned.bam.duplicates_marked.bam.duplicate_metrics TMP_DIR="+tmp_dir, farm_queue=farm, waitID=id3)
|
||||
cmd("java -jar /seq/software/picard/current/bin/CalculateHsMetrics.jar BI="+options.bait_list+" TI="+options.target_list+" I="+barcode+".aligned.duplicates_marked.bam M="+barcode+".hs_metrics VALIDATION_STRINGENCY=SILENT", farm_queue=farm, waitID = id4)
|
||||
|
||||
|
||||
fout = open("hybrid_selection.metrics", "w")
|
||||
print >> fout, "BARCODE\t"+get_line_number_from_file(barcodes[0]+".hs_metrics", 6).rstrip()
|
||||
for barcode in barcodes:
|
||||
print >> fout, barcode+"\t"+get_line_number_from_file(barcode+".hs_metrics", 7).rstrip()
|
||||
|
||||
|
|
@ -1,357 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import sys, itertools, FlatFileTable
|
||||
from enum import Enum
|
||||
|
||||
def subset_list_by_indices(indices, list):
|
||||
subset = []
|
||||
for index in indices:
|
||||
subset.append(list[index])
|
||||
return subset
|
||||
|
||||
def chunk_generator(record_gen, key_fields):
|
||||
"""Input:
|
||||
record_gen: generator that produces dictionaries (records in database speak)
|
||||
key_fields: keys in each dictionary used to determine chunk membership
|
||||
Output:
|
||||
locus_chunk: list of consecutive lines that have the same key_fields"""
|
||||
|
||||
locus_chunk = []
|
||||
last_key = ""
|
||||
first_record = True
|
||||
for record in record_gen:
|
||||
key = [record[f] for f in key_fields]
|
||||
if key == last_key or first_record:
|
||||
locus_chunk.append(record)
|
||||
first_record = False
|
||||
else:
|
||||
if locus_chunk != []:
|
||||
yield locus_chunk
|
||||
locus_chunk = [record]
|
||||
last_key = key
|
||||
yield locus_chunk
|
||||
|
||||
|
||||
class call_stats:
|
||||
def __init__(self, call_type, coverage): #, acc_conf_calls, conf_call_rate, cum_corr_calls, cum_calls, coverage):
|
||||
self.call_type = call_type
|
||||
self.coverage = coverage
|
||||
self.calls = 0
|
||||
self.conf_ref_calls = 0
|
||||
self.conf_het_calls = 0
|
||||
self.conf_hom_calls = 0
|
||||
self.conf_var_calls = 0
|
||||
self.conf_genotype_calls = 0
|
||||
self.conf_refvar_calls = 0
|
||||
self.correct_genotype = 0
|
||||
self.correct_refvar = 0
|
||||
|
||||
|
||||
def add_stat(self, calls, conf_ref_calls, conf_het_calls, conf_hom_calls, conf_var_calls, conf_genotype_calls, conf_refvar_calls, correct_genotype, correct_refvar):
|
||||
#print self, calls, conf_ref_calls, conf_het_calls, conf_hom_calls, conf_var_calls, conf_genotype_calls, conf_refvar_calls, correct_genotype, correct_refvar
|
||||
self.calls += calls
|
||||
self.conf_ref_calls += conf_ref_calls
|
||||
self.conf_het_calls += conf_het_calls
|
||||
self.conf_hom_calls += conf_hom_calls
|
||||
self.conf_var_calls += conf_var_calls
|
||||
self.conf_genotype_calls += conf_genotype_calls
|
||||
self.conf_refvar_calls += conf_refvar_calls
|
||||
self.correct_genotype += correct_genotype
|
||||
self.correct_refvar += correct_refvar
|
||||
|
||||
@staticmethod
|
||||
def calc_discovery_stats(chunk):
|
||||
conf_calls = 0
|
||||
correct_genotype = 0
|
||||
calls = 0
|
||||
for record in chunk:
|
||||
calls += 1
|
||||
if float(record["BtrLod"]) >= 5 or call_type.from_record_2_state(record["ReferenceBase"][0],record["BestGenotype"]) == call_type.call_types_2_state.Reference and float(record["BtnbLod"]) >= 5:
|
||||
conf_calls += 1
|
||||
if call_type.discovery_call_correct(record):
|
||||
correct_genotype += 1
|
||||
|
||||
return correct_genotype, conf_calls, calls
|
||||
|
||||
@staticmethod
|
||||
def calc_genotyping_stats(chunk):
|
||||
conf_calls = 0
|
||||
correct_genotype = 0
|
||||
calls = 0
|
||||
for record in chunk:
|
||||
calls += 1
|
||||
if float(record["BtnbLod"]) >= 5:
|
||||
conf_calls += 1
|
||||
if call_type.genotyping_call_correct(record):
|
||||
correct_genotype += 1
|
||||
|
||||
return correct_genotype, conf_calls, calls
|
||||
|
||||
@staticmethod
|
||||
def stats_header():
|
||||
return "TrueGenotype,Coverage,AccuracyConfidentGenotypingCalls,ConfidentGenotypingCallRate,AccuracyConfidentDiscoveryCalls,ConfidentDiscoveryCallRate,Calls,ConfRefCalls,ConfHetCalls,ConfHomCalls,ConfGenotypeCalls,CorrectGenotypes,ConfVarCalls,ConfDiscoveryCalls,CorrectDiscovery"
|
||||
|
||||
def __str__(self):
|
||||
return ",".join(map(str, (self.calls, self.conf_ref_calls, self.conf_het_calls, self.conf_hom_calls, self.conf_genotype_calls, self.correct_genotype, self.conf_var_calls, self.conf_refvar_calls, self.correct_refvar)))
|
||||
|
||||
def stats_str(self):
|
||||
return "%s,%d,%.5f,%.5f,%.5f,%.5f,%s" % (self.call_type, self.coverage, float(self.correct_genotype) / max(self.conf_genotype_calls,1), float(self.conf_genotype_calls) / max(self.calls,1), float(self.correct_refvar) / max(self.conf_refvar_calls,1), float(self.conf_refvar_calls) / max(self.calls,1), self.__str__())
|
||||
|
||||
class call_type:
|
||||
"""Class that returns an Enum with the type of call provided by a record"""
|
||||
call_types_3_state = Enum("HomozygousSNP","HeterozygousSNP","HomozygousReference")
|
||||
call_types_3_state_short = Enum("Hom","Het","Ref")
|
||||
call_types_2_state = Enum("Variant","Reference")
|
||||
|
||||
@staticmethod
|
||||
def from_record_3_state(ref, genotype): # record):
|
||||
"""Given reference base as string, determine whether called genotype is homref, het, homvar"""
|
||||
#ref = record["ReferenceBase"][0]
|
||||
#genotype = record["HapmapChipGenotype"]
|
||||
return call_type.call_types_3_state[genotype.count(ref)]
|
||||
|
||||
@staticmethod
|
||||
def from_record_2_state(ref, genotype):
|
||||
"""Given reference base as string, determine whether called genotype is ref or var"""
|
||||
#ref = record["ReferenceBase"][0]
|
||||
#genotype = record["HapmapChipGenotype"]
|
||||
return call_type.call_types_2_state[0] if genotype.count(ref) < 2 else call_type.call_types_2_state[1]
|
||||
|
||||
@staticmethod
|
||||
def genotyping_call_correct(record):
|
||||
return record["HapmapChipGenotype"] == record["BestGenotype"]
|
||||
|
||||
|
||||
@staticmethod
|
||||
def discovery_call_correct(record):
|
||||
return call_type.from_record_2_state(record["ReferenceBase"][0], record["HapmapChipGenotype"]) == call_type.from_record_2_state(record["ReferenceBase"][0], record["BestGenotype"])
|
||||
|
||||
def print_record_debug(rec):
|
||||
print "TEST Genotyping:", call_type.genotyping_call_correct(rec)
|
||||
print "TEST Discovery:", call_type.discovery_call_correct(rec)
|
||||
print "DIFF:", call_type.genotyping_call_correct(rec) != call_type.discovery_call_correct(rec)
|
||||
print "\n".join([" %20s => '%s'" % (k,v) for k,v in sorted(rec.items())])
|
||||
print call_type.from_record_2_state(rec["ReferenceBase"][0],rec["HapmapChipGenotype"])
|
||||
print call_type.from_record_2_state(rec["ReferenceBase"][0],rec["BestGenotype"])
|
||||
print
|
||||
|
||||
def aggregate_stats(filename, max_loci, table_filename, debug):
|
||||
aggregate = dict()
|
||||
fout = open(table_filename,"w")
|
||||
fout.write(call_stats.stats_header()+"\n")
|
||||
|
||||
locus_gen = chunk_generator(FlatFileTable.record_generator(filename, None), ("Sequence","Position"))
|
||||
for index, locus_chunk in enumerate(locus_gen):
|
||||
if index >= max_loci:
|
||||
break
|
||||
if (index % 1000) == 0:
|
||||
sys.stderr.write( str(index)+" loci processed, at: "+locus_chunk[0]["Sequence"]+":"+locus_chunk[0]["Position"]+"\n")
|
||||
|
||||
covs = dict()
|
||||
coverage_chunk_gen = chunk_generator(locus_chunk, ("DownsampledCoverage", "Sequence", "Position"))
|
||||
for cov_chunk in coverage_chunk_gen:
|
||||
first_record = cov_chunk[0]
|
||||
#stat = call_stats.calc_stats(cov_chunk)
|
||||
|
||||
for record in cov_chunk:
|
||||
hapmap_genotyping_call_type = call_type.from_record_3_state(record["ReferenceBase"][0],record["HapmapChipGenotype"])
|
||||
key = hapmap_genotyping_call_type, int(record["DownsampledCoverage"])
|
||||
#key = call_type.from_record_3_state(record)#, int(record["DownsampledCoverage"])
|
||||
record["DownsampledCoverage"] = int(record["DownsampledCoverage"])
|
||||
record["HapmapChipCallType"] = hapmap_genotyping_call_type
|
||||
value = record
|
||||
|
||||
correct_genotype, conf_genotype_calls, genotype_calls = call_stats.calc_genotyping_stats([record])
|
||||
correct_refvar, conf_refvar_calls, refvar_calls = call_stats.calc_discovery_stats([record])
|
||||
assert(genotype_calls == refvar_calls)
|
||||
|
||||
conf_ref_calls = 0
|
||||
conf_het_calls = 0
|
||||
conf_hom_calls = 0
|
||||
best_genotyping_call_type = call_type.from_record_3_state(record["ReferenceBase"][0],record["BestGenotype"])
|
||||
if conf_genotype_calls:
|
||||
if best_genotyping_call_type.index == 0: conf_hom_calls = 1
|
||||
if best_genotyping_call_type.index == 1: conf_het_calls = 1
|
||||
if best_genotyping_call_type.index == 2: conf_ref_calls = 1
|
||||
|
||||
conf_var_calls = 0
|
||||
if conf_refvar_calls:
|
||||
this_variant_call_type = call_type.from_record_2_state(record["ReferenceBase"][0],record["BestGenotype"])
|
||||
conf_var_calls = 1 if this_variant_call_type.index == 0 else 0
|
||||
|
||||
aggregate.setdefault(key, call_stats(*key))
|
||||
#print ",".join(map(str,(genotype_calls, conf_ref_calls, conf_het_calls, conf_hom_calls, conf_var_calls, conf_genotype_calls, conf_refvar_calls, correct_genotype, correct_refvar)))
|
||||
aggregate[key].add_stat(genotype_calls, conf_ref_calls, conf_het_calls, conf_hom_calls, conf_var_calls, conf_genotype_calls, conf_refvar_calls, correct_genotype, correct_refvar)
|
||||
|
||||
if debug:# and conf_refvar_calls:
|
||||
print "KEYS:",key
|
||||
print_record_debug(record)
|
||||
|
||||
|
||||
for key, records in sorted(aggregate.items()):
|
||||
fout.write(records.stats_str()+"\n")
|
||||
|
||||
fout.close()
|
||||
#return aggregate
|
||||
|
||||
class weighted_avg:
|
||||
|
||||
def __init__(self):
|
||||
self.sum = 0.0
|
||||
self.count = 0
|
||||
|
||||
def add(self, value, counts):
|
||||
self.sum += value*counts
|
||||
self.count += counts
|
||||
|
||||
def return_avg(self):
|
||||
return float(self.sum) / max(self.count,1)
|
||||
|
||||
def stats_from_hist(options, depth_hist_filename, stats_filename, variant_eval_dir, depth_multiplier=1.0):
|
||||
|
||||
#hist_zero = {"CallType" : ,"Coverage","AccuracyConfidentCalls","ConfidentCallRate","CumCorrectCalls","CumCalls","CumgCorrectFraction"}
|
||||
#prob_genotype = [1e-5, 1e-3, 1-1e-3]
|
||||
#prob_genotype = [0.37, 0.62, .0]
|
||||
#prob_genotype = [0.203, 0.304, .491]
|
||||
#prob_genotype = [0.216, 0.302, .481]
|
||||
prob_genotype = [0.205, 0.306, 0.491] # Based on CEU NA12878 actual hapmap chip calls
|
||||
#prob_genotype = [0.213, 0.313, 0.474] # Based on YRB NA19240 actual hapmap chip calls
|
||||
theta = 1.0/1850 # expected heterozygosity
|
||||
prob_genotype = [0.5 * theta, 1.0 * theta, 1 - 1.5*theta] # Based on CEU NA12878 actual hapmap chip calls
|
||||
|
||||
|
||||
hist = []
|
||||
hist_gen = FlatFileTable.record_generator(depth_hist_filename, sep=" ", skip_until_regex_line = "^depth count freq")
|
||||
for index, record in enumerate(hist_gen):
|
||||
assert int(record["depth"]) == index
|
||||
hist.append(int(record["count"]))
|
||||
|
||||
# If upsampling is not done in the CoverageEval GATK module, the number of observations of reads
|
||||
# with high depth of coverage can be very low giving
|
||||
|
||||
stats_dict = dict()
|
||||
stats_gen = FlatFileTable.record_generator(stats_filename, sep=",")
|
||||
for record in stats_gen:
|
||||
key1 = int(record["Coverage"])
|
||||
key2 = record["TrueGenotype"]
|
||||
#print key2
|
||||
stats_dict.setdefault(key1, dict()) # create a nested dict if it doesn't exist
|
||||
stats_dict[key1][key2] = record # create an entry for these keys
|
||||
|
||||
#highest_homref_calls = 0
|
||||
#if record["TrueGenotype"] == "HomozygousReference":
|
||||
# calls = int(record["Calls"])
|
||||
# if calls > highest_homref_calls:
|
||||
# highest_homref_calls = calls
|
||||
|
||||
acc = dict()
|
||||
call_rate = dict()
|
||||
conf_calls = dict()
|
||||
|
||||
start = 1
|
||||
end = 1000
|
||||
max_usable_depth = 40 # Depth of coverage beyond which stats are not sampled enough and we take the stat at this depth instead
|
||||
for depth, depth_count in enumerate(hist[start:end+1],start): # For Cd = depth count
|
||||
#print "DEPTH: "+str(depth)
|
||||
try:
|
||||
depth = max(int(float(depth*depth_multiplier)),1)
|
||||
if depth > max_usable_depth: # Ensure that high entries with bad stats use a good stat from a depth that we now is well sampled
|
||||
depth = max_usable_depth
|
||||
depth_entries = stats_dict[depth]
|
||||
except KeyError:
|
||||
print "Stopped on depth",depth
|
||||
break
|
||||
if True:
|
||||
for true_genotype, stat in depth_entries.items(): # For t (SNP type) = true_genotype
|
||||
#print "TRUE_GENOTYPE: "+str(true_genotype)
|
||||
for genotype in call_type.call_types_3_state:
|
||||
conf_calls.setdefault(genotype, 0.0)
|
||||
prob_conf_x_call = float(stat["Conf"+str(call_type.call_types_3_state_short[genotype.index])+"Calls"])/float(stat["Calls"])
|
||||
conf_calls[genotype] += depth_count * prob_conf_x_call * prob_genotype[genotype.index]
|
||||
#if genotype.index == 1:
|
||||
# print "%.5f " % prob_conf_x_call, depth, depth_count, conf_calls[genotype], int(stat["Conf"+str(call_type.call_types_3_state_short[genotype.index])+"Calls"]), int(stat["Calls"])
|
||||
|
||||
acc.setdefault(true_genotype,weighted_avg())
|
||||
call_rate.setdefault(true_genotype,weighted_avg())
|
||||
acc[true_genotype].add(float(stat["AccuracyConfidentGenotypingCalls"]), depth_count)
|
||||
call_rate[true_genotype].add(float(stat["ConfidentGenotypingCallRate"]), depth_count)
|
||||
|
||||
import numpy
|
||||
for genotype in call_type.call_types_3_state:
|
||||
print "%19s accuracy : %.3f" % (str(genotype), acc[str(genotype)].return_avg())
|
||||
print "%19s call rate: %.3f" % (str(genotype), call_rate[str(genotype)].return_avg())
|
||||
|
||||
print "\nExpected performance given perfect accuracy and call rate:"
|
||||
print "%19s %7s %7s %7s" % ("", "Actual", "Perfect", "Diff.")
|
||||
total_hist_sites = numpy.sum(hist)
|
||||
total_predicted = 0
|
||||
for genotype in call_type.call_types_3_state:
|
||||
predicted = conf_calls[genotype]
|
||||
total_predicted += predicted
|
||||
perfect = prob_genotype[genotype.index]*total_hist_sites
|
||||
diff = perfect - predicted
|
||||
percent_of_possible = predicted / perfect * 100
|
||||
print "%19s calls: %8.0f %8.0f %8.0f %8.1f" % (genotype, predicted, perfect, diff, percent_of_possible)
|
||||
#repl_string += "%s %.0f\n" % (genotype, predicted)
|
||||
print " Total calls: %7d" % total_predicted
|
||||
|
||||
#stats_gen = FlatFileTable.record_generator(stats_filename, sep=",")
|
||||
#for chunk in chunk_generator(stats_gen, key_fields=("True_Genotype")):
|
||||
|
||||
print "\nCoverage histogram mean: %.2f" % numpy.average(range(len(hist)), weights=hist)
|
||||
|
||||
# STEM AND LEAF PLOT
|
||||
|
||||
# If VariantEval directory given, compare with those results
|
||||
if options.variant_eval_file != None:
|
||||
vareval_file = open(options.variant_eval_file)
|
||||
num_hets = None; num_homs = None
|
||||
for line in vareval_file:
|
||||
if "UNKNOWN_CALLED_VAR_HET_NO_SITES" in line: num_hets = line.rstrip().split()[2]
|
||||
if "UNKNOWN_CALLED_VAR_HOM_NO_SITES" in line: num_homs = line.rstrip().split()[2]
|
||||
|
||||
if options.oneline_stats != None:
|
||||
oneline_stats = open(options.oneline_stats, "w")
|
||||
pred_hom = conf_calls[call_type.call_types_3_state[0]]
|
||||
pred_het = conf_calls[call_type.call_types_3_state[1]]
|
||||
print >>oneline_stats, "%s %.0f %s %.0f %s" % (options.oneline_stats, pred_hom, num_homs, pred_het, num_hets)
|
||||
|
||||
def usage(parser):
|
||||
parser.print_usage()
|
||||
sys.exit()
|
||||
|
||||
if __name__ == "__main__":
|
||||
from optparse import OptionParser
|
||||
|
||||
parser = OptionParser()
|
||||
parser.add_option("-c", "--genotype_call_file", help="GELI file to use in generating coverage stat table", dest="genotype_filename", )
|
||||
parser.add_option("-s", "--stats_file", help="file to containing empirical genotyping stats", dest="stats_filename", default=None)
|
||||
parser.add_option("-g", "--histogram_file", help="file containing counts of each depth of coverage", dest="hist_filename")
|
||||
parser.add_option("-m", "--max_loci", help="maximum number of loci to parse (for debugging)", default=sys.maxint, dest="max_loci", type="int")
|
||||
parser.add_option("-e", "--evaluate", help="evaluate genotypes; requires a stats file and a histogram file", default=False, dest="evaluate_genotypes", action="store_true")
|
||||
parser.add_option("-d", "--debug", help="provide debugging output", default=False, dest="debug", action="store_true")
|
||||
parser.add_option("-p", "--depth_multiplier", help="multiply all depths in histogram by this value; for \"downsampling\" depth", default=1.0, dest="depth_multiplier", type=float)
|
||||
parser.add_option("-v", "--variant_eval_file", help="file with output of VariantEval genotype concordance to compare this prediction to", default=None, dest="variant_eval_file")
|
||||
parser.add_option("-o", "--oneline_stats_file", help="output single, tabular line of stats to this file", default=None, dest="oneline_stats")
|
||||
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
if not options.evaluate_genotypes:
|
||||
print "Creating performance tables from genotypes file"
|
||||
#if options.stats_filename == None:
|
||||
# sys.exit("Must provide -s stats fliname option")
|
||||
if options.genotype_filename == None:
|
||||
sys.exit("Must provide -c genotype call filename option")
|
||||
stats_filename = options.stats_filename if options.stats_filename != None else options.genotype_filename+".stats"
|
||||
aggregate_stats(options.genotype_filename, options.max_loci, options.stats_filename, options.debug)
|
||||
else:
|
||||
print "Evaluating genotypes"
|
||||
print "Depth multiplier:",options.depth_multiplier
|
||||
if options.hist_filename == None:
|
||||
sys.exit("Must provide -g histogram filename option")
|
||||
if options.stats_filename == None:
|
||||
sys.exit("Must provide -s stats fliname option")
|
||||
stats_from_hist(options, options.hist_filename, options.stats_filename, options.variant_eval_file, options.depth_multiplier)
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,141 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import sys, os
|
||||
from farm_commands import cmd
|
||||
|
||||
#if __name__ == "__main__":
|
||||
from optparse import OptionParser
|
||||
class OptionParser (OptionParser):
|
||||
|
||||
def check_required (self, opts):
|
||||
for opt in opts:
|
||||
option = self.get_option(opt)
|
||||
|
||||
# Assumes the option's 'default' is set to None!
|
||||
if getattr(self.values, option.dest) is None:
|
||||
self.error("%s option not supplied" % option)
|
||||
|
||||
def exists_and_non_zero(file):
|
||||
return os.path.exists(file) and os.path.getsize(file) != 0
|
||||
|
||||
def process_bam(bam_file):
|
||||
gatk_exe = "java -Xmx8192m -jar ~/dev/Sting/trunk/dist/GenomeAnalysisTK.jar "
|
||||
output_basepath = os.path.splitext(bam_file)[0]
|
||||
output_basefile = os.path.basename(output_basepath)
|
||||
|
||||
print "Input BAM file:",bam_file
|
||||
|
||||
using_hapmapchip = None
|
||||
if False:
|
||||
# Config for chr1 of pilot 2 people
|
||||
ref = " -R /broad/1KG/reference/human_b36_both.fasta"
|
||||
#interval_string = "1"
|
||||
|
||||
# Assure existance of intervals file
|
||||
intervals_file = output_basepath+".hapmap.intervals.chr1"
|
||||
if not exists_and_non_zero(intervals_file):
|
||||
Cmd = "awk -F' ' '{ if ($1 == \""+interval_string+"\") {print $1 \":\" $4} }' "+hapmapchip_file+" > "+intervals_file
|
||||
cmd(Cmd, die_on_fail = True)
|
||||
|
||||
# Setup hapmapchip GFFs; assure that hapmap chip file is here
|
||||
using_hapmapchip = True
|
||||
hapmapchip_file = output_basepath+".hapmapchip.gff"
|
||||
if not exists_and_non_zero(hapmapchip_file):
|
||||
sys.exit("Expected to find "+hapmapchip_file+" but it's not around")
|
||||
|
||||
else:
|
||||
# Config for pilot 3 Broad data
|
||||
#ref = " -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
|
||||
#intervals_file = "/seq/references/HybSelOligos/thousand_genomes_alpha_redesign/thousand_genomes_alpha_redesign.targets.interval_list"
|
||||
|
||||
# Config for pilot 3 DCC data (12/16/09 r. 2375)
|
||||
ref = " -R /broad/1KG/reference/human_b36_both.fasta"
|
||||
intervals_file = "/humgen/gsa-scr1/GATK_Data/thousand_genomes_alpha_redesign.targets.b36.interval_list"
|
||||
|
||||
# Assure that BAM index exists
|
||||
bam_index_file = output_basepath+".bam.bai"
|
||||
if not exists_and_non_zero(bam_index_file):
|
||||
Cmd = "samtools index "+bam_file
|
||||
cmd(Cmd, die_on_fail = True)
|
||||
|
||||
# Run UnifiedGenotyper calls
|
||||
vcf_outfile = output_basepath+".calls.vcf"
|
||||
if not exists_and_non_zero(vcf_outfile):
|
||||
print "Running UnifiedGenotyper"
|
||||
Cmd = gatk_exe + ref + " -T UnifiedGenotyper"+\
|
||||
" -I "+bam_file+\
|
||||
" -varout "+vcf_outfile+\
|
||||
" -L "+intervals_file+\
|
||||
" -fmq0 -l INFO "
|
||||
#" --genotype -lod 5"
|
||||
|
||||
#+interval_string+\
|
||||
cmd(Cmd, die_on_fail=True)
|
||||
|
||||
# Run VariantEval on UnifiedGenotyper calls
|
||||
varianteval_outfile = output_basepath+".varianteval"
|
||||
if not exists_and_non_zero(varianteval_outfile):
|
||||
print "Running VariantEval with output to "+varianteval_outfile
|
||||
Cmd = gatk_exe + ref + " -T VariantEval"+\
|
||||
" -B dbsnp,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod"+\
|
||||
" -B eval,VCF,"+ssg_outfile+\
|
||||
" -o "+varianteval_outfile+\
|
||||
" -L "+intervals_file+\
|
||||
" -G -V"+\
|
||||
" --evalContainsGenotypes"+\
|
||||
" -minPhredConfidenceScore 150"+\
|
||||
" -fmq0 -l INFO"
|
||||
if using_hapmapchip:
|
||||
Cmd = Cmd + " -hc "+hapmapchip_file
|
||||
|
||||
cmd(Cmd, die_on_fail=True)
|
||||
|
||||
# Run CoverageHist on BAM file
|
||||
hist_outfile = output_basepath+".hapmap.coverage_hist"
|
||||
if not exists_and_non_zero(hist_outfile):
|
||||
print "Running CoverageHist on "+bam_file
|
||||
Cmd = gatk_exe + ref + " -T DepthOfCoverage"+\
|
||||
" -I "+bam_file+\
|
||||
" -o "+hist_outfile+\
|
||||
" -L "+intervals_file+\
|
||||
" -histogram --suppressLocusPrinting"+\
|
||||
" -fmq0 -l INFO"
|
||||
|
||||
cmd(Cmd, die_on_fail=True)#,"gsa", output_basepath)
|
||||
|
||||
# Now do CoverageEval predictions if they haven't been done
|
||||
oneline_stats_file = output_basepath+".oneline_stats"
|
||||
if not exists_and_non_zero(oneline_stats_file):
|
||||
Cmd = "CoverageEval.py -e"+\
|
||||
" -s /humgen/gsa-scr1/projects/1kg_pilot2/coverage_eval/fixed_stats/current/NA12878.chr1.full.stats"+\
|
||||
" -g "+hist_outfile+\
|
||||
" -v "+varianteval_outfile+\
|
||||
" -o "+oneline_stats_file
|
||||
cmd(Cmd, die_on_fail=True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = OptionParser()
|
||||
parser.add_option("-b", "--bam-file", help="Input BAM filename", dest="bam_file", default=None)
|
||||
parser.add_option("-a", "--all_bams", help="Process all BAMs in current directory", default=False, dest="all_bams", action="store_true")
|
||||
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
if not options.all_bams:
|
||||
parser.check_required(("-b",))
|
||||
process_bam(options.bam_file)
|
||||
else:
|
||||
bams = [f for f in os.listdir(".") if f.endswith(".bam")]
|
||||
print bams
|
||||
for bam in bams:
|
||||
process_bam(bam)
|
||||
#print
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,47 +0,0 @@
|
|||
#!/util/bin/python
|
||||
|
||||
import math, string, sys
|
||||
|
||||
for input_file in sys.argv[1:]:
|
||||
|
||||
FILENAME = input_file
|
||||
|
||||
# Read the depth of coverage distribution from file
|
||||
depth = []; count = []
|
||||
for line in open(FILENAME):
|
||||
try:
|
||||
int(line[0])
|
||||
split = string.split(line,' ')
|
||||
if int(split[1]) != 0:
|
||||
depth.append(int(split[0]))
|
||||
count.append(int(split[1]))
|
||||
except ValueError: pass
|
||||
|
||||
# Calculate the mode
|
||||
mode = depth[count.index(max(count))]
|
||||
|
||||
# Find the index of maximum extent of 'good' data
|
||||
idx = depth.index(mode); dist = 10**9
|
||||
while abs(count[idx] - count[0]) < dist:
|
||||
dist = abs(count[idx] - count[0])
|
||||
idx += 1
|
||||
model_count = count[:idx + 1]
|
||||
|
||||
# Calculate mean & variance of 'good' data
|
||||
model_tot = sum(model_count); adder = 0
|
||||
for i in range(len(model_count)): adder += depth[i]*model_count[i]
|
||||
mean = adder/float(model_tot)
|
||||
|
||||
adder = 0
|
||||
for i in range(len(model_count)): adder += model_count[i]*(mean - depth[i])**2
|
||||
var = adder/float(model_tot)
|
||||
stdev = var**0.5
|
||||
|
||||
# Output Thresholds
|
||||
print FILENAME
|
||||
print 'Mean Depth of Coverage: ' + str(mean)
|
||||
print 'Plus One Std Dev: ' + str(mean + stdev)
|
||||
print 'Plus Two Std Dev: ' + str(mean + 2*stdev)
|
||||
print 'Plus Three Std Dev: ' + str(mean + 3*stdev)
|
||||
print 'Plus Four Std Dev: ' + str(mean + 4*stdev)
|
||||
print 'Plus Five Std Dev: ' + str(mean + 5*stdev)
|
||||
|
|
@ -1,309 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# Evaluates mapping results produced my RunMapper.py by comparig
|
||||
# the result to the original simulated reads in a sam file
|
||||
|
||||
import getopt, sys, os, re
|
||||
from operator import attrgetter, itemgetter
|
||||
from subprocess import Popen, STDOUT, PIPE
|
||||
from SAM import SAMIO
|
||||
from pushback_file import pushback_file
|
||||
from qltout import qltout_file
|
||||
from aln_file import aln_file
|
||||
|
||||
|
||||
#!/usr/bin/env python
|
||||
|
||||
import string, sys
|
||||
|
||||
class SAMRecordWrapper:
|
||||
def __init__(self, rec):
|
||||
#print 'Rec', rec
|
||||
self.rec = rec
|
||||
|
||||
def id(self): return self.rec.getQName()
|
||||
def contig(self): return self.rec.getRname()
|
||||
def pos(self): return self.rec.getPos()
|
||||
def offset(self): return self.pos()-1
|
||||
def map_qual(self): return self.rec.getMapq()
|
||||
|
||||
def __str__(self): return str(self.rec)
|
||||
|
||||
def SAMIOWrapper( file ):
|
||||
print 'SAMIOWrapper', file
|
||||
return SAMIO( file, debugging=True, func=SAMRecordWrapper)
|
||||
|
||||
class mapper_info:
|
||||
def __init__(self, name, ext, file_class):
|
||||
self.name = name.lower()
|
||||
self.ext = ext
|
||||
self.file_class = file_class
|
||||
|
||||
mapper_defs = [ mapper_info("ilt", "ilt.qltout", qltout_file),
|
||||
mapper_info("merlin", "qltout", qltout_file),
|
||||
mapper_info("swmerlin", "qltout", qltout_file),
|
||||
mapper_info("bwa", "sam", SAMIOWrapper),
|
||||
mapper_info("bwa32", "sam", SAMIOWrapper),
|
||||
mapper_info("maq", "aln.txt", aln_file) ]
|
||||
mapper_defs = dict( [(m.name, m) for m in mapper_defs] )
|
||||
|
||||
def Comp_sam_map(comp_head, mapper, file_output, sam_indel):
|
||||
|
||||
# sam_indel is a +/- number corresponding to the size of insertions / deletions
|
||||
# in generated SAM file that are fixed (1,2,4,8,16...), this doesn't always
|
||||
# translate to the SAM's cigar, but the full value should be used
|
||||
|
||||
file_output_head = comp_head+"."+mapper.name
|
||||
if file_output:
|
||||
# if non-false, output to filename in file_output_head
|
||||
# if "" or False, leave output going to stdout
|
||||
#saveout = sys.stdout
|
||||
feval = open(file_output_head+".eval", "w")
|
||||
#sys.stdout = feval
|
||||
|
||||
# create a file with a one-line tabular output of stats
|
||||
# (for merging into Excel / R input)
|
||||
fevaltab = open(file_output_head+".tab", "w")
|
||||
|
||||
# if maq file, convert aln.map to aln.txt
|
||||
aln_txt_filename = file_output_head+"."+mapper.ext
|
||||
if mapper.name == "maq" and not os.path.exists(aln_txt_filename):
|
||||
maq_exe = "/seq/dirseq/maq-0.7.1/maq"
|
||||
|
||||
# SHOULD BE THIS
|
||||
#cmd_str = maq_exe+" mapview "+file_output_head+".out.aln.map"
|
||||
|
||||
cmd_str = maq_exe+" mapview "+file_output_head+".out.aln.map"
|
||||
|
||||
print >> sys.stderr, "Executing "+cmd_str
|
||||
fout = open(aln_txt_filename, "w")
|
||||
p = Popen( cmd_str , shell=True, stdout=fout)
|
||||
|
||||
map_filename = comp_head+"."+mapper.name+"."+mapper.ext
|
||||
print 'map_filename', map_filename
|
||||
map_file = mapper.file_class(map_filename)
|
||||
|
||||
SAM_filename = comp_head+".sam"
|
||||
sam_file = SAMIO(SAM_filename, debugging=True)
|
||||
|
||||
#sams = [s for s in sam_file]
|
||||
print "Reading map file..."
|
||||
maps = [q for q in map_file]
|
||||
|
||||
print "Reading SAM file..."
|
||||
sams = {}
|
||||
if mapper.name in ["maq", 'bwa', 'bwa32']:
|
||||
for s in sam_file:
|
||||
sams[s.getQName()] = s
|
||||
else:
|
||||
for i, s in enumerate(sam_file):
|
||||
sams[i] = s
|
||||
|
||||
#maps = {}
|
||||
#for m in map_file:
|
||||
# maps[m.id()] = m
|
||||
#sams = [s for s in sam_file]
|
||||
|
||||
class smq_t:
|
||||
def __init__(self, sam, map, qual):
|
||||
self.sam = sam # SAM record (exists for all reads)
|
||||
self.map = map # Mapping record, may be None
|
||||
self.qual = qual # Quality of mapping
|
||||
|
||||
print "Making SMQ records..."
|
||||
SMQ = []
|
||||
for maprec in maps:
|
||||
samrec = sams.get(maprec.id()) # get mapping otherwise None
|
||||
SMQ.append( smq_t(samrec, maprec, maprec.map_qual()) )
|
||||
|
||||
print "Sorting..."
|
||||
SMQ.sort(key=attrgetter('qual'), reverse=True)
|
||||
|
||||
print "Evaluating placements..."
|
||||
placed = 0
|
||||
incorrectly_placed = 0
|
||||
correctly_placed = 0
|
||||
fevaltab.write( "\t".join( ["last_qual", "total_reads", "placed", "correctly_placed", "incorrectly_placed", "not_placed"] ) + "\n" )
|
||||
if len(SMQ):
|
||||
last_qual = SMQ[0].qual # grab 1st qual
|
||||
for smq in SMQ:
|
||||
if smq.sam == None:
|
||||
continue
|
||||
|
||||
if smq.qual != last_qual:
|
||||
total_reads = len(sams)#placed + not_placed
|
||||
not_placed = len(sams) - placed
|
||||
fevaltab.write( "\t".join( map(str, [last_qual, total_reads, placed, correctly_placed, incorrectly_placed, not_placed]) ) + "\n" )
|
||||
#print "Wrote all reads with qual >= "+str(last_qual)
|
||||
last_qual = smq.qual
|
||||
|
||||
placed += 1
|
||||
|
||||
# Convert the CIGAR string - e.g. 75M1I, 74M2D - into an int corresponding to bases
|
||||
# inserted / deleted - e.g. +1, -2
|
||||
#cigar = smq.sam.getCigar()
|
||||
#indelled_bases = 0
|
||||
#match = re.search(r'(\d+)I', cigar)
|
||||
#if match:
|
||||
# indelled_bases += int(match.groups()[0])
|
||||
#match = re.search(r'(\d+)D', cigar)
|
||||
#if match:
|
||||
# indelled_bases -= int(match.groups()[0])
|
||||
|
||||
# We consider a read properly aligned if
|
||||
# - both start positions are the same OR
|
||||
# - the end position of the mapping = end position of the samq
|
||||
#if smq.sam.getPos() != smq.map.pos() and smq.sam.getPos() - indelled_bases != smq.map.pos():
|
||||
|
||||
try:
|
||||
map_indel_bases = smq.map.indelled_bases() # total number of indels in mapped seq
|
||||
# are generally at the beginning of an alignment spanning the anchor base
|
||||
except AttributeError:
|
||||
map_indel_bases = 0 # Since MAQ doesn't do local alignment, we don't care
|
||||
# if its sam.py interface doesn't have access to indel data since there are
|
||||
# no indels in its alignmnets
|
||||
|
||||
if smq.sam.getPos() != smq.map.pos() and smq.map.pos() + sam_indel + map_indel_bases != smq.sam.getPos():
|
||||
#print >> feval, "Indelled SAM bases: "+str(indelled_bases)
|
||||
try:
|
||||
print >> feval, "Indelled map bases: "+str(map_indel_bases)
|
||||
except AttributeError:
|
||||
pass
|
||||
print >> feval, "SAM ref: %5s %s" % (smq.sam.getRname(), smq.sam.getPos())
|
||||
print >> feval, "Mapping: %5s %s" % (smq.map.contig(), smq.map.pos())
|
||||
print >> feval, "SAM record:\n"+str(smq.sam)
|
||||
print >> feval, "Mapping record:\n"+str(smq.map)+"\n"
|
||||
incorrectly_placed += 1
|
||||
else:
|
||||
correctly_placed +=1
|
||||
|
||||
|
||||
#indexed_maps = [None] * len(sams)
|
||||
#for q in maps:
|
||||
# indexed_maps[q.id()] = q
|
||||
|
||||
#def f(s,q):
|
||||
# if q == None:
|
||||
# return s
|
||||
# else:
|
||||
# return False
|
||||
|
||||
#missing = filter( None, map(f, sams, indexed_maps))#, range(len(indexed_maps))) )
|
||||
|
||||
|
||||
#for miss in missing:
|
||||
# print miss
|
||||
#not_placed = len(missing)
|
||||
#not_placed = len(sams) - len(maps)
|
||||
total_reads = len(sams) #placed + not_placed
|
||||
not_placed = len(sams) - placed
|
||||
|
||||
print >> feval, "Total reads : "+str(total_reads)
|
||||
print >> feval, "+-Placed : "+str(placed)
|
||||
print >> feval, "| +-Correct : "+str(correctly_placed)
|
||||
print >> feval, "| +-Incorrect: "+str(incorrectly_placed)
|
||||
print >> feval, "+-Not placed : "+str(not_placed)
|
||||
|
||||
fevaltab.write( "\t".join( map(str, [0, total_reads, placed, correctly_placed, incorrectly_placed, not_placed]) ) + "\n" )
|
||||
print "Wrote all reads with all quals"
|
||||
|
||||
#if file_output_head:
|
||||
# sys.stdout = saveout
|
||||
|
||||
def remove_escape_codes(s):
|
||||
import re
|
||||
return re.sub(r'\x1b\[(\d\d)?\w', "", s)
|
||||
|
||||
def usage():
|
||||
print "EvalMapping.py\n"
|
||||
print "Required arguments:\n"
|
||||
print " -h HEAD All input and output files start with HEAD"
|
||||
print " --OR--"
|
||||
print " <list of *.sam files on STDIN to use as filename templates>"
|
||||
print " i.e. ls *.sam | EvalMapping.py -m MAPPER"
|
||||
print
|
||||
print " -m MAPPER Mapping program output to compare (e.g. merlin, ILT)"
|
||||
print
|
||||
print "Optional arguments:\n"
|
||||
print
|
||||
print " -o Output to screen instead of HEAD.MAPPER.eval"
|
||||
print
|
||||
print "Input files:"
|
||||
print " HEAD.sam"
|
||||
print
|
||||
print "Result files expected for Merlin and ILT:"
|
||||
print " HEAD.MAPPER.qltout"
|
||||
print
|
||||
print "Result files expected for MAQ:"
|
||||
print
|
||||
sys.exit(2)
|
||||
|
||||
if __name__ == "__main__":
|
||||
opts = None
|
||||
try:
|
||||
opts, args = getopt.getopt(sys.argv[1:], "h:m:o", ["head","mapper","file-output"])
|
||||
except getopt.GetoptError:
|
||||
usage()
|
||||
|
||||
comp_heads = []
|
||||
mapper_str = ""
|
||||
file_output = True
|
||||
for opt, arg in opts:
|
||||
if opt in ("-h", "--head"):
|
||||
comp_heads = [arg]
|
||||
if opt in ("-m", "--mapper"):
|
||||
mapper_str = arg
|
||||
possible_mappers = mapper_defs.keys()
|
||||
# Check it later now...
|
||||
#if mapper_str not in possible_mappers:
|
||||
# print "--mapper argument was \""+mapper_str+"\"; expected one of the following: "+", ".join(possible_mappers)
|
||||
# sys.exit(2)
|
||||
if opt in ("-o", "--file-output"):
|
||||
file_output = False
|
||||
|
||||
# Check for required args
|
||||
if (mapper_str == ""):
|
||||
usage()
|
||||
elif (mapper_str == "all"):
|
||||
mappers = mapper_defs.values()
|
||||
print mappers
|
||||
else:
|
||||
try:
|
||||
mapper_str_items = mapper_str.split(",")
|
||||
mappers = [mapper_defs.get(mapper.lower()) for mapper in mapper_str_items]
|
||||
except KeyError:
|
||||
print "Don't know this mapper given in -m option: "+mapper
|
||||
print "Try: ilt, merlin, maq, bwa"
|
||||
sys.exit(1)
|
||||
|
||||
# if we didn't already get a single file from -h
|
||||
if len(comp_heads) == 0:
|
||||
# and there's a list on STDIN,
|
||||
if not sys.stdin.isatty(): # redirected from file or pipe
|
||||
comp_heads = [ os.path.splitext( remove_escape_codes(file).rstrip() )[0] for file in sys.stdin.readlines() ]
|
||||
comp_heads = [ f for f in comp_heads if len(f) > 0 ]
|
||||
else:
|
||||
# no single file AND no STDIN file list
|
||||
usage()
|
||||
|
||||
print "\nFile heads to evaluate:"
|
||||
print "\n".join(comp_heads)+"\n"
|
||||
|
||||
comp_heads.reverse()
|
||||
for comp_head in comp_heads: #[:1]
|
||||
for mapper in mappers:
|
||||
# Guess the original size of simulated indel from filename
|
||||
m = re.search(r'_den\.(\d+)_', comp_head)
|
||||
if m:
|
||||
if ".DELETION_" in comp_head:
|
||||
indel_size = -1 * int(m.groups()[0])
|
||||
elif ".INSERTION_" in comp_head:
|
||||
indel_size = int(m.groups()[0])
|
||||
else:
|
||||
indel_size = 0
|
||||
else:
|
||||
sys.exit("Couldn't guess simulated read indel size from filename")
|
||||
|
||||
print "Evaluating "+mapper.name+" results with indel size: "+str(indel_size)+" and file header: "+comp_head
|
||||
Comp_sam_map(comp_head, mapper, file_output, indel_size)
|
||||
|
||||
|
|
@ -1,65 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# Utility script to convert FASTA + QUALS file into FASTQ required for MAQ
|
||||
# MAQ needs to convert fastq into bfq file using fastq2bfq in order to use
|
||||
# this data
|
||||
|
||||
import sys, fasta, os
|
||||
from itertools import *
|
||||
|
||||
def usage():
|
||||
print "Usage: fastq.py fasta_in quals_in fastq_out"
|
||||
print " fasta_in: *.fasta file"
|
||||
print " quals_in: *.quals.txt OR *.quala"
|
||||
sys.exit()
|
||||
|
||||
def output_fastq_record(file_handle, fasta, quals):
|
||||
#print 'fasta, quals', fasta, quals
|
||||
#for fasta, qual in zip(fastas, quals):
|
||||
|
||||
qualsVector = map(int, quals.seq.split())
|
||||
|
||||
print >> fqout, "@"+fasta.id
|
||||
print >> fqout, fasta.seq
|
||||
print >> fqout, "+"
|
||||
print >> fqout, "".join( map(lambda q: chr(q+33), qualsVector ) )
|
||||
|
||||
def qualStr2fasta( line ):
|
||||
elts = line.split()
|
||||
id = elts[0]
|
||||
quals = ' '.join(elts[1:])
|
||||
return fasta.fasta_record(id, quals)
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv[1:]) != 3:
|
||||
usage()
|
||||
else:
|
||||
fasta_in_file, quals_in_file, fastq_out_file = sys.argv[1:]
|
||||
|
||||
print "Quals input file: "+quals_in_file
|
||||
print "Fasta input file: "+fasta_in_file
|
||||
|
||||
if os.path.splitext(quals_in_file)[1] == ".txt":
|
||||
qual_factory = imap( qualStr2fasta, file(quals_in_file) )
|
||||
else:
|
||||
if os.path.splitext(quals_in_file)[1] == ".quala":
|
||||
# Create fasta factory from the quala file and treat it
|
||||
# as if it was a fasta file and pull the numbers
|
||||
qual_factory = fasta.fasta_file(quals_in_file, cleanup=False)
|
||||
else:
|
||||
print "Qual input file should be a *.quals.txt or *.quala file"
|
||||
|
||||
print "Fastq ouptut file: "+fastq_out_file
|
||||
fqout = file(fastq_out_file, "w")
|
||||
|
||||
for qual_record, fasta_record in izip_longest(qual_factory, fasta.fasta_file(fasta_in_file)):
|
||||
if qual_record == None or fasta_record == None:
|
||||
# We ran out of one record type, so warn the user!
|
||||
print "WARNING: Different number of quals and fastas in input files!"
|
||||
sys.exit(1)
|
||||
#print 'qual_record', qual_record
|
||||
#print 'fasta_record', fasta_record
|
||||
output_fastq_record(fqout, fasta_record, qual_record)
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,40 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import sys, itertools
|
||||
|
||||
def record_generator(filename, sep="\t", skip_n_lines=0, skip_until_regex_line=""):
|
||||
"""Given a file with field headers on the first line and records on subsequent lines,
|
||||
generates a dictionary for each line keyed by the header fields"""
|
||||
fin = open(filename)
|
||||
|
||||
if skip_n_lines > 0:
|
||||
for i in range(skip_n_lines): # Skip a number of lines
|
||||
fin.readline()
|
||||
|
||||
found_regex = False
|
||||
if skip_until_regex_line != "":
|
||||
import re
|
||||
regex_line = re.compile(skip_until_regex_line)
|
||||
for line in fin:
|
||||
match = regex_line.search(line)
|
||||
if match:
|
||||
found_regex = line
|
||||
break
|
||||
if not found_regex:
|
||||
print "Warning: Regex "+skip_until_regex_line+" not found in FlatFileTable:record_generator"
|
||||
|
||||
if found_regex:
|
||||
header = found_regex.rstrip().split(sep) # Parse header
|
||||
else:
|
||||
header = fin.readline().rstrip().split(sep) # Pull off header
|
||||
|
||||
for line in fin: #
|
||||
fields = line.rstrip().split(sep)
|
||||
record = dict(itertools.izip(header, fields))
|
||||
yield record
|
||||
|
||||
def record_matches_values(record, match_field_values):
|
||||
for match_field, match_values in match_field_values:
|
||||
if record[match_field] not in match_values:
|
||||
return False
|
||||
return True
|
||||
|
|
@ -1,49 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# Converts Geli files (genotype likelihood in picard) to GFF format for GATK
|
||||
|
||||
import sys, os, subprocess
|
||||
|
||||
Index2AffyChr = []
|
||||
Index2AffyChr.append("chrM")
|
||||
for i in range(1,23):
|
||||
Index2AffyChr.append("chr"+str(i))
|
||||
Index2AffyChr.append("chrX")
|
||||
Index2AffyChr.append("chrY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 3:
|
||||
print "usage: Geli2GFF.py INPUT_FILE OUTPUT_DIRECTORY"
|
||||
sys.exit()
|
||||
else:
|
||||
infile = sys.argv[1]
|
||||
outdir = sys.argv[2]
|
||||
|
||||
output_file = outdir+"/"+os.path.splitext(os.path.basename(infile))[0]+".gff"
|
||||
outfile = open(output_file, "w")
|
||||
|
||||
cmd = "/seq/software/picard/current/bin/GeliToText.jar I="+infile
|
||||
pipe = subprocess.Popen(cmd, shell=True, bufsize=4000, stdout=subprocess.PIPE).stdout
|
||||
|
||||
pipe.readline()
|
||||
for line in pipe:
|
||||
if line[0] not in ('@', '#'):
|
||||
fields = line.split("\t")
|
||||
#print fields
|
||||
try:
|
||||
contig = fields[0] #Index2AffyChr[int(fields[0])]
|
||||
except KeyError, ValueError:
|
||||
print "skipping "+fields[0]
|
||||
continue # Skip entries not in chr 0 through 24
|
||||
|
||||
print >>outfile, contig+"\tHapmapGenotype\t"+fields[5]+"\t"+fields[1]+"\t"+str(int(fields[1])+1)+"\t.\t.\t.\t"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,170 +0,0 @@
|
|||
import farm_commands
|
||||
import os.path
|
||||
import sys
|
||||
from optparse import OptionParser
|
||||
import string
|
||||
import re
|
||||
import glob
|
||||
import picard_utils
|
||||
import itertools
|
||||
|
||||
gatkPath = "~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar"
|
||||
ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
|
||||
|
||||
def geli2dbsnpFile(geli):
|
||||
root, flowcellDotlane, ext = picard_utils.splitPath(geli)
|
||||
if ext == ".calls":
|
||||
return None
|
||||
else:
|
||||
return os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
|
||||
|
||||
def SingleSampleGenotyperCmd(bam, geli, use2bp):
|
||||
naid = os.path.split(bam)[1].split(".")[0]
|
||||
metrics = geli + '.metrics'
|
||||
gatkPath = '/humgen/gsa-scr1/kiran/repositories/Sting/trunk/dist/GenomeAnalysisTK.jar'
|
||||
hapmapChip = '/home/radon01/andrewk/hapmap_1kg/gffs/' + naid + '.gff'
|
||||
hapmapChipStr = ' '.join(['--hapmap_chip', hapmapChip])
|
||||
if not os.path.exists(hapmapChip):
|
||||
print '*** warning, no hapmap chip resuls for', naid
|
||||
hapmapChipStr = ''
|
||||
|
||||
targetList = '/home/radon01/depristo/work/1kg_pilot_evaluation/data/thousand_genomes_alpha_redesign.targets.interval_list'
|
||||
cmd = "java -ea -jar " + gatkPath + ' ' + ' '.join(['-T SingleSampleGenotyper', '-I', bam, '-L', targetList, '-R', ref, '-D', '/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod', '-metout', metrics, '-varout', geli, '-geli -l INFO ']) + hapmapChipStr
|
||||
return cmd
|
||||
|
||||
def bams2geli(bams):
|
||||
def call1(bam):
|
||||
geli = os.path.splitext(bam)[0] + '.geli'
|
||||
jobid = 0
|
||||
if OPTIONS.useSSG:
|
||||
if not os.path.exists(geli + '.calls'):
|
||||
cmd = SingleSampleGenotyperCmd(bam, geli + '.calls', OPTIONS.useSSG2b)
|
||||
else:
|
||||
if not os.path.exists(geli):
|
||||
cmd = picard_utils.callGenotypesCmd( bam, geli, options = picard_utils.hybridSelectionExtraArgsForCalling())
|
||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry )
|
||||
return geli, jobid
|
||||
calls = map(call1, bams)
|
||||
return map(lambda x: x[0], calls), map(lambda x: x[1], calls)
|
||||
|
||||
def gelis2gelisText( gelis ):
|
||||
def geli2geliText( maybeGeli ):
|
||||
if os.path.splitext(maybeGeli)[1] == ".calls" :
|
||||
return maybeGeli
|
||||
else:
|
||||
return os.path.split(maybeGeli)[1] + '.calls'
|
||||
|
||||
return map( geli2geliText, gelis)
|
||||
|
||||
|
||||
def main():
|
||||
global OPTIONS, ROOT
|
||||
|
||||
usage = "usage: %prog lanes.list nIndividuals [options]"
|
||||
parser = OptionParser(usage=usage)
|
||||
parser.add_option("-q", "--farmQueue", dest="farmQueue",
|
||||
type="string", default=None,
|
||||
help="Farm queue to submit jobs to. Leave blank for local processing")
|
||||
parser.add_option("-l", "--lod", dest="lod",
|
||||
type="float", default=5,
|
||||
help="minimum lod for calling a variant")
|
||||
parser.add_option("-k", "--column", dest="column",
|
||||
type="int", default=1,
|
||||
help="Column in the file with the bam or geli file path")
|
||||
parser.add_option("", "--dry", dest="dry",
|
||||
action='store_true', default=False,
|
||||
help="If provided, nothing actually gets run, just a dry run")
|
||||
parser.add_option("", "--ssg", dest="useSSG",
|
||||
action='store_true', default=False,
|
||||
help="If provided, we'll use the GATK SSG for genotyping")
|
||||
parser.add_option("", "--ssg2b", dest="useSSG2b",
|
||||
action='store_true', default=False,
|
||||
help="If provided, we'll use 2bp enabled GATK SSG ")
|
||||
parser.add_option("-o", "--output", dest="output",
|
||||
type="string", default='/dev/stdout',
|
||||
help="x")
|
||||
|
||||
(OPTIONS, args) = parser.parse_args()
|
||||
if len(args) != 2:
|
||||
parser.error("incorrect number of arguments: " + str(args))
|
||||
lines = [line.strip().split() for line in open(args[0])]
|
||||
nIndividuals = int(args[1])
|
||||
|
||||
outputFile = open(OPTIONS.output, 'w')
|
||||
print >> outputFile, '#', ' '.join(sys.argv)
|
||||
|
||||
data = map( lambda x: x[OPTIONS.column-1], lines )
|
||||
if os.path.splitext(data[0])[1] == '.bam':
|
||||
gelis, jobids = bams2geli(data)
|
||||
if filter(lambda x: x <> 0, jobids) <> []:
|
||||
# there's still work to do
|
||||
sys.exit('Stopping. Please rerun this program when the farm jobs are complete: ' + str(jobids))
|
||||
# TODO: Should add a wait here for all farm jobs to finish...
|
||||
print 'gelis', gelis
|
||||
print 'jobids', jobids
|
||||
else:
|
||||
gelis = map( lambda x: x[OPTIONS.column-1], lines )
|
||||
jobids = [None] * len(gelis)
|
||||
|
||||
print 'Geli files'
|
||||
print gelis
|
||||
|
||||
for geli, jobid in zip(gelis, jobids):
|
||||
dbsnpFile = geli2dbsnpFile(geli)
|
||||
if dbsnpFile == None and not os.path.exists(dbsnpFile):
|
||||
dbsnpCmd = picard_utils.CollectDbSnpMatchesCmd(geli, dbsnpFile, OPTIONS.lod)
|
||||
if jobid == 0: jobid = None
|
||||
farm_commands.cmd(dbsnpCmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid)
|
||||
|
||||
# TODO: Should add a wait here for all farm jobs to finish...
|
||||
|
||||
# read in the dbSNP tracks
|
||||
nTotalSnps = 0
|
||||
nNovelSnps = 0
|
||||
for geli in gelis:
|
||||
root, flowcellDotlane, ext = picard_utils.splitPath(geli)
|
||||
#dbsnp_matches = os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
|
||||
dbsnp_matches = geli2dbsnpFile(geli)
|
||||
print dbsnp_matches
|
||||
if os.path.exists(dbsnp_matches):
|
||||
TOTAL_SNPS, NOVEL_SNPS, PCT_DBSNP, NUM_IN_DB_SNP = picard_utils.read_dbsnp(dbsnp_matches)
|
||||
nTotalSnps += int(TOTAL_SNPS)
|
||||
nNovelSnps += int(NOVEL_SNPS)
|
||||
print >> outputFile, '# DATA: ', flowcellDotlane, TOTAL_SNPS, NOVEL_SNPS, PCT_DBSNP, NUM_IN_DB_SNP, dbsnp_matches
|
||||
print >> outputFile, '# DATA: TOTAL SNP CALLS SUMMED ACROSS LANES, NOT ACCOUNT FOR IDENTITY', nTotalSnps
|
||||
print >> outputFile, '# DATA: NOVEL SNP CALLS SUMMED ACROSS LANES, NOT ACCOUNT FOR IDENTITY ', nNovelSnps
|
||||
print >> outputFile, '# DATA: AVERAGE DBSNP RATE ACROSS LANES %.2f' % (100.0 * float(nTotalSnps - nNovelSnps) / (max(nTotalSnps, 1)))
|
||||
|
||||
# convert the geli's to text
|
||||
jobid = None
|
||||
variantsOut = gelis2gelisText( gelis )
|
||||
for geli, variantOut in zip(gelis, variantsOut):
|
||||
name = os.path.split(geli)[1]
|
||||
if not os.path.exists(variantOut):
|
||||
cmd = ("GeliToText.jar I=%s | awk '$1 !~ \"@\" && $1 !~ \"#Sequence\" && $0 !~ \"GeliToText\"' | awk '$7 > %f {print \"%s\" $0}' > %s" % ( geli, OPTIONS.lod, name, variantOut) )
|
||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry)
|
||||
|
||||
cmd = ("cat %s | sort -k 1 -k 2 -n > tmp.calls" % ( ' '.join(variantsOut) ) )
|
||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid)
|
||||
|
||||
sortedCallFile = 'all.sorted.calls'
|
||||
cmd = ("~/dev/GenomeAnalysisTK/trunk/perl/sortByRef.pl -k 1 tmp.calls ~/work/humanref/Homo_sapiens_assembly18.fasta.fai > %s" % ( sortedCallFile ) )
|
||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid)
|
||||
|
||||
if not OPTIONS.dry:
|
||||
sortedCalls = [line.split() for line in open(sortedCallFile)]
|
||||
aggregratedCalls = picard_utils.aggregateGeliCalls(sortedCalls)
|
||||
|
||||
print >> outputFile, 'loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom'
|
||||
|
||||
for snp in map( lambda x: picard_utils.aggregatedGeliCalls2SNP(x, nIndividuals), aggregratedCalls ):
|
||||
if snp == None: continue # ignore bad calls
|
||||
#print snp
|
||||
#sharedCalls = list(sharedCallsGroup)
|
||||
#genotype = list(sharedCalls[0][5])
|
||||
print >> outputFile, '%s %s %s %.6f -0.0 -0.0 0.000000 100.0 %d %d %d %d' % (snp.loc, snp.ref, snp.alt(), snp.q(), nIndividuals, snp.nRefGenotypes(), snp.nHetGenotypes(), snp.nHomVarGenotypes())
|
||||
outputFile.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,48 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import subprocess, time, sys
|
||||
|
||||
dinucs = ("AA","AC","AG","AT","CA","CC","CG","CT","GA","GC","GG","GT","TA","TC","TG","TT")
|
||||
|
||||
#covar_work_dir = "/humgen/gsa-scr1/andrewk/recalibration_work"
|
||||
#covar_table_dir = "/humgen/gsa-scr1/andrewk/recalibration_tables"
|
||||
fileroot = sys.argv[1]
|
||||
covar_counts_root = fileroot+".covariate_counts"
|
||||
parameter_root = fileroot+".log_reg"
|
||||
recal_table = fileroot+".recalibration_table"
|
||||
|
||||
# Run all R process to do regression and wait for completion
|
||||
kids = []
|
||||
for dinuc in dinucs:
|
||||
kids.append(subprocess.Popen(["/home/radon01/andrewk/covariates/logistic_regression.R", covar_counts_root, parameter_root, dinuc]))
|
||||
#kids.append(subprocess.Popen(["sleep", "8"]))
|
||||
|
||||
while any(kid.poll() is None for kid in kids):
|
||||
time.sleep(0.25)
|
||||
|
||||
fout = file(recal_table, "w")
|
||||
|
||||
fout.write("dinuc_v.1\t")
|
||||
for p in range(5):
|
||||
for q in range(5):
|
||||
fout.write("%d,%d\t" % (p,q))
|
||||
fout.write("\n")
|
||||
|
||||
for dinuc in dinucs:
|
||||
dinin = open(parameter_root+"."+dinuc+".parameters")
|
||||
#dinin.readline()
|
||||
params = []
|
||||
for line in dinin:
|
||||
line.rstrip("\n")
|
||||
params.extend(map(float, line.split()))
|
||||
fout.write(dinuc+"\t")
|
||||
fout.write("\t".join(map(str, params)))
|
||||
fout.write("\n")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,99 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import os,sys
|
||||
|
||||
def exit(msg,errorlevel):
|
||||
"Exit the program with the specified message and error code."
|
||||
print msg
|
||||
sys.exit(errorlevel)
|
||||
|
||||
def open_source_file(source_filename):
|
||||
"Open the source file with the given name. Make sure it's readable."
|
||||
if not os.access(source_filename,os.R_OK):
|
||||
exit("Unable to read covariate counts file '" + sys.argv[1] + "'",1)
|
||||
if not source_filename.endswith('.csv'):
|
||||
exit("Source file is in incorrect format. Must be csv.")
|
||||
return open(source_filename,'r')
|
||||
|
||||
def read_header(source_file):
|
||||
"Read the header from the given source file. Do basic validation."
|
||||
header = source_file.readline().split(',')
|
||||
if header[0] != 'rg' or header[1] != 'dn':
|
||||
exit("Input file is in invalid format. First two columns should be <read group>,<dinucleotide>",1)
|
||||
return header
|
||||
|
||||
def create_intermediate_file(source_file,header,read_group,dinuc):
|
||||
"Create an intermediate file for a particular read group / dinuc from a given source file"
|
||||
base = source_file.name[:source_file.name.rfind('.csv')]
|
||||
intermediate_filename = "%s.%s.%s.csv" % (base,read_group,dinuc)
|
||||
intermediate_file = open(intermediate_filename,"w")
|
||||
intermediate_file.write(','.join(header[2:]))
|
||||
return intermediate_file
|
||||
|
||||
def open_target_file(target_filename):
|
||||
"Open a target file and write out the header."
|
||||
target_file = open(target_filename,'w')
|
||||
target_file.write("rg\tdinuc\t")
|
||||
for p in range(5):
|
||||
for q in range(5):
|
||||
target_file.write("%d,%d\t" % (p,q))
|
||||
target_file.write("\n")
|
||||
return target_file
|
||||
|
||||
def process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script):
|
||||
"Process the contents of an intermediate file. An intermediate file is the specific data arranged per read group, per dinuc."
|
||||
base = source_file.name[:source_file.name.rfind('.csv')] + '.' + read_group
|
||||
regression_command = ' '.join((R_exe,logistic_regression_script,base,base,dinuc))
|
||||
result = os.system(regression_command)
|
||||
if result != 0:
|
||||
exit('Unable to run linear regression; command was %s, error code was %d' % (regression_command, result),1)
|
||||
parameters_filename = '.'.join((base,dinuc,'parameters'))
|
||||
if not os.access(parameters_filename,os.R_OK):
|
||||
exit("Unable to read output of R from file " + parameters_filename)
|
||||
parameters_file = open(parameters_filename,'r')
|
||||
parameters = ' '.join([line.rstrip('\n') for line in parameters_file]).split(' ')
|
||||
target_file.write('\t'.join([read_group,dinuc]+parameters)+'\n')
|
||||
# optionally remove the input and output files. For now, keep them around for debugging purposes.
|
||||
# os.remove('.'.join((base,dinuc,'csv')))
|
||||
# os.remove('.'.join((base,dinuc,'parameters')))
|
||||
|
||||
def compute_logistic_regression(source_filename,target_filename,R_exe,logistic_regression_script):
|
||||
"Do the heavy lifting. Group covariant data into individual output files, compute the logistic regression, and format the results."
|
||||
source_file = open_source_file(source_filename)
|
||||
target_file = open_target_file(target_filename)
|
||||
|
||||
header = read_header(source_file)
|
||||
|
||||
intermediate_file = None
|
||||
read_group = None
|
||||
dinuc = None
|
||||
|
||||
for data_line in source_file:
|
||||
data_line = data_line.strip()
|
||||
if len(data_line) == 0:
|
||||
continue
|
||||
data = data_line.split(',')
|
||||
if read_group != data[0] or dinuc != data[1]:
|
||||
if intermediate_file:
|
||||
intermediate_file.close()
|
||||
process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script)
|
||||
read_group,dinuc = data[0:2]
|
||||
intermediate_file = create_intermediate_file(source_file,header,read_group,dinuc)
|
||||
intermediate_file.write(','.join(data[2:])+'\n')
|
||||
|
||||
if intermediate_file:
|
||||
intermediate_file.close()
|
||||
process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script)
|
||||
|
||||
source_file.close()
|
||||
target_file.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
# set up sensible defaults for the locations of R and the logistic regression script
|
||||
R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
|
||||
logistic_regression_script="resources/logistic_regression.R"
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
exit("Usage: logistic_regression <covariate count input csv> <parameters csv>",1)
|
||||
|
||||
compute_logistic_regression(sys.argv[1],sys.argv[2],R_exe,logistic_regression_script)
|
||||
|
|
@ -1,78 +0,0 @@
|
|||
import farm_commands
|
||||
import os.path
|
||||
import sys
|
||||
from optparse import OptionParser
|
||||
from datetime import date
|
||||
import glob
|
||||
import operator
|
||||
import ValidateGATK
|
||||
import picard_utils
|
||||
from MergeBAMsUtils import *
|
||||
|
||||
def splitSourcesByPopulation(allSources, merged_filename_base, NAID2Pop):
|
||||
sourcePairs = [[source, source] for source in allSources]
|
||||
return groupSources(sourcePairs, NAID2Pop, merged_filename_base)
|
||||
|
||||
if __name__ == "__main__":
|
||||
usage = "usage: %prog files.list [options]"
|
||||
parser = OptionParser(usage=usage)
|
||||
parser.add_option("-q", "--farm", dest="farmQueue",
|
||||
type="string", default=None,
|
||||
help="Farm queue to send processing jobs to")
|
||||
parser.add_option("-d", "--dir", dest="output_dir",
|
||||
type="string", default="./",
|
||||
help="Output directory")
|
||||
parser.add_option("", "--dry", dest="dry",
|
||||
action='store_true', default=False,
|
||||
help="If provided, nothing actually gets run, just a dry run")
|
||||
parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles",
|
||||
action='store_true', default=False,
|
||||
help="Ignores already written files, if present")
|
||||
parser.add_option("-s", "--useSamtools", dest="useSamtools",
|
||||
action='store_true', default=False,
|
||||
help="If present, uses samtools to perform the merge")
|
||||
parser.add_option("-m", "--mergeBin", dest="mergeBin",
|
||||
type="string", default=None,
|
||||
help="Path to merge binary")
|
||||
parser.add_option("-n", "--naIDPops", dest="NAIDS2POP",
|
||||
type="string", default=None,
|
||||
help="Path to file contains NAID POP names. If provided, input files will be merged by population")
|
||||
parser.add_option("-p", "--pop", dest="onlyPop",
|
||||
type="string", default=None,
|
||||
help="If provided, only this population will be processed")
|
||||
|
||||
(OPTIONS, args) = parser.parse_args()
|
||||
if len(args) != 1:
|
||||
parser.error("incorrect number of arguments")
|
||||
|
||||
directory = OPTIONS.output_dir
|
||||
|
||||
if not os.path.exists(directory):
|
||||
os.mkdir(directory)
|
||||
|
||||
NAID2Pop = None
|
||||
if OPTIONS.NAIDS2POP <> None:
|
||||
NAID2Pop = readNAIdMap(OPTIONS.NAIDS2POP)
|
||||
|
||||
for line in open(args[0]):
|
||||
s = line.split()
|
||||
if ( s <> [] and s[0] <> '#' ):
|
||||
merged_filename_base = s[0]
|
||||
allSources = reduce( operator.__add__, map( glob.glob, s[1:] ), [] )
|
||||
print 'Merging info:'
|
||||
for spec in splitSourcesByPopulation(allSources, merged_filename_base, NAID2Pop):
|
||||
if OPTIONS.onlyPop <> None and spec.group() <> OPTIONS.onlyPop:
|
||||
continue
|
||||
|
||||
spec.setPath(directory)
|
||||
spec.pprint()
|
||||
|
||||
jobid = None
|
||||
if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAM()):
|
||||
output = spec.getMergedBase()
|
||||
cmd = spec.mergeCmd(OPTIONS.mergeBin, useSamtools = OPTIONS.useSamtools)
|
||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry)
|
||||
|
||||
if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAMIndex()):
|
||||
jobid = farm_commands.cmd(spec.getIndexCmd(), OPTIONS.farmQueue, None, waitID = jobid, just_print_commands = OPTIONS.dry)
|
||||
|
||||
|
|
@ -1,210 +0,0 @@
|
|||
import farm_commands
|
||||
import os.path
|
||||
import sys
|
||||
from optparse import OptionParser
|
||||
from datetime import date
|
||||
import glob
|
||||
import operator
|
||||
import ValidateGATK
|
||||
import picard_utils
|
||||
import operator
|
||||
|
||||
MERGE_BIN = '/seq/software/picard/current/bin/MergeSamFiles.jar'
|
||||
bam_ext = 'bam'
|
||||
bam_index_ext = 'bai'
|
||||
|
||||
def readNAIdMap(NAIdFile):
|
||||
m = dict()
|
||||
for data in [line.split() for line in open(NAIdFile)]:
|
||||
naid, pop = data[0:2]
|
||||
print naid, ' => ', pop
|
||||
assert naid not in m
|
||||
m[naid] = pop
|
||||
print 'Read NAID->population map'
|
||||
print 'Contains', len(m), 'id -> population mappings'
|
||||
print 'Distinct populations:', picard_utils.unique(m.values())
|
||||
return m
|
||||
|
||||
_abbrevs = [
|
||||
(1<<50L, 'P'),
|
||||
(1<<40L, 'T'),
|
||||
(1<<30L, 'G'),
|
||||
(1<<20L, 'M'),
|
||||
(1<<10L, 'k'),
|
||||
(1, '')
|
||||
]
|
||||
|
||||
def greek(size):
|
||||
"""Return a string representing the greek/metric suffix of a size"""
|
||||
for factor, suffix in _abbrevs:
|
||||
if size > factor:
|
||||
break
|
||||
return '%.1f%s' % (float(size)/factor, suffix)
|
||||
|
||||
class MergeFilesSpec:
|
||||
def __init__(self, sources, group, merged_filename_base, path = '' ):
|
||||
self.sourceFiles = sources
|
||||
self.groupName = group
|
||||
self.merged_filename_base = merged_filename_base
|
||||
self.path = ''
|
||||
|
||||
def __str__(self):
|
||||
return 'MergeFilesSpec: ' + self.group()
|
||||
|
||||
def group(self):
|
||||
return self.groupName
|
||||
|
||||
def sources(self):
|
||||
return self.sourceFiles
|
||||
|
||||
def filename(self):
|
||||
if self.merged_filename_base <> None:
|
||||
return self.merged_filename_base + '.'.join([self.group()])
|
||||
else:
|
||||
return self.group()
|
||||
|
||||
def pprint(self):
|
||||
print '--------------------------------------------------------------------------------'
|
||||
print ' Population: ', self.group()
|
||||
print ' Merged filename: ', self.getMergedBAM()
|
||||
print ' N sources: ', len(self.sources())
|
||||
for source in sorted(self.sources()):
|
||||
print ' Source: ', source
|
||||
print ' Sizes: ', self.sourceSizes(humanReadable=True)
|
||||
print ' Est. merged size: ', greek(reduce(operator.__add__, self.sourceSizes(), 0))
|
||||
|
||||
def setPath(self, path):
|
||||
self.path = path
|
||||
|
||||
def getMergedBase(self):
|
||||
return os.path.join(self.path, self.filename())
|
||||
|
||||
def getMergedBAM(self):
|
||||
return self.getMergedBase() + '.' + bam_ext
|
||||
|
||||
def getMergedBAMIndex(self):
|
||||
return self.getMergedBase() + '.' + bam_ext + '.' + bam_index_ext
|
||||
|
||||
def sourceSizes(self, humanReadable=False):
|
||||
sizes = map( os.path.getsize, self.sources() )
|
||||
if humanReadable:
|
||||
sizes = map(greek, sizes)
|
||||
return sizes
|
||||
|
||||
def mergeCmd(self, mergeBin = None, MSD = True, useSamtools = False):
|
||||
if mergeBin == None:
|
||||
mergeBin = MERGE_BIN
|
||||
|
||||
return picard_utils.mergeBAMCmd(self.getMergedBAM(), self.sources(), mergeBin, MSD = MSD, useSamtools = useSamtools)
|
||||
|
||||
def getIndexCmd(self):
|
||||
return "samtools index " + self.getMergedBAM()
|
||||
|
||||
# Very general-purpose operation that returns merge specs given two lists of pairs:
|
||||
# The first list, sources, contains superkey / sourceFile pairs.
|
||||
# The second list, groups, contains key / group pairs.
|
||||
#
|
||||
# This function walks over all source pairs, and for each superkey, it
|
||||
# looks for any key within groups contained within superkey. If it finds it,
|
||||
# it associates sourceFile with the merge group in groups.
|
||||
#
|
||||
# The system requires that superkey match one (and only) one key in groups. It also
|
||||
# requires that each group string be unique. The system can handle groups provided
|
||||
# as a list of pairs of a dictionary.
|
||||
#
|
||||
# The function returns a list of MergeFileSpecs, one for each group with
|
||||
# at least one associated sourceFile.
|
||||
#
|
||||
def groupSources(sources, groups, merged_filename_base):
|
||||
if groups == None:
|
||||
return [MergeFilesSpec(map( lambda x: x[1], sources), '', merged_filename_base)]
|
||||
else:
|
||||
specs = dict()
|
||||
if type(groups) == list: groups = dict(groups)
|
||||
|
||||
for superkey, sourceFile in sources:
|
||||
spec = None
|
||||
for key, group in groups.iteritems():
|
||||
#print 'Examining', superkey, key, group
|
||||
if superkey.find(key) <> -1:
|
||||
if group in specs:
|
||||
spec = specs[group]
|
||||
else:
|
||||
spec = MergeFilesSpec([], group, merged_filename_base)
|
||||
specs[group] = spec
|
||||
print 'Mapping', group, key, superkey, sourceFile
|
||||
spec.sourceFiles.append(sourceFile)
|
||||
if spec == None:
|
||||
sys.exit('File contains an unknown superkey: ' + superkey)
|
||||
v = specs.values()
|
||||
v.sort(key = MergeFilesSpec.group)
|
||||
return v
|
||||
|
||||
import unittest
|
||||
class TestMergeBAMsUtils(unittest.TestCase):
|
||||
def setUp(self):
|
||||
import cStringIO
|
||||
groupsString = """NA10846 ceu CEPH1
|
||||
NA10847 ceu CEPH1
|
||||
NA12144 ceu CEPH1
|
||||
NA12145 ceu CEPH1
|
||||
NA12146 yri CEPH1
|
||||
NA12239 yri CEPH1
|
||||
NA07029 ceu CEPH1
|
||||
NA07019 ceu CEPH1
|
||||
NA06994 ceu CEPH1
|
||||
NA07000 ceu CEPH1
|
||||
NA07022 ceu CEPH1
|
||||
NA07056 ceu CEPH1
|
||||
NA07048 ceu CEPH1
|
||||
NA06991 ceu CEPH1
|
||||
NA07034 ceu CEPH1
|
||||
"""
|
||||
lanesString = """NA10846 30GA9AAXX 1 Paired CEPH 30GA9AAXX.1.observed_genotypes.geli
|
||||
NA10846 30GA9AAXX 6 Paired CEPH 30GA9AAXX.6.observed_genotypes.geli
|
||||
NA10847 30GA9AAXX 7 Paired CEPH 30GA9AAXX.7.observed_genotypes.geli
|
||||
NA12146 30JLTAAXX 2 Paired CEPH 30JLTAAXX.2.observed_genotypes.geli
|
||||
NA12239 30PNVAAXX 1 Paired CEPH 30PNVAAXX.1.observed_genotypes.geli
|
||||
NA12144 30PYMAAXX 1 Paired CEPH 30PYMAAXX.1.observed_genotypes.geli
|
||||
NA12146 30PYMAAXX 7 Paired CEPH 30PYMAAXX.7.observed_genotypes.geli
|
||||
"""
|
||||
|
||||
self.laneIDCounts = dict([["NA10846", 2], ["NA10847", 1], ["NA12146", 2], ["NA12239", 1], ["NA12144", 1]])
|
||||
|
||||
pops = [line.strip() for line in cStringIO.StringIO(groupsString)]
|
||||
lanes = [line.strip() for line in cStringIO.StringIO(lanesString)]
|
||||
|
||||
print pops
|
||||
print lanes
|
||||
|
||||
self.ids2pop = [line.split()[0:2] for line in pops]
|
||||
self.ids2gelis = [[line.split()[0], line.split()[5]] for line in lanes]
|
||||
self.ids2ids = dict([[line.split()[0]] * 2 for line in lanes])
|
||||
|
||||
def testPopGroups(self):
|
||||
specs = groupSources(self.ids2gelis, self.ids2pop, 'foo')
|
||||
print 'Specs', specs
|
||||
self.assertEqual(len(specs), 2)
|
||||
self.assertEqual(specs[0].group(), 'ceu')
|
||||
self.assertEqual(specs[1].group(), 'yri')
|
||||
|
||||
ceu = specs[0]
|
||||
yri = specs[1]
|
||||
#print ceu.sources()
|
||||
self.assertEqual(len(ceu.sources()), 4)
|
||||
self.assertEqual(len(yri.sources()), 3)
|
||||
self.assertEqual(ceu.getMergedBAM(), 'foo.ceu.bam')
|
||||
self.assertEqual(ceu.getMergedBAMIndex(), 'foo.ceu.bam.bai')
|
||||
self.assertEqual(yri.getMergedBAM(), 'foo.yri.bam')
|
||||
self.assertEqual(yri.getMergedBAMIndex(), 'foo.yri.bam.bai')
|
||||
|
||||
def testIDGroups(self):
|
||||
specs = groupSources(self.ids2gelis, self.ids2ids, 'foo')
|
||||
self.assertEqual(len(specs), 5)
|
||||
for spec in specs:
|
||||
print 'Spec', spec
|
||||
self.assertEqual(len(spec.sources()), self.laneIDCounts[spec.group()])
|
||||
self.assertEqual(spec.getMergedBAM(), 'foo.' + spec.group() + '.bam')
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
|
@ -1,82 +0,0 @@
|
|||
import farm_commands
|
||||
import os.path
|
||||
import sys
|
||||
from optparse import OptionParser
|
||||
import picard_utils
|
||||
from MergeBAMsUtils import *
|
||||
|
||||
def splitSourcesByKeys( bams, keys ):
|
||||
keyPairs = [[key, key] for key in keys]
|
||||
keybamPairs = zip(keys, bams)
|
||||
return groupSources(keybamPairs, keyPairs, None)
|
||||
|
||||
if __name__ == "__main__":
|
||||
usage = """usage: %prog bams.list [options]
|
||||
Merges BAM files by keys from a file of a list of bams.
|
||||
|
||||
bams.list is a whitespace separated file. One column (--keyCol arg) is the key, and another
|
||||
column (--bamCol) is a path to a bam file. This program will group the bam files
|
||||
by key and spawn merge and index jobs to merge all of the files sharing the same key together"""
|
||||
|
||||
parser = OptionParser(usage=usage)
|
||||
parser.add_option("-q", "--farm", dest="farmQueue",
|
||||
type="string", default=None,
|
||||
help="Farm queue to send processing jobs to")
|
||||
parser.add_option("-d", "--dir", dest="output_dir",
|
||||
type="string", default="./",
|
||||
help="Output directory")
|
||||
parser.add_option("", "--dry", dest="dry",
|
||||
action='store_true', default=False,
|
||||
help="If provided, nothing actually gets run, just a dry run")
|
||||
parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles",
|
||||
action='store_true', default=False,
|
||||
help="Ignores already written files, if present")
|
||||
parser.add_option("-m", "--mergeBin", dest="mergeBin",
|
||||
type="string", default=None,
|
||||
help="Path to merge binary")
|
||||
parser.add_option("", "--MSD", dest="MSD",
|
||||
action='store_true', default=False,
|
||||
help="Merge sequence dictionaries?")
|
||||
parser.add_option("", "--keyCol", dest="keyCol",
|
||||
type=int, default=1,
|
||||
help="Column in the list file holding the key")
|
||||
parser.add_option("", "--bamCol", dest="bamCol",
|
||||
type=int, default=2,
|
||||
help="Column in the list file holding the bam file path")
|
||||
parser.add_option("-l", "--link", dest="link",
|
||||
action='store_true', default=False,
|
||||
help="If true, program will soft link single bam files that don't need merging")
|
||||
|
||||
(OPTIONS, args) = parser.parse_args()
|
||||
if len(args) != 1:
|
||||
parser.error("incorrect number of arguments")
|
||||
|
||||
directory = OPTIONS.output_dir
|
||||
|
||||
if not os.path.exists(directory):
|
||||
os.mkdir(directory)
|
||||
|
||||
bamsList = [line.strip().split() for line in open(args[0])]
|
||||
keys = map( lambda x: x[OPTIONS.keyCol-1], bamsList )
|
||||
bams = map( lambda x: x[OPTIONS.bamCol-1], bamsList )
|
||||
|
||||
print 'Merging info:'
|
||||
for info in bamsList: print info
|
||||
for spec in splitSourcesByKeys(bams, keys):
|
||||
spec.setPath(directory)
|
||||
spec.pprint()
|
||||
|
||||
jobid = None
|
||||
if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAM()):
|
||||
output = spec.getMergedBase() + '.stdout'
|
||||
if len(spec.sources()) == 1 and OPTIONS.link:
|
||||
cmd = 'ln -s ' + spec.sources()[0] + ' ' + spec.getMergedBAM()
|
||||
else:
|
||||
cmd = spec.mergeCmd(OPTIONS.mergeBin, MSD = OPTIONS.MSD)
|
||||
print cmd
|
||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, just_print_commands = OPTIONS.dry)
|
||||
|
||||
if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAMIndex()):
|
||||
#pass
|
||||
jobid = farm_commands.cmd(spec.getIndexCmd(), OPTIONS.farmQueue, None, waitID = jobid, just_print_commands = OPTIONS.dry)
|
||||
|
||||
|
|
@ -1,154 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# merges .tab files produced by EvalMapping.py into a tabular format
|
||||
|
||||
#from __future__ import print_function # python 2.6 / 3.0 print as a function
|
||||
import os,sys,re
|
||||
from rpy2 import *
|
||||
|
||||
def ordered_eval_files(aligner, mutation_type, file_ext):
|
||||
tab_files = [f for f in os.listdir(".") if f.endswith("."+aligner+"."+file_ext) and (mutation_type in f or "None_den." in f)]
|
||||
|
||||
tabs = []
|
||||
for f in tab_files:
|
||||
num = int(re.search(r'den\.(\d\d?)_', f).groups()[0])
|
||||
if "None_den" in f:
|
||||
num = 0
|
||||
tabs.append( (num,f) )
|
||||
tabs.sort()
|
||||
return tabs
|
||||
|
||||
def anchor_hist(filename):
|
||||
fin = open(filename)
|
||||
offsets = []
|
||||
read_poses = []
|
||||
|
||||
for line in fin:
|
||||
id = None
|
||||
fields = line.split()
|
||||
if len(fields) > 0 and ".read." in fields[0]:
|
||||
id = line.split()[0]
|
||||
read_poses.append(id.split(".")[-1])
|
||||
offsets.append(id.split(":")[1].split(".")[0])
|
||||
|
||||
#if "read name :" in line:
|
||||
# id = line.split()[3]
|
||||
|
||||
import rpy2.robjects as robj
|
||||
if len(read_poses):
|
||||
print len(read_poses)
|
||||
return robj.r.hist(robj.FloatVector(read_poses), breaks=77, plot=False)
|
||||
else:
|
||||
print 0
|
||||
return robj.IntVector([])
|
||||
|
||||
def estimateCPUTime(filename):
|
||||
times = []
|
||||
if os.path.exists(filename):
|
||||
regex = re.compile('CPU time\s*:\s+([\d\.]+) sec')
|
||||
#print ('Filename', filename)
|
||||
def extractTime1(line):
|
||||
sobj = regex.search(line)
|
||||
if sobj <> None:
|
||||
time = float(sobj.group(1))
|
||||
#print (line, sobj, sobj.groups(), time)
|
||||
return time
|
||||
else:
|
||||
return None
|
||||
times = [time for time in map(extractTime1, open(filename)) if time <> None]
|
||||
#print('times', times)
|
||||
|
||||
return max(times)
|
||||
|
||||
def print_eval_tables(aligner, mut_type, num_var, filename, min_qual_cutoff=0):
|
||||
filebase = os.path.splitext(filename)[0]
|
||||
cpuTime = estimateCPUTime(filebase + '.stdout')
|
||||
|
||||
lines = open(filename).readlines()
|
||||
data = None
|
||||
if len(lines) > 1:
|
||||
da_line = lines[0]
|
||||
for line in lines:
|
||||
line = line.rstrip()
|
||||
fields = line.split("\t")
|
||||
try:
|
||||
qual_cutoff = int(fields[0])
|
||||
except:
|
||||
qual_cutoff = 1000
|
||||
if qual_cutoff <= min_qual_cutoff:
|
||||
break
|
||||
else:
|
||||
da_line = line
|
||||
|
||||
#print ("%2d\t" % num_var, file=sys.stderr),
|
||||
data = [aligner, mut_type, min_qual_cutoff, num_var] + map(int, da_line.split()) + [cpuTime]
|
||||
print "%s\t%s\t%2d\t%s\t%.2f" % (aligner, mut_type, num_var, da_line, cpuTime)
|
||||
return data
|
||||
|
||||
def plot_anchor_hists():
|
||||
import rpy2.robjects as robj
|
||||
#robj.r.par( mfrow = robj.RVector([2,2]) )
|
||||
#robj.r('X11(width=24, height=15)')
|
||||
robj.r('png("lotto_histo.png", width=1500, height=900)')
|
||||
robj.r('par(mfrow=c(4,6 ), cin=c(3,3))')
|
||||
|
||||
for (aligner, cutoff) in (("maq", 30), ("ilt",-1), ("merlin",-1), ("swmerlin",-1)):
|
||||
print aligner, ">"+str(cutoff)
|
||||
|
||||
num_file = ordered_eval_files(aligner, mut_type, file_ext="eval")
|
||||
#print (*num_file, sep="\n")
|
||||
for num, file in num_file:
|
||||
|
||||
h = anchor_hist( file )
|
||||
title = "{aligner} {num} bp insert".format(aligner=aligner, num=num)
|
||||
robj.r.plot(h, xlab="Anchor position", main=title, col="blue", ylim=robj.FloatVector([0,40]), xlim=robj.FloatVector([0,90]))
|
||||
|
||||
robj.r("dev.off()")
|
||||
#raw_input("Press any key to exit...")
|
||||
|
||||
|
||||
def analyzeData( data, mut_type ):
|
||||
import rpy2.robjects as robj
|
||||
# print name / data name
|
||||
aligners = [('MAQ', 'maq>30'), ('Merlin', 'swmerlin>-1'), ('ILT', 'ilt>-1'), ('BWA', 'bwa32>30')]
|
||||
|
||||
def selectData( key ):
|
||||
def keepDataP( data ):
|
||||
return data[0] == key and data[1] == mut_type
|
||||
return filter( keyDataP, data )
|
||||
|
||||
def placeRatePlot():
|
||||
robj.r('png("x.png", width=1500, height=900)')
|
||||
x = robj.FloatVector(mutDensity)
|
||||
y = robj.FloatVector(placeRate)
|
||||
robj.r.plot(x, y, xlab="Mutation density / size", col="blue")
|
||||
robj.r("dev.off()")
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print "merge_tabs.py MUTATION_TYPE (e.g. INSERTION, DELETION, SNP)"
|
||||
sys.exit(1)
|
||||
else:
|
||||
mut_type = sys.argv[1]
|
||||
|
||||
if False:
|
||||
plot_anchor_hists()
|
||||
else:
|
||||
data = []
|
||||
for (aligner, cutoff) in (("maq", 30), ("maq", 0), ("maq",-1), ("ilt",-1), ("merlin",-1), ("swmerlin",-1), ("bwa", 30), ("bwa", 0), ("bwa", -1), ("bwa32", 30), ("bwa32", 0), ("bwa32", -1)):
|
||||
name = aligner + ">" + str(cutoff)
|
||||
#print (aligner, ">"+str(cutoff))
|
||||
|
||||
num_file = ordered_eval_files(aligner, mut_type, file_ext="tab")
|
||||
#num_file = ordered_eval_files(aligner, mut_type, file_ext="eval")
|
||||
for num, file in num_file:
|
||||
datum = print_eval_tables( name, mut_type, num, file, cutoff )
|
||||
if datum <> None:
|
||||
data.append(datum)
|
||||
#print
|
||||
|
||||
analyzeData( data, mut_type )
|
||||
|
||||
|
|
@ -1,221 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# Executable files. Please update these to match the installed locations of your tools.
|
||||
samtools_exe='/seq/dirseq/samtools/current/samtools'
|
||||
java_exe='/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/bin/java'
|
||||
R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
|
||||
|
||||
# Any special site-specific arguments to pass the JVM.
|
||||
jvm_args='-ea'
|
||||
|
||||
# Which platforms should the calibration tool be run over?
|
||||
platforms=['illumina']
|
||||
|
||||
# Where to put the output created as part of recalibration.
|
||||
# If editing, please end this variable with a trailing slash.
|
||||
output_root = './'
|
||||
|
||||
# Location of the resource files distributed with the recalibration tool.
|
||||
# If editing, please end this variable with a trailing slash.
|
||||
resources='recalibratorResources/'
|
||||
#resources='/broad/1KG/DCC_recal/ReadQualityRecalibrator/resources/'
|
||||
|
||||
import getopt,glob,os,sys
|
||||
import LogisticRegressionByReadGroup
|
||||
|
||||
# if the provided resources path is relative, convert it to a path
|
||||
# relative to the calling script rather than the cwd.
|
||||
if not os.path.isabs(resources):
|
||||
application_path = os.path.dirname(sys.argv[0])
|
||||
resources = os.path.abspath(application_path) + '/' + resources
|
||||
|
||||
# Where does the reference live?
|
||||
reference_base = resources + 'Homo_sapiens_assembly18'
|
||||
reference = reference_base + '.fasta'
|
||||
reference_dict = reference_base + '.dict'
|
||||
reference_fai = reference_base + '.fasta.fai'
|
||||
|
||||
# Where does DBSNP live?
|
||||
dbsnp = resources + 'dbsnp_129_hg18.rod'
|
||||
|
||||
# Where are the application files required to run the recalibration?
|
||||
#gatk = resources + 'gatk/GenomeAnalysisTK.jar'
|
||||
gatk = '/home/radon01/depristo/dev/GenomeAnalysisTK_stable/trunk/dist/GenomeAnalysisTK.jar'
|
||||
|
||||
logistic_regression_script = resources + 'logistic_regression.R'
|
||||
empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R'
|
||||
|
||||
# Assemble the platform list into command-line arguments.
|
||||
platform_args = ' '.join(['-pl %s' % platform for platform in platforms])
|
||||
|
||||
def output_file_needs_update(output_file, input_files = []):
|
||||
if reprocessFiles: # we are always reprocessing files
|
||||
return True
|
||||
if not os.path.exists(output_file): # the file doesn't exist
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def executeCommand(cmd):
|
||||
if not dryRun:
|
||||
return os.system(cmd)
|
||||
else:
|
||||
print ' => Would execute', cmd
|
||||
return 0
|
||||
|
||||
def exit(msg,errorcode):
|
||||
print msg
|
||||
sys.exit(errorcode)
|
||||
|
||||
def check_input_file_available(filename,description):
|
||||
if not os.access(filename,os.R_OK):
|
||||
exit('Unable to access %s %s' % (description,filename),1)
|
||||
|
||||
def graph_file(graph_script,graph_data):
|
||||
'Graph the given data using the given script. Leave the data in the output directory.'
|
||||
check_input_file_available(graph_script,'%s R graphing script' % graph_script)
|
||||
check_input_file_available(graph_data,'%s graphing data' % graph_data)
|
||||
result = executeCommand(' '.join((R_exe,graph_script,graph_data)))
|
||||
if result != 0:
|
||||
exit('Unable to graph data: %s' % graph_data,1)
|
||||
|
||||
def recalibrate():
|
||||
'Recalibrate the given bam file'
|
||||
# generate the covariates
|
||||
print 'generating covariates'
|
||||
linear_regression_results = output_dir + 'linear_regression_results.out'
|
||||
if output_file_needs_update(linear_regression_results):
|
||||
generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
|
||||
returncode = executeCommand(generate_covariates)
|
||||
if returncode != 0:
|
||||
exit('Unable to generate covariates',1)
|
||||
|
||||
# compute the logistic regression
|
||||
if not dryRun:
|
||||
print 'computing the logistic regression'
|
||||
LogisticRegressionByReadGroup.compute_logistic_regression(output_dir + 'initial.covariate_counts.csv',linear_regression_results,R_exe,logistic_regression_script)
|
||||
else:
|
||||
print 'Logistic recalibration data files already generated', linear_regression_results
|
||||
|
||||
if output_file_needs_update(calibrated_bam):
|
||||
# apply the logistic regression, writing the output data to calibrated_bam
|
||||
print 'applying the correction to the reads'
|
||||
apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams',output_dir+'linear_regression_results.out','-outputBAM',calibrated_bam))
|
||||
returncode = executeCommand(apply_logistic_regression)
|
||||
if returncode != 0:
|
||||
exit('Unable to apply logistic regression',1)
|
||||
else:
|
||||
print 'Recalibrated BAM file already generated', calibrated_bam
|
||||
|
||||
# index the calibrated bam
|
||||
if output_file_needs_update(calibrated_bam + '.bai'):
|
||||
print 'indexing the calibrated bam'
|
||||
index_calibrated_bamfile = ' '.join((samtools_exe,'index',calibrated_bam))
|
||||
returncode = executeCommand(index_calibrated_bamfile)
|
||||
if returncode != 0:
|
||||
exit('Unable to index calibrated bamfile',1)
|
||||
else:
|
||||
print 'Recalibrated BAM index file already generated', calibrated_bam + '.bai'
|
||||
|
||||
print 'Recalibration complete! Calibrated bam is available here: ' + calibrated_bam
|
||||
|
||||
def evaluate():
|
||||
'Evaluate recalibration results.'
|
||||
print 'Evaluating recalibration results'
|
||||
|
||||
recalibrated_regression_results = output_dir + 'recalibrated' + 'linear_regression_results.out'
|
||||
if output_file_needs_update(recalibrated_regression_results):
|
||||
# regenerate the covariates
|
||||
regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
|
||||
print 'regenerating covariates'
|
||||
returncode = executeCommand(regenerate_covariates)
|
||||
if returncode != 0:
|
||||
exit('Unable to regenerate covariates',1)
|
||||
|
||||
print 'graphing initial results'
|
||||
for filename in glob.glob(output_dir+'initial.*.empirical_v_reported_quality.csv'):
|
||||
graph_file(empirical_vs_reported_grapher,filename)
|
||||
print 'graphing final results'
|
||||
for filename in glob.glob(output_dir+'recalibrated.*.empirical_v_reported_quality.csv'):
|
||||
graph_file(empirical_vs_reported_grapher,filename)
|
||||
else:
|
||||
print 'Evaluated files already generated', recalibrated_regression_results
|
||||
|
||||
def usage():
|
||||
exit('Usage: python RecalQual.py [--recalibrate] [--evaluate] <input bam file> <calibrated output bam file>',1)
|
||||
|
||||
# Try to parse the given command-line arguments.
|
||||
try:
|
||||
opts, args = getopt.gnu_getopt(sys.argv[1:],'',['recalibrate','evaluate', 'reprocess', 'dry', 'outputRoot='])
|
||||
except getopt.GetoptError, err:
|
||||
usage()
|
||||
|
||||
# An input and an output file is required. Fail if left unspecified.
|
||||
if len(args) < 2:
|
||||
usage()
|
||||
|
||||
# Determine whether to evaluate / recalibrate.
|
||||
recalibrate_requested = False
|
||||
evaluate_requested = False
|
||||
reprocessFiles = False
|
||||
dryRun = False
|
||||
for opt,arg in opts:
|
||||
if opt == '--recalibrate':
|
||||
recalibrate_requested = True
|
||||
if opt == '--evaluate':
|
||||
evaluate_requested = True
|
||||
if opt == '--reprocess':
|
||||
reprocessFiles = True
|
||||
if opt == '--dry':
|
||||
dryRun = True
|
||||
if opt == '--outputRoot':
|
||||
output_root = arg
|
||||
|
||||
# Default to 'recalibrate' unless the user specified only evaluate.
|
||||
do_recalibration = not (evaluate_requested and not recalibrate_requested)
|
||||
# Only evaluate if the user specifically requested evaluation.
|
||||
do_evaluation = evaluate_requested
|
||||
|
||||
print 'Looking for supplemental files in ' + resources
|
||||
|
||||
# check that the input bam file exists, and that the bam is indexed.
|
||||
bam = args[0]
|
||||
bam_index = bam + '.bai'
|
||||
|
||||
check_input_file_available(bam,'reads file')
|
||||
check_input_file_available(bam_index,'reads index file')
|
||||
|
||||
# parse the user's calibration output file requirements
|
||||
calibrated_bam = args[1]
|
||||
calibrated_bam_index = calibrated_bam + '.bai'
|
||||
|
||||
# check that the fasta and supporting files are available
|
||||
check_input_file_available(reference,'reference file')
|
||||
check_input_file_available(reference_dict,'reference dictionary')
|
||||
check_input_file_available(reference_fai,'reference index file')
|
||||
|
||||
# check that the dbsnp is available
|
||||
check_input_file_available(dbsnp,'dbsnp file')
|
||||
|
||||
# sanity check that the software is available
|
||||
check_input_file_available(samtools_exe,'samtools')
|
||||
check_input_file_available(java_exe,'java')
|
||||
check_input_file_available(R_exe,'R')
|
||||
check_input_file_available(gatk,'Genome Analysis Toolkit')
|
||||
check_input_file_available(logistic_regression_script,'logistic regression script')
|
||||
|
||||
# make an output directory for temporary files
|
||||
output_dir=output_root+'output.' + bam[bam.rfind('/')+1:bam.rfind('.bam')] + '/'
|
||||
if not os.path.isdir(output_dir):
|
||||
os.mkdir(output_dir)
|
||||
if not os.path.isdir(output_dir):
|
||||
exit('Unable to create output directory ' + output_dir,1)
|
||||
|
||||
# assemble the required program arguments
|
||||
gatk_base_cmdline = ' '.join((java_exe,jvm_args,'-jar',gatk,'-R',reference,'--DBSNP',dbsnp,'-l INFO'))
|
||||
|
||||
if do_recalibration:
|
||||
recalibrate()
|
||||
|
||||
if do_evaluation:
|
||||
evaluate()
|
||||
237
python/SAM.py
237
python/SAM.py
|
|
@ -1,237 +0,0 @@
|
|||
import string
|
||||
import operator
|
||||
from pushback_file import pushback_file
|
||||
|
||||
# Type TAG Req? Default Description
|
||||
SAMHeaderFields = \
|
||||
[['HD', 'VN', True, '0.1.0', 'File format version'], \
|
||||
['HD', 'SO', False, None, 'Sort order. Valid values are: unsorted, queryname or coordinate'], \
|
||||
['SQ', 'SN', True, None, 'Sequence name. Unique among all sequence records in the file. The value of this field is used in alignment records.'], \
|
||||
['SQ', 'LN', True, None, 'Sequence length'], \
|
||||
['SQ', 'AS', False, None, 'Genome assembly identifier. Refers to the reference genome assembly in an unambiguous form. Example: HG18.'], \
|
||||
['SQ', 'M5', False, None, 'MD5 checksum of the sequence in the uppercase (gaps and space are removed)'], \
|
||||
['SQ', 'UR', False, None, 'URI of the sequence'], \
|
||||
['SQ', 'SP', False, None, 'Species'], \
|
||||
['RG', 'ID', True, 'ReadGroup1', 'Unique read group identifier. The value of the ID field is used in the RG tags of alignment records.'], \
|
||||
['RG', 'SM', True, 'SampleA', 'Sample (use pool name where a pool is being sequenced)'], \
|
||||
['RG', 'LB', False, None, 'Library'], \
|
||||
['RG', 'DS', False, None, 'Description'], \
|
||||
['RG', 'PU', False, None, 'Platform unit (e.g. lane for Illumina or slide for SOLiD)'], \
|
||||
['RG', 'PI', False, None, 'Predicted median insert size (maybe different from the actual median insert size)'], \
|
||||
['RG', 'CN', False, None, 'Name of sequencing center producing the read'], \
|
||||
['RG', 'DT', False, None, 'Date the run was produced (ISO 8601 date or date/time)'], \
|
||||
['RG', 'PL', False, None, 'Platform/technology used to produce the read'], \
|
||||
['PG', 'ID', True, 'ProgramA','Program name'], \
|
||||
['PG', 'VN', False, None, 'Program version'], \
|
||||
['PG', 'CL', False, None, 'Command line']]
|
||||
|
||||
def SAMHeaderEncode( type, tag ):
|
||||
return type + '.' + tag
|
||||
|
||||
SAMHeaderDict = dict()
|
||||
for record in SAMHeaderFields:
|
||||
type, tag, req, default, desc = record
|
||||
SAMHeaderDict[SAMHeaderEncode(type, tag)] = [req, default, desc]
|
||||
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
#
|
||||
# A SAM header is a potentially complex object, so we just punt on it. The only really required
|
||||
# outputs that *must* be user specified are the SQ: SN and SQ: LN fields.
|
||||
# Everything else is just split up and stored in our hash table by reading the SAMHeaderFields
|
||||
# data structure (defined above). All required fields are initialized to their default values
|
||||
#
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
class SAMHeader(dict):
|
||||
def __init__(self, seqName, seqLen, **keys):
|
||||
# setup initial header
|
||||
self.fields = dict()
|
||||
for record in SAMHeaderFields:
|
||||
type, tag, req, default, desc = record
|
||||
self.setField( type, tag, default )
|
||||
|
||||
self.setField('SQ', 'SN', seqName )
|
||||
self.setField('SQ', 'LN', seqLen )
|
||||
|
||||
# add keyword arguments
|
||||
for key, value in keys.iteritems():
|
||||
type, tag = key.split('.')
|
||||
self.setField( type, tag, value )
|
||||
|
||||
def isReq( self, type, tag ):
|
||||
return SAMHeaderDict[SAMHeaderEncode(type,tag)][0]
|
||||
|
||||
def setField( self, type, tag, value ):
|
||||
#print 'Setting', type, tag, value
|
||||
self.fields[SAMHeaderEncode(type, tag)] = value
|
||||
|
||||
def getField( self, type, tag ):
|
||||
return self.fields[SAMHeaderEncode(type, tag)]
|
||||
|
||||
def __str__( self ):
|
||||
types = ['HD', 'SQ', 'RG']
|
||||
|
||||
def formatType( type ):
|
||||
s = '@' + type + '\t'
|
||||
for record in SAMHeaderFields:
|
||||
qtype, tag, req, default, desc = record
|
||||
if type == qtype and self.isReq(type, tag):
|
||||
value = self.getField(type, tag)
|
||||
if value == None:
|
||||
raise Error('Unset required SAM header ' + type + ' ' + tag)
|
||||
s += tag + ':' + str(value) + '\t'
|
||||
return s
|
||||
return string.join(map( formatType, types ),'\n')
|
||||
|
||||
SAM_SEQPAIRED = 0x0001 # the read is paired in sequencing, no matter whether it is mapped in a pair
|
||||
SAM_MAPPAIRED = 0x0002 # the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1
|
||||
SAM_UNMAPPED = 0x0004 # the query sequence itself is unmapped
|
||||
SAM_MATEUNMAPPED = 0x0008 # the mate is unmapped 1
|
||||
SAM_QUERYSTRAND = 0x0010 # strand of the query (0 for forward; 1 for reverse strand)
|
||||
SAM_MATESTRAND = 0x0020 # strand of the mate 1
|
||||
SAM_ISFIRSTREAD = 0x0040 # the read is the first read in a pair 1,2
|
||||
SAM_ISSECONDREAD = 0x0080 # the read is the second read in a pair 1,2
|
||||
SAM_NOTPRIMARY = 0x0100 # the alignment is not primary (a read having split hits may have multiple primary alignment records)
|
||||
|
||||
SAM_FLAGS = {
|
||||
SAM_SEQPAIRED : 'the read is paired in sequencing, no matter whether it is mapped in a pair',
|
||||
SAM_MAPPAIRED : 'the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1',
|
||||
SAM_UNMAPPED : 'the query sequence itself is unmapped',
|
||||
SAM_MATEUNMAPPED : 'the mate is unmapped 1',
|
||||
SAM_QUERYSTRAND : 'strand of the query (0 for forward; 1 for reverse strand)' ,
|
||||
SAM_MATESTRAND : 'strand of the mate 1' ,
|
||||
SAM_ISFIRSTREAD : 'the read is the first read in a pair 1,2' ,
|
||||
SAM_ISSECONDREAD : 'the read is the second read in a pair 1,2',
|
||||
SAM_NOTPRIMARY : 'the alignment is not primary (a read having split hits may have multiple primary alignment records)'
|
||||
}
|
||||
|
||||
def SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig = '*', pairPos = 0, insertSize = 0 ):
|
||||
r = SAMRecord()
|
||||
r.setValuesFromArgs( qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig, pairPos, insertSize )
|
||||
return r
|
||||
|
||||
def SAMRecordFromString( str ):
|
||||
r = SAMRecord()
|
||||
r.setValuesFromString( str )
|
||||
return r
|
||||
|
||||
def SAMFlagValue( flags, testFlag ):
|
||||
return testFlag & flags
|
||||
|
||||
def SAMFlagIsSet( flags, testFlag ):
|
||||
return SAMFlagValue(flags, testFlag) <> 0
|
||||
|
||||
def SAMFlagsDescs( flags ):
|
||||
def keepMe(p):
|
||||
flagKey, flagDesc = p
|
||||
return [flagKey, SAMFlagIsSet(flags, flagKey), flagDesc]
|
||||
return sorted(map( keepMe, SAM_FLAGS.iteritems() ))
|
||||
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
#
|
||||
# This is really the meat of the SAM I/O system.
|
||||
#
|
||||
# setValuesFromArgs takes the required arguments for a SAM record and stores them in a list
|
||||
# (in SAM ordering) so that writing the values to a string is trivial. Note the conversion
|
||||
# of int quals to ASCII encoded quals.
|
||||
#
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
class SAMRecord:
|
||||
def __init__( self, vals = []):
|
||||
self.vals = []
|
||||
|
||||
def setValuesFromArgs( self, qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig = '*', pairPos = 0, insertSize = 0 ):
|
||||
self.vals = [qname, self.formatFlags(flags), rname, pos, mapq, \
|
||||
cigar, pairContig, pairPos, insertSize, seq.lower(), self.toASCII33(quals) ]
|
||||
|
||||
def setValuesFromString( self, line ):
|
||||
#print 'line is', line
|
||||
formats = [str, int, str, int, int, str, str, int, int, str, str]
|
||||
self.vals = map( lambda f, v: f(v), formats, line.split()[0:len(formats)] )
|
||||
|
||||
def formatFlags( self, flags ):
|
||||
b = reduce( operator.__or__, flags, 0 )
|
||||
#print 'FormatFlags', flags, b
|
||||
return b
|
||||
|
||||
def getQName(self): return self.vals[0]
|
||||
def getFlags(self): return self.vals[1]
|
||||
def getRname(self): return self.vals[2] # Reference sequence name (can be chr1, chr2, etc...)
|
||||
def getPos(self): return self.vals[3]
|
||||
def getMapq(self): return self.vals[4]
|
||||
def getCigar(self): return self.vals[5] # Returns CIGAR which gives match, insertion, and other info as "75M2I"
|
||||
def getSeq(self): return self.vals[9]
|
||||
def getQuals(self): return self.fromASCII33(self.vals[10])
|
||||
|
||||
def toASCII33( self, quals ):
|
||||
return string.join( map( lambda q: chr(q+33), quals ), '' )
|
||||
|
||||
def fromASCII33( self, qualStr ):
|
||||
return map( lambda q: ord(q)-33, qualStr)
|
||||
|
||||
def __str__( self ):
|
||||
return string.join( map( str, self.vals ), '\t')
|
||||
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
#
|
||||
# Wrapper class for reading and writing records to a SAM file
|
||||
#
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
class SAMIO:
|
||||
def __init__( self, fileName, header = None, debugging = False, func = None ):
|
||||
self.header = header
|
||||
self.debugging = debugging
|
||||
|
||||
if func == None:
|
||||
func = lambda x: x
|
||||
self.func = func
|
||||
|
||||
if self.header == None:
|
||||
self.fileHandle = pushback_file(fileName, "r")
|
||||
self.readHeader()
|
||||
else:
|
||||
self.fileHandle = file(fileName, "w")
|
||||
self.writeHeader()
|
||||
|
||||
def close(self):
|
||||
self.fileHandle.close()
|
||||
|
||||
def readHeader(self):
|
||||
self.header = ""
|
||||
while True:
|
||||
line = self.fileHandle.readline()
|
||||
if line.startswith("@"):
|
||||
self.header += line
|
||||
else:
|
||||
self.fileHandle.pushback(line)
|
||||
if self.debugging:
|
||||
pass #print str(self.header)
|
||||
return True
|
||||
|
||||
return self.header
|
||||
|
||||
def RecordGenerator( self ):
|
||||
for line in self.fileHandle:
|
||||
#print 'Reading line', line
|
||||
line = line.rstrip("\n")
|
||||
yield self.func(SAMRecordFromString(line))
|
||||
raise StopIteration
|
||||
return
|
||||
|
||||
def __iter__(self):
|
||||
return self.RecordGenerator()
|
||||
|
||||
def writeRecord( self, recordIterator ):
|
||||
for record in recordIterator:
|
||||
self.writeRecord(record)
|
||||
|
||||
def writeRecord( self, record ):
|
||||
if self.debugging:
|
||||
print record
|
||||
print >> self.fileHandle, record
|
||||
return True
|
||||
|
||||
def writeHeader( self ):
|
||||
if self.debugging:
|
||||
print str(self.header)
|
||||
print >> self.fileHandle, str(self.header)
|
||||
return True
|
||||
|
|
@ -1,304 +0,0 @@
|
|||
#!/util/bin/python
|
||||
#
|
||||
#
|
||||
# This file really needs to be cleaned up and functions need to be improved
|
||||
#
|
||||
# (1) utility operations should be moved to a utility file
|
||||
# (2) Need to add support for windows paired reads, information about where
|
||||
# reads come from (which file), and n-base pair loci
|
||||
# (3) Move to C++ based SAM/BAM i/o system
|
||||
# (4) Figure out a way to get the reference to stream, and use a automatically pickled version
|
||||
# (5) Add capability to walk over all reference bases, regardless of coverage. Currently
|
||||
# we only walk over covered bases.
|
||||
#
|
||||
#
|
||||
from optparse import OptionParser
|
||||
import os
|
||||
import sys
|
||||
import random
|
||||
import string
|
||||
import os.path
|
||||
import heapq
|
||||
from itertools import *
|
||||
|
||||
from SAM import *
|
||||
import SimpleSAM
|
||||
#import SimulateReads # utility functions should be imported
|
||||
|
||||
# Global variable containing command line arguments
|
||||
OPTIONS = None
|
||||
|
||||
#
|
||||
# Trivial class that added pushback support to a stream
|
||||
#
|
||||
class peakStream(object):
|
||||
"""A simple wrapper around a stream that adds unget operation"""
|
||||
def __init__( self, stream ):
|
||||
self.ungot = []
|
||||
self.s = stream
|
||||
|
||||
def unget( self, elt ):
|
||||
"""Put elt at the front of the underlying stream, making the next get call return it"""
|
||||
|
||||
#print 'unget', self.ungot
|
||||
self.ungot.append(elt)
|
||||
#print ' -> unget', self.ungot
|
||||
|
||||
# we hold the next() operation
|
||||
def __iter__( self ): return self
|
||||
|
||||
def next(self):
|
||||
"""For python iterator support"""
|
||||
#print 'ungot', self.ungot
|
||||
if self.ungot <> []:
|
||||
elt = self.ungot.pop(0)
|
||||
else:
|
||||
elt = self.s.next()
|
||||
|
||||
#print ' -> ungot', self.ungot
|
||||
#print ' elt', elt
|
||||
return elt
|
||||
|
||||
#
|
||||
# Returns a single, combined stream of SAM records in chromosome order from a list of
|
||||
# input streams, each in reference order
|
||||
#
|
||||
def rawReadStream( ref, inputs ):
|
||||
"""Returns a single iterable that returns records in reference order
|
||||
from each of the input streams"""
|
||||
def includePriorities( stream ):
|
||||
return imap( lambda x: (x.getPos(), x), stream )
|
||||
def prunePriorities( stream ):
|
||||
return imap( lambda p: SimpleSAM.MappedReadFromSamRecord(ref, p[1]), stream )
|
||||
|
||||
with_priorities = map( includePriorities, inputs )
|
||||
return prunePriorities( heapq.merge( *with_priorities ) )
|
||||
|
||||
#
|
||||
# Just wraps the raw read stream objects in a list as they come out
|
||||
#
|
||||
def readStream( ref, inputs ):
|
||||
"""Returns a single iterable that returns SAM records in reference order
|
||||
from each of the input streams"""
|
||||
rs = rawReadStream( ref, inputs )
|
||||
return imap( lambda x: [x], rs )
|
||||
|
||||
#
|
||||
# More complex stream object. Takes a list of input streams and creates a stream
|
||||
# returning successive loci covered by the reads in the combined input stream.
|
||||
#
|
||||
class lociStream():
|
||||
"""A streaming data structure that returns reads spanning each covered loci in
|
||||
the input reads, offsets into them
|
||||
where the bases are equivalent, and the position of the locus.
|
||||
"""
|
||||
def __init__( self, ref, inputs ):
|
||||
self.ref = ref
|
||||
self.rs = peakStream(rawReadStream( ref, inputs ))
|
||||
self.window = []
|
||||
|
||||
self.chr = None
|
||||
self.pos = None
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def pushRead( self, read ):
|
||||
self.window.append(read)
|
||||
|
||||
def cleanWindow( self, chr, pos ):
|
||||
"""Walk over the window of reads, deleting those who no longer cover the current pos"""
|
||||
if OPTIONS.debug:
|
||||
print 'cleanWindow start:', chr, pos, self.window
|
||||
|
||||
def keepReadP( read ):
|
||||
return read.chr == chr and pos >= read.start and pos <= read.end
|
||||
self.window = filter( keepReadP, self.window )
|
||||
|
||||
if OPTIONS.debug:
|
||||
print 'cleanWindow stop:', chr, pos, self.window
|
||||
|
||||
def expandWindow( self, chr, pos ):
|
||||
"""Keep reading from the read stream until we've added all reads from the stream covering pos"""
|
||||
if OPTIONS.debug:
|
||||
print 'expandWindow start:', chr, pos, self.window
|
||||
for read in self.rs:
|
||||
#print 'read', read, pos
|
||||
if read.chr == chr and read.start <= pos and read.end >= pos:
|
||||
self.pushRead(read)
|
||||
else:
|
||||
self.rs.unget( read )
|
||||
#self.rs = chain( [read], self.rs )
|
||||
break
|
||||
if OPTIONS.debug:
|
||||
print 'expandWindow stop:', chr, pos, self.window
|
||||
|
||||
|
||||
#
|
||||
# This is the workhorse routine. It constructs a window of reads covering the
|
||||
# next locus in the genome, and returns the reference, the contig, its position
|
||||
# in the genome, along with a vector of offsets into the list of reads that denote
|
||||
# the equivalent base among all the reads and reference.
|
||||
#
|
||||
def next(self):
|
||||
if self.pos <> None:
|
||||
self.cleanWindow(self.chr, self.pos) # consume reads not covering pos
|
||||
self.expandWindow(self.chr, self.pos) # add reads to window covering pos
|
||||
|
||||
if self.window == []:
|
||||
# the window is empty, we need to jump to the first pos of the first read in the stream:
|
||||
nextRead = self.rs.next()
|
||||
self.pushRead( nextRead )
|
||||
self.chr = nextRead.chr
|
||||
self.pos = nextRead.start
|
||||
return self.next()
|
||||
else:
|
||||
# at this point, window contains all reads covering the pos, we need to return them
|
||||
# and the offsets into each read for this loci
|
||||
def calcOffset( read ):
|
||||
offset = self.pos - read.start
|
||||
return offset
|
||||
|
||||
offsets = map(calcOffset, self.window)
|
||||
currPos = self.pos
|
||||
self.pos += 1 # we are going to try to get the next position
|
||||
return self.ref, self.chr, currPos, offsets, self.window
|
||||
|
||||
#
|
||||
# Reference reader
|
||||
#
|
||||
def readRef(referenceFasta):
|
||||
ref = {}
|
||||
|
||||
header = None
|
||||
seq = ''
|
||||
for line in open(referenceFasta):
|
||||
if line[0] == '>':
|
||||
if header <> None:
|
||||
ref[header] = seq.lower()
|
||||
seq = ''
|
||||
header = line[1:].strip()
|
||||
else:
|
||||
seq += line.strip()
|
||||
|
||||
ref[header] = seq.lower()
|
||||
#print ref
|
||||
return ref
|
||||
|
||||
#
|
||||
# Main() procedure
|
||||
#
|
||||
def main():
|
||||
global OPTIONS, ROOT
|
||||
|
||||
# ------------------------------------------------------------
|
||||
#
|
||||
# Setup command line options
|
||||
#
|
||||
# ------------------------------------------------------------
|
||||
usage = "usage: %prog [options] Walker [sam or bam file or file list]+"
|
||||
parser = OptionParser(usage=usage)
|
||||
parser.add_option("-r", "--ref", dest="reference",
|
||||
type="string", default=None,
|
||||
help="Reference DNA seqeunce in FASTA format")
|
||||
parser.add_option("-m", "--maxRecords", dest="maxRecords",
|
||||
type=int, default=None,
|
||||
help="Max number of SAM records to process")
|
||||
parser.add_option("-q", "--quiet", dest="quiet",
|
||||
action='store_true', default=False,
|
||||
help="Be quiet when generating output")
|
||||
parser.add_option("-d", "--debug", dest="debug",
|
||||
action='store_true', default=False,
|
||||
help="Verbose debugging output?")
|
||||
|
||||
(OPTIONS, args) = parser.parse_args()
|
||||
print 'args', args
|
||||
if len(args) < 2:
|
||||
parser.error("Incorrect number of arguments")
|
||||
|
||||
|
||||
# ------------------------------------------------------------
|
||||
#
|
||||
# Dynamically load the walker class (the first cmd line arg)
|
||||
# and initialize a walker class object
|
||||
#
|
||||
#
|
||||
# load walkers from standard location
|
||||
#
|
||||
# ------------------------------------------------------------
|
||||
walkerName = args[0]
|
||||
walkerModule = __import__(walkerName, globals(), locals(), [], -1)
|
||||
walkerClass = walkerModule.__dict__[walkerName]
|
||||
walker = walkerClass()
|
||||
|
||||
print walkerName
|
||||
print walkerModule
|
||||
print walkerClass
|
||||
print walker
|
||||
|
||||
print '------------------------------------------------------------'
|
||||
print 'program:', sys.argv[0]
|
||||
print ''
|
||||
print ' ref: ', OPTIONS.reference
|
||||
print ' walker: ', walker
|
||||
if walker.hasOption('name'):
|
||||
print ' -> ', walker.getOption('name')
|
||||
if walker.hasOption('desc'):
|
||||
print ' -> ', walker.getOption('desc')
|
||||
print '------------------------------------------------------------'
|
||||
|
||||
|
||||
# ------------------------------------------------------------
|
||||
#
|
||||
# Initialize the engine
|
||||
#
|
||||
# ------------------------------------------------------------
|
||||
|
||||
# read the reference
|
||||
refs = readRef(OPTIONS.reference)
|
||||
|
||||
# create the low-level SAMIO streams
|
||||
files = args[1:]
|
||||
inputs = map( lambda file: SAMIO( file, debugging=OPTIONS.debug ), files )
|
||||
|
||||
# build the higher level stream object, either by record or by loci
|
||||
if walker.isRecordWalker():
|
||||
stream = readStream( refs, inputs )
|
||||
if walker.isLociWalker():
|
||||
stream = lociStream( refs, inputs )
|
||||
|
||||
# ------------------------------------------------------------
|
||||
#
|
||||
# Move the walker object over the stream
|
||||
#
|
||||
# For each element in the record or loci stream, invoke the walker
|
||||
# object on it. Use filter, map, and reduce to construct the sum
|
||||
#
|
||||
# ------------------------------------------------------------
|
||||
sum = walker.reduceDefault
|
||||
counter = 0
|
||||
for elt in stream:
|
||||
counter += 1
|
||||
#print 'processing elt', counter, elt, OPTIONS.maxRecords
|
||||
if OPTIONS.maxRecords <> None:
|
||||
if (counter * 10) % OPTIONS.maxRecords == 0:
|
||||
print ' => ', (counter * 100.0) / OPTIONS.maxRecords, '% done'
|
||||
|
||||
if counter > OPTIONS.maxRecords:
|
||||
break
|
||||
if walker.filterfunc( *elt ):
|
||||
x = walker.mapfunc( *elt )
|
||||
sum = walker.reducefunc( x, sum )
|
||||
print 'Result is', sum
|
||||
|
||||
if __name__ == "__main__":
|
||||
if True:
|
||||
import cProfile, pstats
|
||||
cProfile.run('main()', 'prof')
|
||||
stats = pstats.Stats('prof')
|
||||
stats.sort_stats('time')
|
||||
stats.print_stats(20)
|
||||
else:
|
||||
main()
|
||||
|
||||
|
||||
|
|
@ -1,31 +0,0 @@
|
|||
#!/util/bin/python
|
||||
|
||||
#from SamWalk import *
|
||||
from SimpleSAM import *
|
||||
import Walker
|
||||
|
||||
class SamWalkTest(object, Walker.Walker):
|
||||
def __init__(self):
|
||||
Walker.Walker.__init__( self, 'byRecord', {}, name = 'SamWalkTest' )
|
||||
|
||||
def filterfunc( self, read ):
|
||||
"""Return true if you want to keep the read for mapping"""
|
||||
return True
|
||||
#return datum.getPos() % 2 == 0
|
||||
|
||||
def mapfunc( self, read ):
|
||||
"""Do something to the read, and return a result for reduction"""
|
||||
#print datum
|
||||
return 1
|
||||
#return len(datum.getSeq())
|
||||
#return datum.getSeq() + '/'
|
||||
#return "\n>%s\n%s" % (datum.getQName(), datum.getSeq())
|
||||
|
||||
#reduceDefault = ''
|
||||
def reducefunc( self, x, sum ):
|
||||
"""Take the result of mapping, and previous reduce results in sum, and combine them"""
|
||||
#print x
|
||||
return x + sum
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,69 +0,0 @@
|
|||
from SAM import *
|
||||
|
||||
def read2align( refSeq, readSeq, chr, pos, cigar ):
|
||||
"""Converts the read bases, a reference sequence, and a cigar string into a pair of
|
||||
alignments strings for refSeq and readSeq. The alignment string has base equivalents
|
||||
between the reference and the read bases"""
|
||||
#
|
||||
# only works for non-indel containing reads right now!
|
||||
#
|
||||
contig = refSeq[chr]
|
||||
refAlign = contig[pos:pos+len(readSeq)]
|
||||
readAlign = readSeq
|
||||
return refAlign, readAlign
|
||||
|
||||
def MappedReadFromSamRecord(ref, r):
|
||||
"""Converts SameRead into a SimpleSamRecord object"""
|
||||
#
|
||||
# This is a temp. function while we wait for Ted's C++ SAM/BAM io system to come online
|
||||
#
|
||||
refAlign, readAlign = read2align(ref, r.getSeq(), r.getRname(), r.getPos(), r.getCigar)
|
||||
return MappedRead( r.getQName(), r.getFlags(), r.getRname(), r.getPos(), r.getMapq(),
|
||||
r.getCigar(), r.getSeq(), refAlign, readAlign, r.getQuals() )
|
||||
|
||||
class MappedRead(object):
|
||||
"""Higher level object representing a mapped read.
|
||||
|
||||
Attempts to be more accessible than a raw SAM record, doing a lot of processing on the record w.r.t.
|
||||
the reference to make analysis of the reads easier. This is the fundamental data structure
|
||||
used to represent reads through the walker engine"""
|
||||
|
||||
def __init__( self, qname, flags, chr, pos, mapq, cigar, readBases, refAlign, readAlign, qualsByBase ):
|
||||
# basic information about the read
|
||||
self.qname = qname
|
||||
self.mapq = mapq
|
||||
self.cigar = cigar
|
||||
self.bases = readBases # actual bases of reads, unaligned, may be fw or reverse
|
||||
self.quals = qualsByBase # as a vector of floats, by base position
|
||||
|
||||
# dealing with position information, more than a standard record Hasbro
|
||||
self.chr = chr
|
||||
self.start = pos
|
||||
self.readlen = len(readBases)
|
||||
self.end = pos + len(refAlign) - 1
|
||||
|
||||
# print 'refAlign', refAlign, len(refAlign)
|
||||
|
||||
# dealing with stand
|
||||
self.flags = flags
|
||||
self.isForwardStrand = SAMFlagIsSet(flags, SAM_QUERYSTRAND)
|
||||
self.isReverseStrand = not SAMFlagIsSet(flags, SAM_QUERYSTRAND)
|
||||
|
||||
self.isPrimary = SAMFlagIsSet(flags, SAM_NOTPRIMARY)
|
||||
self.isUnmapped = SAMFlagIsSet(flags, SAM_UNMAPPED)
|
||||
self.isPaired = SAMFlagIsSet(flags, SAM_SEQPAIRED)
|
||||
self.isMapPaired = SAMFlagIsSet(flags, SAM_MAPPAIRED)
|
||||
self.isFirstReadInPair = SAMFlagIsSet(flags, SAM_ISFIRSTREAD)
|
||||
self.isSecondReadInPair = SAMFlagIsSet(flags, SAM_ISSECONDREAD)
|
||||
|
||||
#self.isMapPaired = SAMFlagIsSet(flags, SAM_MATEUNMAPPED)
|
||||
#self.isMapPaired = SAMFlagIsSet(flags, SAM_MATESTRAND)
|
||||
|
||||
# everything will be reverse complemented already
|
||||
self.refAlign = refAlign #
|
||||
self.readAlign = readAlign # all should be forward strand
|
||||
|
||||
def __str__(self):
|
||||
return '<MappedRead %s:%d-%d>' % (self.chr, self.start, self.end)
|
||||
|
||||
__repr__ = __str__
|
||||
|
|
@ -1,754 +0,0 @@
|
|||
#!/util/bin/python
|
||||
|
||||
from optparse import OptionParser
|
||||
import os
|
||||
import sys
|
||||
#import set
|
||||
import random
|
||||
import string
|
||||
|
||||
from SAM import *
|
||||
|
||||
def all(iterable):
|
||||
for element in iterable:
|
||||
if not element:
|
||||
return False
|
||||
return True
|
||||
|
||||
def any(iterable):
|
||||
for element in iterable:
|
||||
if element:
|
||||
return True
|
||||
return False
|
||||
|
||||
OPTIONS = None
|
||||
|
||||
bases = set(['a', 't', 'g', 'c'])
|
||||
|
||||
class Mutation:
|
||||
"""Representation of a change from one sequence to another"""
|
||||
|
||||
SNP = "SNP"
|
||||
INSERTION = "Insertion"
|
||||
DELETION = "Deletion"
|
||||
TYPES = set([SNP, INSERTION, DELETION])
|
||||
|
||||
CIGARChars = { SNP : 'M', INSERTION : 'I', DELETION : 'D' }
|
||||
|
||||
def __init__( self, contig, pos, type, original, mutated ):
|
||||
# example: chr20 123 SNP "a" "t" <- a to t at base 123 on chr20
|
||||
# example: chr20 123 INSERTION "" "atg" <- insertion of 3 bases starting at pos 123
|
||||
# example: chr20 123 DELETION "atg" "" <- deletion of 3 bases starting at pos 123
|
||||
self.contig = contig # contig of the mutation
|
||||
self._pos = pos # position of the change in contig coordinates
|
||||
self._type = type # one of the TYPES
|
||||
assert(self._type in Mutation.TYPES)
|
||||
self.original = original # String of the original genotype
|
||||
self.mutated = mutated # String of the mutated genotype
|
||||
|
||||
def pos(self, offset=0): return self._pos - offset
|
||||
def type(self): return self._type
|
||||
def __len__(self):
|
||||
return max( len(self.mutated), len(self.original) )
|
||||
|
||||
def mutString( self ):
|
||||
if self._type == Mutation.SNP:
|
||||
return 'P:%s->%s' % (self.original, self.mutated)
|
||||
elif self._type == Mutation.INSERTION:
|
||||
return 'I:%s' % (self.mutated)
|
||||
elif self._type == Mutation.DELETION:
|
||||
return 'D:%s' % (self.original)
|
||||
else:
|
||||
raise Exception('Unexpected mutation type ' + self._type)
|
||||
|
||||
def CIGARChar( self ):
|
||||
return Mutation.CIGARChars[self.type()]
|
||||
|
||||
def __str__( self ):
|
||||
return '%s:%d %s' % (self.contig, self._pos, self.mutString())
|
||||
|
||||
def __repr__( self ):
|
||||
return 'Mutation(%s, %d, %s)' % (self.contig, self._pos, self.mutString())
|
||||
|
||||
def execute( self, offset, refseq, mutseq ):
|
||||
p = self.pos(offset)
|
||||
if self.type() == Mutation.SNP:
|
||||
mutseq[p] = self.mutated
|
||||
elif self.type() == Mutation.INSERTION:
|
||||
refseq[p:p] = string.join(['-'] * len(self),'')
|
||||
mutseq[p:p] = self.mutated
|
||||
elif self.type() == Mutation.DELETION:
|
||||
mutseq[p:p+len(self)] = string.join(['-'] * len(self), '')
|
||||
refseq.extend(self.mutated)
|
||||
mutseq.extend(self.mutated)
|
||||
|
||||
#print refseq, mutseq
|
||||
return refseq, mutseq
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
#
|
||||
# Code for dealing with CIGAR strings
|
||||
#
|
||||
# ---------------------------------------------------------------------------
|
||||
def flatten(l):
|
||||
out = []
|
||||
for item in l:
|
||||
if isinstance(item, (list, tuple)):
|
||||
out.extend(flatten(item))
|
||||
else:
|
||||
out.append(item)
|
||||
return out
|
||||
|
||||
def mutationsCIGAR( readStart, alignStart, readLen, mutations ):
|
||||
if len(mutations) == 1 and mutations[0].type() <> Mutation.SNP:
|
||||
mut = mutations[0]
|
||||
internalPos = mut.pos() - alignStart
|
||||
|
||||
leftLen = max(internalPos - readStart,0)
|
||||
mutLen = min(internalPos + len(mut) - readStart, readLen - leftLen, len(mut))
|
||||
rightLen = readLen - leftLen - mutLen
|
||||
#print mut, readStart, alignStart, internalPos, leftLen, mutLen, rightLen
|
||||
#print
|
||||
|
||||
l = [ (leftLen, 'M'), (mutLen, mut.CIGARChar()), (rightLen, 'M') ]
|
||||
l = filter( lambda x: x[0] > 0, l )
|
||||
return string.join(map(str, flatten(l)), '')
|
||||
|
||||
if not all( map( lambda mut: mut.type() == Mutation.SNP, mutations ) ):
|
||||
# bail
|
||||
raise Exception('Currently only supports multiple SNPs CIGARS')
|
||||
|
||||
return str(readLen) + 'M'
|
||||
|
||||
# def mutationsCIGARBAD( refStart, readStart, readStop, mutations ):
|
||||
# sortedMutations = sorted(mutations, key=Mutation.pos)
|
||||
# CIGAR = []
|
||||
# pos = readStart
|
||||
#
|
||||
# for mutation in sortedMutations:
|
||||
# internalMutationStart = mutation.pos() - refStart
|
||||
# if internalMutationStart >= readStart and internalMutationStart < readStop
|
||||
# delta = mutation.pos() - pos
|
||||
# if not OPTIONS.quiet:
|
||||
# print 'mutationsCIGAR', pos, CIGAR, delta, mutation
|
||||
# if mutation.type() <> Mutation.SNP and delta > 0:
|
||||
# CIGAR.append([delta, 'M'])
|
||||
# pos = mutation.pos()
|
||||
#
|
||||
# if mutation.type == Mutation.INSERTION:
|
||||
# CIGAR.append(['I', len(mutation)])
|
||||
# if mutation.type == Mutation.DELETION:
|
||||
# CIGAR.append(['D', len(mutation)])
|
||||
#
|
||||
# delta = refSeqPos + refLen - pos
|
||||
# if not OPTIONS.quiet:
|
||||
# print 'mutationsCIGAR', pos, CIGAR, delta, mutation
|
||||
# if delta > 0:
|
||||
# CIGAR.append([delta, 'M'])
|
||||
#
|
||||
# s = string.join(map( str, flatten(CIGAR) ), '')
|
||||
# print 'CIGAR:', CIGAR, s
|
||||
# return s
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
#
|
||||
# mutating the reference
|
||||
#
|
||||
# ---------------------------------------------------------------------------
|
||||
def executeMutations( mutations, seqIndex, refseq, mutseq ):
|
||||
for mutation in mutations:
|
||||
internalSite = mutation.pos() - seqIndex
|
||||
refseq, mutseq = mutation.execute( seqIndex, refseq, mutseq )
|
||||
return refseq, mutseq
|
||||
|
||||
def isBadSequenceForSim(seq):
|
||||
if any( map( lambda b: b not in bases, seq.tostring().lower() ) ):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def mutateReference(ref, contig, mutSite, mutType, mutParams, nBasesToPad):
|
||||
#print 'mutateReference'
|
||||
|
||||
# start and stop
|
||||
refStartIndex = mutSite - nBasesToPad + 1
|
||||
internalStartIndex = nBasesToPad - 1
|
||||
|
||||
if OPTIONS.mutationType == 'SNP' or OPTIONS.mutationType == 'NONE':
|
||||
refStopIndex = mutSite + nBasesToPad
|
||||
else:
|
||||
refStopIndex = mutSite + nBasesToPad - 1
|
||||
|
||||
refsub = ref[refStartIndex : refStopIndex]
|
||||
mutseq = refsub.tomutable()
|
||||
refsub = refsub.tomutable()
|
||||
mutations = []
|
||||
|
||||
def success():
|
||||
return True, mutations, refsub, mutseq
|
||||
def fail():
|
||||
return False, [], None, None
|
||||
|
||||
if refStartIndex < 0:
|
||||
return fail()
|
||||
if isBadSequenceForSim(refsub):
|
||||
#print 'rejecting seq', refsub.tostring()
|
||||
#print map( lambda b: b not in bases, refsub.tostring().lower() )
|
||||
return fail()
|
||||
|
||||
if not OPTIONS.quiet:
|
||||
print ref
|
||||
print refsub
|
||||
|
||||
otherSites = set(range(refStartIndex, refStopIndex)) - set([mutSite])
|
||||
# print 'otherSites', otherSites
|
||||
for site in [mutSite] + random.sample(otherSites, max(OPTIONS.mutationDensity-1,0)):
|
||||
internalSite = site - refStartIndex
|
||||
if mutType == 'NONE' or OPTIONS.mutationDensity < 1:
|
||||
pass
|
||||
elif mutType == 'SNP':
|
||||
mutBase = ref[site].lower()
|
||||
otherBases = bases - set(mutBase)
|
||||
otherBase = random.choice(list(otherBases))
|
||||
assert mutBase <> otherBase
|
||||
if not OPTIONS.quiet:
|
||||
print mutBase, ' => ', otherBase
|
||||
mutations.append( Mutation( contig, site, Mutation.SNP, mutBase, otherBase ) )
|
||||
elif mutType == 'INSERTION':
|
||||
inSeq = string.join([ random.choice(list(bases)) for i in range( OPTIONS.indelSize ) ], '')
|
||||
mutation = Mutation( contig, site, Mutation.INSERTION, '', inSeq )
|
||||
mutations.append( mutation )
|
||||
elif mutType == 'DELETION':
|
||||
inSeq = string.join([ random.choice(list(bases)) for i in range( OPTIONS.indelSize ) ], '')
|
||||
mutation = Mutation( contig, site, Mutation.DELETION, '', ref[refStopIndex:refStopIndex+OPTIONS.indelSize])
|
||||
mutations.append( mutation )
|
||||
else:
|
||||
raise Exception('Unexpected mutation type', mutType)
|
||||
|
||||
# process the mutations
|
||||
refsub, mutseq = executeMutations( mutations, refStartIndex, refsub, mutseq )
|
||||
|
||||
return success()
|
||||
|
||||
# version 1.0 -- prototype
|
||||
# def mutateReference(ref, mutSite, mutType, mutParams, nBasesToPad):
|
||||
# #print 'mutateReference'
|
||||
# refStartIndex = mutSite - nBasesToPad + 1
|
||||
# refStopIndex = mutSite + nBasesToPad
|
||||
# internalStartIndex = nBasesToPad - 1
|
||||
#
|
||||
# if refStartIndex < 0:
|
||||
# return False, None, None
|
||||
#
|
||||
# refsub = ref[refStartIndex : refStopIndex]
|
||||
# print ref
|
||||
# print refsub
|
||||
#
|
||||
# if mutType == 'SNP' or mutType == 'None':
|
||||
# #print ' In IF'
|
||||
# mutBase = ref[mutSite]
|
||||
# bases = set(['A', 'T', 'G', 'C'])
|
||||
# if mutBase in bases:
|
||||
# otherBases = bases - set(mutBase)
|
||||
# otherBase = random.choice(list(otherBases))
|
||||
# assert mutBase <> otherBase
|
||||
# mutseq = refsub.tomutable()
|
||||
#
|
||||
# if mutType == 'SNP':
|
||||
# print mutBase, ' => ', otherBase
|
||||
# mutseq[internalStartIndex] = otherBase
|
||||
#
|
||||
# print refsub
|
||||
# print mutseq
|
||||
# align = Bio.Align.Generic.Alignment(Gapped(IUPAC.unambiguous_dna, '-'))
|
||||
# align.add_sequence("ref", refsub.tostring())
|
||||
# align.add_sequence("mut", mutseq.tostring())
|
||||
# print str(align)
|
||||
#
|
||||
# return True, refsub, mutseq
|
||||
#
|
||||
# return False, None, None
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
#
|
||||
# sample reads from alignments
|
||||
#
|
||||
# ---------------------------------------------------------------------------
|
||||
def accumulateBases( mutSeq, start, readLen ):
|
||||
count = 0
|
||||
for stop in range(start, len(mutSeq)):
|
||||
if notDash(mutSeq[stop]):
|
||||
count += 1
|
||||
#print stop, count
|
||||
if count == readLen:
|
||||
break
|
||||
|
||||
stop += 1
|
||||
read = string.join(filter( notDash, mutSeq[start:stop] ), '')
|
||||
|
||||
return read, stop
|
||||
|
||||
# doesn't support paired reads
|
||||
#
|
||||
# def sampleReadsFromAlignment(refSeq, mutSeq, alignStart, readLen, nReads, mutations):
|
||||
# #print 'REF:', refSeq.tostring()
|
||||
# #print 'MUT:', mutSeq.tostring()
|
||||
# #print ''
|
||||
#
|
||||
# # we are assuming that mutSeq is exactly 1 + 2 * readLen in size, which the mutation
|
||||
# # right in the middle
|
||||
# lastGoodStartSite = readLen - 1
|
||||
# if not OPTIONS.quiet:
|
||||
# print refSeq.tostring(), 'ref'
|
||||
# print mutSeq.tostring(), 'mut'
|
||||
#
|
||||
# # we can potentially start at any site in mutSeq
|
||||
# nPotentialStarts = len(mutSeq) - readLen + 1
|
||||
# potentialStarts = range(nPotentialStarts)
|
||||
#
|
||||
# if nReads.lower() == 'tile' or int(nReads) >= nPotentialStarts:
|
||||
# if OPTIONS.uniqueReadsOnly:
|
||||
# starts = potentialStarts
|
||||
# else:
|
||||
# starts = map( lambda x: random.choice(potentialStarts), range(nPotentialStarts))
|
||||
# else:
|
||||
# starts = random.sample(potentialStarts, nPotentialStarts)
|
||||
#
|
||||
# #print 'potentialStarts', potentialStarts
|
||||
# #print 'Starts', starts
|
||||
#
|
||||
# def sampleRead( start ):
|
||||
# if mutSeq[start] <> '-':
|
||||
# read, stop = accumulateBases( mutSeq, start, readLen )
|
||||
# remaining = len(mutSeq) - stop
|
||||
# refForRead = refSeq[start:stop]
|
||||
# #print 'XXXX', start, stop, refForRead, read
|
||||
# cigar = mutationsCIGAR( alignStart, readLen, mutations )
|
||||
# if not OPTIONS.quiet:
|
||||
# leftPads = string.join(start * ['-'], '')
|
||||
# rightPads = string.join(remaining * ['-'], '')
|
||||
# print leftPads + mutSeq.tostring()[start:stop] + rightPads, start, stop, cigar
|
||||
# return filter(notDash, read), alignStart + start, cigar
|
||||
# else:
|
||||
# return False, False, False
|
||||
#
|
||||
# reads = map( sampleRead, starts )
|
||||
# return [read for read in reads if read[0] <> False]
|
||||
|
||||
class refSite:
|
||||
def __init__( self, refSeq, mutSeq, alignStart, mutations ):
|
||||
self.refSeq = refSeq
|
||||
self.mutSeq = mutSeq
|
||||
self.alignStart = alignStart
|
||||
self.mutations = mutations
|
||||
|
||||
def sample1Read( refSite, start, readLen, reverseComplementP = False ):
|
||||
if refSite.mutSeq[start] <> '-':
|
||||
read, stop = accumulateBases( refSite.mutSeq, start, readLen )
|
||||
|
||||
mutSubSeq = refSite.mutSeq[start:stop]
|
||||
if reverseComplementP:
|
||||
s = Seq( read, IUPAC.unambiguous_dna )
|
||||
#print 'reverseComplementP', read, s.reverse_complement().tostring(), mutSubSeq
|
||||
read = s.reverse_complement().tostring()
|
||||
mutSubSeq.complement()
|
||||
|
||||
remaining = len(refSite.mutSeq) - stop
|
||||
refForRead = refSite.refSeq[start:stop]
|
||||
#print 'XXXX', start, stop, refForRead, read
|
||||
cigar = mutationsCIGAR( start, refSite.alignStart, readLen, refSite.mutations )
|
||||
leftPads = string.join(start * ['-'], '')
|
||||
rightPads = string.join(remaining * ['-'], '')
|
||||
ppReadStr = leftPads + mutSubSeq.tostring() + rightPads # + '(' + read + ')'
|
||||
return True, filter(notDash, read), refSite.alignStart + start, cigar, ppReadStr
|
||||
else:
|
||||
return False, None, None, None, None
|
||||
|
||||
def sampleReadsFromAlignment(wholeRef, refSeq, mutSeq, alignStart, readLen, nReads, mutations, pairedOffset, mutations2, refSeq2, mutSeq2):
|
||||
#print 'REF:', refSeq.tostring()
|
||||
#print 'MUT:', mutSeq.tostring()
|
||||
#print ''
|
||||
|
||||
# pairedSite, mutations2, refSeq2, mutSeq2 can all be None
|
||||
|
||||
refSite1 = refSite( refSeq, mutSeq, alignStart, mutations )
|
||||
refSite2 = refSite( refSeq2, mutSeq2, alignStart + pairedOffset, mutations2 )
|
||||
|
||||
#print refSite1.alignStart, refSite2.alignStart, pairedOffset
|
||||
|
||||
# we are assuming that mutSeq is exactly 1 + 2 * readLen in size, which the mutation
|
||||
# right in the middle
|
||||
lastGoodStartSite = readLen - 1
|
||||
if not OPTIONS.quiet:
|
||||
if OPTIONS.generatePairedEnds:
|
||||
r = wholeRef.seq[refSite1.alignStart:refSite2.alignStart + 2*readLen - 1]
|
||||
print r.tostring(), 'ref'
|
||||
print r.complement().tostring(), 'ref, complement'
|
||||
else:
|
||||
print refSeq.tostring(), 'ref'
|
||||
print mutSeq.tostring(), 'mut'
|
||||
|
||||
# we can potentially start at any site in mutSeq
|
||||
nPotentialStarts = len(mutSeq) - readLen + 1
|
||||
potentialStarts = range(nPotentialStarts)
|
||||
#print 'potentialStarts', potentialStarts
|
||||
|
||||
if nReads.lower() == 'tile' or int(nReads) >= nPotentialStarts:
|
||||
if OPTIONS.uniqueReadsOnly:
|
||||
starts = potentialStarts
|
||||
else:
|
||||
starts = map( lambda x: random.choice(potentialStarts), range(nPotentialStarts))
|
||||
else:
|
||||
starts = random.sample(potentialStarts, int(nReads))
|
||||
|
||||
#print 'Starts', starts
|
||||
|
||||
def sampleRead( start ):
|
||||
good1, read1, pos1, cigar1, ppstr1 = sample1Read( refSite1, start, readLen )
|
||||
|
||||
if OPTIONS.generatePairedEnds:
|
||||
good2, read2, pos2, cigar2, ppstr2 = sample1Read( refSite2, start, readLen, True )
|
||||
if not OPTIONS.quiet:
|
||||
offsetDashes = string.join(['-'] * (pairedOffset), '')
|
||||
print
|
||||
print ppstr1 + offsetDashes, pos1, cigar1
|
||||
print offsetDashes + ppstr2, pos2, cigar2
|
||||
return [(read1, pos1, cigar1), (read2, pos2, cigar2)]
|
||||
else:
|
||||
if not OPTIONS.quiet:
|
||||
print ppstr1, pos1, cigar1
|
||||
return [(read1, pos1, cigar1), (None, 0, None)]
|
||||
|
||||
reads = map( sampleRead, starts )
|
||||
return [read for read in reads if read[0][0] <> None]
|
||||
|
||||
def notDash( c ):
|
||||
return c <> '-'
|
||||
|
||||
def fakeQuals( seq ):
|
||||
return len(seq) * [30]
|
||||
#return range(len(seq))
|
||||
|
||||
def alignedRead2SAM( siteID, readID, fastaID, read, pos, cigar, read2, pos2, cigar2 ):
|
||||
rname = fastaID
|
||||
mapq = 40
|
||||
quals = fakeQuals( read )
|
||||
|
||||
qname = '%s:%d.read.%d' % (fastaID, siteID, readID)
|
||||
|
||||
if not OPTIONS.generatePairedEnds:
|
||||
# not in paired mode
|
||||
flags = []
|
||||
record1 = SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, read, quals )
|
||||
record2 = None
|
||||
else:
|
||||
flags = [SAM_SEQPAIRED, SAM_MAPPAIRED]
|
||||
insertSize = pos2 - pos - len(read)
|
||||
record1 = SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, read, quals, pairContig = rname, pairPos = pos2, insertSize = insertSize )
|
||||
record2 = SAMRecordFromArgs( qname + 'p', flags, rname, pos2, mapq, cigar2, read2, quals, pairContig = rname, pairPos = pos, insertSize = insertSize )
|
||||
|
||||
#print 'read1, read2', record1.getSeq(), record2.getSeq()
|
||||
|
||||
return [record1, record2]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
#
|
||||
# paired end reads
|
||||
#
|
||||
# ---------------------------------------------------------------------------
|
||||
def pairedReadSite( ref, leftMutSite, readLen ):
|
||||
pairedOffset = int(round(random.normalvariate(OPTIONS.insertSize, OPTIONS.insertDev)))
|
||||
|
||||
def fail(): return False, None
|
||||
|
||||
if pairedOffset < 0:
|
||||
return fail()
|
||||
|
||||
pairedSite = pairedOffset + leftMutSite + readLen
|
||||
#print 'pairedSite', pairedOffset, leftMutSite, pairedSite
|
||||
|
||||
if pairedSite + readLen >= len(ref):
|
||||
return fail()
|
||||
refsub = ref[pairedSite - readLen:pairedSite + readLen + 1]
|
||||
if isBadSequenceForSim( refsub ):
|
||||
return fail()
|
||||
|
||||
return True, pairedSite
|
||||
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
#
|
||||
# build allowed regions and sampling starts from region
|
||||
#
|
||||
# ---------------------------------------------------------------------------
|
||||
def parseSamplingRange( arg, ref, readLen ):
|
||||
# returns a list of the allowed regions to sample in the reference
|
||||
|
||||
def one(x):
|
||||
elts = x.split()
|
||||
#print 'elts', elts
|
||||
if len(elts) > 0:
|
||||
elts1 = elts[0].split(':')
|
||||
#print 'elts1', elts1
|
||||
return map( int, elts1 )
|
||||
else:
|
||||
return False
|
||||
|
||||
if arg <> None:
|
||||
try:
|
||||
return [ one(arg) ]
|
||||
except:
|
||||
# try to read it as a file
|
||||
return filter( None, map( one, open(arg).readlines() ) )
|
||||
else:
|
||||
return [[0, len(ref.seq) - readLen]]
|
||||
|
||||
def sampleStartSites( sampleRanges, nsites ):
|
||||
print 'sampleStartSites', sampleRanges, nsites
|
||||
|
||||
# build a set of potential sites
|
||||
if len(sampleRanges) > 1:
|
||||
potentialSites = set()
|
||||
for start, end in sampleRanges:
|
||||
#print 'potentialSites', start, end, potentialSites
|
||||
potentialSites = potentialSites.union(set(xrange(start, end)))
|
||||
else:
|
||||
start, end = sampleRanges[0]
|
||||
potentialSites = xrange(start, end)
|
||||
|
||||
#print 'potentialSites', potentialSites
|
||||
|
||||
# choose sites from potentials
|
||||
if len(potentialSites) <= nsites:
|
||||
# tile every site
|
||||
sites = potentialSites
|
||||
else:
|
||||
# we need to sample from the potential sites
|
||||
sites = random.sample( potentialSites, nsites )
|
||||
|
||||
# print 'Sites', sites
|
||||
print 'Number of potential start sites:', len(potentialSites)
|
||||
print 'Number of start sites that will be generated:', len(sites)
|
||||
return sites
|
||||
|
||||
def readRef(referenceFasta):
|
||||
handle = open(referenceFasta)
|
||||
for seq_record in SeqIO.parse(handle, "fasta"):
|
||||
print seq_record.id
|
||||
print seq_record.name
|
||||
#print repr(seq_record.seq)
|
||||
print len(seq_record.seq)
|
||||
yield seq_record
|
||||
handle.close()
|
||||
|
||||
import os.path
|
||||
def outputFilename():
|
||||
root = os.path.splitext(os.path.split(OPTIONS.reference)[1])[0]
|
||||
if OPTIONS.generatePairedEnds:
|
||||
pairedP = 'Yes'
|
||||
else:
|
||||
pairedP = 'No'
|
||||
|
||||
params = [['mut',OPTIONS.mutationType], ['den', max(OPTIONS.mutationDensity, OPTIONS.indelSize)], [OPTIONS.readLen,'bp'], \
|
||||
['nsites', OPTIONS.nSites], ['cov', OPTIONS.coverage], ['range', OPTIONS.range], ['paired', pairedP]]
|
||||
filename = root + '__' + string.join(map( lambda p: string.join(map(str, p),'.'), params), '_')
|
||||
root = os.path.join(OPTIONS.outputPrefix, filename)
|
||||
return root, root + '.sam'
|
||||
|
||||
def main():
|
||||
global OPTIONS, ROOT
|
||||
|
||||
usage = "usage: %prog [options]"
|
||||
parser = OptionParser(usage=usage)
|
||||
parser.add_option("-r", "--ref", dest="reference",
|
||||
type="string", default=None,
|
||||
help="Reference DNA seqeunce in FASTA format")
|
||||
parser.add_option("-o", "--outputPrefix", dest="outputPrefix",
|
||||
type="string", default='',
|
||||
help="Output Prefix SAM file")
|
||||
parser.add_option("-c", "--coverage", dest="coverage",
|
||||
type="string", default='1',
|
||||
help="Number of reads to produce that cover each mutated region for the reference, or tile for all possible reads.")
|
||||
parser.add_option("-n", "--nsites", dest="nSites",
|
||||
type=int, default=1,
|
||||
help="Number of sites to generate mutations in the reference")
|
||||
parser.add_option("-l", "--readlen", dest="readLen",
|
||||
type=int, default=36,
|
||||
help="Length of reads to generate")
|
||||
parser.add_option("-s", "--seed", dest="seed",
|
||||
type=int, default=123456789,
|
||||
help="Random number seed. If 0, then system clock is used")
|
||||
parser.add_option("-t", "--muttype", dest="mutationType",
|
||||
type="string", default='None',
|
||||
help="Type of mutation to introduce from reference. Can be None, SNP, insertion, or deletion")
|
||||
parser.add_option("-e", "--mutdensity", dest="mutationDensity",
|
||||
type=int, default=1,
|
||||
help="Number of mutations to introduce from the reference")
|
||||
parser.add_option("-x", "--range", dest="range",
|
||||
type="string", default=None,
|
||||
help="Sampling range restriction, in the form of x:y without a space")
|
||||
parser.add_option("-u", "--unique", dest="uniqueReadsOnly",
|
||||
action='store_true', default=True,
|
||||
help="Assumes that the user wants unique reads generated. If the coverage is greater than the number of unique reads at the site, the region is just tiled")
|
||||
parser.add_option("-i", "--indelsize", dest="indelSize",
|
||||
type=int, default=0,
|
||||
help="Size in bp of the insertion or deletion to introduce")
|
||||
|
||||
# Paired end options
|
||||
parser.add_option("", "--paired", dest="generatePairedEnds",
|
||||
action='store_true', default=False,
|
||||
help="Should we generate paired end reads?")
|
||||
parser.add_option("", "--insertSize", dest="insertSize",
|
||||
type=int, default=280,
|
||||
help="Size in bp of the paired library insertion")
|
||||
parser.add_option("", "--insertDev", dest="insertDev",
|
||||
type=int, default=5,
|
||||
help="Standard deviation in the size in bp of the paired library insertion")
|
||||
|
||||
parser.add_option("-q", "--quiet", dest="quiet",
|
||||
action='store_true', default=False,
|
||||
help="Be quiet when generating output")
|
||||
parser.add_option("-d", "--debug", dest="debug",
|
||||
action='store_true', default=False,
|
||||
help="Verbose debugging output?")
|
||||
(OPTIONS, args) = parser.parse_args()
|
||||
if len(args) != 0:
|
||||
parser.error("incorrect number of arguments")
|
||||
|
||||
root, outputSAM = outputFilename()
|
||||
|
||||
if OPTIONS.seed == 0:
|
||||
random.seed(None)
|
||||
else:
|
||||
random.seed(OPTIONS.seed)
|
||||
|
||||
OPTIONS.mutationType = OPTIONS.mutationType.upper()
|
||||
|
||||
print '------------------------------------------------------------'
|
||||
print 'program:', sys.argv[0]
|
||||
print ''
|
||||
print ' ref: ', OPTIONS.reference
|
||||
print ' output: ', outputSAM
|
||||
print ''
|
||||
print ' mutation type: ', OPTIONS.mutationType
|
||||
print ' mutation density: ', OPTIONS.mutationDensity
|
||||
print ' indel size: ', OPTIONS.indelSize
|
||||
print ' readLen: ', OPTIONS.readLen
|
||||
print ' nSites: ', OPTIONS.nSites
|
||||
print ' coverage: ', OPTIONS.coverage
|
||||
print ' range restriction: ', OPTIONS.range
|
||||
print ''
|
||||
print ' paired ends?: ', OPTIONS.generatePairedEnds
|
||||
print ' insert size: ', OPTIONS.insertSize
|
||||
print ' insert stddev: ', OPTIONS.insertDev
|
||||
print ''
|
||||
print ' Debugging?: ', OPTIONS.debug
|
||||
print '------------------------------------------------------------'
|
||||
|
||||
if OPTIONS.mutationType <> 'SNP' and OPTIONS.mutationDensity > 1:
|
||||
raise Exception('Does not support mutation density > 1 for mutations of class', OPTIONS.mutationType)
|
||||
|
||||
readLen = OPTIONS.readLen
|
||||
fastaRecords = [seq for seq in readRef(OPTIONS.reference)]
|
||||
header = SAMHeader( fastaRecords[0].id, len(fastaRecords[0].seq) )
|
||||
SAMout = SAMIO( outputSAM, header, debugging=OPTIONS.debug )
|
||||
|
||||
mutationsout = open(root + '.mutations.txt', 'w')
|
||||
qualsout = open(root + '.quals.txt', 'w')
|
||||
refout = open(root + '.ref', 'w')
|
||||
|
||||
fastaout = open(root + '.fasta', 'w')
|
||||
if OPTIONS.generatePairedEnds:
|
||||
fastaout2 = open(root + '.pairs.fasta', 'w')
|
||||
qualsout2 = open(root + '.pairs.quals.txt', 'w')
|
||||
|
||||
counter = 0
|
||||
refLen = 0
|
||||
for ref in fastaRecords:
|
||||
refLen = len(ref.seq)
|
||||
|
||||
# write the crazy ref file info needed by samtools
|
||||
print >> refout, ref.id + '\t' + str(refLen)
|
||||
|
||||
sampleRanges = parseSamplingRange( OPTIONS.range, ref, readLen )
|
||||
sites = sampleStartSites( sampleRanges, OPTIONS.nSites )
|
||||
|
||||
for mutSite, siteCounter in zip( sites, range(1, len(sites)+1) ):
|
||||
#print 'Sampling site:', mutSite, refLen
|
||||
#mutSite = readLen-1
|
||||
nSitesSelected = max(int(round(len(sites)/100.0)), 1)
|
||||
#print siteCounter, len(sites), nSitesSelected)
|
||||
if siteCounter % nSitesSelected == 0:
|
||||
print 'Sampled site %d of %d (%.2f%%)' % ( siteCounter, len(sites), (100.0*siteCounter) / len(sites))
|
||||
|
||||
good, mutations, refSeq, mutSeq = mutateReference(ref.seq, ref.id, mutSite, OPTIONS.mutationType, [], readLen)
|
||||
|
||||
pairedSite, mutations2, refSeq2, mutSeq2 = 0, None, None, None
|
||||
if good and OPTIONS.generatePairedEnds:
|
||||
if OPTIONS.mutationType == 'SNP':
|
||||
pairedMutType = 'SNP'
|
||||
else:
|
||||
pairedMutType = 'NONE'
|
||||
|
||||
good, pairedSite = pairedReadSite( ref.seq, mutSite, readLen )
|
||||
print 'pairedReadSite', good, pairedSite, good
|
||||
if good:
|
||||
good, mutations2, refSeq2, mutSeq2 = mutateReference(ref.seq, ref.id, pairedSite, pairedMutType, [], readLen)
|
||||
|
||||
#print 'Good', good, mutations, refSeq, mutSeq
|
||||
if good:
|
||||
print >> mutationsout, ref.id + ':' + str(mutSite) + '\t' + string.join(map(str, mutations), '\t')
|
||||
#print 'Mutations', mutations
|
||||
|
||||
refSeqPos = mutSite - readLen + 1
|
||||
readPairs = sampleReadsFromAlignment(ref, refSeq, mutSeq, refSeqPos, readLen, OPTIONS.coverage, mutations, pairedSite - mutSite, mutations2, refSeq2, mutSeq2)
|
||||
for i in range(len(readPairs)):
|
||||
counter += 1
|
||||
|
||||
read1, read2 = readPairs[i]
|
||||
seq, pos, cigar = read1
|
||||
seq2, pos2, cigar2 = read2
|
||||
|
||||
# write out the sam and fasta files
|
||||
def write1( record, recordi ):
|
||||
if record <> None:
|
||||
SAMout.writeRecord( record )
|
||||
sequences = [SeqRecord(Seq(record.getSeq()), id = record.getQName(), description='')]
|
||||
#print sequences
|
||||
|
||||
if recordi % 2 == 0:
|
||||
localFastaOut, localQualsOut = fastaout, qualsout
|
||||
else:
|
||||
localFastaOut, localQualsOut = fastaout2, qualsout2
|
||||
|
||||
SeqIO.write(sequences, localFastaOut, "fasta")
|
||||
print >> localQualsOut, record.getQName(), string.join(map( str, record.getQuals() ), ' ')
|
||||
|
||||
#print i, read1, read2
|
||||
records = alignedRead2SAM( mutSite, i+1, ref.id, seq, pos + 1, cigar, seq2, pos2 + 1, cigar2 )
|
||||
map( write1, records, range( len(records) ) )
|
||||
|
||||
SAMout.close()
|
||||
qualsout.close()
|
||||
fastaout.close()
|
||||
refout.close()
|
||||
mutationsout.close()
|
||||
|
||||
if OPTIONS.generatePairedEnds:
|
||||
qualsout2.close()
|
||||
fastaout2.close()
|
||||
|
||||
# for record in SAMIO( outputSAM ):
|
||||
# print record
|
||||
|
||||
if __name__ == "__main__":
|
||||
import Bio
|
||||
from Bio import SeqIO
|
||||
from Bio.Seq import Seq
|
||||
from Bio.SeqRecord import SeqRecord
|
||||
from Bio.Alphabet import IUPAC, Gapped
|
||||
main()
|
||||
|
||||
|
||||
|
|
@ -1,194 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import getopt, sys, os, string
|
||||
from farm_commands import *
|
||||
|
||||
#FastaQuals2Fastq_exe = "/wga/dev/andrewk/Arachne/AlignerEvaluation/FastaQuals2Fastq.py"
|
||||
FastaQuals2Fastq_exe = "FastaQuals2Fastq.py"
|
||||
|
||||
def isFastaB(filename):
|
||||
"""Is the file a fastb file already?"""
|
||||
#print os.path.splitext(filename)
|
||||
return os.path.splitext(filename)[1] == '.fastb'
|
||||
|
||||
def readListOfLanes( listFile ):
|
||||
"""Simply reads a list of files to process from a file"""
|
||||
lines = map( string.split, map( string.strip, open(listFile).readlines() ) )
|
||||
return map( lambda x: x[0], lines )
|
||||
#return map( lambda x: x[0], lines ), map( lambda x: x[1], lines )
|
||||
|
||||
|
||||
def run_swmerlin(input_file, input_head, farm=""):
|
||||
run_merlin(input_file, input_head, farm, sw=True)
|
||||
|
||||
def run_merlin(input_file, input_head, farm="", sw=False):
|
||||
"sw = Merlin Smith-Waterman option"
|
||||
if isFastaB(input_file):
|
||||
input_fastb = input_file
|
||||
else:
|
||||
input_fastb = input_head+".fastb"
|
||||
if regenExistingFiles or not os.path.exists(input_fastb):
|
||||
cmd("Fasta2Fastb IN= "+input_file, just_print_commands=justPrintCommands)
|
||||
if sw:
|
||||
output_head = input_head+".swmerlin"
|
||||
else:
|
||||
output_head = input_head+".merlin"
|
||||
cmd_str = "Merlin REF_FASTB= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.fastb REF_MERLIN= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.merlinref.bin FASTB= "+input_fastb+" OUT_HEAD="+output_head
|
||||
if sw:
|
||||
cmd_str += " SW=True"
|
||||
cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands)
|
||||
|
||||
USE_BATCH = False
|
||||
|
||||
def run_ilt(input_file, input_head, farm=""):
|
||||
#print 'isFastaB', input_file, isFastaB(input_file)
|
||||
if isFastaB(input_file):
|
||||
input_fastb = input_file
|
||||
else:
|
||||
input_fastb = input_head+".fastb"
|
||||
if regenExistingFiles or not os.path.exists(input_fastb):
|
||||
cmd("Fasta2Fastb IN= "+input_file, just_print_commands=justPrintCommands)
|
||||
|
||||
output_head = input_head+".ilt"
|
||||
|
||||
if USE_BATCH:
|
||||
cmd_str = "~depristo/bin/batchShortQueryLookup2.pl --NUMPROCS=10 --BATCHQUEUE=long --SEQS="+input_fastb+" --L=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.lookup --MAX_FREQ=1000 --O= "+output_head
|
||||
cmd(cmd_str, False, input_head, just_print_commands=justPrintCommands)
|
||||
else:
|
||||
cmd_str = "ImperfectLookupTable SEQS= "+input_fastb+" L= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.lookup MAX_FREQ=1000 OUT_PREFIX= "+output_head
|
||||
cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands)
|
||||
|
||||
def run_BWA32(input_file, input_head, farm=""):
|
||||
run_BWA(input_file, input_head, farm, 32)
|
||||
|
||||
def run_BWA(fasta, input_head, farm="", seed='inf'):
|
||||
output_head = input_head+".bwa"
|
||||
quals = input_head+".quals.txt"
|
||||
fastq = input_head+".fastq"
|
||||
if regenExistingFiles or not os.path.exists(fastq) :
|
||||
cmd_str = FastaQuals2Fastq_exe+" "+fasta+" "+quals+" "+fastq
|
||||
cmd(cmd_str)
|
||||
if seed <> 'inf':
|
||||
output_head += str(seed)
|
||||
output_sam = output_head + ".sam"
|
||||
cmd_str = "bwahuman samse "+str(seed)+" "+fastq+" "+output_sam
|
||||
cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands)
|
||||
|
||||
def run_MAQ(input_fasta, head, farm=""):
|
||||
maq_exe = "/seq/dirseq/maq-0.7.1/maq"
|
||||
bfa_ref="/seq/dirseq/ktibbett/maq-0.7.1-test/Homo_sapiens_assembly18.bfa"
|
||||
|
||||
fasta = input_fasta
|
||||
quals = head+".quals.txt"
|
||||
fastq = head+".fastq"
|
||||
if regenExistingFiles or not os.path.exists(fastq) :
|
||||
cmd_str = FastaQuals2Fastq_exe+" "+fasta+" "+quals+" "+fastq
|
||||
cmd(cmd_str)
|
||||
|
||||
bfq = head+".bfq"
|
||||
if regenExistingFiles or not os.path.exists(bfq):
|
||||
cmd( maq_exe+" fastq2bfq "+fastq+" "+bfq, just_print_commands=justPrintCommands )
|
||||
|
||||
out_head = head+".maq"
|
||||
maq_out = out_head+".out.aln.map"
|
||||
cmd_str = maq_exe+" map -e 100 -a 600 -s 0 "+maq_out+" "+bfa_ref+" "+bfq
|
||||
cmd(cmd_str, farm, out_head, just_print_commands=justPrintCommands)
|
||||
|
||||
def usage():
|
||||
print "Required arguments:"
|
||||
print " -i Input FASTA head (*.fasta, *.qualb)"
|
||||
print " OR"
|
||||
print " -d Directory to grab all FASTA files from"
|
||||
print " OR"
|
||||
print " -l List of FASTA/FASTB files to process"
|
||||
print
|
||||
print "Optional arguments:"
|
||||
print " -f QUEUE Farm jobs to QUEUE on LSF"
|
||||
print
|
||||
print " -m MAPPER Compare output from MAPPER which can be: ilt, merlin, swmerlin, maq, all (default: all)"
|
||||
print
|
||||
print " -x Don't execute commands, just print them"
|
||||
print
|
||||
print " -w Output files to current directory (strip path from input file/dir/list"
|
||||
print
|
||||
|
||||
|
||||
def get_all_fasta_files(fasta_dir):
|
||||
files = os.listdir(fasta_dir)
|
||||
if not fasta_dir.endswith("/"): fasta_dir += "/"
|
||||
fasta_files = [fasta_dir+f for f in files if f.endswith(".fasta") and os.path.getsize(fasta_dir+f) > 0]
|
||||
#print fasta_files
|
||||
return fasta_files
|
||||
|
||||
justPrintCommands = False
|
||||
regenExistingFiles = False
|
||||
|
||||
if __name__ == "__main__":
|
||||
opts = None
|
||||
try:
|
||||
opts, args = getopt.getopt(sys.argv[1:], "i:d:f:m:l:xw:r", ["input","fasta_dir","farm","mapper","listOfLanes", "dontexe", "regenExistingFiles", "outputInWorkingDirectory"])
|
||||
except getopt.GetoptError:
|
||||
print sys.argv
|
||||
usage()
|
||||
sys.exit(2)
|
||||
|
||||
input_head = ""
|
||||
fasta_dir = ""
|
||||
mapper_str = "all"
|
||||
farm_sub = False
|
||||
listOfLanes = None
|
||||
outputInWorkingDirectory = False
|
||||
|
||||
for opt, arg in opts:
|
||||
print opt, arg
|
||||
if opt in ("-i", "--input"):
|
||||
input_head = arg
|
||||
if opt in ("-l", "--listOfLanes"):
|
||||
listOfLanes = arg
|
||||
if opt in ("-d", "--fasta_dir"):
|
||||
fasta_dir = arg
|
||||
if opt in ("-f", "--farm"):
|
||||
farm_sub = arg
|
||||
if opt in ("-r", "--regenExistingFiles"):
|
||||
regenExistingFiles = True
|
||||
if opt in ("-m", "--mapper"):
|
||||
mapper_str = arg
|
||||
if opt in ("-x", "--dontexe"):
|
||||
justPrintCommands = True
|
||||
if opt in ("-w", "--outputInWorkingDirectory"):
|
||||
outputInWorkingDirectory = True
|
||||
|
||||
if (input_head == "") and (fasta_dir == "") and (listOfLanes == None):
|
||||
print input_head, fasta_dir, listOfLanes
|
||||
usage()
|
||||
sys.exit(2)
|
||||
|
||||
# Select function(s) for mapper
|
||||
mapper_func_list = {"ilt":run_ilt, "merlin":run_merlin, "swmerlin":run_swmerlin, "maq":run_MAQ, "bwa":run_BWA, "bwa32":run_BWA32}
|
||||
if mapper_str.lower() == "all":
|
||||
mapper_list = mapper_func_list.values()
|
||||
else:
|
||||
mapper_list = [mapper_func_list.get(mapper_str.lower())]
|
||||
if mapper_list == [None]:
|
||||
sys.exit("Don't know of mapper argument: "+mapper_str)
|
||||
|
||||
if input_head:
|
||||
input_heads = [None]
|
||||
input_files = [input_head + 'fasta']
|
||||
elif listOfLanes <> None:
|
||||
input_files = readListOfLanes(listOfLanes)
|
||||
input_heads = [None] * len(input_files)
|
||||
else:
|
||||
input_files = [file for file in get_all_fasta_files(fasta_dir)]
|
||||
input_heads = [None] * len(input_files)
|
||||
|
||||
for input_file, input_head in zip(input_files, input_heads):
|
||||
if input_head == None:
|
||||
file_head = os.path.splitext(input_file)[0]
|
||||
if outputInWorkingDirectory:
|
||||
file_head = os.path.split(file_head)[1]
|
||||
else:
|
||||
file_head = input_head
|
||||
for mapper in mapper_list:
|
||||
mapper( input_file, file_head, farm=farm_sub )
|
||||
print
|
||||
|
|
@ -1,92 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import getopt, sys, os, string
|
||||
from farm_commands import *
|
||||
|
||||
def picardCMD(name, **keywords ):
|
||||
cmd = name
|
||||
for key, value in keywords.iteritems():
|
||||
cmd += ' ' + key + "=" + str(value)
|
||||
return cmd
|
||||
|
||||
def spawnValidationJob( input_file, output_head, farm, maxErrors):
|
||||
validate_exe = "ValidateSAM"
|
||||
output_file = output_head + '.stdout'
|
||||
|
||||
if regenExistingFiles or not os.path.exists(output_file):
|
||||
cmd_str = picardCMD( validate_exe, I=input_file, M=maxErrors )
|
||||
if farm == "":
|
||||
cmd_str += " > " + output_file
|
||||
cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands)
|
||||
|
||||
def usage():
|
||||
print "Required arguments:"
|
||||
print " -d Directory to grab all sam/bam files from"
|
||||
print
|
||||
print "Optional arguments:"
|
||||
print " -f QUEUE Farm jobs to QUEUE on LSF"
|
||||
print
|
||||
print " -m MAXERRORS Maximum number of errors to detect before aborting"
|
||||
print
|
||||
|
||||
|
||||
def get_all_sam_files(dir):
|
||||
files = []
|
||||
|
||||
for dirpath, dirnames, filenames in os.walk(dir):
|
||||
for filename in filenames:
|
||||
base, ext = os.path.splitext(filename)
|
||||
if ext.lower() in ['.sam', '.bam']:
|
||||
files.append( os.path.join( dirpath, filename ) )
|
||||
#print filename, base, ext
|
||||
|
||||
return files
|
||||
|
||||
def output_filename( input_file ):
|
||||
parts = filter(lambda x: x.strip() <> '', input_file.split("/"))
|
||||
print parts
|
||||
return ".".join(parts) + ".validation"
|
||||
|
||||
justPrintCommands = False
|
||||
regenExistingFiles = False
|
||||
|
||||
if __name__ == "__main__":
|
||||
opts = None
|
||||
try:
|
||||
opts, args = getopt.getopt(sys.argv[1:], "d:f:m:r", ["dir","farm","maxErrors", "regenExistingFiles"])
|
||||
except getopt.GetoptError:
|
||||
print sys.argv
|
||||
usage()
|
||||
sys.exit(2)
|
||||
|
||||
dir = ""
|
||||
mapper_str = "all"
|
||||
farm_sub = False
|
||||
maxErrors = 1000
|
||||
|
||||
for opt, arg in opts:
|
||||
print opt, arg
|
||||
if opt in ("-d", "--dir"):
|
||||
dir = arg
|
||||
if opt in ("-f", "--farm"):
|
||||
farm_sub = arg
|
||||
if opt in ("-m", "--maxErrors"):
|
||||
maxErrors = arg
|
||||
if opt in ("-r", "--regenExistingFiles"):
|
||||
regenExistingFiles = True
|
||||
if dir == "":
|
||||
usage()
|
||||
sys.exit(2)
|
||||
|
||||
input_files = get_all_sam_files(dir)
|
||||
print 'Processing files: N=', len(input_files)
|
||||
for input_file in input_files:
|
||||
print ' ->', input_file
|
||||
|
||||
for input_file in input_files:
|
||||
output_file = output_filename( input_file )
|
||||
print input_file, "=>", output_file
|
||||
spawnValidationJob( input_file, output_file, farm_sub, maxErrors )
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,66 +0,0 @@
|
|||
import farm_commands
|
||||
import os.path
|
||||
import sys
|
||||
import getopt
|
||||
|
||||
defaultCommands = ['CountReads', 'Pileup']
|
||||
|
||||
def usage():
|
||||
print "Optional arguments:"
|
||||
print " -f QUEUE Farm jobs to QUEUE on LSF"
|
||||
print " -c cmd1,cmd2 Walkers to execute, otherwise", ' '.join(defaultCommands)
|
||||
print " -e Ignore existing files", ' '.join(defaultCommands)
|
||||
|
||||
if __name__ == "__main__":
|
||||
opts = None
|
||||
try:
|
||||
opts, args = getopt.getopt(sys.argv[1:], "f:c:a:e", ["farm", "commands", "args", "ignoreExistingFiles"])
|
||||
except getopt.GetoptError:
|
||||
print sys.argv
|
||||
usage()
|
||||
sys.exit(2)
|
||||
|
||||
farm_sub = False
|
||||
commandsList = defaultCommands
|
||||
ignoreExistingFiles = False
|
||||
extraArgs = ''
|
||||
|
||||
for opt, arg in opts:
|
||||
if opt in ("-f", "--farm"):
|
||||
farm_sub = arg
|
||||
if opt in ("-c", "--commands"):
|
||||
commandsList = arg.split(',')
|
||||
if opt in ("-e", "--ignoreExistingFiles"):
|
||||
ignoreExistingFiles = True
|
||||
if opt in ("-a", "--args"):
|
||||
extraArgs = arg
|
||||
|
||||
directory = args[1]
|
||||
for line in open(args[0]):
|
||||
lineParts = line.split()
|
||||
lane = lineParts[0].strip()
|
||||
|
||||
ref = '/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta'
|
||||
if ( len(lineParts) > 1 ):
|
||||
ref = lineParts[1].strip()
|
||||
|
||||
if not os.path.exists(lane):
|
||||
print 'Input SAM/BAM file: "', lane, '" does not exist, skipping...'
|
||||
continue
|
||||
|
||||
head, lane_filename = os.path.split(lane)
|
||||
filebase = os.path.splitext(lane_filename)[0]
|
||||
|
||||
# make the pileup
|
||||
# 503 8:03 samtools view -b MV1994.bam Escherichia_coli_K12:10-1000 > sub.bam
|
||||
# 504 8:03 samtools pileup -f Escherichia_coli_K12_MG1655.fasta sub.bam
|
||||
|
||||
# convert the fasta
|
||||
for analysis in commandsList:
|
||||
output = os.path.join(directory, filebase + '.' + analysis + '.output')
|
||||
if ignoreExistingFiles or not os.path.exists(output):
|
||||
cmd = "java -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -T " + analysis + " -I " + lane + " -R " + ref + " -o " + output + " -l INFO " + extraArgs
|
||||
print cmd
|
||||
farm_commands.cmd(cmd, farm_sub, output)
|
||||
|
||||
|
||||
|
|
@ -1,201 +0,0 @@
|
|||
import farm_commands
|
||||
import os.path
|
||||
import sys
|
||||
from optparse import OptionParser
|
||||
import string
|
||||
|
||||
defaultCommands = ['CountReads', 'Pileup']
|
||||
|
||||
def indexBAM(reads, queue=None):
|
||||
cmd = "samtools index " + reads
|
||||
farm_commands.cmd(cmd, queue, None, just_print_commands=OPTIONS.justPrint)
|
||||
|
||||
def readIntervalFile(file):
|
||||
def notHeaderP(line):
|
||||
return line[0][0] <> '@'
|
||||
lines = filter(notHeaderP, map( string.split, [line for line in open(file)]))
|
||||
|
||||
byContig = {}
|
||||
for entry in lines:
|
||||
elt = [int(entry[1]), int(entry[2])]
|
||||
if entry[0] in byContig:
|
||||
byContig[entry[0]].append(elt)
|
||||
else:
|
||||
byContig[entry[0]] = [elt]
|
||||
|
||||
#print byContig
|
||||
return byContig
|
||||
|
||||
def isIntervalFile(filename):
|
||||
return os.path.splitext(filename)[1] == '.interval_list'
|
||||
|
||||
def filterByInterval(intervalMap, unfilteredPileupFile, filteredPileupFile):
|
||||
print 'Intervals', intervalMap
|
||||
|
||||
def maybeWrite1(line, offsetMap, out):
|
||||
debug = False
|
||||
parts = line.split()
|
||||
contig = parts[0]
|
||||
pos = int(parts[1])
|
||||
if debug: print 'pileup', contig, pos, line,
|
||||
|
||||
if contig in intervalMap:
|
||||
intervals = intervalMap[contig]
|
||||
|
||||
while (offsetMap[contig] < len(intervals)):
|
||||
offset = offsetMap[contig]
|
||||
#print intervals[offset]
|
||||
curStart, curStop = intervals[offset][0:2]
|
||||
if debug: print contig, pos, curStart, curStop
|
||||
if pos >= curStart:
|
||||
if pos <= curStop:
|
||||
#print line,
|
||||
print >> out, line,
|
||||
break
|
||||
else:
|
||||
if debug: print 'Progressing', contig, pos, curStart, curStop
|
||||
offsetMap[contig] += 1
|
||||
else:
|
||||
break
|
||||
return offsetMap
|
||||
|
||||
def allDone( offsetMap, intervalMap ):
|
||||
def oneDone( contig ):
|
||||
return offsetMap[contig] >= len(intervalMap[contig])
|
||||
return all( map(oneDone, offsetMap.iterkeys()) )
|
||||
|
||||
i = 0
|
||||
offsetMap = dict([(key, 0) for key in intervalMap.keys()])
|
||||
out = open(filteredPileupFile, 'w')
|
||||
for line in open(unfilteredPileupFile):
|
||||
offsetMap = maybeWrite1(line, offsetMap, out)
|
||||
i += 1
|
||||
if i % 10000 == 0 and allDone(offsetMap, intervalMap):
|
||||
break
|
||||
out.close()
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
global OPTIONS, ROOT
|
||||
|
||||
usage = "usage: %prog [options]"
|
||||
parser = OptionParser(usage=usage)
|
||||
parser.add_option("-f", "--file", dest="validateInfoFile",
|
||||
type="string", default=None,
|
||||
help="File name of the validation set")
|
||||
parser.add_option("-d", "--dataDir", dest="dataDir",
|
||||
type="string", default=None,
|
||||
help="Directory to the data files")
|
||||
parser.add_option("-o", "--outputDir", dest="outputDir",
|
||||
type="string", default=None,
|
||||
help="Directory to put the output")
|
||||
parser.add_option("-q", "--farmQueue", dest="farmQueue",
|
||||
type="string", default=None,
|
||||
help="Farm queue to submit jobs to. Leave blank for local processing")
|
||||
parser.add_option("-e", "--ignoreExistingFiles", dest="ignoreExistingFiles",
|
||||
action='store_true', default=False,
|
||||
help="Ignores the existing files")
|
||||
parser.add_option("-a", "--rebuildAllFiles", dest="rebuildAllFiles",
|
||||
action='store_true', default=False,
|
||||
help="If provided, all intermediate files (BAM and pileups) will be regenerated")
|
||||
parser.add_option("-n", "--dontValidate", dest="dontValidate",
|
||||
action='store_true', default=False,
|
||||
help="If provided, won't try to validate, just make sure the validation data files are ready")
|
||||
parser.add_option("-g", "--gatk", dest="gatkPath",
|
||||
type="string", default="~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar",
|
||||
help="Path to the gatk")
|
||||
parser.add_option("-x", "--args", dest="extraArgs",
|
||||
type="string", default="",
|
||||
help="Extra args to pass to the GATK")
|
||||
parser.add_option("-p", "--justPrint", dest="justPrint",
|
||||
action='store_true', default=False,
|
||||
help="Don't actually run GATK, just setup data files")
|
||||
|
||||
(OPTIONS, args) = parser.parse_args()
|
||||
if len(args) != 0:
|
||||
parser.error("incorrect number of arguments")
|
||||
|
||||
if OPTIONS.validateInfoFile == None:
|
||||
parser.error("f option is required")
|
||||
if OPTIONS.dataDir == None:
|
||||
parser.error("d option is required")
|
||||
if OPTIONS.outputDir == None:
|
||||
parser.error("o option is required")
|
||||
|
||||
if not os.path.exists(OPTIONS.outputDir):
|
||||
os.mkdir(OPTIONS.outputDir)
|
||||
if not os.path.exists(OPTIONS.dataDir):
|
||||
os.mkdir(OPTIONS.dataDir)
|
||||
|
||||
for line in open(OPTIONS.validateInfoFile):
|
||||
if line.strip() == "" or line[0] == '#':
|
||||
# comment line
|
||||
continue
|
||||
|
||||
# line is of the format:
|
||||
# <reads.bam> <ref.fasta> <processingRegion>
|
||||
originalReads, ref, region = line.split()
|
||||
|
||||
originalReads = os.path.expanduser(originalReads)
|
||||
ref = os.path.expanduser(ref)
|
||||
|
||||
if not os.path.exists(originalReads) or not os.path.exists(ref):
|
||||
print 'Input files do not exist!', originalReads, ref
|
||||
sys.exit(1)
|
||||
|
||||
head, read_filename = os.path.split(originalReads)
|
||||
filebase = os.path.splitext(read_filename)[0]
|
||||
|
||||
reads = os.path.join(OPTIONS.dataDir, read_filename)
|
||||
readsIndex = reads + '.bai'
|
||||
subBAM = os.path.join(OPTIONS.dataDir, filebase + '.selected.bam')
|
||||
pileup = os.path.join(OPTIONS.dataDir, filebase + '.selected.pileup')
|
||||
validationOutput = os.path.join(OPTIONS.outputDir, filebase + '.validate.output')
|
||||
|
||||
#print 'reads', reads
|
||||
if not os.path.exists(reads) or OPTIONS.rebuildAllFiles:
|
||||
farm_commands.cmd("ln -s " + os.path.abspath(originalReads) + " " + reads, just_print_commands=OPTIONS.justPrint)
|
||||
|
||||
if not os.path.exists(readsIndex) or OPTIONS.rebuildAllFiles:
|
||||
indexBAM(reads)
|
||||
|
||||
if not os.path.exists(subBAM) or OPTIONS.rebuildAllFiles:
|
||||
if region == '*' or isIntervalFile(region):
|
||||
farm_commands.cmd("ln -s " + os.path.abspath(reads) + " " + subBAM, just_print_commands=OPTIONS.justPrint)
|
||||
else:
|
||||
cmd = "samtools view -b " + reads + " " + region + " > " + subBAM
|
||||
farm_commands.cmd(cmd, None, None, just_print_commands=OPTIONS.justPrint)
|
||||
|
||||
subBAMIndex = subBAM+'.bai'
|
||||
if not os.path.exists(subBAMIndex) or OPTIONS.rebuildAllFiles:
|
||||
if region == '*' or isIntervalFile(region):
|
||||
farm_commands.cmd("ln -s " + os.path.abspath(readsIndex) + " " + subBAMIndex, just_print_commands=OPTIONS.justPrint)
|
||||
else:
|
||||
indexBAM(subBAM)
|
||||
|
||||
if not os.path.exists(pileup) or OPTIONS.rebuildAllFiles:
|
||||
if isIntervalFile(region):
|
||||
filteredPileup = pileup + ".unfiltered"
|
||||
if not os.path.exists(filteredPileup) or OPTIONS.rebuildAllFiles:
|
||||
cmd = "samtools pileup -cf " + ref + " " + subBAM + " > " + filteredPileup
|
||||
#farm_commands.cmd(cmd, None, None)
|
||||
filterByInterval(readIntervalFile(region), filteredPileup, pileup)
|
||||
else:
|
||||
cmd = "samtools pileup -cf " + ref + " " + subBAM + " > " + pileup
|
||||
farm_commands.cmd(cmd, None, None, just_print_commands=OPTIONS.justPrint)
|
||||
|
||||
if not OPTIONS.dontValidate and (not os.path.exists(validationOutput) or OPTIONS.ignoreExistingFiles):
|
||||
print validationOutput, 'does not exist'
|
||||
analysis = "ValidatingPileup"
|
||||
cmd = "java -ea -Xmx1024m -jar " + OPTIONS.gatkPath + " -T " + analysis + " -I " + subBAM + " -R " + ref + " -l INFO -S SILENT -U " + OPTIONS.extraArgs
|
||||
|
||||
if isIntervalFile(region):
|
||||
cmd += " --intervals " + region
|
||||
|
||||
cmd += " -B pileup,SAMPileup," + pileup
|
||||
print cmd
|
||||
farm_commands.cmd(cmd, OPTIONS.farmQueue, outputFile=validationOutput, just_print_commands=OPTIONS.justPrint)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,58 +0,0 @@
|
|||
#!/util/bin/python
|
||||
|
||||
import Walker
|
||||
from SimpleSAM import *
|
||||
|
||||
class WalkLociTest(object, Walker.Walker):
|
||||
def __init__(self):
|
||||
Walker.Walker.__init__( self, 'byLoci', {}, name = 'WalkLociTest' )
|
||||
self.debug = False
|
||||
|
||||
def filterfunc( self, *data ):
|
||||
return True
|
||||
#return datum.getPos() % 2 == 0
|
||||
|
||||
def mapfunc( self, ref, chrom, pos, offsets, reads ):
|
||||
|
||||
if self.debug:
|
||||
print '------------------------------'
|
||||
print ' locus', chrom, pos
|
||||
print ' offsets = ', offsets
|
||||
print ' reads = ', reads
|
||||
|
||||
#
|
||||
# This is the heart of the test walker. You've got the reference, contig,
|
||||
# pos, offsets, and reads. The line below generate pile-ups
|
||||
#
|
||||
def getField(field):
|
||||
return map( lambda r, o: r.__getattribute__(field)[o], reads, offsets )
|
||||
|
||||
def toASCII33( quals ):
|
||||
return ''.join( map( lambda q: chr(q+33), quals ))
|
||||
|
||||
if False:
|
||||
# actually do some useful work
|
||||
refbase = ref[chrom][pos-1]
|
||||
bases = ''.join(getField('bases'))
|
||||
quals = toASCII33(getField('quals'))
|
||||
print chrom, pos, refbase, len(reads), all(map(lambda b: b == refbase, bases)), bases, quals
|
||||
|
||||
if self.debug:
|
||||
for offset, read in zip( offsets, reads ):
|
||||
if self.debug:
|
||||
print ' offset, read', offset, read
|
||||
print read.bases[offset], read.quals[offset]
|
||||
|
||||
#print datum
|
||||
return 1
|
||||
#return len(datum.getSeq())
|
||||
#return datum.getSeq() + '/'
|
||||
#return "\n>%s\n%s" % (datum.getQName(), datum.getSeq())
|
||||
|
||||
#reduceDefault = ''
|
||||
def reducefunc( self, x, sum ):
|
||||
#print x
|
||||
return x + sum
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,88 +0,0 @@
|
|||
class Walker:
|
||||
"""Represents a functional walking object to be applied to SAM records.
|
||||
|
||||
Actually useful programs will inherit from this walker object to implement
|
||||
one or more of the blank operators provided by this module. Effectly, this
|
||||
engine applies a walker object in a standard order to each record in a SAM/BAM
|
||||
file:
|
||||
|
||||
walker = MyWalker(args)
|
||||
samRecords = parseReadsInFiles( files ):
|
||||
results = []
|
||||
|
||||
foreach record in files:
|
||||
if walker.filterfunc( record ):
|
||||
x = walker.mapfunc( record )
|
||||
results.append(x)
|
||||
reduced = reduce( walker.reducefunc, results, walker.reduceDefault )
|
||||
|
||||
|
||||
where:
|
||||
|
||||
filterfunc( data ) -> returns true or false if the data should be
|
||||
included or excluded from mapping
|
||||
|
||||
mapfunc( data ) -> applied to each record, locus, or pair of records, depending
|
||||
on the walker type
|
||||
|
||||
reducefunc( x, sum ) -> combines partial results x with the accumulating
|
||||
results sum across all the data
|
||||
|
||||
"""
|
||||
def __init__(self, walkerType, options, name = None, desc = None ):
|
||||
self.options = options
|
||||
self.setOption( 'walkerType', walkerType )
|
||||
self.setOption( 'name', name )
|
||||
self.setOption( 'desc', desc )
|
||||
|
||||
print self.options
|
||||
|
||||
reduceDefault = 0
|
||||
|
||||
#
|
||||
# walker types
|
||||
#
|
||||
def isRecordWalker( self ):
|
||||
return self.getOption('walkerType') == 'byRecord'
|
||||
def isPairedRecordWalker( self ):
|
||||
return self.getOption('walkerType') == 'byRecordPairs'
|
||||
def isLociWalker( self ):
|
||||
return self.getOption('walkerType') == 'byLoci'
|
||||
|
||||
#
|
||||
# Option processing
|
||||
#
|
||||
def getOption( self, flag ):
|
||||
if self.hasOption( flag ):
|
||||
return self.options[flag]
|
||||
else:
|
||||
return None
|
||||
|
||||
def setOption( self, flag, value ):
|
||||
self.options[flag] = value
|
||||
|
||||
def hasOption(self, flag):
|
||||
return flag in self.options
|
||||
|
||||
def optionEnabled(self, flag):
|
||||
return flag in self.options and self.options[flag] <> False
|
||||
|
||||
def optionDisabled(self, flag):
|
||||
return not self.optionEnabled(flag)
|
||||
|
||||
#
|
||||
# Default functions
|
||||
#
|
||||
def filterfunc( self, datum ):
|
||||
return True
|
||||
|
||||
def mapfunc( self, datum ):
|
||||
return datum
|
||||
|
||||
def reducefunc( self, mapResult, sum ):
|
||||
return sum + 1
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
|
|
@ -1,87 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import string, sys
|
||||
|
||||
class aln_record:
|
||||
"Stores one record of data from a MAQ .aln.txt file"
|
||||
field_names = (
|
||||
"read name",
|
||||
"chromosome",
|
||||
"position",
|
||||
"strand",
|
||||
"insert size from the outer coorniates of a pair",
|
||||
"paired flag",
|
||||
"mapping quality",
|
||||
"single-end mapping quality",
|
||||
"alternative mapping quality",
|
||||
"number of mismatches of the best hit",
|
||||
"sum of qualities of mismatched bases of the best hit",
|
||||
"number of 0-mismatch hits of the first 24bp",
|
||||
"number of 1-mismatch hits of the first 24bp on the reference",
|
||||
"length of the read",
|
||||
"read sequence",
|
||||
"sequence quality"
|
||||
)
|
||||
max_field_name_len = max(map(len, field_names))
|
||||
|
||||
def __init__(self, obj, parse = True):
|
||||
self.fields = []
|
||||
if type(obj) == str:
|
||||
self.setValuesFromString( obj, parse );
|
||||
else:
|
||||
raise TypeError("aln_record did not recognize type: "+str(type(obj)))
|
||||
|
||||
def setValuesFromString( self, line, parse = True ):
|
||||
if parse:
|
||||
formats = [str, str, int, str, int, int, int, int, int, int, int, int, int, int, str, str]
|
||||
self.fields = map( lambda f, v: f(v), formats, line.split() )
|
||||
else:
|
||||
self.fields = line.split()
|
||||
|
||||
def __str__(self):
|
||||
s = ""
|
||||
for n,v in zip(aln_record.field_names, self.fields):
|
||||
s += ("%"+str(aln_record.max_field_name_len)+"s : %s\n") % (n, str(v))
|
||||
return s
|
||||
|
||||
#return string.join( map( str, self.fields ), ' ')
|
||||
|
||||
def id(self): return self.fields[0]
|
||||
def contig(self): return self.fields[1]
|
||||
|
||||
def offset(self): return self.fields[2]-1
|
||||
def pos(self): return self.fields[2]
|
||||
|
||||
# Quality of read mapping (only maq gives this field)
|
||||
def map_qual(self): return self.fields[6]
|
||||
|
||||
#def offset_end(self): return self.fields[8]
|
||||
#def pos_end(self): return self.fields[8]+1
|
||||
|
||||
#def linear_start(self): return self.fields[9]
|
||||
|
||||
|
||||
class aln_file:
|
||||
def __init__(self, filename, parse=True):
|
||||
self.filename = filename
|
||||
self.parse = parse
|
||||
self.faln = open(self.filename)
|
||||
|
||||
def RecordGenerator(self):
|
||||
for line in self.faln:
|
||||
yield aln_record(line, self.parse)
|
||||
raise StopIteration
|
||||
|
||||
def __iter__(self):
|
||||
return self.RecordGenerator()
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 2:
|
||||
print "To test aln_file class:\naln_file.py ALN_FILE"
|
||||
else:
|
||||
count = 0
|
||||
for aln in aln_file(sys.argv[1]):
|
||||
print aln
|
||||
count += 1
|
||||
#if count > 5:
|
||||
# break
|
||||
|
|
@ -1,87 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import string, sys
|
||||
|
||||
class aln_record:
|
||||
"Stores one record of data from a MAQ .aln.txt file"
|
||||
field_names = (
|
||||
"read name",
|
||||
"chromosome",
|
||||
"position",
|
||||
"strand",
|
||||
"insert size from the outer coorniates of a pair",
|
||||
"paired flag",
|
||||
"mapping quality",
|
||||
"single-end mapping quality",
|
||||
"alternative mapping quality",
|
||||
"number of mismatches of the best hit",
|
||||
"sum of qualities of mismatched bases of the best hit",
|
||||
"number of 0-mismatch hits of the first 24bp",
|
||||
"number of 1-mismatch hits of the first 24bp on the reference",
|
||||
"length of the read",
|
||||
"read sequence",
|
||||
"sequence quality"
|
||||
)
|
||||
max_field_name_len = max(map(len, field_names))
|
||||
|
||||
def __init__(self, obj, parse = True):
|
||||
self.fields = []
|
||||
if type(obj) == str:
|
||||
self.setValuesFromString( obj, parse );
|
||||
else:
|
||||
raise TypeError("aln_record did not recognize type: "+str(type(obj)))
|
||||
|
||||
def setValuesFromString( self, line, parse = True ):
|
||||
if parse:
|
||||
formats = [str, str, int, str, int, int, int, int, int, int, int, int, int, int, str, str]
|
||||
self.fields = map( lambda f, v: f(v), formats, line.split() )
|
||||
else:
|
||||
self.fields = line.split()
|
||||
|
||||
def __str__(self):
|
||||
s = ""
|
||||
for n,v in zip(aln_record.field_names, self.fields):
|
||||
s += ("%"+str(aln_record.max_field_name_len)+"s : %s\n") % (n, str(v))
|
||||
return s
|
||||
|
||||
#return string.join( map( str, self.fields ), ' ')
|
||||
|
||||
def id(self): return self.fields[0]
|
||||
def contig(self): return self.fields[1]
|
||||
|
||||
def offset(self): return self.fields[2]-1
|
||||
def pos(self): return self.fields[2]
|
||||
|
||||
# Quality of read mapping (only maq gives this field)
|
||||
def map_qual(self): return self.fields[6]
|
||||
|
||||
#def offset_end(self): return self.fields[8]
|
||||
#def pos_end(self): return self.fields[8]+1
|
||||
|
||||
#def linear_start(self): return self.fields[9]
|
||||
|
||||
|
||||
class aln_file:
|
||||
def __init__(self, filename, parse=True):
|
||||
self.filename = filename
|
||||
self.parse = parse
|
||||
self.faln = open(self.filename)
|
||||
|
||||
def RecordGenerator(self):
|
||||
for line in self.faln:
|
||||
yield aln_record(line, self.parse)
|
||||
raise StopIteration
|
||||
|
||||
def __iter__(self):
|
||||
return self.RecordGenerator()
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 2:
|
||||
print "To test aln_file class:\naln_file.py ALN_FILE"
|
||||
else:
|
||||
count = 0
|
||||
for aln in aln_file(sys.argv[1]):
|
||||
print aln
|
||||
count += 1
|
||||
#if count > 5:
|
||||
# break
|
||||
|
|
@ -1,37 +0,0 @@
|
|||
import farm_commands
|
||||
import os.path
|
||||
import sys
|
||||
from optparse import OptionParser
|
||||
from datetime import date
|
||||
import glob
|
||||
import operator
|
||||
|
||||
if __name__ == "__main__":
|
||||
usage = "usage: %prog files"
|
||||
parser = OptionParser(usage=usage)
|
||||
parser.add_option("-s", "--size", dest="regionSize",
|
||||
type="int", default=None, help="")
|
||||
|
||||
(OPTIONS, args) = parser.parse_args()
|
||||
|
||||
def extract(line):
|
||||
s = line.split()
|
||||
t = s[0].split(":")
|
||||
if len(t) == 2:
|
||||
chr, pos = t
|
||||
return chr, int(pos)
|
||||
else:
|
||||
return None, None
|
||||
|
||||
for file in args:
|
||||
chr, lastPos, startPos, calledBases, progress = None, None, None, None, None
|
||||
lastLine = None
|
||||
for line in open(file):
|
||||
if startPos == None:
|
||||
chr, startPos = extract(line)
|
||||
lastLine = line
|
||||
if lastLine <> None and startPos <> None:
|
||||
lastPos = extract(lastLine)[1]
|
||||
calledBases = lastPos - startPos
|
||||
progress = "%.2f" % (float(calledBases) / OPTIONS.regionSize * 100)
|
||||
print file, chr, startPos, lastPos, calledBases, progress
|
||||
|
|
@ -1,591 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import sys, string
|
||||
import os
|
||||
import re
|
||||
from itertools import *
|
||||
from optparse import OptionParser
|
||||
from memo import DiskMemoize, time_func
|
||||
|
||||
class ref_genome:
|
||||
"""Reads reference genome in FASTA format into a dict"""
|
||||
|
||||
def __init__(self, ref_genome_file):
|
||||
ref_genome.chr_offset = [[] for i in range(45)]
|
||||
chr_id = 0
|
||||
seq = ""
|
||||
for line in open(ref_genome_file):
|
||||
if line.startswith(">"):
|
||||
print line[1:],
|
||||
if line.startswith(">chrM"): # skip first > line
|
||||
continue
|
||||
ref_genome.chr_offset[chr_id] = seq
|
||||
chr_id += 1
|
||||
seq = " " # make it 1 indexed instead of 0 indexed
|
||||
#if chr_id > 2:
|
||||
# break
|
||||
else:
|
||||
seq += line.rstrip().upper()
|
||||
ref_genome.chr_offset[chr_id] = seq
|
||||
|
||||
def __getitem__(self, key):
|
||||
return ref_genome.chr_offset[key]
|
||||
|
||||
AffyChr2Index = dict()
|
||||
for i in range(1,23):
|
||||
AffyChr2Index[str(i)] = i
|
||||
AffyChr2Index['MT'] = 0
|
||||
AffyChr2Index['X'] = 23
|
||||
AffyChr2Index['Y'] = 24
|
||||
|
||||
class GenotypeCall:
|
||||
#ref = time_func(ref_genome)("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
|
||||
def __init__( self, chr, pos, genotype, snpP, lod ):
|
||||
self.chr = chr
|
||||
self.pos = int(pos)
|
||||
self._site = chr + ':' + str(self.pos)
|
||||
self._isSNP = snpP
|
||||
self.genotype = string.join(map(string.upper, sorted(genotype)), '/') # sorted list of bases at position
|
||||
self.lod = lod
|
||||
|
||||
def refbase(self):
|
||||
return GenotypeCall.ref[AffyChr2Index[self.chr]][self.pos]
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self._site)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self._site == other._site
|
||||
|
||||
def site(self): return self._site
|
||||
def isSNP(self): return self._isSNP
|
||||
|
||||
def ref_het_hom(self):
|
||||
if self.genotype[0] <> self.genotype[2]:
|
||||
return 1 # het(erozygous non-ref)
|
||||
else:
|
||||
# homozygous something
|
||||
if self.genotype[0] == self.refbase:
|
||||
return 0 # ref
|
||||
else:
|
||||
return 2 # hom(ozygous non-ref)
|
||||
|
||||
def isHET(self): return self.genotype[0] <> self.genotype[2]
|
||||
def isHOM(self): return self.genotype[0] == self.genotype[2]
|
||||
|
||||
def __str__(self):
|
||||
return "%s:%s %s %s" % ( self.chr, self.pos, self.genotype, self.lod)
|
||||
|
||||
MAQGenotypeEncoding = {
|
||||
'A' : ['A', 'A'],
|
||||
'C' : ['C', 'C'],
|
||||
'T' : ['T', 'T'],
|
||||
'G' : ['G', 'G'],
|
||||
"M" : ['A', 'C'],
|
||||
'K' : ['G', 'T'],
|
||||
'Y' : ['C', 'T'],
|
||||
'R' : ['A', 'G'],
|
||||
'W' : ['A', 'T'],
|
||||
'S' : ['C', 'G'],
|
||||
'D' : ['A', 'G', 'T'],
|
||||
'B' : ['C', 'G', 'T'],
|
||||
'H' : ['A', 'C', 'T'],
|
||||
'V' : ['A', 'C', 'G'],
|
||||
'N' : ['A', 'C', 'G', 'T'] }
|
||||
|
||||
MAQ2STDChr = dict()
|
||||
for i in range(1,23):
|
||||
MAQ2STDChr['chr'+str(i)] = str(i)
|
||||
MAQ2STDChr['chrM'] = 'MT'
|
||||
MAQ2STDChr['chrX'] = 'X'
|
||||
MAQ2STDChr['chrY'] = 'Y'
|
||||
|
||||
def convertMAQChr(maqChr):
|
||||
#print 'convertMAQChr:', maqChr, MAQ2STDChr[maqChr]
|
||||
if maqChr in MAQ2STDChr:
|
||||
return MAQ2STDChr[maqChr]
|
||||
else:
|
||||
return '?'
|
||||
|
||||
def convertMAQGenotype( oneBaseCode ):
|
||||
return MAQGenotypeEncoding[oneBaseCode]
|
||||
|
||||
def internalReadSNPFile( parse1, filename ):
|
||||
result = []
|
||||
snps_extracted = 0
|
||||
for snp in imap( parse1, open(filename) ):
|
||||
if snp:
|
||||
result.append(snp)
|
||||
snps_extracted += 1
|
||||
if snps_extracted > OPTIONS.debug_lines:
|
||||
break
|
||||
|
||||
print len(result),"genotypes extracted"
|
||||
return result
|
||||
|
||||
def snpMAP( snps ):
|
||||
#d = dict( map( lambda x: [x.site(), x], snps ) )
|
||||
d = dict()
|
||||
for snp in snps:
|
||||
d[snp.site()] = snp#d
|
||||
|
||||
#print 'snps', snps, d
|
||||
return d
|
||||
|
||||
def overlappingSites( snps1, snps2 ):
|
||||
map1 = snpMAP(snps1)
|
||||
map2 = snpMAP(snps2)
|
||||
shared = set(map1.keys()) & set(map2.keys())
|
||||
print 'Number of snp1 records', len(map1)
|
||||
print 'Number of snp2 records', len(map2)
|
||||
print 'Number of shared sites', len(shared)
|
||||
print "\n".join(map(str,snps1))
|
||||
return shared
|
||||
|
||||
def readMAQSNPs(filename):
|
||||
# Each line consists of:
|
||||
# chromosome
|
||||
# position
|
||||
# reference base
|
||||
# consensus base
|
||||
# Phred-like consensus quality
|
||||
# read depth
|
||||
# the average number of hits of reads covering this position
|
||||
# the highest mapping quality of the reads covering the position
|
||||
# the minimum consensus quality in the 3bp flanking regions at each side of the site (6bp in total)
|
||||
# the second best call
|
||||
# log likelihood ratio of the second best and the third best call
|
||||
# and the third best call.
|
||||
#
|
||||
# Also, note that:
|
||||
#
|
||||
# What do those "S", "M" and so on mean in the cns2snp output?
|
||||
# They are IUB codes for heterozygotes. Briefly:
|
||||
#
|
||||
# M=A/C, K=G/T, Y=C/T, R=A/G, W=A/T, S=G/C, D=A/G/T, B=C/G/T, H=A/C/T, V=A/C/G, N=A/C/G/T
|
||||
def read1(line):
|
||||
formats = [str, int, str, str, int, int]
|
||||
vals = map( lambda f, x: f(x), formats, line.split()[0:6] )
|
||||
alignQual = vals[4]
|
||||
if alignQual >= (10*OPTIONS.lod):
|
||||
return GenotypeCall( convertMAQChr(vals[0]), vals[1], convertMAQGenotype(vals[3]), vals[2] <> vals[3], alignQual/10.0 )
|
||||
else:
|
||||
#print 'Filtering', alignQual, vals
|
||||
return False
|
||||
|
||||
return internalReadSNPFile( read1, filename )
|
||||
|
||||
OPTIONS = None
|
||||
|
||||
def MerlinChr( index ):
|
||||
if index == 0:
|
||||
return 'MT'
|
||||
elif index == 23:
|
||||
return 'X'
|
||||
elif index == 24:
|
||||
return 'Y'
|
||||
else:
|
||||
return str(index)
|
||||
|
||||
def readMerlinSNPs(filename):
|
||||
# 0:72 G GG 155.337967 0.000000 homozygous A:0 C:2 G:510 T:2 514 0 1 1 GG:-5.59 CG:-160.92 GT:-161.51 AG:-162.11 CT:-1293.61 CC:-1293.61 TT:-1294.19 AC:-1294.21 AT:-1294.80 AA:-1295.40
|
||||
# 0:149 T CC 118.595886 1131.024696 homozygous-SNP A:2 C:442 G:1 T:7 452 0 1 1 CC:-24.21 CT:-142.81 AC:-156.33 CG:-156.96 TT:-1155.23 AT:-1159.41 GT:-1160.04 AA:-1173.26 AG:-1173.56 GG:-1174.20
|
||||
# chr:pos ref genotype bestVsRef bestVsNextBest class ...
|
||||
def read1(line):
|
||||
formats = [lambda x: x.split(':'), str, sorted, float, float, str]
|
||||
vals = map( lambda f, x: f(x), formats, line.split()[0:6] )
|
||||
bestVsRef, bestVsNext = vals[3:5]
|
||||
isSNP = vals[5].find('-SNP') <> -1
|
||||
if bestVsRef >= OPTIONS.lod and isSNP:
|
||||
return GenotypeCall( MerlinChr(int(vals[0][0])), int(vals[0][1]) + 1, vals[2], isSNP, bestVsRef )
|
||||
else:
|
||||
return False
|
||||
|
||||
return internalReadSNPFile( read1, filename )
|
||||
|
||||
def readSNPfile( filename, format ):
|
||||
formats = { 'merlin' : readMerlinSNPs, 'maq' : readMAQSNPs }
|
||||
if format.lower() in formats:
|
||||
return list(formats[format.lower()](filename))
|
||||
else:
|
||||
raise Exception('Unknown SNP file format ' + format)
|
||||
|
||||
def readAffyFile(filename):
|
||||
# chrom position genotype probe_set_id dbsnp_id
|
||||
# 1 84647761 TC SNP_A-1780419 rs6576700
|
||||
# 5 156323558 GG SNP_A-1780418 rs17054099
|
||||
def read1(line):
|
||||
formats = [str, int, sorted, str, str]
|
||||
vals = map( lambda f, x: f(x), formats, line.split() )
|
||||
|
||||
try:
|
||||
chr = str(int(vals[0]))
|
||||
except:
|
||||
chr = convertMAQChr(vals[0])
|
||||
#print 'CHR', chr, vals[0]
|
||||
return GenotypeCall( chr, vals[1], vals[2], False, 100 )
|
||||
|
||||
file = open(filename)
|
||||
file.readline() # skip header
|
||||
#affyData = map( read1, file )
|
||||
affyData = []
|
||||
for index, line in enumerate(file):
|
||||
affyData.append(read1(line))
|
||||
if index > OPTIONS.debug_lines:
|
||||
break
|
||||
if index % 10000 == 0:
|
||||
print index
|
||||
# Give a chance to use list before creating dictionary
|
||||
return affyData
|
||||
|
||||
#print "1111111"
|
||||
#return dict( zip( map( GenotypeCall.site, affyData ), affyData ) )
|
||||
|
||||
def equalSNPs( snp1, snp2 ):
|
||||
return snp1.genotype == snp2.genotype
|
||||
|
||||
# def concordance( truthSet, testSet, includeVector = None ):
|
||||
# # calculates a bunch of useful stats about the two
|
||||
# # data genotype call sets above
|
||||
#
|
||||
# states = [[x,0] for x in ['tested', 'shared', 'shared-equal', 'test-snp', 'hom-ref', 'hom-snp', 'het-snp', 'het-ref']]
|
||||
# counts = dict(states)
|
||||
#
|
||||
# def incShared(state, equalP ):
|
||||
# counts[state] += 1
|
||||
# if equalP:
|
||||
# counts[state + '-equal'] += 1
|
||||
#
|
||||
# nTestSites = 0
|
||||
# for i, testSNP in izip( count(), testSet ):
|
||||
# if includeVector <> None and not includeVector[i]:
|
||||
# # we are skiping this site
|
||||
# continue
|
||||
#
|
||||
# nTestSites += 1
|
||||
#
|
||||
# if testSNP.isSNP():
|
||||
# counts['test-snp'] += 1
|
||||
#
|
||||
# #print testSNP.site()
|
||||
# if testSNP.site() in truthSet:
|
||||
# truth = truthSet[testSNP.site()]
|
||||
# eql = equalSNPs( testSNP, truth )
|
||||
#
|
||||
# incShared( 'shared', eql )
|
||||
# if testSNP.isSNP():
|
||||
# if truth.isHOM(): incShared( 'hom-snp', eql )
|
||||
# else: incShared( 'het-snp', eql )
|
||||
# else:
|
||||
# if truth.isHOM(): incShared( 'hom-ref', eql )
|
||||
# else: incShared( 'het-ref', eql )
|
||||
#
|
||||
# if OPTIONS.verbose and nTestSites % 100 == 0 and nSharedSites > 0:
|
||||
# #print nTestSites, nSharedSites, nEqualSites
|
||||
# print nTestSites, counts
|
||||
#
|
||||
# counts['tested'] = nTestedSites
|
||||
#
|
||||
# return counts
|
||||
|
||||
class ConcordanceData:
|
||||
def __init__(self, name, file1count, file2count):
|
||||
self.name = name
|
||||
self.nFile1Sites = file1count # num sites in file 1
|
||||
self.nFile2Sites = file2count # num sites in file 1
|
||||
self.nSharedSites = 0 # num SNP pairs that map to same position on the genome
|
||||
self.nEqualSites = 0 # num SNPs pars with the same genotype
|
||||
|
||||
def inc( self, truthSNP, testSNP ):
|
||||
self.nSharedSites += 1
|
||||
if equalSNPs( testSNP, truthSNP ): # if the genotypes are equal
|
||||
self.nEqualSites += 1
|
||||
|
||||
def rate(self):
|
||||
return (100.0 * self.nEqualSites) / max(self.nSharedSites,1)
|
||||
|
||||
def __str__(self):
|
||||
return '%d %d %.2f' % ( self.nSharedSites, self.nEqualSites, self.rate() )
|
||||
|
||||
def concordance( truthSet, testSet, sharedSites = None ):
|
||||
# calculates a bunch of useful stats about the two
|
||||
# data genotype call sets above
|
||||
#
|
||||
# The 2 calls in main work like this:
|
||||
# affy, snp1, snp1_snp2_shared
|
||||
# affy, snp2, snp1_snp2_shared
|
||||
|
||||
nTestSites = 0
|
||||
|
||||
# Now for each of the calls to concordance, we generate 3 sets:
|
||||
# - allData: all SNP1 sites that are also in Affy
|
||||
allData = ConcordanceData('all', len(truthSet), len(testSet))
|
||||
# - sharedData: SNP1 sites that are also SNP2 sites that are alse in Affy
|
||||
sharedData = ConcordanceData('shared', len(truthSet), len(testSet))
|
||||
# - uniqueData: SNP1 sites that are not SNP2 sites but that are in Affy
|
||||
uniqueData = ConcordanceData('unique', len(truthSet), len(testSet))
|
||||
for i, testSNP in izip( count(), testSet ):
|
||||
nTestSites += 1
|
||||
if testSNP.site() in truthSet:
|
||||
truthSNP = truthSet[testSNP.site()]
|
||||
|
||||
allData.inc( truthSNP, testSNP )
|
||||
if sharedSites <> None:
|
||||
if testSNP.site() in sharedSites:
|
||||
sharedData.inc( truthSNP, testSNP )
|
||||
else:
|
||||
uniqueData.inc( truthSNP, testSNP )
|
||||
|
||||
if OPTIONS.verbose and nTestSites % 100000 == 0:
|
||||
#print nTestSites, nSharedSites, nEqualSites
|
||||
print nTestSites, allData, sharedData, uniqueData
|
||||
|
||||
return nTestSites, allData, sharedData, uniqueData
|
||||
|
||||
# def concordance( truthSet, testSet, includeVector = None ):
|
||||
# # calculates a bunch of useful stats about the two
|
||||
# # data genotype call sets above
|
||||
#
|
||||
# states = [[x,0] for x in ['tested', 'shared', 'test-snp', 'shared-hom-ref', 'shared-het-snp', 'shared-hom-snp']]
|
||||
# counts = dict(states)
|
||||
#
|
||||
# nTestSites = 0
|
||||
# nSharedSites = 0
|
||||
# nEqualSites = 0
|
||||
# for i, testSNP in izip( count(), testSet ):
|
||||
# nTestSites += 1
|
||||
# #print testSNP.site()
|
||||
# if testSNP.site() in truthSet:
|
||||
# nSharedSites += 1
|
||||
# if equalSNPs( testSNP, truthSet[testSNP.site()] ):
|
||||
# nEqualSites += 1
|
||||
# #else:
|
||||
# # print '~', testSNP, truthSet[testSNP.site()]
|
||||
# if OPTIONS.verbose and nTestSites % 100000 == 0 and nSharedSites > 0:
|
||||
# #print nTestSites, nSharedSites, nEqualSites
|
||||
# print nTestSites, nSharedSites, nEqualSites, (100.0 * nEqualSites) / nSharedSites
|
||||
#
|
||||
# return [nTestSites, nSharedSites, nEqualSites, (100.0 * nEqualSites) / nSharedSites]
|
||||
|
||||
|
||||
def printConcordanceResults( filename1, filename2, results, hasSharedSites = False ):
|
||||
nTestSites, allData, sharedData, uniqueData = results
|
||||
|
||||
def print1(data):
|
||||
print '------------------------------------------------------------'
|
||||
print 'Concordance results', data.name, 'sites'
|
||||
print ' -> Genotype file1', filename1
|
||||
print ' -> Genotype file2', filename2
|
||||
print ' -> Number of tested sites (%s %s): %d' % (filename2, data.name, nTestSites)
|
||||
print ' -> Number of sites shared between files (%s %s): %d' % (filename2, data.name, data.nSharedSites)
|
||||
print ' -> Percent sites in file1 shared (%s %s): %.2f' % (filename1, data.name, data.nSharedSites / (0.01 * data.nFile1Sites))
|
||||
print ' -> Percent sites in file2 shared (%s %s): %.2f' % (filename1, data.name, data.nSharedSites / (0.01 * data.nFile2Sites))
|
||||
print ' -> Number of genotypically equivalent sites (%s %s): %d' % (filename2, data.name, data.nEqualSites)
|
||||
print ' -> Concordance rate (%s %s): %.2f' % (filename2, data.name, data.rate())
|
||||
|
||||
print1(allData)
|
||||
if hasSharedSites:
|
||||
print1( sharedData ) # shared between SNP1 and SNP2
|
||||
print1( uniqueData ) # unique to SNP1 or to SNP2 only
|
||||
|
||||
def dump_shared_snps( affys, snp_list1, snp_list2 ):
|
||||
print len(affys), len(snp_list1), len(snp_list2)
|
||||
snps1 = snpMAP(snp_list1)
|
||||
snps2 = snpMAP(snp_list2)
|
||||
snp1_sites = set(snps1.keys()); print "SNP1s:",len(snp1_sites)
|
||||
snp2_sites = set(snps2.keys()); print "SNP2s:",len(snp2_sites)
|
||||
affy_sites = set(affys.keys()); print "Affys:",len(affy_sites)
|
||||
snp1or2_affy_sites = (snp1_sites | snp2_sites) & affy_sites
|
||||
snp1and2_affy_sites = (snp1_sites & snp2_sites) & affy_sites
|
||||
print "SNP 1 or 2 and Affy: ",len(snp1or2_affy_sites)
|
||||
print "SNP 1 and 2 and Affy:",len(snp1and2_affy_sites)
|
||||
|
||||
fsnp = open ("snp.tab","w")
|
||||
print >>fsnp, "site lod1 lod2 lod1v2 gen1 gen2 genaff inc1 inc2 inc12 lodm genm incm ref_het_hom refbase"
|
||||
for site in snp1and2_affy_sites:
|
||||
snp1 = snps1[site]
|
||||
snp2 = snps2[site]
|
||||
affy = affys[site]
|
||||
|
||||
print >>fsnp, "%-11s %5.2f %5.2f" % (site, snp1.lod, snp2.lod),
|
||||
try:
|
||||
snp1div2 = snp1.lod / snp2.lod
|
||||
except ZeroDivisionError:
|
||||
snp1div2 = 1000
|
||||
print >>fsnp, "%5.2f" % snp1div2,
|
||||
print >>fsnp, snp1.genotype, snp2.genotype, affy.genotype,
|
||||
print >>fsnp, "%1d %1d %1d" % (not equalSNPs(snp1, affy), not equalSNPs(snp2,affy), not equalSNPs(snp1,snp2)),
|
||||
|
||||
# Calculte meta_lod from the two lods
|
||||
if snp1.genotype == snp2.genotype:
|
||||
meta_lod = snp1.lod + snp2.lod
|
||||
meta_genotype = snp1.genotype
|
||||
else:
|
||||
if snp1.lod > snp2.lod:
|
||||
meta_lod = snp1.lod
|
||||
meta_genotype = snp1.genotype
|
||||
else:
|
||||
meta_lod = snp2.lod
|
||||
meta_genotype = snp2.genotype
|
||||
meta_inc = meta_genotype != affy.genotype
|
||||
print >>fsnp, "%5.2f %3s %1d" % (meta_lod, meta_genotype, meta_inc),
|
||||
print >>fsnp, affy.ref_het_hom(),
|
||||
print >>fsnp, affy.refbase()
|
||||
|
||||
def intersection_union_snps( affy, snps1, snps2 ):
|
||||
map1 = snpMAP(snps1)
|
||||
map2 = snpMAP(snps2)
|
||||
shared_nonaffy_sites = (set(map1.keys()) & set(map2.keys())).difference(affy.keys())
|
||||
nonaffy_shared = [(map1[site].lod, map2[site].lod) for site in shared_nonaffy_sites]
|
||||
shared_affy_sites = set(map1.keys()) & set(map2.keys()) & set(affy.keys())
|
||||
|
||||
#shared = []
|
||||
#for site in shared_sites:
|
||||
# shared.append((map1[site], map2[site]))
|
||||
#print "Shared:",len( shared )
|
||||
#shared_x = [s[0].lod for s in shared]
|
||||
#shared_y = [s[1].lod for s in shared]
|
||||
|
||||
both_corr = []
|
||||
snp1_corr = []
|
||||
snp2_corr = []
|
||||
neither_corr = []
|
||||
# given two bools telling whether snp1 and snp2 are correct,
|
||||
# return the correct object
|
||||
#
|
||||
# snp2 incorrect, snp2 correct
|
||||
truth_list = [[neither_corr, snp2_corr], # snp1 incorrect
|
||||
[snp1_corr, both_corr]] # snp1 correct
|
||||
|
||||
for site in shared_affy_sites:
|
||||
snp1_true = equalSNPs(map1[site], affy[site])
|
||||
snp2_true = equalSNPs(map2[site], affy[site])
|
||||
truth_list[snp1_true][snp2_true].append( (map1[site].lod, map2[site].lod) )
|
||||
|
||||
print "Beginning plot..."
|
||||
import rpy2.robjects as robj
|
||||
robj.r('X11(width=15, height=15)')
|
||||
XY_MAX = 25
|
||||
plots = ((nonaffy_shared, "gray45"),(both_corr, "black"), (snp1_corr, "red"), (snp2_corr, "green"), (neither_corr, "blue"))
|
||||
plots = ((both_corr, "black"), (snp1_corr, "red"), (snp2_corr, "green"), (neither_corr, "blue"))
|
||||
robj.r.plot([0, XY_MAX], [0, XY_MAX], \
|
||||
xlab = os.path.splitext(OPTIONS.snp1)[0].capitalize()+" LOD", \
|
||||
ylab = os.path.splitext(OPTIONS.snp2)[0].capitalize()+" LOD", \
|
||||
main = "Shared SNP site LODs", \
|
||||
xlim = robj.FloatVector([0,XY_MAX]), \
|
||||
ylim = robj.FloatVector([0,min(XY_MAX,25)]), \
|
||||
pch = 19, \
|
||||
cex = 0.0)
|
||||
for xy, color in plots:
|
||||
print "X"
|
||||
robj.r.points([pt[0] for pt in xy], [pt[1] for pt in xy], col=color, pch=19, cex=.3)
|
||||
print "Color:",color
|
||||
print "Len:",len(xy)
|
||||
#print "\n".join(["%25s %25s" % pt for pt in xy])
|
||||
|
||||
#print "\n".join(["%25s %25s" % shared_snp for shared_snp in shared])
|
||||
#robj.r.plot(shared_x, shared_y, xlab=OPTIONS.format1.capitalize()+" LOD", ylab=OPTIONS.format2.capitalize()+" LOD", main="Shared SNP site LODs", col="black", xlim=robj.FloatVector([0,XY_MAX]), ylim=robj.FloatVector([0,min(XY_MAX,25)]), pch=19, cex=0.3)
|
||||
raw_input("Press enter to continue")
|
||||
|
||||
return
|
||||
|
||||
ss1 = set(snps1)
|
||||
ss2 = set(snps2)
|
||||
print "Shared:",len( ss1.intersection(ss2) )
|
||||
print "Snp1 only:",len( ss1.difference(ss2) )
|
||||
print "Snp2 only:",len( ss2.difference(ss1) )
|
||||
print "Snp1 total:",len( ss1 )
|
||||
print "Snp2 total:",len( ss2 )
|
||||
|
||||
def count_het_sites(snp_list):
|
||||
hets = 0
|
||||
for snp in snp_list:
|
||||
if snp.isHET():
|
||||
hets += 1
|
||||
print hets,"hets,",len(snp_list),"total,",
|
||||
print "%.1f" % (float(hets)/len(snp_list)*100)
|
||||
|
||||
def main(argv):
|
||||
global OPTIONS, ROOT
|
||||
|
||||
usage = "usage: %prog --truth affy-truth-file --snp1 snpfile1 --snp2 snpfile2"
|
||||
parser = OptionParser(usage=usage)
|
||||
parser.add_option("-t", "--truth", dest="affy",
|
||||
type="string", default=None,
|
||||
help="Affy truth file")
|
||||
parser.add_option("-1", "--snp1", dest="snp1",
|
||||
type="string", default=None,
|
||||
help="Affy truth file")
|
||||
parser.add_option("-2", "--snp2", dest="snp2",
|
||||
type="string", default=None,
|
||||
help="Affy truth file")
|
||||
parser.add_option("", "--f1", dest="format1",
|
||||
type="string", default=None,
|
||||
help="File type of snpfile1")
|
||||
parser.add_option("", "--f2", dest="format2",
|
||||
type="string", default=None,
|
||||
help="File type of snpfile2")
|
||||
parser.add_option("-l", "--lod", dest="lod",
|
||||
type="float", default=5.0,
|
||||
help="Minimum LOD of confident SNP call in Merlin or Q (10x) in MAQ")
|
||||
parser.add_option("-v", "--verbose", dest="verbose",
|
||||
action='store_true', default=False,
|
||||
help="Verbose output")
|
||||
parser.add_option("-d", "--debug_lines", dest="debug_lines",
|
||||
type='float', default=sys.maxint,
|
||||
help="Number of input data lines to process for debugging")
|
||||
|
||||
(OPTIONS, args) = parser.parse_args()
|
||||
if len(args) != 0:
|
||||
parser.error("incorrect number of arguments")
|
||||
|
||||
if OPTIONS.affy == None:
|
||||
parser.error("No affy data specified")
|
||||
|
||||
if OPTIONS.format1 == None:
|
||||
parser.error("First format cannot be none")
|
||||
|
||||
if OPTIONS.snp2 <> None and OPTIONS.format2 == None:
|
||||
parser.error("snp2 file given but format was not specified")
|
||||
|
||||
# Load reference genome
|
||||
#ref = ref_genome("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
|
||||
#sys.exit()
|
||||
|
||||
if 1:
|
||||
print "Reading Affy truth data..."
|
||||
#readAffyFile2 = DiskMemoize( readAffyFile, "readAffyFile", global_deps = ["OPTIONS.lod"] )
|
||||
readAffyFile2 = time_func(readAffyFile)
|
||||
affy_list = readAffyFile2( filename=OPTIONS.affy )
|
||||
#count_het_sites(affy_list)
|
||||
#sys.exit()
|
||||
affy = dict( zip( map( GenotypeCall.site, affy_list ), affy_list ) )
|
||||
print 'Read affy truth data:'
|
||||
print ' -> number of genotyped loci', len(affy)
|
||||
|
||||
#readSNPfile2 = DiskMemoize( readSNPfile, "readSNPfile", global_deps = ["OPTIONS.lod"] )
|
||||
readSNPfile2 = time_func(readSNPfile)
|
||||
print "Reading SNPs 1 file..."
|
||||
snps1 = readSNPfile2( filename=OPTIONS.snp1, format=OPTIONS.format1 )
|
||||
if OPTIONS.snp2 <> None:
|
||||
print "Reading SNPs 2 file..."
|
||||
snps2 = readSNPfile2( filename=OPTIONS.snp2, format=OPTIONS.format2 )
|
||||
|
||||
dump_shared_snps( affy, snps1, snps2 )
|
||||
#intersection_union_snps( affy, snps1, snps2 )
|
||||
#sys.exit()
|
||||
|
||||
sharedSites = None
|
||||
if OPTIONS.snp2 <> None:
|
||||
sharedSites = overlappingSites( snps1, snps2 )
|
||||
results1 = concordance( affy, snps1, sharedSites )
|
||||
printConcordanceResults( OPTIONS.affy, OPTIONS.snp1, results1, True )
|
||||
results2 = concordance( affy, snps2, sharedSites )
|
||||
printConcordanceResults( OPTIONS.affy, OPTIONS.snp2, results2, True )
|
||||
|
||||
else:
|
||||
results = concordance( affy, snps1 )
|
||||
printConcordanceResults( OPTIONS.affy, OPTIONS.snp1, results, sharedSites )
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv)
|
||||
|
|
@ -1,81 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# Creates summary stats from a concordance file - SNPs called and number in dbSNP - and will output
|
||||
|
||||
import sys, re, os, farm_commands
|
||||
|
||||
sting_dir = os.environ.get("STING_DIR")
|
||||
if sting_dir == None:
|
||||
print "You need to set the environment variable STING_DIR to point to your Sting/GATK build"
|
||||
sys.exit()
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print "Usage: PROG CONCORDANCE_FILE"
|
||||
sys.exit()
|
||||
|
||||
class callset_data:
|
||||
def __init__(self, snps, in_dbSNP, titv):
|
||||
self.snps = snps
|
||||
self.in_dbSNP = in_dbSNP
|
||||
self.titv = titv
|
||||
def inc_snps(self):
|
||||
self.snps += 1
|
||||
def inc_dbsnps(self):
|
||||
self.in_dbSNP += 1
|
||||
def __str__(self):
|
||||
return "SNPs: %10d in_dbSNP: %d TiTv: %d" % (self.snps, self.in_dbSNP, self.titv)
|
||||
|
||||
sets = dict()
|
||||
concordance_filename = os.path.abspath(sys.argv[1])
|
||||
for line in file(concordance_filename):
|
||||
fields = line.split("\t")
|
||||
filter = 0
|
||||
if not re.search('HybridSelectionVariantFilter', line):
|
||||
match = re.search('Venn=(\S*)', line) ### TODO: Only works in 2-way venn with ";", remove ";" for N-way Venn - should be changed to work for both automatically
|
||||
if match:
|
||||
my_set = match.groups()[0]
|
||||
|
||||
if sets.has_key(my_set):
|
||||
sets[my_set].inc_snps()
|
||||
if fields[2] != ".":
|
||||
sets[my_set].inc_dbsnps()
|
||||
else:
|
||||
sets[my_set] = callset_data(1, 0, 0)
|
||||
|
||||
print "\n".join(["%40s %s" % (k,str(v)) for k,v in sets.items()])
|
||||
|
||||
print "Splitting concordance file\n"
|
||||
splits_dir = concordance_filename+".splits"
|
||||
|
||||
for vennSet in sets.keys():
|
||||
print vennSet
|
||||
output_head = splits_dir+"_"+vennSet
|
||||
vcf_file = splits_dir+"_"+vennSet+".vcf"
|
||||
varianteval_file = splits_dir+"_"+vennSet+".varianteval"
|
||||
|
||||
if not os.path.exists(varianteval_file):
|
||||
fout = open(vcf_file, "w")
|
||||
for line in file(concordance_filename):
|
||||
if line.startswith("#"):
|
||||
fout.write(line)
|
||||
else:
|
||||
match = re.search('Venn='+vennSet, line)
|
||||
if match:
|
||||
fout.write(line)
|
||||
|
||||
varianteval_out = os.path.basename(concordance_filename)+".vcf"
|
||||
farm_commands.cmd("java -Xmx4096m -jar "+sting_dir+" -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta "
|
||||
+"-T VariantEval "
|
||||
+"-B eval,VCF,"+vcf_file+" "
|
||||
+"-o "+varianteval_file+" "
|
||||
+"-D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod",
|
||||
"gsa")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,80 +0,0 @@
|
|||
import farm_commands
|
||||
import os.path
|
||||
import sys
|
||||
from optparse import OptionParser
|
||||
from gatkConfigParser import *
|
||||
import glob
|
||||
|
||||
if __name__ == "__main__":
|
||||
usage = """usage: %prog [-c config.cfg]* input.bam output.bam"""
|
||||
|
||||
parser = OptionParser(usage=usage)
|
||||
parser.add_option("-A", "--args", dest="args",
|
||||
type="string", default="",
|
||||
help="arguments to GATK")
|
||||
parser.add_option("-m", "--mode", dest="RecalMode",
|
||||
type="string", default="SEQUENTIAL",
|
||||
help="Mode argument to provide to table recalibrator")
|
||||
parser.add_option("-q", "--farm", dest="farmQueue",
|
||||
type="string", default=None,
|
||||
help="Farm queue to send processing jobs to")
|
||||
parser.add_option("-c", "--config", dest="configs",
|
||||
action="append", type="string", default=[],
|
||||
help="Configuration file")
|
||||
parser.add_option("-d", "--dir", dest="scratchDir",
|
||||
type="string", default="./",
|
||||
help="Output directory")
|
||||
parser.add_option("-w", "--wait", dest="initialWaitID",
|
||||
type="string", default=None,
|
||||
help="If providedm the first job dispatched to LSF will use this id as it ended() prerequisite")
|
||||
parser.add_option("", "--dry", dest="dry",
|
||||
action='store_true', default=False,
|
||||
help="If provided, nothing actually gets run, just a dry run")
|
||||
parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles",
|
||||
action='store_true', default=False,
|
||||
help="Ignores already written files, if present")
|
||||
|
||||
(OPTIONS, args) = parser.parse_args()
|
||||
if len(args) != 2:
|
||||
parser.error("incorrect number of arguments")
|
||||
|
||||
config = gatkConfigParser(OPTIONS.configs)
|
||||
inputBAM = args[0]
|
||||
outputBAM = args[1]
|
||||
rootname = os.path.split(os.path.splitext(outputBAM)[0])[1]
|
||||
|
||||
covariateRoot = os.path.join(OPTIONS.scratchDir, rootname)
|
||||
covariateInitial = covariateRoot + '.init'
|
||||
initDataFile = covariateInitial + '.recal_data.csv'
|
||||
covariateRecal = covariateRoot + '.recal'
|
||||
recalDataFile = covariateRecal + '.recal_data.csv'
|
||||
|
||||
if not os.path.exists(OPTIONS.scratchDir):
|
||||
os.mkdir(OPTIONS.scratchDir)
|
||||
|
||||
def covariateCmd(bam, outputDir):
|
||||
add = " -I %s --OUTPUT_FILEROOT %s" % (bam, outputDir)
|
||||
return config.gatkCmd('CountCovariates', log=outputDir, stdLogName=True) + add
|
||||
|
||||
def recalibrateCmd(inputBAM, dataFile, outputBAM):
|
||||
return config.gatkCmd('TableRecalibration', log=outputBAM, stdLogName=True) + " -I %s -params %s -outputBAM %s -mode %s" % (inputBAM, dataFile, outputBAM, OPTIONS.RecalMode)
|
||||
|
||||
def runCovariateCmd(inputBAM, dataFile, dir, jobid):
|
||||
if OPTIONS.ignoreExistingFiles or not os.path.exists(dataFile):
|
||||
cmd = covariateCmd(inputBAM, dir)
|
||||
return farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
|
||||
|
||||
#
|
||||
# Actually do some work here
|
||||
#
|
||||
jobid = OPTIONS.initialWaitID
|
||||
if OPTIONS.ignoreExistingFiles or not os.path.exists(initDataFile):
|
||||
jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid)
|
||||
|
||||
if OPTIONS.ignoreExistingFiles or not os.path.exists(outputBAM):
|
||||
cmd = recalibrateCmd(inputBAM, initDataFile, outputBAM)
|
||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
|
||||
jobid = farm_commands.cmd('samtools index ' + outputBAM, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
|
||||
|
||||
jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid)
|
||||
|
||||
|
|
@ -1,24 +0,0 @@
|
|||
import os.path
|
||||
from optparse import OptionParser
|
||||
|
||||
def main():
|
||||
global OPTIONS
|
||||
usage = "usage: %prog [options] files"
|
||||
parser = OptionParser(usage=usage)
|
||||
parser.add_option("-c", "--category", dest="category",
|
||||
type='string', default="Anonymous",
|
||||
help="If provided, catagory name")
|
||||
|
||||
(OPTIONS, args) = parser.parse_args()
|
||||
if len(args) == 0:
|
||||
parser.error("incorrect number of arguments")
|
||||
|
||||
print '<Category name="%s">' % OPTIONS.category
|
||||
for arg in args:
|
||||
path = os.path.abspath(arg)
|
||||
name = os.path.basename(arg)
|
||||
print ' <Resource name="%s" path="%s" serverURL="http://www.broad.mit.edu/webservices/igv"/>' % (name, path)
|
||||
print '</Category>'
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,107 +0,0 @@
|
|||
import os.path
|
||||
import sys
|
||||
from optparse import OptionParser
|
||||
from vcfReader import *
|
||||
from itertools import *
|
||||
import faiReader
|
||||
|
||||
def main():
|
||||
global OPTIONS
|
||||
usage = "usage: %prog [options] file1 ... fileN"
|
||||
parser = OptionParser(usage=usage)
|
||||
parser.add_option("-f", "--f", dest="fai",
|
||||
type='string', default=None,
|
||||
help="FAI file defining the sort order of the VCF")
|
||||
parser.add_option("-o", "--o", dest="output",
|
||||
type='string', default=None,
|
||||
help="if provided, output will go here instead of stdout")
|
||||
parser.add_option("-a", "--assumeSorted", dest="assumeSorted",
|
||||
action='store_true', default=False,
|
||||
help="If provided, this assumes the input VCF files are themselves sorted, enabling a simple efficent merge")
|
||||
parser.add_option("-v", "--verbose", dest="verbose",
|
||||
action='store_true', default=False,
|
||||
help="If provided, verbose progress will be enabled")
|
||||
|
||||
(OPTIONS, args) = parser.parse_args()
|
||||
if len(args) == 0:
|
||||
parser.error("Requires at least 1 VCF to merge")
|
||||
|
||||
order = None
|
||||
if OPTIONS.fai <> None:
|
||||
if OPTIONS.verbose: print 'reading FAI', OPTIONS.fai
|
||||
|
||||
order = faiReader.readFAIContigOrdering(OPTIONS.fai)
|
||||
#print 'Order', order
|
||||
|
||||
if OPTIONS.output != None:
|
||||
out = open(OPTIONS.output,'w')
|
||||
else:
|
||||
out = sys.stdout
|
||||
|
||||
if OPTIONS.assumeSorted:
|
||||
mergeSort(out, args, order)
|
||||
else:
|
||||
memSort(out, args, order)
|
||||
|
||||
def cmpVCFRecords(order, r1, r2):
|
||||
if order <> None:
|
||||
c1 = order[str(r1.getChrom())]
|
||||
c2 = order[str(r2.getChrom())]
|
||||
orderCmp = cmp(c1, c2)
|
||||
if orderCmp <> 0:
|
||||
return orderCmp
|
||||
return cmp(r1.getPos(), r2.getPos())
|
||||
|
||||
def mergeSort(out, args, order):
|
||||
#print 'MergeSort', args, order
|
||||
header = None
|
||||
|
||||
orderMap = []
|
||||
for file in args:
|
||||
#print file
|
||||
openedFile = open(file)
|
||||
for header, record, counter in lines2VCF(openedFile, extendedOutput = True, decodeAll = False):
|
||||
orderMap.append([record, file])
|
||||
break
|
||||
openedFile.close()
|
||||
|
||||
#print orderMap
|
||||
sortedOrderMap = sorted(orderMap, key=lambda x: x[0], cmp = lambda r1, r2: cmpVCFRecords(order, r1, r2))
|
||||
#print sortedOrderMap
|
||||
|
||||
for headerLine in header: print >> out, headerLine
|
||||
i = 0
|
||||
n = len(sortedOrderMap)
|
||||
for file in map( lambda x: x[1], sortedOrderMap):
|
||||
if OPTIONS.verbose:
|
||||
i += 1
|
||||
print 'Processing', file, ':', i, 'of', n
|
||||
|
||||
for record in lines2VCF(open(file), extendedOutput = False, decodeAll = False):
|
||||
print >> out, record.format()
|
||||
|
||||
def memSort(args, order):
|
||||
header = None
|
||||
records = []
|
||||
for file in args:
|
||||
#print file
|
||||
for header, record, counter in lines2VCF(open(file), extendedOutput = True, decodeAll = False):
|
||||
records.append(record)
|
||||
|
||||
records.sort(lambda r1, r2: cmpVCFRecords(order, r1, r2))
|
||||
for line in formatVCF(header, records):
|
||||
#pass
|
||||
print >> out, line
|
||||
|
||||
PROFILE = False
|
||||
if __name__ == "__main__":
|
||||
if PROFILE:
|
||||
import cProfile
|
||||
cProfile.run('main()', 'fooprof')
|
||||
import pstats
|
||||
p = pstats.Stats('fooprof')
|
||||
p.sort_stats('cumulative').print_stats(10)
|
||||
p.sort_stats('time').print_stats(10)
|
||||
p.sort_stats('time', 'cum').print_stats(.5, 'init')
|
||||
else:
|
||||
main()
|
||||
|
|
@ -1,332 +0,0 @@
|
|||
import farm_commands
|
||||
import os.path
|
||||
import sys
|
||||
from optparse import OptionParser
|
||||
from datetime import date
|
||||
import glob
|
||||
import operator
|
||||
import faiReader
|
||||
import math
|
||||
import shutil
|
||||
|
||||
GATK_STABLE = 'java -ea -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar -l INFO -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta '
|
||||
GATK_DEV = 'java -ea -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta '
|
||||
GATK = GATK_STABLE
|
||||
|
||||
class CallTarget:
|
||||
def __init__(self, name, hetero, minQ = 50, depth = 120, truthGFFName = None, variantEvalArgs = '', otherVCF = None):
|
||||
self.name = name
|
||||
self.hetero = hetero
|
||||
self.minQ = minQ
|
||||
self.depth = depth
|
||||
if truthGFFName <> None:
|
||||
self.truthGFFName = truthGFFName
|
||||
else:
|
||||
self.truthGFFName = name
|
||||
self.variantEvalArgs = variantEvalArgs
|
||||
self.otherVCF = otherVCF
|
||||
self.unionVCF = None
|
||||
if otherVCF != None:
|
||||
self.unionVCF = self.name + '.gatk.glftrio.union.filtered.vcf'
|
||||
self.listFile = None
|
||||
self.releaseVCF = None
|
||||
|
||||
CEU_HET = 0.79e-3
|
||||
YRI_HET = 1.0e-3
|
||||
|
||||
targets = [
|
||||
CallTarget('NA12878', CEU_HET),
|
||||
CallTarget('NA12891', CEU_HET),
|
||||
CallTarget('NA12892', CEU_HET),
|
||||
CallTarget('NA19238', YRI_HET),
|
||||
CallTarget('NA19239', YRI_HET),
|
||||
CallTarget('NA19240', YRI_HET),
|
||||
|
||||
CallTarget('ceu.trio', CEU_HET, 50, 360, 'NA12878', '--sampleName NA12878', '/humgen/gsa-hpprojects/1kg/1kg_pilot2/currentBestProjectCalls/CEU_1kg_pilot2.vcf'),
|
||||
CallTarget('yri.trio', YRI_HET, 50, 360, 'NA19240', '--sampleName NA19240', '/humgen/gsa-hpprojects/1kg/1kg_pilot2/currentBestProjectCalls/YRI_1kg_pilot2.vcf'),
|
||||
|
||||
# CallTarget('NA19240.alltechs.solid_original', YRI_HET, 50, 120, 'NA19240'),
|
||||
# CallTarget('NA12878.alltechs.solid_original', CEU_HET, 50, 120, 'NA12878'),
|
||||
|
||||
CallTarget('NA19240.alltechs.solid_recal', YRI_HET, 50, 120, 'NA19240'),
|
||||
CallTarget('NA12878.alltechs.solid_recal', CEU_HET, 50, 120, 'NA12878'),
|
||||
]
|
||||
|
||||
sets = ['Intersection', 'filteredInBoth', 'gatk', 'gatk-filteredInOther', 'glftrio', 'glftrio-filteredInOther', 'Intersection', ['gatk-unique', 'gatk.*'], ['glftrio-unique', 'glftrio.*']]
|
||||
|
||||
def main():
|
||||
global OPTIONS
|
||||
usage = "usage: %prog stage [options]"
|
||||
parser = OptionParser(usage=usage)
|
||||
# parser.add_option("-q", "--farm", dest="farmQueue",
|
||||
# type="string", default=None,
|
||||
# help="Farm queue to send processing jobs to")
|
||||
parser.add_option("", "--dry", dest="dry",
|
||||
action='store_true', default=False,
|
||||
help="If provided, nothing actually gets run, just a dry run")
|
||||
parser.add_option("-s", "--sample", dest="useSample",
|
||||
type='string', default=None,
|
||||
help="If provided, only run pipeline for this sample")
|
||||
parser.add_option("-c", "--minQ", dest="minQ",
|
||||
type='float', default=None,
|
||||
help="If provided, will actually use this Q threshold for calls")
|
||||
parser.add_option("-d", "--dir", dest="dir",
|
||||
type='string', default="",
|
||||
help="If provided, this is the root where files are read and written")
|
||||
parser.add_option("-q", "--farm", dest="farmQueue",
|
||||
type="string", default=None,
|
||||
help="Farm queue to send processing jobs to")
|
||||
|
||||
(OPTIONS, args) = parser.parse_args()
|
||||
if len(args) != 1:
|
||||
parser.error("incorrect number of arguments")
|
||||
|
||||
stages = args[0].split(",")
|
||||
|
||||
allJobs = []
|
||||
for callTarget in targets:
|
||||
if callTarget.name == OPTIONS.useSample or OPTIONS.useSample == None:
|
||||
lastJobs = None
|
||||
|
||||
if OPTIONS.minQ != None:
|
||||
callTarget.minQ = OPTIONS.minQ
|
||||
|
||||
for stage in stages:
|
||||
print 'STAGE', stage
|
||||
target = callTarget.name
|
||||
callTarget.listFile = os.path.join("lists", target + ".list")
|
||||
unfilteredVCFBaseName = target + '.gatk.ug.vcf'
|
||||
unfilteredVCF = os.path.join(OPTIONS.dir, unfilteredVCFBaseName)
|
||||
filteredVCF = os.path.join(OPTIONS.dir, target + '.gatk.ug.filtered.vcf')
|
||||
|
||||
tmpdir = os.path.join(OPTIONS.dir, "intermediates", unfilteredVCFBaseName + ".scatter")
|
||||
if not os.path.exists(tmpdir):
|
||||
os.makedirs(tmpdir)
|
||||
|
||||
if callTarget.unionVCF != None:
|
||||
callTarget.releaseVCF = os.path.join(OPTIONS.dir, callTarget.name + ".gatk_glftrio.intersection.annotated.filtered.vcf")
|
||||
|
||||
print 'Heading into stage', stage, lastJobs
|
||||
|
||||
newJobs = []
|
||||
if stage == 'CALL':
|
||||
newJobs = callSNPs(target, callTarget, callTarget.listFile, tmpdir)
|
||||
|
||||
if stage == 'MERGE':
|
||||
newJobs = mergeSNPs(target, lastJobs, unfilteredVCF, tmpdir)
|
||||
|
||||
if stage == 'FILTER':
|
||||
newJobs = filterSNPs(callTarget, lastJobs, unfilteredVCF, filteredVCF)
|
||||
|
||||
if stage == 'UNION':
|
||||
newJobs = unionSNPs(callTarget, lastJobs, filteredVCF )
|
||||
|
||||
if stage == 'SELECT_INTERSECT':
|
||||
if ( callTarget.unionVCF != None ):
|
||||
# we are one fo the release targets
|
||||
newJobs = finalize1KGRelease(callTarget, lastJobs, os.path.join(OPTIONS.dir, callTarget.unionVCF ), callTarget.releaseVCF)
|
||||
|
||||
if stage == 'EVAL':
|
||||
newJobs = evalSNPs(callTarget, lastJobs, unfilteredVCF, filteredVCF)
|
||||
|
||||
if stage == 'RELEASE':
|
||||
dir = '/humgen/gsa-scr1/pub/1000GenomesPilot2'
|
||||
subdir = '1000GenomesPilot2SNPs_GATK_glftrio_' + date.today().strftime("%m%d%y")
|
||||
releaseSNPs(os.path.join(dir, subdir), callTarget, filteredVCF )
|
||||
|
||||
if stage == 'CLEAN':
|
||||
shutil.rmtree("intermediates", True)
|
||||
for file in [filteredVCF, unfilteredVCF]:
|
||||
if os.path.exists(file): os.remove(file)
|
||||
|
||||
print 'New jobs'
|
||||
for job in newJobs:
|
||||
print ' ', job
|
||||
allJobs.append(newJobs)
|
||||
if newJobs != []:
|
||||
lastJobs = newJobs
|
||||
|
||||
print 'EXECUTING JOBS'
|
||||
farm_commands.executeJobs(allJobs, farm_queue = OPTIONS.farmQueue, just_print_commands = OPTIONS.dry)
|
||||
|
||||
hg18 = ['chr' + str(i) for i in range(1,23)] + ['chrX']
|
||||
b36 = [str(i) for i in range(1,23)] + ['X']
|
||||
|
||||
def autosomePlusX(name):
|
||||
return name in b36 or name in hg18
|
||||
|
||||
def partition(faiRecords, maybeChrom, n):
|
||||
# 1 247249719 3 60 61
|
||||
# 2 242951149 251370554 60 61
|
||||
chromAndSize = [[x[0], int(x[1])] for x in faiRecords if autosomePlusX(x[0])]
|
||||
#print chromAndSize
|
||||
if maybeChrom <> None:
|
||||
chromAndSize = filter(lambda x: x[0] == maybeChrom, chromAndSize)
|
||||
|
||||
def r(chrom, chrStart, chrEnd):
|
||||
outputVCF = 'calls.chr%s.good.%s.vcf' % (chrom, chrStart)
|
||||
L = '-L %s:%d-%d' % (chrom, chrStart, chrEnd)
|
||||
return outputVCF, chrom, chrStart, chrEnd, L
|
||||
|
||||
for chrom, size in chromAndSize:
|
||||
#print 'SIZE', chrom, size
|
||||
if n == 1:
|
||||
yield r(chrom, 1, size)
|
||||
else:
|
||||
idealBp = float(size-1) / n
|
||||
chrEnd = None
|
||||
chrStart = 1
|
||||
for i in range(1, n):
|
||||
chrEnd = 1 + int(round(idealBp * (i + 1)))
|
||||
#print chrom, chrStart, chrEnd, idealBp
|
||||
result = r(chrom, chrStart, chrEnd)
|
||||
chrStart = chrEnd + 1
|
||||
yield result
|
||||
if chrEnd <> size:
|
||||
raise Exception('X')
|
||||
|
||||
def callSNPs(target, callTarget, listFile, tmpdir):
|
||||
extras = "-mmq 10 -mbq 10 -pl SOLID --heterozygosity %e" % (callTarget.hetero)
|
||||
fai = "/humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta.fai"
|
||||
NWaysParallel = 5
|
||||
|
||||
bins = partition(faiReader.readFAI(fai), None, NWaysParallel)
|
||||
jobs = list()
|
||||
for outputVCF, chrom, chrStart, chrEnd, L in bins:
|
||||
outputVCF = os.path.join(tmpdir, outputVCF)
|
||||
#print outputVCF, L, os.path.exists(outputVCF)
|
||||
#if not os.path.exists(outputVCF):
|
||||
print 'Enqueuing job for', outputVCF, L
|
||||
cmd = 'java -Xmx2048m -jar /home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar ' + \
|
||||
'-T UnifiedGenotyper -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -mrl 500000 ' + \
|
||||
'-I %s -confidence %d %s -varout %s -vf VCF -L %s -gm JOINT_ESTIMATE' % (listFile, callTarget.minQ, extras, outputVCF, L)
|
||||
#jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry)
|
||||
jobs.append(farm_commands.FarmJob(cmd, jobName = "CALL_%s_c%s_s%s" % (target, str(chrom), str(chrStart))))
|
||||
|
||||
return jobs
|
||||
|
||||
def mergeSNPs(target, lastJobs, snpFile, tmpdir):
|
||||
#print 'LastJobs = ', lastJobs
|
||||
cmd = "python ~/dev/GenomeAnalysisTK/trunk/python/mergeVCFs.py -a -f /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta.fai %s/*.vcf > %s" % (tmpdir, snpFile)
|
||||
return [farm_commands.FarmJob(cmd, jobName = "MERGE_%s" % (target), dependencies = lastJobs)]
|
||||
|
||||
def filterSNPs(callTarget, lastJobs, unfilteredVCF, filteredVCF):
|
||||
target = callTarget.name
|
||||
#expression = "AB > 0.75 || DP > %s" % depth
|
||||
expression1 = ['GATK_STANDARD', "AB > 0.75 || DP > %s || MQ0 > 40 || SB > -0.10" % callTarget.depth]
|
||||
expression2 = ['HARD_TO_VALIDATE', "MQ0 >= 4 && ((MQ0 / (1.0 * DP)) > 0.1)"]
|
||||
cmd = GATK + '-T VariantFiltration -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -B variant,VCF,%s --clusterWindowSize 10 -o %s ' % (unfilteredVCF, filteredVCF)
|
||||
|
||||
for name, exp in [expression1, expression2]:
|
||||
cmd += '--filterName %s --filterExpression "%s" ' % ( name, exp )
|
||||
|
||||
#jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry)
|
||||
return [farm_commands.FarmJob(cmd, jobName = "FILTER_%s" % (target), dependencies = lastJobs)]
|
||||
|
||||
def evalSNPs(callTarget, lastJobs, unfilteredVCF, filteredVCF):
|
||||
evalRoot = os.path.join(OPTIONS.dir, "eval")
|
||||
if not os.path.exists(evalRoot):
|
||||
os.makedirs(evalRoot)
|
||||
|
||||
def eval1(vcfFullPath, namePostfix = "", args = ""):
|
||||
vcf = os.path.split(vcfFullPath)[1]
|
||||
out = os.path.join(OPTIONS.dir, "eval", vcf + namePostfix + ".eval")
|
||||
cmd = GATK + "-T VariantEval -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -B 1kg_ceu,VCF,/humgen/gsa-hpprojects/1kg/1kg_pilot1/SNPCalls/Joint/RC1/CEU.2and3_way.annotated.vcf -B 1kg_yri,VCF,/humgen/gsa-hpprojects/1kg/1kg_pilot1/SNPCalls/Joint/RC1/YRI.2and3_way.annotated.vcf -B eval,VCF,%s -G -A -o %s -L %s" % ( vcfFullPath, out, '\;'.join(map(str, b36)) )
|
||||
hapmap3 = os.path.join("../../hapmap3Genotypes", callTarget.truthGFFName + ".b36.gff")
|
||||
if os.path.exists(hapmap3):
|
||||
cmd += " -B hapmap-chip,GFF,%s %s %s" % (hapmap3, callTarget.variantEvalArgs, args)
|
||||
#jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry)
|
||||
return farm_commands.FarmJob(cmd, jobName = "EVAL_%s_%s" % (callTarget.name, namePostfix), dependencies = lastJobs)
|
||||
|
||||
jobs = []
|
||||
jobs.append(eval1(filteredVCF))
|
||||
jobs.append(eval1(unfilteredVCF))
|
||||
if callTarget.unionVCF != None:
|
||||
jobs.append(eval1(os.path.join(OPTIONS.dir, callTarget.unionVCF), ".union"))
|
||||
for set in sets:
|
||||
if type(set) == list:
|
||||
name, selector = set
|
||||
else:
|
||||
name, selector = set, set
|
||||
jobs.append(eval1(os.path.join(OPTIONS.dir, callTarget.unionVCF), "." + name, " -vcfInfoSelector set=\"" + selector + "\""))
|
||||
|
||||
if callTarget.releaseVCF != None:
|
||||
jobs.append(eval1(callTarget.releaseVCF, ".release"))
|
||||
|
||||
|
||||
return jobs
|
||||
|
||||
def unionSNPs(callTarget, lastJobs, filteredVCF ):
|
||||
if callTarget.otherVCF != None:
|
||||
cmd = GATK + "-T VCFCombine -B GATK,VCF,%s -B glfTrio,VCF,%s -O %s -type UNION -priority GATK,glfTrio -A" % ( filteredVCF, callTarget.otherVCF, os.path.join(OPTIONS.dir, callTarget.unionVCF) )
|
||||
return [farm_commands.FarmJob(cmd, jobName = "UNION_%s" % (callTarget.name), dependencies = lastJobs)]
|
||||
else:
|
||||
return []
|
||||
# jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry)
|
||||
|
||||
def finalize1KGRelease(callTarget, lastJobs, inputVCF, outputVCF):
|
||||
commands = []
|
||||
lastJob = lastJobs
|
||||
|
||||
# subdir = os.path.join(OPTIONS.dir, "1kg_release")
|
||||
# if not os.path.exists(subdir):
|
||||
# os.makedirs(subdir)
|
||||
|
||||
# first, filter out the intersection
|
||||
cmd = GATK + '-T VCFSelect -B variant,VCF,%s -o %s -match "(set eq \'Intersection\' || set eq \'filteredInBoth\')" ' % (inputVCF, outputVCF)
|
||||
lastJob = farm_commands.FarmJob(cmd, jobName = "SELECT_%s" % (callTarget.name), dependencies = lastJob)
|
||||
commands.append(lastJob)
|
||||
|
||||
# call variant annotator to annotate all of the calls
|
||||
# annotatedVCF = os.path.join(subdir, callTarget.name + ".gatk_glftrio.intersection.annotated.vcf")
|
||||
# cmd = GATK + '-T VariantAnnotator -I %s -standard -B variant,VCF,%s -vcf %s -L 1:1-1,000,000 ' % (callTarget.listFile, intersectVCF, annotatedVCF)
|
||||
# lastJob = farm_commands.FarmJob(cmd, jobName = "ANNOTATE_%s" % (callTarget.name), dependencies = lastJob)
|
||||
# commands.append(lastJob)
|
||||
#
|
||||
# # filter the calls
|
||||
# filteredVCF = os.path.join(subdir, callTarget.name + ".gatk_glftrio.intersection.annotated.filtered.vcf")
|
||||
# commands = commands + filterSNPs(callTarget, lastJob, annotatedVCF, filteredVCF)
|
||||
#
|
||||
return commands
|
||||
|
||||
def releaseSNPs(dir, callTarget, filteredVCF ):
|
||||
if not os.path.exists(dir):
|
||||
os.makedirs(dir)
|
||||
|
||||
print 'Copying files into ', dir, filteredVCF, callTarget.unionVCF, callTarget.releaseVCF
|
||||
if not OPTIONS.dry: shutil.copy(filteredVCF, dir)
|
||||
if callTarget.unionVCF != None:
|
||||
if not OPTIONS.dry: shutil.copy(os.path.join(OPTIONS.dir, callTarget.unionVCF), dir)
|
||||
if callTarget.releaseVCF != None:
|
||||
if not OPTIONS.dry: shutil.copy(callTarget.releaseVCF, dir)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
# java -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -T VCFCombine -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -B GATK,VCF,ceu.trio.gatk.ug.filtered.vcf -B glfTrio,VCF,/humgen/gsa-hpprojects/1kg/1kg_pilot2/currentBestProjectCalls/CEU_1kg_pilot2.vcf -O test.vcf -type UNION -priority GATK,glfTrio -l INFO -A
|
||||
# java -ea -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -T VariantEval -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -B eval,VCF,test.vcf -B hapmap-chip,GFF,../../hapmap3Genotypes/NA12878.b36.gff --sampleName NA12878 -vcfInfoSelector set=gatk-filtered
|
||||
|
||||
|
||||
# if ( $1 == 3.1 ) then
|
||||
# cat ceu.trio.calls.allTechs.mmq10_mbq10_q200.filtered.vcf | cut -f 1-10 > ceu.trio.calls.allTechs.mmq10_mbq10_q200.NA12878only.filtered.vcf
|
||||
# endif
|
||||
#
|
||||
# if ( $1 == 4 ) then
|
||||
# foreach callset ( NA12878.allTechs.mmq10_mbq10_q200 ceu.trio.calls.allTechs.mmq10_mbq10_q200.NA12878only )
|
||||
# java -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar -T CallsetConcordance -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -B GATK,VCF,$callset.filtered.vcf -B glfTrio,VCF,CEU_1kg_pilot2.na12878.vcf -CT SimpleVenn -CO ${callset}_v_CEU_1kg_pilot2.filtered.vcf -l INFO
|
||||
# cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "callset2_only"' > ${callset}_v_CEU_1kg_pilot2.filtered.CEU_1kg_pilot2Unique.vcf
|
||||
# cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "callset1_only"' > ${callset}_v_CEU_1kg_pilot2.filtered.${callset}Unique.vcf
|
||||
# cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "concordant"' > ${callset}_v_CEU_1kg_pilot2.filtered.concordant.vcf
|
||||
# end
|
||||
# endif
|
||||
#
|
||||
# if ( $1 == 5 ) then
|
||||
# mkdir /humgen/gsa-scr1/pub/1000Pilot2_010710
|
||||
#
|
||||
# foreach file ( NA12878.allTechs.mmq10_mbq10_q200.filtered.vcf NA12878.SLX.mmq10_mbq10_q50.filtered.vcf NA12891.calls.mmq10_mbq10_q50.filtered.vcf NA12892.calls.mmq10_mbq10_q50.filtered.vcf ceu.trio.calls.allTechs.mmq10_mbq10_q200.filtered.vcf )
|
||||
# echo $file
|
||||
# cp $file /humgen/gsa-scr1/pub/1000Pilot2_010710
|
||||
# end
|
||||
#
|
||||
# endif
|
||||
|
|
@ -1,72 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
import string
|
||||
|
||||
class align_block:
|
||||
def __init__(self, indel_count, length, mismatches):
|
||||
self.indel_count = int(indel_count)
|
||||
self.length = int(length)
|
||||
self.mismatches = int(mismatches)
|
||||
def __str__(self):
|
||||
return string.join( map( str, [self.indel_count, self.length, self.mismatches]), ' ')
|
||||
|
||||
class qltout_record:
|
||||
|
||||
def __init__(self, obj):
|
||||
self.fields = []
|
||||
self.align_blocks = []
|
||||
if type(obj) == str:
|
||||
self.setValuesFromString( obj );
|
||||
else:
|
||||
raise TypeError("qltout_record did not recognize type: "+str(type(obj)))
|
||||
|
||||
def setValuesFromString( self, line ):
|
||||
formats = [str, int, int, int, int, int, int, int, int, int, int]
|
||||
|
||||
split_line = line.split()
|
||||
self.fields = map( lambda f, v: f(v), formats, split_line[:11] )
|
||||
|
||||
next_align_block = 11
|
||||
while next_align_block < len(split_line):
|
||||
self.align_blocks.append( align_block(*split_line[next_align_block:next_align_block+3]) )
|
||||
next_align_block += 3
|
||||
#print self.__str__()
|
||||
|
||||
def indelled_bases(self):
|
||||
return sum([ab.indel_count for ab in self.align_blocks])
|
||||
|
||||
def __str__(self):
|
||||
return string.join( map( str, self.fields ), ' ')+' '+string.join( map( str, self.align_blocks ), ' ')
|
||||
|
||||
def id(self): return self.fields[1]
|
||||
def contig(self): return self.fields[6]
|
||||
|
||||
def offset(self): return self.fields[7]
|
||||
def pos(self): return self.fields[7]+1
|
||||
|
||||
def offset_end(self): return self.fields[8]
|
||||
def pos_end(self): return self.fields[8]+1
|
||||
|
||||
def linear_start(self): return self.fields[9]
|
||||
def map_qual(self): return 100 # This only exists in maq files
|
||||
# for ILT and Merlin, we assume the same confidence for all mappings
|
||||
|
||||
|
||||
class qltout_file:
|
||||
def __init__(self, filename):
|
||||
self.filename = filename
|
||||
self.fqlt = open(self.filename)
|
||||
|
||||
def RecordGenerator(self):
|
||||
|
||||
for line in self.fqlt:
|
||||
#fields = line.rstrip("\n").split("\t")
|
||||
if not line.startswith("QUERY"):
|
||||
continue
|
||||
yield qltout_record(line) #fields)
|
||||
raise StopIteration
|
||||
|
||||
def __iter__(self):
|
||||
return self.RecordGenerator()
|
||||
|
||||
|
||||
|
|
@ -1,20 +0,0 @@
|
|||
#!/bin/tcsh
|
||||
|
||||
setenv refroot ~/work/refmut/
|
||||
setenv refname test2
|
||||
setenv ref "${refroot}/${refname}"
|
||||
setenv cov 100
|
||||
setenv sites 1
|
||||
setenv x '' # '-x 6696745:6708910'
|
||||
|
||||
# testing samtools
|
||||
./SimulateReads.py --ref ${ref}.fasta --coverage ${cov} -n ${sites} -l 10 -t None ${x}
|
||||
|
||||
# running samtools
|
||||
samtools import ${refname}__mut.None_10.bp_nsites.${sites}_cov.${cov}.ref ${refname}__mut.None_10.bp_nsites.${sites}_cov.${cov}.sam samtest.bam
|
||||
|
||||
samtools sort samtest.bam samtestSorted
|
||||
|
||||
samtools index samtestSorted.bam
|
||||
|
||||
samtools pileup -f ${ref}.fasta samtestSorted.bam
|
||||
|
|
@ -1,728 +0,0 @@
|
|||
import os.path
|
||||
import sys
|
||||
from optparse import OptionParser
|
||||
from vcfReader import *
|
||||
#import pylab
|
||||
import operator
|
||||
from itertools import *
|
||||
import math
|
||||
import random
|
||||
|
||||
DEBUG = False
|
||||
|
||||
class Range:
|
||||
ANY = '*'
|
||||
def __init__(self, left = ANY, right = ANY, leftOpen = False, rightOpen = False):
|
||||
self.left = left
|
||||
self.right = right
|
||||
self.leftOpen = leftOpen
|
||||
self.rightOpen = rightOpen
|
||||
|
||||
def __str__(self):
|
||||
leftB, rightB = '[', ']'
|
||||
if self.leftOpen: leftB = '('
|
||||
if self.rightOpen: rightB = ')'
|
||||
return '%s%s, %s%s' % (leftB, self.left, self.right, rightB)
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
def dashedString(self):
|
||||
return str(self).replace(", ", "-")
|
||||
|
||||
def contains(self, v):
|
||||
def test(r, op, open):
|
||||
return r == Range.ANY or op(v, r) or (not open and v == r)
|
||||
return test(self.left, operator.__gt__, self.leftOpen) and test(self.right, operator.__lt__, self.rightOpen)
|
||||
|
||||
class CallCovariate:
|
||||
def __init__(self, feature, featureRange, qualRange, FPRate = None):
|
||||
self.feature = feature
|
||||
|
||||
self.featureRange = featureRange
|
||||
|
||||
self.qualRange = qualRange
|
||||
self.FPRate = FPRate
|
||||
|
||||
def containsVariant(self, call):
|
||||
inFeature = self.featureRange.contains(call.getField(self.feature))
|
||||
inQual = self.qualRange.contains(call.getQual())
|
||||
#print 'inFeature, inQual', inFeature, inQual
|
||||
return inFeature and inQual
|
||||
|
||||
def getFPRate(self): return self.FPRate
|
||||
def getFeature(self): return self.feature
|
||||
|
||||
def getCovariateField(self): return self.getFeature() + '_RQ'
|
||||
|
||||
def __str__(self): return "[CC feature=%s range=%s qualRange=%s]" % (self.feature, self.featureRange, self.qualRange)
|
||||
|
||||
class RecalibratedCall:
|
||||
def __init__(self, call, features):
|
||||
self.call = call
|
||||
self.features = dict([[feature, None] for feature in features])
|
||||
|
||||
def recalFeature( self, feature, FPRate ):
|
||||
assert self.features[feature] == None, "Feature " + feature + ' has value ' + str(self.features[feature]) + ' for call ' + str(self.call) # not reassigning values
|
||||
assert FPRate <= 1 and FPRate >= 0
|
||||
self.features[feature] = FPRate
|
||||
|
||||
def getFeature( self, feature, missingValue = None, phredScaleValue = False ):
|
||||
v = self.features[feature]
|
||||
if v == None:
|
||||
return missingValue
|
||||
elif phredScaleValue:
|
||||
return phredScale(v)
|
||||
else:
|
||||
return v
|
||||
|
||||
def jointFPErrorRate(self):
|
||||
#print self.features
|
||||
logTPRates = [math.log10(1-r) for r in self.features.itervalues() if r <> None]
|
||||
logJointTPRate = reduce(lambda x, y: x + y, logTPRates, 0)
|
||||
logJointTPRate = min(logJointTPRate, 1e-3 / 3) # approximation from het of 0.001
|
||||
jointTPRate = math.pow(10, logJointTPRate)
|
||||
#print logTPRates
|
||||
#print logJointTPRate, jointTPRate
|
||||
return 1 - jointTPRate
|
||||
|
||||
def featureStringList(self):
|
||||
return ','.join(map(lambda feature: '%s=Q%d' % (feature, self.getFeature(feature, '*', True)), self.features.iterkeys()))
|
||||
|
||||
def __str__(self):
|
||||
return '[%s: %s => Q%d]' % (str(self.call), self.featureStringList(), phredScale(self.jointFPErrorRate()))
|
||||
|
||||
def readVariants( file, maxRecords = None, decodeAll = True, downsampleFraction = 1, filter = None, minQScore = -1, maxQScore = 10000000, mustBeVariant = False, skip = None ):
|
||||
if filter == None:
|
||||
filter = not OPTIONS.unfiltered
|
||||
|
||||
f = open(file)
|
||||
header, columnNames, lines = readVCFHeader(f)
|
||||
|
||||
nLowQual = 0
|
||||
counter = 0
|
||||
def parseVariant(args):
|
||||
global nLowQual, counter
|
||||
header1, VCF, counter = args
|
||||
counter += 1
|
||||
#print counter, skip, counter % skip
|
||||
if filter and not VCF.passesFilters() or ( False and mustBeVariant == True and not VCF.isVariant() ): # currently ignore mustBeVariant
|
||||
#print 'filtering', VCF
|
||||
return None
|
||||
elif VCF.getQual() <= minQScore or VCF.getQual() > maxQScore:
|
||||
#print 'filtering', VCF.getQual()
|
||||
#nLowQual += 1
|
||||
return None
|
||||
elif skip <> None and counter % skip <> 0:
|
||||
#print 'skipping'
|
||||
return None
|
||||
elif random.random() <= downsampleFraction:
|
||||
#print 'keeping', VCF
|
||||
return VCF
|
||||
else:
|
||||
return None
|
||||
|
||||
variants = ifilter(None, imap(parseVariant, islice(lines2VCF(lines, header=header, columnNames = columnNames, extendedOutput = True, decodeAll = decodeAll), maxRecords)))
|
||||
if nLowQual > 0:
|
||||
print '%d snps filtered due to QUAL < %d' % (nLowQual, minQScore)
|
||||
return header, variants
|
||||
|
||||
def selectVariants( variants, selector = None ):
|
||||
if selector <> None:
|
||||
return filter(selector, variants)
|
||||
else:
|
||||
return variants
|
||||
|
||||
def titv(variants):
|
||||
ti = len(filter(VCFRecord.isTransition, variants))
|
||||
tv = len(variants) - ti
|
||||
titv = ti / (1.0*max(tv,1))
|
||||
|
||||
return titv
|
||||
|
||||
def dbSNPRate(variants):
|
||||
inDBSNP = len(filter(VCFRecord.isKnown, variants))
|
||||
return float(inDBSNP) / max(len(variants),1)
|
||||
|
||||
def gaussian(x, mu, sigma):
|
||||
constant = 1 / math.sqrt(2 * math.pi * sigma**2)
|
||||
exponent = -1 * ( x - mu )**2 / (2 * sigma**2)
|
||||
return constant * math.exp(exponent)
|
||||
|
||||
# if target = T, and FP calls have ti/tv = 0.5, we want to know how many FP calls
|
||||
# there are in N calls with ti/tv of X.
|
||||
#
|
||||
def titvFPRateEstimate(variants, target):
|
||||
titvRatio = titv(variants)
|
||||
|
||||
# f <- function(To,T) { (To - T) / (1/2 - T) + 0.001 }
|
||||
def theoreticalCalc():
|
||||
if titvRatio >= target:
|
||||
FPRate = 0
|
||||
else:
|
||||
FPRate = (titvRatio - target) / (0.5 - target)
|
||||
FPRate = min(max(FPRate, 0), 1)
|
||||
TPRate = max(min(1 - FPRate, 1 - dephredScale(OPTIONS.maxQScoreForCovariate)), dephredScale(OPTIONS.maxQScoreForCovariate))
|
||||
if DEBUG: print 'FPRate', FPRate, titvRatio, target
|
||||
assert FPRate >= 0 and FPRate <= 1
|
||||
return TPRate
|
||||
|
||||
# gaussian model
|
||||
def gaussianModel():
|
||||
LEFT_HANDED = True
|
||||
sigma = 1 # old value is 5
|
||||
constant = 1 / math.sqrt(2 * math.pi * sigma**2)
|
||||
exponent = -1 * ( titvRatio - target )**2 / (2 * sigma**2)
|
||||
TPRate = gaussian(titvRatio, target, sigma) / gaussian(target, target, sigma)
|
||||
if LEFT_HANDED and titvRatio >= target:
|
||||
TPRate = 1
|
||||
TPRate -= dephredScale(OPTIONS.maxQScoreForCovariate)
|
||||
if DEBUG: print 'TPRate', TPRate, constant, exponent, dephredScale(OPTIONS.maxQScoreForCovariate)
|
||||
return TPRate
|
||||
|
||||
FPRate = 1 - theoreticalCalc()
|
||||
#FPRate = 1 - gaussianModel()
|
||||
nVariants = len(variants)
|
||||
|
||||
if DEBUG: print ':::', nVariants, titvRatio, target, FPRate
|
||||
|
||||
return titvRatio, FPRate
|
||||
|
||||
def phredScale(errorRate):
|
||||
return -10 * math.log10(max(errorRate, 1e-10))
|
||||
|
||||
def dephredScale(qscore):
|
||||
return math.pow(10, float(qscore) / -10)
|
||||
|
||||
def frange6(*args):
|
||||
"""A float range generator."""
|
||||
start = 0.0
|
||||
step = 1.0
|
||||
|
||||
l = len(args)
|
||||
if l == 1:
|
||||
end = args[0]
|
||||
elif l == 2:
|
||||
start, end = args
|
||||
elif l == 3:
|
||||
start, end, step = args
|
||||
if step == 0.0:
|
||||
raise ValueError, "step must not be zero"
|
||||
else:
|
||||
raise TypeError, "frange expects 1-3 arguments, got %d" % l
|
||||
|
||||
v = start
|
||||
while True:
|
||||
if (step > 0 and v >= end) or (step < 0 and v <= end):
|
||||
raise StopIteration
|
||||
yield v
|
||||
v += step
|
||||
|
||||
def compareFieldValues( v1, v2 ):
|
||||
if type(v1) <> type(v2):
|
||||
#print 'Different types', type(v1), type(v2)
|
||||
c = cmp(type(v1), type(v2))
|
||||
else:
|
||||
c = cmp(v1, v2)
|
||||
#print 'Comparing %s %s = %s' % (v1, v2, c)
|
||||
return c
|
||||
|
||||
def calculateBins(variants, field, minValue, maxValue, partitions):
|
||||
values = map(lambda x: x.getField(field), variants)
|
||||
return calculateBinsForValues(values, field, minValue, maxValue, partitions)
|
||||
|
||||
def calculateBinsForValues(values, field, minValue, maxValue, partitions):
|
||||
sortedValues = sorted(values)
|
||||
captureFieldRangeForPrinting(field, sortedValues)
|
||||
|
||||
targetBinSize = len(values) / (1.0*partitions)
|
||||
#print sortedValues
|
||||
uniqBins = groupby(sortedValues)
|
||||
binsAndSizes = map(lambda x: [x[0], len(list(x[1]))], uniqBins)
|
||||
#print 'BINS AND SIZES', binsAndSizes
|
||||
|
||||
def bin2Break(bin): return [bin[0], bin[0], bin[1]]
|
||||
bins = [bin2Break(binsAndSizes[0])]
|
||||
for bin in binsAndSizes[1:]:
|
||||
#print ' Breaks', bins
|
||||
#print ' current bin', bin
|
||||
curLeft = bins[-1][0]
|
||||
curSize = bin[1]
|
||||
prevSize = bins[-1][2]
|
||||
#print curSize, prevSize
|
||||
if curSize + prevSize > targetBinSize or (not isNumber(curLeft) and isNumber(bin[0])):
|
||||
#print ' => appending', bin2Break(bin)
|
||||
bins.append(bin2Break(bin))
|
||||
else:
|
||||
bins[-1][1] = bin[0]
|
||||
bins[-1][2] += curSize
|
||||
|
||||
#print 'Returning ', bins
|
||||
#sys.exit(1)
|
||||
return bins
|
||||
|
||||
def fieldRange(variants, field):
|
||||
values = map(lambda v: v.getField(field), variants)
|
||||
minValue = min(values)
|
||||
maxValue = max(values)
|
||||
#rangeValue = maxValue - minValue
|
||||
bins = calculateBins(variants, field, minValue, maxValue, OPTIONS.partitions)
|
||||
validateBins(bins)
|
||||
return minValue, maxValue, bins
|
||||
|
||||
def validateBins(bins):
|
||||
#print 'Bins are', bins
|
||||
for left1, right1, count1 in bins:
|
||||
for left2, right2, count2 in bins:
|
||||
def contains2(x):
|
||||
return left2 < x and x < right2
|
||||
|
||||
if left1 <> left2 and right1 <> right2:
|
||||
if None in [left1, left2, right1, right2]:
|
||||
pass # we're ok
|
||||
elif contains2(left1) or contains2(right2):
|
||||
raise Exception("Bad bins", left1, right1, left2, right2)
|
||||
|
||||
def printFieldQualHeader():
|
||||
more = ""
|
||||
if TRUTH_CALLS <> None:
|
||||
more = CallCmp.HEADER
|
||||
def p(stream):
|
||||
if stream <> None:
|
||||
print >> stream, ' %20s %20s left right %15s nVariants nNovels titv titvNovels dbSNP Q' % ("category", "field", "qRange"), more
|
||||
p(sys.stdout)
|
||||
p(RECAL_LOG)
|
||||
|
||||
def printFieldQual( category, field, cc, variants ):
|
||||
more = ""
|
||||
if TRUTH_CALLS <> None:
|
||||
callComparison, theseFPs = sensitivitySpecificity(variants, TRUTH_CALLS)
|
||||
more = str(callComparison)
|
||||
novels = selectVariants(variants, VCFRecord.isNovel)
|
||||
def p(stream):
|
||||
if stream <> None:
|
||||
print >> stream, ' %20s %20s %15s %15s %8d %8d %2.2f %2.2f %3.2f %3d' % (category, field, binString(field, cc.featureRange), cc.qualRange.dashedString(), len(variants), len(novels), titv(variants), titv(novels), dbSNPRate(variants) * 100, phredScale(cc.FPRate)), more
|
||||
p(sys.stdout)
|
||||
p(RECAL_LOG)
|
||||
|
||||
FIELD_RANGES = dict()
|
||||
def captureFieldRangeForPrinting(field, sortedValues):
|
||||
"""Finds the minimum float value in sortedValues for convenience printing in recal.log"""
|
||||
#print sortedValues
|
||||
floatValues = filter(isNumber, sortedValues)
|
||||
if floatValues <> []:
|
||||
FIELD_RANGES[field] = floatValues[0]
|
||||
#print 'Setting field range to', field, FIELD_RANGES[field]
|
||||
|
||||
|
||||
def isNumber(x):
|
||||
return isinstance(x, (int, long, float))
|
||||
|
||||
def binString(field, cc):
|
||||
epsilon = 1e-2
|
||||
left, right = cc.left, cc.right
|
||||
leftStr = str(left)
|
||||
rightStr = "%5s" % str(right)
|
||||
if OPTIONS.plottableNones and not isNumber(left) and not isNumber(right) and field in FIELD_RANGES:
|
||||
left = right = FIELD_RANGES[field] - epsilon
|
||||
if OPTIONS.plottableNones and not isNumber(left) and isNumber(right):
|
||||
left = right - epsilon
|
||||
if isNumber(left): leftStr = "%.4f" % left
|
||||
if isNumber(right): rightStr = "%.4f" % right
|
||||
return '%12s %12s' % (leftStr, rightStr)
|
||||
|
||||
#
|
||||
#
|
||||
#
|
||||
def recalibrateCalls(variants, fields, callCovariates):
|
||||
def phred(v): return phredScale(v)
|
||||
|
||||
for variant in variants:
|
||||
recalCall = RecalibratedCall(variant, fields)
|
||||
originalQual = variant.getField('QUAL')
|
||||
|
||||
for callCovariate in callCovariates:
|
||||
if callCovariate.containsVariant(variant):
|
||||
FPR = callCovariate.getFPRate()
|
||||
recalCall.recalFeature(callCovariate.getFeature(), FPR)
|
||||
recalCall.call.setField(callCovariate.getCovariateField(), phred(FPR))
|
||||
|
||||
#recalCall.call.setField('QUAL', phred(recalCall.jointFPErrorRate()))
|
||||
recalCall.call.setField('QUAL', phred(recalCall.jointFPErrorRate()))
|
||||
recalCall.call.setField('OQ', originalQual)
|
||||
#print 'recalibrating', variant.getLoc()
|
||||
#print ' =>', variant
|
||||
yield recalCall.call
|
||||
|
||||
#
|
||||
#
|
||||
#
|
||||
def optimizeCalls(variants, fields, titvTarget):
|
||||
callCovariates = calibrateFeatures(variants, fields, titvTarget, category = "covariates", useBreaks=True)
|
||||
recalCalls = recalibrateCalls(variants, fields, callCovariates)
|
||||
return recalCalls, callCovariates
|
||||
|
||||
def printCallQuals(field, recalCalls, titvTarget, info = ""):
|
||||
print '--------------------------------------------------------------------------------'
|
||||
print info
|
||||
calibrateFeatures(recalCalls, [field], titvTarget, printCall = True, cumulative = False, forcePrint = True, prefix = "OPT-", printHeader = False, category = "optimized-calls" )
|
||||
print 'Cumulative'
|
||||
calibrateFeatures(recalCalls, [field], titvTarget, printCall = True, cumulative = True, forcePrint = True, prefix = "OPTCUM-", printHeader = False, category = "optimized-calls" )
|
||||
|
||||
def all( p, l ):
|
||||
for elt in l:
|
||||
if not p(elt): return False
|
||||
return True
|
||||
|
||||
|
||||
def mapVariantBins(variants, field, cumulative = False, breakQuals = [Range()]):
|
||||
minValue, maxValue, featureBins = fieldRange(variants, field)
|
||||
|
||||
#print 'BREAKQuals', breakQuals[0]
|
||||
bins = [(x,y) for x in featureBins for y in breakQuals]
|
||||
#print 'BINS', bins
|
||||
|
||||
def variantsInBin(featureBin, qualRange):
|
||||
right = featureBin[1]
|
||||
if cumulative:
|
||||
right = Range.ANY
|
||||
cc = CallCovariate(field, Range(featureBin[0], right), qualRange)
|
||||
|
||||
return cc, selectVariants(variants, lambda v: cc.containsVariant(v))
|
||||
|
||||
#sys.exit(1)
|
||||
return starmap( variantsInBin, bins )
|
||||
|
||||
def qBreaksRanges(qBreaks, useBreaks):
|
||||
if qBreaks == None or not useBreaks:
|
||||
return [Range()] # include everything in a single range
|
||||
else:
|
||||
breaks = map(float, qBreaks.split(','))
|
||||
return map(lambda x, y: Range(x,y, rightOpen = True), chain([Range.ANY], breaks), chain(breaks, [Range.ANY]))
|
||||
|
||||
def calibrateFeatures(variants, fields, titvTarget, printCall = False, cumulative = False, forcePrint = False, prefix = '', printHeader = True, category = None, useBreaks = False ):
|
||||
covariates = []
|
||||
|
||||
if printHeader: printFieldQualHeader()
|
||||
for field in fields:
|
||||
if DEBUG: print 'Optimizing field', field
|
||||
|
||||
titv, FPRate = titvFPRateEstimate(variants, titvTarget)
|
||||
#print 'Overall FRRate:', FPRate, nErrors, phredScale(FPRate)
|
||||
|
||||
for cc, selectedVariants in mapVariantBins(variants, field, cumulative = cumulative, breakQuals = qBreaksRanges(OPTIONS.QBreaks, useBreaks and field <> 'QUAL')):
|
||||
#print 'CC', cc, field, useBreaks
|
||||
if len(selectedVariants) > max(OPTIONS.minVariantsPerBin,1) or forcePrint:
|
||||
titv, FPRate = titvFPRateEstimate(selectedVariants, titvTarget)
|
||||
#dbsnp = dbSNPRate(selectedVariants)
|
||||
cc.FPRate = FPRate
|
||||
covariates.append(cc)
|
||||
printFieldQual( category, prefix + field, cc, selectedVariants )
|
||||
else:
|
||||
print 'Not calibrating bin', cc, 'because it contains too few variants:', len(selectedVariants)
|
||||
|
||||
return covariates
|
||||
|
||||
class CallCmp:
|
||||
def __init__(self, nTP, nFP, nFN):
|
||||
self.nTP = nTP
|
||||
self.nFP = nFP
|
||||
self.nFN = nFN
|
||||
|
||||
# def FPRate(self):
|
||||
# return (1.0*self.nFP) / max(self.nTP + self.nFP, 1)
|
||||
|
||||
def FNRate(self):
|
||||
return (1.0*self.nFN) / max(self.nTP + self.nFN, 1)
|
||||
|
||||
def sensitivity(self):
|
||||
# = TP / (TP + FN)
|
||||
return (1.0*self.nTP) / max( self.nTP + self.nFN,1 )
|
||||
|
||||
def PPV(self):
|
||||
# = TP / (TP + FP)
|
||||
return (1.0*self.nTP) / max( self.nTP + self.nFP, 1 )
|
||||
|
||||
HEADER = "TP FP FN FNRate Sensitivity PPV"
|
||||
|
||||
def __str__(self):
|
||||
return '%6d %6d %6d %.3f %.3f %.3f' % (self.nTP, self.nFP, self.nFN, self.FNRate(), self.sensitivity(), self.PPV())
|
||||
|
||||
def variantInTruth(variant, truth):
|
||||
if variant.getLoc() in truth:
|
||||
return truth[variant.getLoc()]
|
||||
else:
|
||||
return False
|
||||
|
||||
def isVariantInSample(t, sample):
|
||||
#print "isVariantInSample", t.getLoc(), t.getField(sample), x
|
||||
return t.getField(sample) <> "0/0"
|
||||
|
||||
def variantsInTruth(truth):
|
||||
# fixme
|
||||
return len(filter(lambda x: isVariantInSample(x, OPTIONS.useSample), truth))
|
||||
|
||||
def sensitivitySpecificity(variants, truth):
|
||||
nTP, nFP = 0, 0
|
||||
FPs = []
|
||||
for variant in variants:
|
||||
t = variantInTruth(variant, truth)
|
||||
|
||||
isTP, isFP = False, False
|
||||
if OPTIONS.useSample or OPTIONS.onlyAtTruth:
|
||||
if t: # we have a site
|
||||
isTP = (isVariantInSample(t, OPTIONS.useSample) and t.isVariant()) or (not isVariantInSample(t, OPTIONS.useSample) and not t.isVariant())
|
||||
isFP = not isTP
|
||||
else:
|
||||
isTP = t
|
||||
isFP = not t
|
||||
|
||||
#if variant.getLoc() == "1:867694":
|
||||
# print variant, 'T: [', t, '] isTP, isFP', isTP, isFP
|
||||
|
||||
if isTP:
|
||||
t.setField("FN", 0)
|
||||
variant.setField("TP", 1)
|
||||
nTP += 1
|
||||
elif isFP:
|
||||
nFP += 1
|
||||
variant.setField("TP", 0)
|
||||
#print t, variant, "is a FP!"
|
||||
FPs.append(variant)
|
||||
nRef = 0 # len(filter(lambda x: not x.isVariant(), truth.itervalues()))
|
||||
nFN = variantsInTruth(truth.itervalues()) - nTP - nRef
|
||||
#print 'nRef', nTP, nFP, nFN, nRef
|
||||
return CallCmp(nTP, nFP, nFN), FPs
|
||||
|
||||
def markTruth(calls):
|
||||
if not OPTIONS.useSample:
|
||||
for variant in calls.itervalues():
|
||||
variant.setField("TP", 0) # set the TP field to 0
|
||||
|
||||
def compareCalls(calls, truthCalls):
|
||||
#markTruth(truthCalls)
|
||||
|
||||
def compare1(name, cumulative):
|
||||
for field in ["QUAL", "OQ"]:
|
||||
for cc, selectedVariants in mapVariantBins(calls, field, cumulative = cumulative):
|
||||
#print selectedVariants[0]
|
||||
printFieldQual("truth-comparison-" + name, field, cc, selectedVariants )
|
||||
|
||||
print 'PER BIN nCalls=', len(calls)
|
||||
compare1('per-bin', False)
|
||||
|
||||
print 'CUMULATIVE nCalls=', len(calls)
|
||||
compare1('cum', True)
|
||||
|
||||
def randomSplit(l, pLeft):
|
||||
def keep(elt, p):
|
||||
if p < pLeft:
|
||||
return elt, None
|
||||
else:
|
||||
return None, elt
|
||||
data = [keep(elt, p) for elt, p in zip(l, map(lambda x: random.random(), l))]
|
||||
def get(i): return filter(lambda x: x <> None, [x[i] for x in data])
|
||||
return get(0), get(1)
|
||||
|
||||
def setup():
|
||||
global OPTIONS, header
|
||||
usage = "usage: %prog files.list [options]"
|
||||
parser = OptionParser(usage=usage)
|
||||
parser.add_option("-f", "--f", dest="fields",
|
||||
type='string', default="QUAL",
|
||||
help="Comma-separated list of fields (either in the VCF columns of as INFO keys) to use during optimization [default: %default]")
|
||||
parser.add_option("-t", "--truth", dest="truth",
|
||||
type='string', default=None,
|
||||
help="VCF formated truth file. If provided, the script will compare the input calls with the truth calls. It also emits calls tagged as TP and a separate file of FP calls")
|
||||
parser.add_option("-l", "--recalLog", dest="recalLog",
|
||||
type='string', default="recal.log",
|
||||
help="VCF formated truth file. If provided, the script will compare the input calls with the truth calls. It also emits calls tagged as TP and a separate file of FP calls")
|
||||
parser.add_option("-u", "--unfiltered", dest="unfiltered",
|
||||
action='store_true', default=False,
|
||||
help="If provided, unfiltered calls will be used in comparisons [default: %default]")
|
||||
parser.add_option("", "--plottable", dest="plottableNones",
|
||||
action='store_true', default=False,
|
||||
help="If provided, will generate fake plottable points for annotations with None values -- doesn't effect the behavior of the system just makes it easy to plot outputs [default: %default]")
|
||||
parser.add_option("", "--onlyAtTruth", dest="onlyAtTruth",
|
||||
action='store_true', default=False,
|
||||
help="If provided, we only consider TP/FP/FN at truth sites[default: %default]")
|
||||
parser.add_option("-p", "--partitions", dest="partitions",
|
||||
type='int', default=25,
|
||||
help="Number of partitions to use for each feature. Don't use so many that the number of variants per bin is very low. [default: %default]")
|
||||
parser.add_option("", "--maxRecordsForCovariates", dest="maxRecordsForCovariates",
|
||||
type='int', default=2000000,
|
||||
help="Derive covariate information from up to this many VCF records. For files with more than this number of records, the system downsamples the reads [default: %default]")
|
||||
parser.add_option("-m", "--minVariantsPerBin", dest="minVariantsPerBin",
|
||||
type='int', default=10,
|
||||
help="Don't include any covariates with fewer than this number of variants in the bin, if such a thing happens. NEEDS TO BE FIXED")
|
||||
parser.add_option("-M", "--maxRecords", dest="maxRecords",
|
||||
type='int', default=None,
|
||||
help="Maximum number of input VCF records to process, if provided. Default is all records")
|
||||
parser.add_option("-q", "--qMin", dest="minQScore",
|
||||
type='int', default=-1,
|
||||
help="The minimum Q score of the initial SNP list to consider for selection [default: %default]")
|
||||
parser.add_option("-Q", "--qMax", dest="maxQScore",
|
||||
type='int', default=1000000,
|
||||
help="The maximum Q score allowed for both a single covariate and the overall QUAL score [default: %default]")
|
||||
parser.add_option("", "--QBreaks", dest="QBreaks",
|
||||
type='string', default=None,
|
||||
help="Breaks in QUAL for generating covarites [default: %default]")
|
||||
parser.add_option("", "--maxQScoreForCovariate", dest="maxQScoreForCovariate",
|
||||
type='int', default=60,
|
||||
help="The maximum Q score allowed for both a single covariate and the overall QUAL score [default: %default]")
|
||||
parser.add_option("-o", "--outputVCF", dest="outputVCF",
|
||||
type='string', default="recal.vcf",
|
||||
help="If provided, a VCF file will be written out to this file [default: %default]")
|
||||
parser.add_option("", "--FNoutputVCF", dest="FNoutputVCF",
|
||||
type='string', default=None,
|
||||
help="If provided, VCF file will be written out to this file [default: %default]")
|
||||
parser.add_option("", "--titv", dest="titvTarget",
|
||||
type='float', default=None,
|
||||
help="If provided, we will optimize calls to the targeted ti/tv rather than that calculated from known calls [default: %default]")
|
||||
parser.add_option("-b", "--bootstrap", dest="bootStrap",
|
||||
type='float', default=None,
|
||||
help="If provided, the % of the calls used to generate the recalibration tables. [default: %default]")
|
||||
parser.add_option("-k", "--skip", dest="skip",
|
||||
type=int, default=None,
|
||||
help="If provided, we'll only take every nth record [default: %default]")
|
||||
parser.add_option("-s", "--useSample", dest="useSample",
|
||||
type='string', default=False,
|
||||
help="If provided, we will examine sample genotypes for this sample, and consider TP/FP/FN in the truth conditional on sample genotypes [default: %default]")
|
||||
parser.add_option("-r", "--dontRecalibrate", dest="dontRecalibrate",
|
||||
action='store_true', default=False,
|
||||
help="If provided, we will not actually do anything to the calls, they will just be assessed [default: %default]")
|
||||
|
||||
(OPTIONS, args) = parser.parse_args()
|
||||
if len(args) > 2:
|
||||
parser.error("incorrect number of arguments")
|
||||
return args
|
||||
|
||||
def assessCalls(file):
|
||||
print 'Counting records in VCF', file
|
||||
numberOfRecords = 1
|
||||
downsampleFraction = 1
|
||||
if OPTIONS.maxRecords <> None:
|
||||
numberOfRecords = quickCountRecords(open(file))
|
||||
if OPTIONS.maxRecords < numberOfRecords:
|
||||
numberOfRecords = OPTIONS.maxRecords
|
||||
downsampleFraction = min(float(OPTIONS.maxRecordsForCovariates) / numberOfRecords, 1)
|
||||
#print 'Reading variants', OPTIONS.skip, downsampleFraction
|
||||
header, allCalls = readVariants(file, OPTIONS.maxRecords, downsampleFraction=downsampleFraction, minQScore = OPTIONS.minQScore, maxQScore = OPTIONS.maxQScore, skip = OPTIONS.skip)
|
||||
allCalls = list(allCalls)
|
||||
print 'Number of VCF records', numberOfRecords, ', max number of records for covariates is', OPTIONS.maxRecordsForCovariates, 'so keeping', downsampleFraction * 100, '% of records'
|
||||
print 'Number of selected VCF records', len(allCalls)
|
||||
|
||||
titvtarget = OPTIONS.titvTarget
|
||||
if titvtarget == None:
|
||||
titvtarget = titv(selectVariants(allCalls, VCFRecord.isKnown))
|
||||
print 'Ti/Tv all ', titv(allCalls)
|
||||
print 'Ti/Tv known', titv(selectVariants(allCalls, VCFRecord.isKnown))
|
||||
print 'Ti/Tv novel', titv(selectVariants(allCalls, VCFRecord.isNovel))
|
||||
|
||||
return header, allCalls, titvtarget
|
||||
|
||||
def determineCovariates(allCalls, titvtarget, fields):
|
||||
if OPTIONS.bootStrap:
|
||||
callsToOptimize, recalEvalCalls = randomSplit(allCalls, OPTIONS.bootStrap)
|
||||
else:
|
||||
callsToOptimize = allCalls
|
||||
|
||||
recalOptCalls, covariates = optimizeCalls(callsToOptimize, fields, titvtarget)
|
||||
printCallQuals("QUAL", list(recalOptCalls), titvtarget, 'OPTIMIZED CALLS')
|
||||
|
||||
if OPTIONS.bootStrap:
|
||||
recalibatedEvalCalls = recalibrateCalls(recalEvalCalls, fields, covariates)
|
||||
printCallQuals("QUAL", list(recalibatedEvalCalls), titvtarget, 'BOOTSTRAP EVAL CALLS')
|
||||
|
||||
return covariates
|
||||
|
||||
def writeRecalibratedCalls(file, header, calls):
|
||||
if file:
|
||||
f = open(file, 'w')
|
||||
#print 'HEADER', header
|
||||
i = 0
|
||||
for line in formatVCF(header, calls):
|
||||
if i % 10000 == 0: print 'writing VCF record', i
|
||||
i += 1
|
||||
print >> f, line
|
||||
f.close()
|
||||
|
||||
def readTruth(truthVCF):
|
||||
print 'Reading truth file', truthVCF
|
||||
rawTruth = list(readVariants(truthVCF, maxRecords = None, decodeAll = True, mustBeVariant = True)[1])
|
||||
truth = dict( [[v.getLoc(), v] for v in rawTruth])
|
||||
print 'Number of raw and passing filter truth calls', len(rawTruth), len(truth)
|
||||
return truth
|
||||
|
||||
def evaluateTruth(header, callVCF, truth, truthVCF):
|
||||
print 'Reading variants back in from', callVCF
|
||||
header, calls = readVariants(callVCF)
|
||||
calls = list(calls)
|
||||
|
||||
print '--------------------------------------------------------------------------------'
|
||||
print 'Comparing calls to truth', truthVCF
|
||||
print ''
|
||||
|
||||
compareCalls(calls, truth)
|
||||
|
||||
writeRecalibratedCalls(callVCF, header, calls)
|
||||
|
||||
def isFN(v):
|
||||
return isVariantInSample(v, OPTIONS.useSample) and not v.hasField("FN")
|
||||
|
||||
if truth <> None and OPTIONS.FNoutputVCF:
|
||||
f = open(OPTIONS.FNoutputVCF, 'w')
|
||||
#print 'HEADER', header
|
||||
for line in formatVCF(header, filter( isFN, truth.itervalues())):
|
||||
print >> f, line
|
||||
f.close()
|
||||
|
||||
TRUTH_CALLS = None
|
||||
RECAL_LOG = None
|
||||
def main():
|
||||
global TRUTH_CALLS, RECAL_LOG
|
||||
|
||||
args = setup()
|
||||
fields = OPTIONS.fields.split(',')
|
||||
|
||||
truthVCF = None
|
||||
#print("LENGTH OF ARGS "+str(len(args)))
|
||||
|
||||
if OPTIONS.truth <> None:
|
||||
truthVCF = OPTIONS.truth
|
||||
TRUTH_CALLS = readTruth(truthVCF)
|
||||
|
||||
if OPTIONS.recalLog <> None:
|
||||
RECAL_LOG = open(OPTIONS.recalLog, "w")
|
||||
print >> RECAL_LOG, "# optimized vcf", args[0]
|
||||
print >> RECAL_LOG, "# truth vcf", truthVCF
|
||||
for key, value in OPTIONS.__dict__.iteritems():
|
||||
print >> RECAL_LOG, '#', key, value
|
||||
|
||||
header, allCalls, titvTarget = assessCalls(args[0])
|
||||
if not OPTIONS.dontRecalibrate:
|
||||
covariates = determineCovariates(allCalls, titvTarget, fields)
|
||||
print OPTIONS
|
||||
header, callsToRecalibate = readVariants(args[0], OPTIONS.maxRecords, minQScore = OPTIONS.minQScore, maxQScore = OPTIONS.maxQScore, skip = OPTIONS.skip)
|
||||
RecalibratedCalls = recalibrateCalls(callsToRecalibate, fields, covariates)
|
||||
writeRecalibratedCalls(OPTIONS.outputVCF, header, RecalibratedCalls)
|
||||
else:
|
||||
printFieldQualHeader()
|
||||
printCallQuals("QUAL", allCalls, titvTarget)
|
||||
OPTIONS.outputVCF = args[0]
|
||||
|
||||
if truthVCF <> None:
|
||||
evaluateTruth(header, OPTIONS.outputVCF, TRUTH_CALLS, truthVCF)
|
||||
|
||||
|
||||
PROFILE = False
|
||||
if __name__ == "__main__":
|
||||
if PROFILE:
|
||||
import cProfile
|
||||
cProfile.run('main()', 'fooprof')
|
||||
import pstats
|
||||
p = pstats.Stats('fooprof')
|
||||
p.sort_stats('cumulative').print_stats(10)
|
||||
p.sort_stats('time').print_stats(10)
|
||||
p.sort_stats('time', 'cum').print_stats(.5, 'init')
|
||||
else:
|
||||
main()
|
||||
|
|
@ -1,49 +0,0 @@
|
|||
import sys
|
||||
from optparse import OptionParser
|
||||
|
||||
def getRsIDSet(dbSNPIds, idIndex):
|
||||
s = set()
|
||||
for line in open(dbSNPIds):
|
||||
#print line
|
||||
rsID = line.split()[idIndex]
|
||||
s.add(rsID)
|
||||
|
||||
return frozenset(s)
|
||||
|
||||
|
||||
def main():
|
||||
global OPTIONS
|
||||
usage = "usage: %prog [options] dbSNP.in.rsids dbSNP.to.match dbSNP.out"
|
||||
parser = OptionParser(usage=usage)
|
||||
parser.add_option("-v", "--verbose", dest="verbose",
|
||||
action='store_true', default=False,
|
||||
help="")
|
||||
|
||||
(OPTIONS, args) = parser.parse_args()
|
||||
if len(args) != 3:
|
||||
parser.error("incorrect number of arguments")
|
||||
|
||||
dbSNPIds = args[0]
|
||||
dbSNPMatch = args[1]
|
||||
dbSNPOut = args[2]
|
||||
idIndex = 4
|
||||
|
||||
rsSet = getRsIDSet(dbSNPIds, idIndex)
|
||||
print 'rsID set has %d elements' % len(rsSet)
|
||||
|
||||
#
|
||||
count = 0
|
||||
matched = 0
|
||||
out = open(dbSNPOut, 'w')
|
||||
for line in open(dbSNPMatch):
|
||||
count += 1
|
||||
rsID = line.split()[idIndex]
|
||||
if rsID in rsSet:
|
||||
#sys.stdout.write(line)
|
||||
matched += 1
|
||||
out.write(line)
|
||||
print 'Processed %d lines, matching %d elements, excluding %d' % ( count, matched, count - matched )
|
||||
out.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,287 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
"""Starts from TGTC list of fastb, qualb, and qltout files and creates merged SAM/BAM files"""
|
||||
|
||||
import os, commands, sys
|
||||
from farm_commands import cmd
|
||||
|
||||
# Global list of failed conversions to be reported at end of execution
|
||||
failed_list = []
|
||||
|
||||
class FailedConversion:
|
||||
def __init__(self, lane, error_str):
|
||||
self.lane = lane
|
||||
self.error_str = error_str
|
||||
print self
|
||||
def __str__(self):
|
||||
return self.error_str+" while attempting conversion of "+str(self.lane)
|
||||
|
||||
class lane:
|
||||
bam_ref_list = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bam.ref_list";
|
||||
pipe_dirs_head = "/slxa/pipelineOutput/farm"
|
||||
pipe_dirs = os.listdir(pipe_dirs_head)
|
||||
pipe_dirs_dict = dict( [(dir.split("_")[1], dir) for dir in pipe_dirs if "_" in dir] )
|
||||
|
||||
def __init__(self, line):
|
||||
fields = line.split(" ")
|
||||
if len(fields) == 5:
|
||||
self.fdl, self.fastb, self.qualb, self.qltout, group = fields
|
||||
else:
|
||||
self.fdl, self.fastb, self.qualb, self.qltout = fields
|
||||
self.qltout = self.qltout.rstrip()
|
||||
|
||||
self.fqltout = os.path.splitext(self.qltout)[0]+".fais_filtered.qltout"
|
||||
self.flowcell, self.date, self.lane = self.fdl.split(".")
|
||||
self.flowlane = self.flowcell+"."+self.lane
|
||||
self.path_head = os.path.splitext(self.qltout)[0]
|
||||
self.sam = self.path_head+".sam"
|
||||
self.bam = self.path_head+".bam"
|
||||
self.head = ".".join(os.path.basename(self.path_head).split(".")[:2]) # should amount to FLOWCELL.LANE
|
||||
|
||||
#self.refdict_metrics()
|
||||
|
||||
def pipe_dir(self):
|
||||
dir = lane.pipe_dirs_dict[self.flowcell]
|
||||
return lane.pipe_dirs_head+"/"+dir
|
||||
|
||||
def set_default_refdict(self):
|
||||
self.refdict = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.dict";
|
||||
#print "WARNING: Using standard Hg18 reference dictionary (refdict) for",self
|
||||
|
||||
def set_default_metrics(self):
|
||||
self.metrics = "/broad/1KG/legacy_data/tgtc2sam/unknown.metrics"
|
||||
#print "WARNING: Using default unkonwn.metrics file for",self
|
||||
|
||||
def refdict_metrics(self):
|
||||
"Ensures that a reference dictionary and metrics are available"
|
||||
try:
|
||||
pipe_lane_dir = self.pipe_dir()
|
||||
except KeyError:
|
||||
self.set_default_refdict()
|
||||
self.set_default_metrics()
|
||||
return True
|
||||
|
||||
self.metrics = pipe_lane_dir+"/"+str(self.flowlane)+".metrics"
|
||||
if os.path.exists(self.metrics):
|
||||
line = commands.getoutput("grep '^reference=' "+pipe_lane_dir+"/"+self.flowlane+".metrics")
|
||||
if line != "":
|
||||
reference_fasta = line.split("=")[1]
|
||||
self.refdict = os.path.splitext(reference_fasta)[0]+'.dict'
|
||||
if "PhiX" in self.refdict:
|
||||
failed_list.append(FailedConversion(self, "PhiX dictionary missing"))
|
||||
return False
|
||||
else:
|
||||
self.set_default_refdict()
|
||||
return True # We were able to generate refdict and metrics files
|
||||
else:
|
||||
self.set_default_refdict()
|
||||
self.set_default_metrics()
|
||||
return True # We were able to generate refdict file and substituted unknown.metrics as the metrics file
|
||||
|
||||
#failed_list.append(FailedConversion(self, "Could not find metrics file"))
|
||||
#return False
|
||||
|
||||
def __str__(self):
|
||||
return self.fdl
|
||||
|
||||
def usage():
|
||||
print "Usage: tgtc_sam.py <command> <command_args>..."
|
||||
print
|
||||
print " Works from tgtc files which contain fastb, qualb, qltout filenames corresponding to one lane of data"
|
||||
|
||||
print " Commands and command arguments:"
|
||||
print " convert tgtc_list - given a tgtc file converts files to SAM and BAM"
|
||||
print " tgtc - file with lines of fastb, qualb, qltout filenames"
|
||||
print " [clean] - optional 3rd argument that will create SAM/BAM files"
|
||||
print " regardless of whether they already exist"
|
||||
print
|
||||
print " merge tgtc output_file - merges converted BAM files"
|
||||
print " tgtc - file with lines of fastb, qualb, qltout filenames"
|
||||
print " output_file - path and name of merged BAM file"
|
||||
print
|
||||
print " lists list_of_tgtc_lists - run over a list of tgtc lists given in a file"
|
||||
print " list_of_tgtc_files - file with tgtc filenames in it"
|
||||
print
|
||||
print " Global options:"
|
||||
print " print_only: do not run commands, just print what they would be "
|
||||
sys.exit(1)
|
||||
|
||||
def make_lane_list(tgtc_file):
|
||||
all_lanes = [lane(l) for l in open(tgtc_file) if not l.startswith("#")]
|
||||
lanes = []
|
||||
for ln in all_lanes:
|
||||
if ln.refdict_metrics(): # if we were able to generate refdict and metrics
|
||||
# append this lane to the lanes list that we will work with
|
||||
lanes.append(ln)
|
||||
unpaired = [l for l in lanes if ".end1." not in l.qualb and ".end2." not in l.qualb]
|
||||
paired1 = [l for l in lanes if ".end1." in l.qualb]
|
||||
paired2 = [l for l in lanes if ".end2." in l.qualb]
|
||||
|
||||
print "Unpaired lanes:",len(unpaired)
|
||||
print "Paired 1 lanes:",len(paired1)
|
||||
print "Paired 2 lanes:",len(paired2)
|
||||
assert len(paired1) == len(paired2)
|
||||
|
||||
return [(u, None) for u in unpaired] + zip(paired1, paired2)
|
||||
|
||||
def convert_lanes(tgtc_file, recreate_all=False, just_print_commands=False):
|
||||
lanes_done = 0
|
||||
lane_list = make_lane_list(tgtc_file)
|
||||
print "Beginning processing of these lanes (unpaired and paired):"
|
||||
print "\n".join(["%s %s" % (p1,p2) for p1,p2 in lane_list])
|
||||
|
||||
for p1, p2 in lane_list:
|
||||
if recreate_all or not os.path.exists(p1.sam) or not os.path.getsize(p1.sam):
|
||||
|
||||
# Filer reads with > 4 errors
|
||||
cmd_str = "\"FilterAlignsILTStyle QLTIN="+p1.qltout+" QLTOUT="+p1.fqltout
|
||||
|
||||
if p2 == None:
|
||||
# This is an UNPAIRED LANE
|
||||
print "Processing unpaired lane",p1
|
||||
cmd_str += (" && Qltout2SAM"+
|
||||
" HEAD="+p1.head+
|
||||
" SAM="+p1.sam+
|
||||
" QLTOUT="+p1.fqltout+
|
||||
" FASTB="+p1.fastb+
|
||||
" QUALB="+p1.qualb+
|
||||
" REFDICT="+p1.refdict+
|
||||
" METRICS="+p1.metrics+
|
||||
" ALIGNER=PairFinder"+
|
||||
" TMPDIR=/broad/hptmp/andrewk")
|
||||
else:
|
||||
# This is a PAIRED LANE
|
||||
print "Processing paired lanes",p1,p2
|
||||
if p1.flowcell != p2.flowcell or p1.lane != p2.lane:
|
||||
print "### WARNING ### paired lanes ",p1,p2,"do not match! Skipping."
|
||||
continue
|
||||
# Filter 2nd Qltout file too
|
||||
cmd_str += " && FilterAlignsILTStyle QLTIN="+p2.qltout+" QLTOUT="+p2.fqltout
|
||||
cmd_str += (" && QltoutPair2SAM"+
|
||||
" HEAD="+p1.head+
|
||||
" SAM="+p1.sam+
|
||||
" QLTOUT1="+p1.fqltout+" QLTOUT2="+p2.fqltout+
|
||||
" FASTB1="+p1.fastb+" FASTB2="+p2.fastb+
|
||||
" QUALB1="+p1.qualb+" QUALB2="+p2.qualb+
|
||||
" REFDICT="+p1.refdict+
|
||||
" METRICS="+p1.metrics+
|
||||
" ALIGNER=PairFinder"+
|
||||
" TMPDIR=/broad/hptmp/andrewk")
|
||||
|
||||
# Make the BAM file
|
||||
cmd_str += " && /seq/dirseq/samtools/current/samtools import "+lane.bam_ref_list+" "+p1.sam+" "+p1.bam+"\""
|
||||
# Now remove one or both SAM files if all previous commands returned 0 for success
|
||||
cmd_str += " && rm -f "+p1.sam
|
||||
if p2 != None: cmd_str += " && rm -f "+p2.sam
|
||||
|
||||
status = cmd(cmd_str, "long", output_head=p1.path_head, just_print_commands=just_print_commands)
|
||||
if status == 0:
|
||||
lanes_done += 1
|
||||
else:
|
||||
failed_list.append("Unkown failure")
|
||||
#if lanes_done + len(failed_list) > 0:
|
||||
#break
|
||||
|
||||
failed = len(failed_list)
|
||||
lanes_total = lanes_done+failed
|
||||
percent_done = lanes_done / lanes_total if lanes_done != 0 else 0
|
||||
print "Lane conversion success: %d/%d (%.1f%%)" % (lanes_done, lanes_total, percent_done)
|
||||
|
||||
def merge_bams(tgtc_file, outfile, just_print_commands=False):
|
||||
input_lane_list = make_lane_list(tgtc_file)
|
||||
#lane_list = [lp for lp in lane_list if lp[0].flowcell =="30LLT"]
|
||||
missing_files = False
|
||||
lane_list = []
|
||||
for p1,p2 in input_lane_list:
|
||||
if "adapted" not in p1.qualb or (p2 != None and "adapted" not in p2.qualb):
|
||||
print "NOT ADAPTED", p1.qualb, p2.qualb
|
||||
if not os.path.exists(p1.bam):
|
||||
#print "Missing file:",p1.bam
|
||||
missing_files = True
|
||||
else:
|
||||
lane_list.append( (p1,p2) )
|
||||
|
||||
#return
|
||||
if not missing_files:
|
||||
if not os.path.exists(outfile):
|
||||
print len(lane_list),"lanes / lane pairs to merge"
|
||||
if not os.path.exists(os.path.dirname(outfile)):
|
||||
os.makedirs(os.path.dirname(outfile))
|
||||
|
||||
cmd("java -cp /home/radon01/depristo/dev/workspace/Libraries/bin"+
|
||||
" edu.mit.broad.picard.sam.MergeSamFiles"+
|
||||
" "+" ".join(["I="+p1.bam for p1,p2 in lane_list])+
|
||||
" O="+outfile+
|
||||
" TMP_DIR=/broad/hptmp/andrewk/"+
|
||||
" && /seq/dirseq/samtools/current/samtools index "+outfile+
|
||||
" && /xchip/tcga/gbm/visualization/sam/IGVAlignmentProcessor/bin/run_linux-64.sh "+outfile+" "+os.path.dirname(outfile)+" hg18 -w 100",
|
||||
"long", outfile, just_print_commands=just_print_commands)
|
||||
|
||||
# Original samtools merge command below that was used for rev. 1 merging
|
||||
|
||||
#cmd("/seq/dirseq/samtools/current/samtools merge "+outfile+
|
||||
# " "+" ".join([p1.bam for p1,p2 in lane_list])+
|
||||
# " && /seq/dirseq/samtools/current/samtools index "+outfile+
|
||||
# " && /xchip/tcga/gbm/visualization/sam/IGVAlignmentProcessor/bin/run_linux-64.sh "+outfile+" "+os.path.dirname(outfile)+" hg18 -w 100",
|
||||
# "long", outfile)
|
||||
|
||||
else:
|
||||
# Convert the lanes that had missing outfiles
|
||||
#convert_lanes(tgtc_file)
|
||||
pass
|
||||
|
||||
def process_tgtc_lists(tgtc_list_file):
|
||||
"Works through a list of tgtc files and merges them to produce the output"
|
||||
lists_done = 0
|
||||
|
||||
for line in open(tgtc_list_file):
|
||||
if line[0] == '#':
|
||||
continue
|
||||
fields = line.split()
|
||||
if len(fields) != 2:
|
||||
continue
|
||||
head, tgtc_list = fields
|
||||
bam = head+".bam"
|
||||
if True: #not os.path.exists(bam):
|
||||
print "Processing tgtc_list_file",tgtc_list,"with goal of producing",bam
|
||||
merge_bams(tgtc_list, bam)
|
||||
print
|
||||
else:
|
||||
print bam,"already exists\n"
|
||||
|
||||
lists_done += 1
|
||||
#if lists_done > 0:
|
||||
# break
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
usage()
|
||||
operation = sys.argv[1]
|
||||
|
||||
# Check for print commands only option
|
||||
if "print_only" in sys.argv:
|
||||
just_print_commands = True
|
||||
sys.argv.remove("print_only")
|
||||
else:
|
||||
just_print_commands = False
|
||||
|
||||
if operation == "convert":
|
||||
if len(sys.argv) < 3:
|
||||
usage()
|
||||
tgtc_file = sys.argv[2]
|
||||
# Run convert_lanes and recreate_all == True if clean option was given as 3rd arg
|
||||
convert_lanes(tgtc_file, len(sys.argv) > 3 and sys.argv[3] == "clean", just_print_commands=just_print_commands)
|
||||
elif operation == "merge":
|
||||
if len(sys.argv) < 4:
|
||||
usage()
|
||||
tgtc_file = sys.argv[2]
|
||||
outfile = sys.argv[3]
|
||||
merge_bams(tgtc_file, outfile, just_print_commands=just_print_commands)
|
||||
elif operation == "lists":
|
||||
tgtc_list_file = "/broad/1KG/legacy_data/tgtc2sam/tgtc_lists_2_legacy_bams"
|
||||
if len(sys.argv) >= 3:
|
||||
tgtc_list_file = sys.argv[2]
|
||||
process_tgtc_lists(tgtc_list_file)
|
||||
else:
|
||||
print "Didn't understand operation value: "+operation
|
||||
usage()
|
||||
Loading…
Reference in New Issue