diff --git a/python/1kgScripts/runme_freeze5.1.csh b/python/1kgScripts/runme_freeze5.1.csh deleted file mode 100755 index 05d741445..000000000 --- a/python/1kgScripts/runme_freeze5.1.csh +++ /dev/null @@ -1,3 +0,0 @@ -python ~/dev/GenomeAnalysisTK/trunk/python/MergeBAMBatch.py -d /humgen/gsa-hphome1/projects/1kg_pilot1/mergedBamsByPopulation/ -q gsa /broad/1KG/DCC_merged/lists/low_coverage_freeze5.list -n /broad/1KG/DCC_merged/lists/naids_and_pop.txt -python ~/dev/GenomeAnalysisTK/trunk/python/MergeBAMBatch.py -d freeze5.1 -q gsa -s /broad/1KG/DCC_merged/lists/trios_freeze5.list - diff --git a/python/AlignBam.py b/python/AlignBam.py deleted file mode 100755 index 1c27789b7..000000000 --- a/python/AlignBam.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python - -import os,sys -from farm_commands import cmd - -def run_paired_BWA(bam): - "Takes a paired end BAM file and outputs an aligned '.aligned.bam' file " - lane = 0 - root = os.path.splitext(bam)[0] - - # BAM to BFQ - cmd("/seq/software/picard/1.21/bin/BamToBfq.jar I="+bam+" LANE="+str(lane)+" FLOWCELL_BARCODE="+root+" PAIRED_RUN=True RUN_BARCODE="+root+" ANALYSIS_DIR=.") - - # BFQ to FQ - root1 = root+"."+str(lane)+".0.1" - root2 = root+"."+str(lane)+".0.2" - bfq1 = root1+".bfq" - bfq2 = root2+".bfq" - fq1 = root1+".fq" - fq2 = root2+".fq" - - cmd("/seq/dirseq/maq-0.7.1/maq bfq2fastq "+bfq1+" "+fq1) - cmd("/seq/dirseq/maq-0.7.1/maq bfq2fastq "+bfq2+" "+fq2) - - # Run BWA - sai1 = root1+".sai" - sai2 = root2+".sai" - cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa aln /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+fq1+" >"+sai1) - cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa aln /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+fq2+" >"+sai2) - - # Pairing - alignedsam = root+".aligned.sam" - cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa sampe /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+sai1+" "+sai2+" "+fq1+" "+fq2+" > "+alignedsam) - - # SAM to BAM - bam_ref_list = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bam.ref_list" - alignedbam = root+".aligned.bam" - cmd("/seq/dirseq/samtools/current/samtools import "+bam_ref_list+" "+alignedsam+" "+alignedbam) - -if __name__ == "__main__": - if len(sys.argv) < 2: - print "Usage: AlignBam.py BAM_FILE" - sys.exit() - bam = sys.argv[1] - run_paired_BWA(bam) - - diff --git a/python/AlignBams.py b/python/AlignBams.py deleted file mode 100644 index ae8845e23..000000000 --- a/python/AlignBams.py +++ /dev/null @@ -1,41 +0,0 @@ -import os,sys -from farm_commands import cmd - -def run_BWA(bam, paired=True): - "Takes a paired end BAM file and outputs an aligned '.aligned.bam' file " - lane = 0 - root = os.path.splitext(bam)[0] - print root - - # BAM to BFQ - cmd("/seq/software/picard/current/bin/BamToBfq.jar I="+bam+" LANE="+str(lane)+" FLOWCELL_BARCODE="+root+" PAIRED_RUN=True RUN_BARCODE="+root+" ANALYSIS_DIR=.") - - # BFQ to FQ - root1 = root+"."+str(lane)+".0.1" - root2 = root+"."+str(lane)+".0.2" - bfq1 = root1+".bfq" - bfq2 = root2+".bfq" - fq1 = root1+".fq" - fq2 = root2+".fq" - - cmd("/seq/dirseq/maq-0.7.1/maq bfq2fastq "+bfq1+" "+fq1) - cmd("/seq/dirseq/maq-0.7.1/maq bfq2fastq "+bfq2+" "+fq2) - - # Run BWA - sai1 = root1+".sai" - sai2 = root2+".sai" - cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa aln /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+fq1+" >"+sai1) - cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa aln /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+fq2+" >"+sai2) - - # Pairing - alignedsam = root+".aligned.sam" - cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa sampe /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+sai1+" "+sai2+" "+fq1+" "+fq2+" > "+alignedsam) - - # SAM to BAM - bam_ref_list = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bam.ref_list" - alignedbam = root+".aligned.bam" - cmd("/seq/dirseq/samtools/current/samtools import "+bam_ref_list+" "+alignedsam+" "+alignedbam) - -if __name__ == "__main__": - bamfiles = sys.argv[1:] - diff --git a/python/BarcodeAnalysis.py b/python/BarcodeAnalysis.py deleted file mode 100755 index c322f8d8c..000000000 --- a/python/BarcodeAnalysis.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/python - -import os,sys -from farm_commands import cmd - -def get_line_number_from_file(filename, line_num): - for index, line in enumerate(open(filename)): - if index == line_num: - return line - -if __name__ == "__main__": - - from optparse import OptionParser - class OptionParser (OptionParser): - def check_required (self, opts): - for opt in opts: - option = self.get_option(opt) - - # Assumes the option's 'default' is set to None! - if getattr(self.values, option.dest) is None: - self.error("%s option not supplied" % option) - - parser = OptionParser() - parser.add_option("-q", "--queue", help="Queue to submit jobs to (default: long)", dest="queue", default="long") - parser.add_option("-l", "--lane", help="Lane number to process", dest="lane", default="") - parser.add_option("-t", "--tmpdir", help="Temp directory to use", dest="tmpdir") - parser.add_option("-b", "--barcodes", help="File with barcodes as \"Id[TABa]barcode_sequence\" with one header line", dest="barcodes") - parser.add_option("-a", "--baits", help="Bait interval list (default: 1KG bait list)", dest="ti", default="/seq/references/HybSelOligos/thousand_genomes_alpha_redesign/thousand_genomes_alpha_redesign.baits.interval_list") - parser.add_option("-r", "--targets", help="Target interval list (default: 1KG target list)", dest="ti", default="/seq/references/HybSelOligos/thousand_genomes_alpha_redesign/thousand_genomes_alpha_redesign.targets.interval_list") - (options, args) = parser.parse_args() - parser.check_required(("-l","-t","-b")) - - solexa_dir = "." - - barcodes = [] - barcode_file = open(options.barcodes) #"../barcode_sequences.txt") - barcode_file.readline() - for line in barcode_file: - fields = line.split() - barcodes.append(fields[1]) - - farm = options.queue - lane = options.lane - alias = "lane"+str(lane) - tmp_dir = options.tmpdir #"/humgen/gsa-scr1/andrewk/tmp/" - - cmd("/seq/software/picard/current/bin/ExtractIlluminaBarcodes.jar B="+solexa_dir+" L="+str(lane)+" BARCODE_POSITION=1 METRICS_FILE=metrics.out BARCODE=" + " BARCODE=".join(barcodes)) - - cmd( "/seq/software/picard/current/bin/BustardToSam.jar B="+solexa_dir+" L="+str(lane)+" O="+alias+".bam SAMPLE_ALIAS="+alias+" RUN_BARCODE="+alias+" TMP_DIR="+tmp_dir)#, farm_queue=farm) #PAIRED_RUN=True - - for barcode in barcodes: - id1 = cmd( "/seq/software/picard/current/bin/BustardToSam.jar B="+solexa_dir+" L="+str(lane)+" BARCODE_POSITION=1 BARCODE="+barcode+" O="+barcode+".bam RUN_BARCODE="+barcode+" SAMPLE_ALIAS=lane5 TMP_DIR="+tmp_dir, farm_queue=farm) #PAIRED_RUN=True - - id2 = cmd( "/home/radon01/andrewk/dev/Sting/trunk/python/AlignBam.py "+barcode+".bam", farm_queue=farm, waitID = id1) - id3 = cmd("/seq/software/picard/1.21/bin/MergeSamFiles.jar I="+barcode+".aligned.bam O="+barcode+".aligned.merged.bam VALIDATION_STRINGENCY=SILENT", farm_queue=farm, waitID = id2) - id4 = cmd("java -Xmx4096m -jar /seq/software/picard/1.21/bin/MarkDuplicates.jar VALIDATION_STRINGENCY=SILENT I= "+barcode+".aligned.merged.bam O="+barcode+".aligned.duplicates_marked.bam M="+barcode+".aligned.bam.duplicates_marked.bam.duplicate_metrics TMP_DIR="+tmp_dir, farm_queue=farm, waitID=id3) - cmd("java -jar /seq/software/picard/current/bin/CalculateHsMetrics.jar BI="+options.bait_list+" TI="+options.target_list+" I="+barcode+".aligned.duplicates_marked.bam M="+barcode+".hs_metrics VALIDATION_STRINGENCY=SILENT", farm_queue=farm, waitID = id4) - - - fout = open("hybrid_selection.metrics", "w") - print >> fout, "BARCODE\t"+get_line_number_from_file(barcodes[0]+".hs_metrics", 6).rstrip() - for barcode in barcodes: - print >> fout, barcode+"\t"+get_line_number_from_file(barcode+".hs_metrics", 7).rstrip() - diff --git a/python/CoverageEval.py b/python/CoverageEval.py deleted file mode 100755 index f8a2ae1e4..000000000 --- a/python/CoverageEval.py +++ /dev/null @@ -1,357 +0,0 @@ -#!/usr/bin/env python - -import sys, itertools, FlatFileTable -from enum import Enum - -def subset_list_by_indices(indices, list): - subset = [] - for index in indices: - subset.append(list[index]) - return subset - -def chunk_generator(record_gen, key_fields): - """Input: - record_gen: generator that produces dictionaries (records in database speak) - key_fields: keys in each dictionary used to determine chunk membership -Output: - locus_chunk: list of consecutive lines that have the same key_fields""" - - locus_chunk = [] - last_key = "" - first_record = True - for record in record_gen: - key = [record[f] for f in key_fields] - if key == last_key or first_record: - locus_chunk.append(record) - first_record = False - else: - if locus_chunk != []: - yield locus_chunk - locus_chunk = [record] - last_key = key - yield locus_chunk - - -class call_stats: - def __init__(self, call_type, coverage): #, acc_conf_calls, conf_call_rate, cum_corr_calls, cum_calls, coverage): - self.call_type = call_type - self.coverage = coverage - self.calls = 0 - self.conf_ref_calls = 0 - self.conf_het_calls = 0 - self.conf_hom_calls = 0 - self.conf_var_calls = 0 - self.conf_genotype_calls = 0 - self.conf_refvar_calls = 0 - self.correct_genotype = 0 - self.correct_refvar = 0 - - - def add_stat(self, calls, conf_ref_calls, conf_het_calls, conf_hom_calls, conf_var_calls, conf_genotype_calls, conf_refvar_calls, correct_genotype, correct_refvar): - #print self, calls, conf_ref_calls, conf_het_calls, conf_hom_calls, conf_var_calls, conf_genotype_calls, conf_refvar_calls, correct_genotype, correct_refvar - self.calls += calls - self.conf_ref_calls += conf_ref_calls - self.conf_het_calls += conf_het_calls - self.conf_hom_calls += conf_hom_calls - self.conf_var_calls += conf_var_calls - self.conf_genotype_calls += conf_genotype_calls - self.conf_refvar_calls += conf_refvar_calls - self.correct_genotype += correct_genotype - self.correct_refvar += correct_refvar - - @staticmethod - def calc_discovery_stats(chunk): - conf_calls = 0 - correct_genotype = 0 - calls = 0 - for record in chunk: - calls += 1 - if float(record["BtrLod"]) >= 5 or call_type.from_record_2_state(record["ReferenceBase"][0],record["BestGenotype"]) == call_type.call_types_2_state.Reference and float(record["BtnbLod"]) >= 5: - conf_calls += 1 - if call_type.discovery_call_correct(record): - correct_genotype += 1 - - return correct_genotype, conf_calls, calls - - @staticmethod - def calc_genotyping_stats(chunk): - conf_calls = 0 - correct_genotype = 0 - calls = 0 - for record in chunk: - calls += 1 - if float(record["BtnbLod"]) >= 5: - conf_calls += 1 - if call_type.genotyping_call_correct(record): - correct_genotype += 1 - - return correct_genotype, conf_calls, calls - - @staticmethod - def stats_header(): - return "TrueGenotype,Coverage,AccuracyConfidentGenotypingCalls,ConfidentGenotypingCallRate,AccuracyConfidentDiscoveryCalls,ConfidentDiscoveryCallRate,Calls,ConfRefCalls,ConfHetCalls,ConfHomCalls,ConfGenotypeCalls,CorrectGenotypes,ConfVarCalls,ConfDiscoveryCalls,CorrectDiscovery" - - def __str__(self): - return ",".join(map(str, (self.calls, self.conf_ref_calls, self.conf_het_calls, self.conf_hom_calls, self.conf_genotype_calls, self.correct_genotype, self.conf_var_calls, self.conf_refvar_calls, self.correct_refvar))) - - def stats_str(self): - return "%s,%d,%.5f,%.5f,%.5f,%.5f,%s" % (self.call_type, self.coverage, float(self.correct_genotype) / max(self.conf_genotype_calls,1), float(self.conf_genotype_calls) / max(self.calls,1), float(self.correct_refvar) / max(self.conf_refvar_calls,1), float(self.conf_refvar_calls) / max(self.calls,1), self.__str__()) - -class call_type: - """Class that returns an Enum with the type of call provided by a record""" - call_types_3_state = Enum("HomozygousSNP","HeterozygousSNP","HomozygousReference") - call_types_3_state_short = Enum("Hom","Het","Ref") - call_types_2_state = Enum("Variant","Reference") - - @staticmethod - def from_record_3_state(ref, genotype): # record): - """Given reference base as string, determine whether called genotype is homref, het, homvar""" - #ref = record["ReferenceBase"][0] - #genotype = record["HapmapChipGenotype"] - return call_type.call_types_3_state[genotype.count(ref)] - - @staticmethod - def from_record_2_state(ref, genotype): - """Given reference base as string, determine whether called genotype is ref or var""" - #ref = record["ReferenceBase"][0] - #genotype = record["HapmapChipGenotype"] - return call_type.call_types_2_state[0] if genotype.count(ref) < 2 else call_type.call_types_2_state[1] - - @staticmethod - def genotyping_call_correct(record): - return record["HapmapChipGenotype"] == record["BestGenotype"] - - - @staticmethod - def discovery_call_correct(record): - return call_type.from_record_2_state(record["ReferenceBase"][0], record["HapmapChipGenotype"]) == call_type.from_record_2_state(record["ReferenceBase"][0], record["BestGenotype"]) - -def print_record_debug(rec): - print "TEST Genotyping:", call_type.genotyping_call_correct(rec) - print "TEST Discovery:", call_type.discovery_call_correct(rec) - print "DIFF:", call_type.genotyping_call_correct(rec) != call_type.discovery_call_correct(rec) - print "\n".join([" %20s => '%s'" % (k,v) for k,v in sorted(rec.items())]) - print call_type.from_record_2_state(rec["ReferenceBase"][0],rec["HapmapChipGenotype"]) - print call_type.from_record_2_state(rec["ReferenceBase"][0],rec["BestGenotype"]) - print - -def aggregate_stats(filename, max_loci, table_filename, debug): - aggregate = dict() - fout = open(table_filename,"w") - fout.write(call_stats.stats_header()+"\n") - - locus_gen = chunk_generator(FlatFileTable.record_generator(filename, None), ("Sequence","Position")) - for index, locus_chunk in enumerate(locus_gen): - if index >= max_loci: - break - if (index % 1000) == 0: - sys.stderr.write( str(index)+" loci processed, at: "+locus_chunk[0]["Sequence"]+":"+locus_chunk[0]["Position"]+"\n") - - covs = dict() - coverage_chunk_gen = chunk_generator(locus_chunk, ("DownsampledCoverage", "Sequence", "Position")) - for cov_chunk in coverage_chunk_gen: - first_record = cov_chunk[0] - #stat = call_stats.calc_stats(cov_chunk) - - for record in cov_chunk: - hapmap_genotyping_call_type = call_type.from_record_3_state(record["ReferenceBase"][0],record["HapmapChipGenotype"]) - key = hapmap_genotyping_call_type, int(record["DownsampledCoverage"]) - #key = call_type.from_record_3_state(record)#, int(record["DownsampledCoverage"]) - record["DownsampledCoverage"] = int(record["DownsampledCoverage"]) - record["HapmapChipCallType"] = hapmap_genotyping_call_type - value = record - - correct_genotype, conf_genotype_calls, genotype_calls = call_stats.calc_genotyping_stats([record]) - correct_refvar, conf_refvar_calls, refvar_calls = call_stats.calc_discovery_stats([record]) - assert(genotype_calls == refvar_calls) - - conf_ref_calls = 0 - conf_het_calls = 0 - conf_hom_calls = 0 - best_genotyping_call_type = call_type.from_record_3_state(record["ReferenceBase"][0],record["BestGenotype"]) - if conf_genotype_calls: - if best_genotyping_call_type.index == 0: conf_hom_calls = 1 - if best_genotyping_call_type.index == 1: conf_het_calls = 1 - if best_genotyping_call_type.index == 2: conf_ref_calls = 1 - - conf_var_calls = 0 - if conf_refvar_calls: - this_variant_call_type = call_type.from_record_2_state(record["ReferenceBase"][0],record["BestGenotype"]) - conf_var_calls = 1 if this_variant_call_type.index == 0 else 0 - - aggregate.setdefault(key, call_stats(*key)) - #print ",".join(map(str,(genotype_calls, conf_ref_calls, conf_het_calls, conf_hom_calls, conf_var_calls, conf_genotype_calls, conf_refvar_calls, correct_genotype, correct_refvar))) - aggregate[key].add_stat(genotype_calls, conf_ref_calls, conf_het_calls, conf_hom_calls, conf_var_calls, conf_genotype_calls, conf_refvar_calls, correct_genotype, correct_refvar) - - if debug:# and conf_refvar_calls: - print "KEYS:",key - print_record_debug(record) - - - for key, records in sorted(aggregate.items()): - fout.write(records.stats_str()+"\n") - - fout.close() - #return aggregate - -class weighted_avg: - - def __init__(self): - self.sum = 0.0 - self.count = 0 - - def add(self, value, counts): - self.sum += value*counts - self.count += counts - - def return_avg(self): - return float(self.sum) / max(self.count,1) - -def stats_from_hist(options, depth_hist_filename, stats_filename, variant_eval_dir, depth_multiplier=1.0): - - #hist_zero = {"CallType" : ,"Coverage","AccuracyConfidentCalls","ConfidentCallRate","CumCorrectCalls","CumCalls","CumgCorrectFraction"} - #prob_genotype = [1e-5, 1e-3, 1-1e-3] - #prob_genotype = [0.37, 0.62, .0] - #prob_genotype = [0.203, 0.304, .491] - #prob_genotype = [0.216, 0.302, .481] - prob_genotype = [0.205, 0.306, 0.491] # Based on CEU NA12878 actual hapmap chip calls - #prob_genotype = [0.213, 0.313, 0.474] # Based on YRB NA19240 actual hapmap chip calls - theta = 1.0/1850 # expected heterozygosity - prob_genotype = [0.5 * theta, 1.0 * theta, 1 - 1.5*theta] # Based on CEU NA12878 actual hapmap chip calls - - - hist = [] - hist_gen = FlatFileTable.record_generator(depth_hist_filename, sep=" ", skip_until_regex_line = "^depth count freq") - for index, record in enumerate(hist_gen): - assert int(record["depth"]) == index - hist.append(int(record["count"])) - - # If upsampling is not done in the CoverageEval GATK module, the number of observations of reads - # with high depth of coverage can be very low giving - - stats_dict = dict() - stats_gen = FlatFileTable.record_generator(stats_filename, sep=",") - for record in stats_gen: - key1 = int(record["Coverage"]) - key2 = record["TrueGenotype"] - #print key2 - stats_dict.setdefault(key1, dict()) # create a nested dict if it doesn't exist - stats_dict[key1][key2] = record # create an entry for these keys - - #highest_homref_calls = 0 - #if record["TrueGenotype"] == "HomozygousReference": - # calls = int(record["Calls"]) - # if calls > highest_homref_calls: - # highest_homref_calls = calls - - acc = dict() - call_rate = dict() - conf_calls = dict() - - start = 1 - end = 1000 - max_usable_depth = 40 # Depth of coverage beyond which stats are not sampled enough and we take the stat at this depth instead - for depth, depth_count in enumerate(hist[start:end+1],start): # For Cd = depth count - #print "DEPTH: "+str(depth) - try: - depth = max(int(float(depth*depth_multiplier)),1) - if depth > max_usable_depth: # Ensure that high entries with bad stats use a good stat from a depth that we now is well sampled - depth = max_usable_depth - depth_entries = stats_dict[depth] - except KeyError: - print "Stopped on depth",depth - break - if True: - for true_genotype, stat in depth_entries.items(): # For t (SNP type) = true_genotype - #print "TRUE_GENOTYPE: "+str(true_genotype) - for genotype in call_type.call_types_3_state: - conf_calls.setdefault(genotype, 0.0) - prob_conf_x_call = float(stat["Conf"+str(call_type.call_types_3_state_short[genotype.index])+"Calls"])/float(stat["Calls"]) - conf_calls[genotype] += depth_count * prob_conf_x_call * prob_genotype[genotype.index] - #if genotype.index == 1: - # print "%.5f " % prob_conf_x_call, depth, depth_count, conf_calls[genotype], int(stat["Conf"+str(call_type.call_types_3_state_short[genotype.index])+"Calls"]), int(stat["Calls"]) - - acc.setdefault(true_genotype,weighted_avg()) - call_rate.setdefault(true_genotype,weighted_avg()) - acc[true_genotype].add(float(stat["AccuracyConfidentGenotypingCalls"]), depth_count) - call_rate[true_genotype].add(float(stat["ConfidentGenotypingCallRate"]), depth_count) - - import numpy - for genotype in call_type.call_types_3_state: - print "%19s accuracy : %.3f" % (str(genotype), acc[str(genotype)].return_avg()) - print "%19s call rate: %.3f" % (str(genotype), call_rate[str(genotype)].return_avg()) - - print "\nExpected performance given perfect accuracy and call rate:" - print "%19s %7s %7s %7s" % ("", "Actual", "Perfect", "Diff.") - total_hist_sites = numpy.sum(hist) - total_predicted = 0 - for genotype in call_type.call_types_3_state: - predicted = conf_calls[genotype] - total_predicted += predicted - perfect = prob_genotype[genotype.index]*total_hist_sites - diff = perfect - predicted - percent_of_possible = predicted / perfect * 100 - print "%19s calls: %8.0f %8.0f %8.0f %8.1f" % (genotype, predicted, perfect, diff, percent_of_possible) - #repl_string += "%s %.0f\n" % (genotype, predicted) - print " Total calls: %7d" % total_predicted - - #stats_gen = FlatFileTable.record_generator(stats_filename, sep=",") - #for chunk in chunk_generator(stats_gen, key_fields=("True_Genotype")): - - print "\nCoverage histogram mean: %.2f" % numpy.average(range(len(hist)), weights=hist) - - # STEM AND LEAF PLOT - - # If VariantEval directory given, compare with those results - if options.variant_eval_file != None: - vareval_file = open(options.variant_eval_file) - num_hets = None; num_homs = None - for line in vareval_file: - if "UNKNOWN_CALLED_VAR_HET_NO_SITES" in line: num_hets = line.rstrip().split()[2] - if "UNKNOWN_CALLED_VAR_HOM_NO_SITES" in line: num_homs = line.rstrip().split()[2] - - if options.oneline_stats != None: - oneline_stats = open(options.oneline_stats, "w") - pred_hom = conf_calls[call_type.call_types_3_state[0]] - pred_het = conf_calls[call_type.call_types_3_state[1]] - print >>oneline_stats, "%s %.0f %s %.0f %s" % (options.oneline_stats, pred_hom, num_homs, pred_het, num_hets) - -def usage(parser): - parser.print_usage() - sys.exit() - -if __name__ == "__main__": - from optparse import OptionParser - - parser = OptionParser() - parser.add_option("-c", "--genotype_call_file", help="GELI file to use in generating coverage stat table", dest="genotype_filename", ) - parser.add_option("-s", "--stats_file", help="file to containing empirical genotyping stats", dest="stats_filename", default=None) - parser.add_option("-g", "--histogram_file", help="file containing counts of each depth of coverage", dest="hist_filename") - parser.add_option("-m", "--max_loci", help="maximum number of loci to parse (for debugging)", default=sys.maxint, dest="max_loci", type="int") - parser.add_option("-e", "--evaluate", help="evaluate genotypes; requires a stats file and a histogram file", default=False, dest="evaluate_genotypes", action="store_true") - parser.add_option("-d", "--debug", help="provide debugging output", default=False, dest="debug", action="store_true") - parser.add_option("-p", "--depth_multiplier", help="multiply all depths in histogram by this value; for \"downsampling\" depth", default=1.0, dest="depth_multiplier", type=float) - parser.add_option("-v", "--variant_eval_file", help="file with output of VariantEval genotype concordance to compare this prediction to", default=None, dest="variant_eval_file") - parser.add_option("-o", "--oneline_stats_file", help="output single, tabular line of stats to this file", default=None, dest="oneline_stats") - - (options, args) = parser.parse_args() - - if not options.evaluate_genotypes: - print "Creating performance tables from genotypes file" - #if options.stats_filename == None: - # sys.exit("Must provide -s stats fliname option") - if options.genotype_filename == None: - sys.exit("Must provide -c genotype call filename option") - stats_filename = options.stats_filename if options.stats_filename != None else options.genotype_filename+".stats" - aggregate_stats(options.genotype_filename, options.max_loci, options.stats_filename, options.debug) - else: - print "Evaluating genotypes" - print "Depth multiplier:",options.depth_multiplier - if options.hist_filename == None: - sys.exit("Must provide -g histogram filename option") - if options.stats_filename == None: - sys.exit("Must provide -s stats fliname option") - stats_from_hist(options, options.hist_filename, options.stats_filename, options.variant_eval_file, options.depth_multiplier) - - - diff --git a/python/CoverageMeta.py b/python/CoverageMeta.py deleted file mode 100755 index d37da2e81..000000000 --- a/python/CoverageMeta.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python - -import sys, os -from farm_commands import cmd - -#if __name__ == "__main__": -from optparse import OptionParser -class OptionParser (OptionParser): - - def check_required (self, opts): - for opt in opts: - option = self.get_option(opt) - - # Assumes the option's 'default' is set to None! - if getattr(self.values, option.dest) is None: - self.error("%s option not supplied" % option) - -def exists_and_non_zero(file): - return os.path.exists(file) and os.path.getsize(file) != 0 - -def process_bam(bam_file): - gatk_exe = "java -Xmx8192m -jar ~/dev/Sting/trunk/dist/GenomeAnalysisTK.jar " - output_basepath = os.path.splitext(bam_file)[0] - output_basefile = os.path.basename(output_basepath) - - print "Input BAM file:",bam_file - - using_hapmapchip = None - if False: - # Config for chr1 of pilot 2 people - ref = " -R /broad/1KG/reference/human_b36_both.fasta" - #interval_string = "1" - - # Assure existance of intervals file - intervals_file = output_basepath+".hapmap.intervals.chr1" - if not exists_and_non_zero(intervals_file): - Cmd = "awk -F' ' '{ if ($1 == \""+interval_string+"\") {print $1 \":\" $4} }' "+hapmapchip_file+" > "+intervals_file - cmd(Cmd, die_on_fail = True) - - # Setup hapmapchip GFFs; assure that hapmap chip file is here - using_hapmapchip = True - hapmapchip_file = output_basepath+".hapmapchip.gff" - if not exists_and_non_zero(hapmapchip_file): - sys.exit("Expected to find "+hapmapchip_file+" but it's not around") - - else: - # Config for pilot 3 Broad data - #ref = " -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" - #intervals_file = "/seq/references/HybSelOligos/thousand_genomes_alpha_redesign/thousand_genomes_alpha_redesign.targets.interval_list" - - # Config for pilot 3 DCC data (12/16/09 r. 2375) - ref = " -R /broad/1KG/reference/human_b36_both.fasta" - intervals_file = "/humgen/gsa-scr1/GATK_Data/thousand_genomes_alpha_redesign.targets.b36.interval_list" - - # Assure that BAM index exists - bam_index_file = output_basepath+".bam.bai" - if not exists_and_non_zero(bam_index_file): - Cmd = "samtools index "+bam_file - cmd(Cmd, die_on_fail = True) - - # Run UnifiedGenotyper calls - vcf_outfile = output_basepath+".calls.vcf" - if not exists_and_non_zero(vcf_outfile): - print "Running UnifiedGenotyper" - Cmd = gatk_exe + ref + " -T UnifiedGenotyper"+\ - " -I "+bam_file+\ - " -varout "+vcf_outfile+\ - " -L "+intervals_file+\ - " -fmq0 -l INFO " - #" --genotype -lod 5" - - #+interval_string+\ - cmd(Cmd, die_on_fail=True) - - # Run VariantEval on UnifiedGenotyper calls - varianteval_outfile = output_basepath+".varianteval" - if not exists_and_non_zero(varianteval_outfile): - print "Running VariantEval with output to "+varianteval_outfile - Cmd = gatk_exe + ref + " -T VariantEval"+\ - " -B dbsnp,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod"+\ - " -B eval,VCF,"+ssg_outfile+\ - " -o "+varianteval_outfile+\ - " -L "+intervals_file+\ - " -G -V"+\ - " --evalContainsGenotypes"+\ - " -minPhredConfidenceScore 150"+\ - " -fmq0 -l INFO" - if using_hapmapchip: - Cmd = Cmd + " -hc "+hapmapchip_file - - cmd(Cmd, die_on_fail=True) - - # Run CoverageHist on BAM file - hist_outfile = output_basepath+".hapmap.coverage_hist" - if not exists_and_non_zero(hist_outfile): - print "Running CoverageHist on "+bam_file - Cmd = gatk_exe + ref + " -T DepthOfCoverage"+\ - " -I "+bam_file+\ - " -o "+hist_outfile+\ - " -L "+intervals_file+\ - " -histogram --suppressLocusPrinting"+\ - " -fmq0 -l INFO" - - cmd(Cmd, die_on_fail=True)#,"gsa", output_basepath) - - # Now do CoverageEval predictions if they haven't been done - oneline_stats_file = output_basepath+".oneline_stats" - if not exists_and_non_zero(oneline_stats_file): - Cmd = "CoverageEval.py -e"+\ - " -s /humgen/gsa-scr1/projects/1kg_pilot2/coverage_eval/fixed_stats/current/NA12878.chr1.full.stats"+\ - " -g "+hist_outfile+\ - " -v "+varianteval_outfile+\ - " -o "+oneline_stats_file - cmd(Cmd, die_on_fail=True) - -if __name__ == "__main__": - parser = OptionParser() - parser.add_option("-b", "--bam-file", help="Input BAM filename", dest="bam_file", default=None) - parser.add_option("-a", "--all_bams", help="Process all BAMs in current directory", default=False, dest="all_bams", action="store_true") - - (options, args) = parser.parse_args() - - if not options.all_bams: - parser.check_required(("-b",)) - process_bam(options.bam_file) - else: - bams = [f for f in os.listdir(".") if f.endswith(".bam")] - print bams - for bam in bams: - process_bam(bam) - #print - - - - - - - - - - \ No newline at end of file diff --git a/python/DOCParameter.py b/python/DOCParameter.py deleted file mode 100755 index b25a7cfca..000000000 --- a/python/DOCParameter.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/util/bin/python - -import math, string, sys - -for input_file in sys.argv[1:]: - - FILENAME = input_file - - # Read the depth of coverage distribution from file - depth = []; count = [] - for line in open(FILENAME): - try: - int(line[0]) - split = string.split(line,' ') - if int(split[1]) != 0: - depth.append(int(split[0])) - count.append(int(split[1])) - except ValueError: pass - - # Calculate the mode - mode = depth[count.index(max(count))] - - # Find the index of maximum extent of 'good' data - idx = depth.index(mode); dist = 10**9 - while abs(count[idx] - count[0]) < dist: - dist = abs(count[idx] - count[0]) - idx += 1 - model_count = count[:idx + 1] - - # Calculate mean & variance of 'good' data - model_tot = sum(model_count); adder = 0 - for i in range(len(model_count)): adder += depth[i]*model_count[i] - mean = adder/float(model_tot) - - adder = 0 - for i in range(len(model_count)): adder += model_count[i]*(mean - depth[i])**2 - var = adder/float(model_tot) - stdev = var**0.5 - - # Output Thresholds - print FILENAME - print 'Mean Depth of Coverage: ' + str(mean) - print 'Plus One Std Dev: ' + str(mean + stdev) - print 'Plus Two Std Dev: ' + str(mean + 2*stdev) - print 'Plus Three Std Dev: ' + str(mean + 3*stdev) - print 'Plus Four Std Dev: ' + str(mean + 4*stdev) - print 'Plus Five Std Dev: ' + str(mean + 5*stdev) diff --git a/python/EvalMapping.py b/python/EvalMapping.py deleted file mode 100755 index 0137c4d5f..000000000 --- a/python/EvalMapping.py +++ /dev/null @@ -1,309 +0,0 @@ -#!/usr/bin/env python - -# Evaluates mapping results produced my RunMapper.py by comparig -# the result to the original simulated reads in a sam file - -import getopt, sys, os, re -from operator import attrgetter, itemgetter -from subprocess import Popen, STDOUT, PIPE -from SAM import SAMIO -from pushback_file import pushback_file -from qltout import qltout_file -from aln_file import aln_file - - -#!/usr/bin/env python - -import string, sys - -class SAMRecordWrapper: - def __init__(self, rec): - #print 'Rec', rec - self.rec = rec - - def id(self): return self.rec.getQName() - def contig(self): return self.rec.getRname() - def pos(self): return self.rec.getPos() - def offset(self): return self.pos()-1 - def map_qual(self): return self.rec.getMapq() - - def __str__(self): return str(self.rec) - -def SAMIOWrapper( file ): - print 'SAMIOWrapper', file - return SAMIO( file, debugging=True, func=SAMRecordWrapper) - -class mapper_info: - def __init__(self, name, ext, file_class): - self.name = name.lower() - self.ext = ext - self.file_class = file_class - -mapper_defs = [ mapper_info("ilt", "ilt.qltout", qltout_file), - mapper_info("merlin", "qltout", qltout_file), - mapper_info("swmerlin", "qltout", qltout_file), - mapper_info("bwa", "sam", SAMIOWrapper), - mapper_info("bwa32", "sam", SAMIOWrapper), - mapper_info("maq", "aln.txt", aln_file) ] -mapper_defs = dict( [(m.name, m) for m in mapper_defs] ) - -def Comp_sam_map(comp_head, mapper, file_output, sam_indel): - - # sam_indel is a +/- number corresponding to the size of insertions / deletions - # in generated SAM file that are fixed (1,2,4,8,16...), this doesn't always - # translate to the SAM's cigar, but the full value should be used - - file_output_head = comp_head+"."+mapper.name - if file_output: - # if non-false, output to filename in file_output_head - # if "" or False, leave output going to stdout - #saveout = sys.stdout - feval = open(file_output_head+".eval", "w") - #sys.stdout = feval - - # create a file with a one-line tabular output of stats - # (for merging into Excel / R input) - fevaltab = open(file_output_head+".tab", "w") - - # if maq file, convert aln.map to aln.txt - aln_txt_filename = file_output_head+"."+mapper.ext - if mapper.name == "maq" and not os.path.exists(aln_txt_filename): - maq_exe = "/seq/dirseq/maq-0.7.1/maq" - - # SHOULD BE THIS - #cmd_str = maq_exe+" mapview "+file_output_head+".out.aln.map" - - cmd_str = maq_exe+" mapview "+file_output_head+".out.aln.map" - - print >> sys.stderr, "Executing "+cmd_str - fout = open(aln_txt_filename, "w") - p = Popen( cmd_str , shell=True, stdout=fout) - - map_filename = comp_head+"."+mapper.name+"."+mapper.ext - print 'map_filename', map_filename - map_file = mapper.file_class(map_filename) - - SAM_filename = comp_head+".sam" - sam_file = SAMIO(SAM_filename, debugging=True) - - #sams = [s for s in sam_file] - print "Reading map file..." - maps = [q for q in map_file] - - print "Reading SAM file..." - sams = {} - if mapper.name in ["maq", 'bwa', 'bwa32']: - for s in sam_file: - sams[s.getQName()] = s - else: - for i, s in enumerate(sam_file): - sams[i] = s - - #maps = {} - #for m in map_file: - # maps[m.id()] = m - #sams = [s for s in sam_file] - - class smq_t: - def __init__(self, sam, map, qual): - self.sam = sam # SAM record (exists for all reads) - self.map = map # Mapping record, may be None - self.qual = qual # Quality of mapping - - print "Making SMQ records..." - SMQ = [] - for maprec in maps: - samrec = sams.get(maprec.id()) # get mapping otherwise None - SMQ.append( smq_t(samrec, maprec, maprec.map_qual()) ) - - print "Sorting..." - SMQ.sort(key=attrgetter('qual'), reverse=True) - - print "Evaluating placements..." - placed = 0 - incorrectly_placed = 0 - correctly_placed = 0 - fevaltab.write( "\t".join( ["last_qual", "total_reads", "placed", "correctly_placed", "incorrectly_placed", "not_placed"] ) + "\n" ) - if len(SMQ): - last_qual = SMQ[0].qual # grab 1st qual - for smq in SMQ: - if smq.sam == None: - continue - - if smq.qual != last_qual: - total_reads = len(sams)#placed + not_placed - not_placed = len(sams) - placed - fevaltab.write( "\t".join( map(str, [last_qual, total_reads, placed, correctly_placed, incorrectly_placed, not_placed]) ) + "\n" ) - #print "Wrote all reads with qual >= "+str(last_qual) - last_qual = smq.qual - - placed += 1 - - # Convert the CIGAR string - e.g. 75M1I, 74M2D - into an int corresponding to bases - # inserted / deleted - e.g. +1, -2 - #cigar = smq.sam.getCigar() - #indelled_bases = 0 - #match = re.search(r'(\d+)I', cigar) - #if match: - # indelled_bases += int(match.groups()[0]) - #match = re.search(r'(\d+)D', cigar) - #if match: - # indelled_bases -= int(match.groups()[0]) - - # We consider a read properly aligned if - # - both start positions are the same OR - # - the end position of the mapping = end position of the samq - #if smq.sam.getPos() != smq.map.pos() and smq.sam.getPos() - indelled_bases != smq.map.pos(): - - try: - map_indel_bases = smq.map.indelled_bases() # total number of indels in mapped seq - # are generally at the beginning of an alignment spanning the anchor base - except AttributeError: - map_indel_bases = 0 # Since MAQ doesn't do local alignment, we don't care - # if its sam.py interface doesn't have access to indel data since there are - # no indels in its alignmnets - - if smq.sam.getPos() != smq.map.pos() and smq.map.pos() + sam_indel + map_indel_bases != smq.sam.getPos(): - #print >> feval, "Indelled SAM bases: "+str(indelled_bases) - try: - print >> feval, "Indelled map bases: "+str(map_indel_bases) - except AttributeError: - pass - print >> feval, "SAM ref: %5s %s" % (smq.sam.getRname(), smq.sam.getPos()) - print >> feval, "Mapping: %5s %s" % (smq.map.contig(), smq.map.pos()) - print >> feval, "SAM record:\n"+str(smq.sam) - print >> feval, "Mapping record:\n"+str(smq.map)+"\n" - incorrectly_placed += 1 - else: - correctly_placed +=1 - - - #indexed_maps = [None] * len(sams) - #for q in maps: - # indexed_maps[q.id()] = q - - #def f(s,q): - # if q == None: - # return s - # else: - # return False - - #missing = filter( None, map(f, sams, indexed_maps))#, range(len(indexed_maps))) ) - - - #for miss in missing: - # print miss - #not_placed = len(missing) - #not_placed = len(sams) - len(maps) - total_reads = len(sams) #placed + not_placed - not_placed = len(sams) - placed - - print >> feval, "Total reads : "+str(total_reads) - print >> feval, "+-Placed : "+str(placed) - print >> feval, "| +-Correct : "+str(correctly_placed) - print >> feval, "| +-Incorrect: "+str(incorrectly_placed) - print >> feval, "+-Not placed : "+str(not_placed) - - fevaltab.write( "\t".join( map(str, [0, total_reads, placed, correctly_placed, incorrectly_placed, not_placed]) ) + "\n" ) - print "Wrote all reads with all quals" - - #if file_output_head: - # sys.stdout = saveout - -def remove_escape_codes(s): - import re - return re.sub(r'\x1b\[(\d\d)?\w', "", s) - -def usage(): - print "EvalMapping.py\n" - print "Required arguments:\n" - print " -h HEAD All input and output files start with HEAD" - print " --OR--" - print " " - print " i.e. ls *.sam | EvalMapping.py -m MAPPER" - print - print " -m MAPPER Mapping program output to compare (e.g. merlin, ILT)" - print - print "Optional arguments:\n" - print - print " -o Output to screen instead of HEAD.MAPPER.eval" - print - print "Input files:" - print " HEAD.sam" - print - print "Result files expected for Merlin and ILT:" - print " HEAD.MAPPER.qltout" - print - print "Result files expected for MAQ:" - print - sys.exit(2) - -if __name__ == "__main__": - opts = None - try: - opts, args = getopt.getopt(sys.argv[1:], "h:m:o", ["head","mapper","file-output"]) - except getopt.GetoptError: - usage() - - comp_heads = [] - mapper_str = "" - file_output = True - for opt, arg in opts: - if opt in ("-h", "--head"): - comp_heads = [arg] - if opt in ("-m", "--mapper"): - mapper_str = arg - possible_mappers = mapper_defs.keys() - # Check it later now... - #if mapper_str not in possible_mappers: - # print "--mapper argument was \""+mapper_str+"\"; expected one of the following: "+", ".join(possible_mappers) - # sys.exit(2) - if opt in ("-o", "--file-output"): - file_output = False - - # Check for required args - if (mapper_str == ""): - usage() - elif (mapper_str == "all"): - mappers = mapper_defs.values() - print mappers - else: - try: - mapper_str_items = mapper_str.split(",") - mappers = [mapper_defs.get(mapper.lower()) for mapper in mapper_str_items] - except KeyError: - print "Don't know this mapper given in -m option: "+mapper - print "Try: ilt, merlin, maq, bwa" - sys.exit(1) - - # if we didn't already get a single file from -h - if len(comp_heads) == 0: - # and there's a list on STDIN, - if not sys.stdin.isatty(): # redirected from file or pipe - comp_heads = [ os.path.splitext( remove_escape_codes(file).rstrip() )[0] for file in sys.stdin.readlines() ] - comp_heads = [ f for f in comp_heads if len(f) > 0 ] - else: - # no single file AND no STDIN file list - usage() - - print "\nFile heads to evaluate:" - print "\n".join(comp_heads)+"\n" - - comp_heads.reverse() - for comp_head in comp_heads: #[:1] - for mapper in mappers: - # Guess the original size of simulated indel from filename - m = re.search(r'_den\.(\d+)_', comp_head) - if m: - if ".DELETION_" in comp_head: - indel_size = -1 * int(m.groups()[0]) - elif ".INSERTION_" in comp_head: - indel_size = int(m.groups()[0]) - else: - indel_size = 0 - else: - sys.exit("Couldn't guess simulated read indel size from filename") - - print "Evaluating "+mapper.name+" results with indel size: "+str(indel_size)+" and file header: "+comp_head - Comp_sam_map(comp_head, mapper, file_output, indel_size) - diff --git a/python/FastaQuals2Fastq.py b/python/FastaQuals2Fastq.py deleted file mode 100755 index 7bc929153..000000000 --- a/python/FastaQuals2Fastq.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python - -# Utility script to convert FASTA + QUALS file into FASTQ required for MAQ -# MAQ needs to convert fastq into bfq file using fastq2bfq in order to use -# this data - -import sys, fasta, os -from itertools import * - -def usage(): - print "Usage: fastq.py fasta_in quals_in fastq_out" - print " fasta_in: *.fasta file" - print " quals_in: *.quals.txt OR *.quala" - sys.exit() - -def output_fastq_record(file_handle, fasta, quals): - #print 'fasta, quals', fasta, quals - #for fasta, qual in zip(fastas, quals): - - qualsVector = map(int, quals.seq.split()) - - print >> fqout, "@"+fasta.id - print >> fqout, fasta.seq - print >> fqout, "+" - print >> fqout, "".join( map(lambda q: chr(q+33), qualsVector ) ) - -def qualStr2fasta( line ): - elts = line.split() - id = elts[0] - quals = ' '.join(elts[1:]) - return fasta.fasta_record(id, quals) - -if __name__ == "__main__": - if len(sys.argv[1:]) != 3: - usage() - else: - fasta_in_file, quals_in_file, fastq_out_file = sys.argv[1:] - - print "Quals input file: "+quals_in_file - print "Fasta input file: "+fasta_in_file - - if os.path.splitext(quals_in_file)[1] == ".txt": - qual_factory = imap( qualStr2fasta, file(quals_in_file) ) - else: - if os.path.splitext(quals_in_file)[1] == ".quala": - # Create fasta factory from the quala file and treat it - # as if it was a fasta file and pull the numbers - qual_factory = fasta.fasta_file(quals_in_file, cleanup=False) - else: - print "Qual input file should be a *.quals.txt or *.quala file" - - print "Fastq ouptut file: "+fastq_out_file - fqout = file(fastq_out_file, "w") - - for qual_record, fasta_record in izip_longest(qual_factory, fasta.fasta_file(fasta_in_file)): - if qual_record == None or fasta_record == None: - # We ran out of one record type, so warn the user! - print "WARNING: Different number of quals and fastas in input files!" - sys.exit(1) - #print 'qual_record', qual_record - #print 'fasta_record', fasta_record - output_fastq_record(fqout, fasta_record, qual_record) - - - diff --git a/python/FlatFileTable.py b/python/FlatFileTable.py deleted file mode 100644 index dbb725673..000000000 --- a/python/FlatFileTable.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python - -import sys, itertools - -def record_generator(filename, sep="\t", skip_n_lines=0, skip_until_regex_line=""): - """Given a file with field headers on the first line and records on subsequent lines, -generates a dictionary for each line keyed by the header fields""" - fin = open(filename) - - if skip_n_lines > 0: - for i in range(skip_n_lines): # Skip a number of lines - fin.readline() - - found_regex = False - if skip_until_regex_line != "": - import re - regex_line = re.compile(skip_until_regex_line) - for line in fin: - match = regex_line.search(line) - if match: - found_regex = line - break - if not found_regex: - print "Warning: Regex "+skip_until_regex_line+" not found in FlatFileTable:record_generator" - - if found_regex: - header = found_regex.rstrip().split(sep) # Parse header - else: - header = fin.readline().rstrip().split(sep) # Pull off header - - for line in fin: # - fields = line.rstrip().split(sep) - record = dict(itertools.izip(header, fields)) - yield record - -def record_matches_values(record, match_field_values): - for match_field, match_values in match_field_values: - if record[match_field] not in match_values: - return False - return True diff --git a/python/Geli2GFF.py b/python/Geli2GFF.py deleted file mode 100755 index e9911e740..000000000 --- a/python/Geli2GFF.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python - -# Converts Geli files (genotype likelihood in picard) to GFF format for GATK - -import sys, os, subprocess - -Index2AffyChr = [] -Index2AffyChr.append("chrM") -for i in range(1,23): - Index2AffyChr.append("chr"+str(i)) -Index2AffyChr.append("chrX") -Index2AffyChr.append("chrY") - -if __name__ == "__main__": - if len(sys.argv) < 3: - print "usage: Geli2GFF.py INPUT_FILE OUTPUT_DIRECTORY" - sys.exit() - else: - infile = sys.argv[1] - outdir = sys.argv[2] - - output_file = outdir+"/"+os.path.splitext(os.path.basename(infile))[0]+".gff" - outfile = open(output_file, "w") - - cmd = "/seq/software/picard/current/bin/GeliToText.jar I="+infile - pipe = subprocess.Popen(cmd, shell=True, bufsize=4000, stdout=subprocess.PIPE).stdout - - pipe.readline() - for line in pipe: - if line[0] not in ('@', '#'): - fields = line.split("\t") - #print fields - try: - contig = fields[0] #Index2AffyChr[int(fields[0])] - except KeyError, ValueError: - print "skipping "+fields[0] - continue # Skip entries not in chr 0 through 24 - - print >>outfile, contig+"\tHapmapGenotype\t"+fields[5]+"\t"+fields[1]+"\t"+str(int(fields[1])+1)+"\t.\t.\t.\t" - - - - - - - - - - diff --git a/python/Gelis2PopSNPs.py b/python/Gelis2PopSNPs.py deleted file mode 100755 index 352ff6801..000000000 --- a/python/Gelis2PopSNPs.py +++ /dev/null @@ -1,170 +0,0 @@ -import farm_commands -import os.path -import sys -from optparse import OptionParser -import string -import re -import glob -import picard_utils -import itertools - -gatkPath = "~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar" -ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" - -def geli2dbsnpFile(geli): - root, flowcellDotlane, ext = picard_utils.splitPath(geli) - if ext == ".calls": - return None - else: - return os.path.join(root, flowcellDotlane) + '.dbsnp_matches' - -def SingleSampleGenotyperCmd(bam, geli, use2bp): - naid = os.path.split(bam)[1].split(".")[0] - metrics = geli + '.metrics' - gatkPath = '/humgen/gsa-scr1/kiran/repositories/Sting/trunk/dist/GenomeAnalysisTK.jar' - hapmapChip = '/home/radon01/andrewk/hapmap_1kg/gffs/' + naid + '.gff' - hapmapChipStr = ' '.join(['--hapmap_chip', hapmapChip]) - if not os.path.exists(hapmapChip): - print '*** warning, no hapmap chip resuls for', naid - hapmapChipStr = '' - - targetList = '/home/radon01/depristo/work/1kg_pilot_evaluation/data/thousand_genomes_alpha_redesign.targets.interval_list' - cmd = "java -ea -jar " + gatkPath + ' ' + ' '.join(['-T SingleSampleGenotyper', '-I', bam, '-L', targetList, '-R', ref, '-D', '/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod', '-metout', metrics, '-varout', geli, '-geli -l INFO ']) + hapmapChipStr - return cmd - -def bams2geli(bams): - def call1(bam): - geli = os.path.splitext(bam)[0] + '.geli' - jobid = 0 - if OPTIONS.useSSG: - if not os.path.exists(geli + '.calls'): - cmd = SingleSampleGenotyperCmd(bam, geli + '.calls', OPTIONS.useSSG2b) - else: - if not os.path.exists(geli): - cmd = picard_utils.callGenotypesCmd( bam, geli, options = picard_utils.hybridSelectionExtraArgsForCalling()) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry ) - return geli, jobid - calls = map(call1, bams) - return map(lambda x: x[0], calls), map(lambda x: x[1], calls) - -def gelis2gelisText( gelis ): - def geli2geliText( maybeGeli ): - if os.path.splitext(maybeGeli)[1] == ".calls" : - return maybeGeli - else: - return os.path.split(maybeGeli)[1] + '.calls' - - return map( geli2geliText, gelis) - - -def main(): - global OPTIONS, ROOT - - usage = "usage: %prog lanes.list nIndividuals [options]" - parser = OptionParser(usage=usage) - parser.add_option("-q", "--farmQueue", dest="farmQueue", - type="string", default=None, - help="Farm queue to submit jobs to. Leave blank for local processing") - parser.add_option("-l", "--lod", dest="lod", - type="float", default=5, - help="minimum lod for calling a variant") - parser.add_option("-k", "--column", dest="column", - type="int", default=1, - help="Column in the file with the bam or geli file path") - parser.add_option("", "--dry", dest="dry", - action='store_true', default=False, - help="If provided, nothing actually gets run, just a dry run") - parser.add_option("", "--ssg", dest="useSSG", - action='store_true', default=False, - help="If provided, we'll use the GATK SSG for genotyping") - parser.add_option("", "--ssg2b", dest="useSSG2b", - action='store_true', default=False, - help="If provided, we'll use 2bp enabled GATK SSG ") - parser.add_option("-o", "--output", dest="output", - type="string", default='/dev/stdout', - help="x") - - (OPTIONS, args) = parser.parse_args() - if len(args) != 2: - parser.error("incorrect number of arguments: " + str(args)) - lines = [line.strip().split() for line in open(args[0])] - nIndividuals = int(args[1]) - - outputFile = open(OPTIONS.output, 'w') - print >> outputFile, '#', ' '.join(sys.argv) - - data = map( lambda x: x[OPTIONS.column-1], lines ) - if os.path.splitext(data[0])[1] == '.bam': - gelis, jobids = bams2geli(data) - if filter(lambda x: x <> 0, jobids) <> []: - # there's still work to do - sys.exit('Stopping. Please rerun this program when the farm jobs are complete: ' + str(jobids)) - # TODO: Should add a wait here for all farm jobs to finish... - print 'gelis', gelis - print 'jobids', jobids - else: - gelis = map( lambda x: x[OPTIONS.column-1], lines ) - jobids = [None] * len(gelis) - - print 'Geli files' - print gelis - - for geli, jobid in zip(gelis, jobids): - dbsnpFile = geli2dbsnpFile(geli) - if dbsnpFile == None and not os.path.exists(dbsnpFile): - dbsnpCmd = picard_utils.CollectDbSnpMatchesCmd(geli, dbsnpFile, OPTIONS.lod) - if jobid == 0: jobid = None - farm_commands.cmd(dbsnpCmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid) - - # TODO: Should add a wait here for all farm jobs to finish... - - # read in the dbSNP tracks - nTotalSnps = 0 - nNovelSnps = 0 - for geli in gelis: - root, flowcellDotlane, ext = picard_utils.splitPath(geli) - #dbsnp_matches = os.path.join(root, flowcellDotlane) + '.dbsnp_matches' - dbsnp_matches = geli2dbsnpFile(geli) - print dbsnp_matches - if os.path.exists(dbsnp_matches): - TOTAL_SNPS, NOVEL_SNPS, PCT_DBSNP, NUM_IN_DB_SNP = picard_utils.read_dbsnp(dbsnp_matches) - nTotalSnps += int(TOTAL_SNPS) - nNovelSnps += int(NOVEL_SNPS) - print >> outputFile, '# DATA: ', flowcellDotlane, TOTAL_SNPS, NOVEL_SNPS, PCT_DBSNP, NUM_IN_DB_SNP, dbsnp_matches - print >> outputFile, '# DATA: TOTAL SNP CALLS SUMMED ACROSS LANES, NOT ACCOUNT FOR IDENTITY', nTotalSnps - print >> outputFile, '# DATA: NOVEL SNP CALLS SUMMED ACROSS LANES, NOT ACCOUNT FOR IDENTITY ', nNovelSnps - print >> outputFile, '# DATA: AVERAGE DBSNP RATE ACROSS LANES %.2f' % (100.0 * float(nTotalSnps - nNovelSnps) / (max(nTotalSnps, 1))) - - # convert the geli's to text - jobid = None - variantsOut = gelis2gelisText( gelis ) - for geli, variantOut in zip(gelis, variantsOut): - name = os.path.split(geli)[1] - if not os.path.exists(variantOut): - cmd = ("GeliToText.jar I=%s | awk '$1 !~ \"@\" && $1 !~ \"#Sequence\" && $0 !~ \"GeliToText\"' | awk '$7 > %f {print \"%s\" $0}' > %s" % ( geli, OPTIONS.lod, name, variantOut) ) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry) - - cmd = ("cat %s | sort -k 1 -k 2 -n > tmp.calls" % ( ' '.join(variantsOut) ) ) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid) - - sortedCallFile = 'all.sorted.calls' - cmd = ("~/dev/GenomeAnalysisTK/trunk/perl/sortByRef.pl -k 1 tmp.calls ~/work/humanref/Homo_sapiens_assembly18.fasta.fai > %s" % ( sortedCallFile ) ) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid) - - if not OPTIONS.dry: - sortedCalls = [line.split() for line in open(sortedCallFile)] - aggregratedCalls = picard_utils.aggregateGeliCalls(sortedCalls) - - print >> outputFile, 'loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom' - - for snp in map( lambda x: picard_utils.aggregatedGeliCalls2SNP(x, nIndividuals), aggregratedCalls ): - if snp == None: continue # ignore bad calls - #print snp - #sharedCalls = list(sharedCallsGroup) - #genotype = list(sharedCalls[0][5]) - print >> outputFile, '%s %s %s %.6f -0.0 -0.0 0.000000 100.0 %d %d %d %d' % (snp.loc, snp.ref, snp.alt(), snp.q(), nIndividuals, snp.nRefGenotypes(), snp.nHetGenotypes(), snp.nHomVarGenotypes()) - outputFile.close() - - -if __name__ == "__main__": - main() diff --git a/python/LogRegression.py b/python/LogRegression.py deleted file mode 100755 index 49b94b7ef..000000000 --- a/python/LogRegression.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python - -import subprocess, time, sys - -dinucs = ("AA","AC","AG","AT","CA","CC","CG","CT","GA","GC","GG","GT","TA","TC","TG","TT") - -#covar_work_dir = "/humgen/gsa-scr1/andrewk/recalibration_work" -#covar_table_dir = "/humgen/gsa-scr1/andrewk/recalibration_tables" -fileroot = sys.argv[1] -covar_counts_root = fileroot+".covariate_counts" -parameter_root = fileroot+".log_reg" -recal_table = fileroot+".recalibration_table" - -# Run all R process to do regression and wait for completion -kids = [] -for dinuc in dinucs: - kids.append(subprocess.Popen(["/home/radon01/andrewk/covariates/logistic_regression.R", covar_counts_root, parameter_root, dinuc])) - #kids.append(subprocess.Popen(["sleep", "8"])) - -while any(kid.poll() is None for kid in kids): - time.sleep(0.25) - -fout = file(recal_table, "w") - -fout.write("dinuc_v.1\t") -for p in range(5): - for q in range(5): - fout.write("%d,%d\t" % (p,q)) -fout.write("\n") - -for dinuc in dinucs: - dinin = open(parameter_root+"."+dinuc+".parameters") - #dinin.readline() - params = [] - for line in dinin: - line.rstrip("\n") - params.extend(map(float, line.split())) - fout.write(dinuc+"\t") - fout.write("\t".join(map(str, params))) - fout.write("\n") - - - - - - - - diff --git a/python/LogisticRegressionByReadGroup.py b/python/LogisticRegressionByReadGroup.py deleted file mode 100755 index f4389ea7f..000000000 --- a/python/LogisticRegressionByReadGroup.py +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python - -import os,sys - -def exit(msg,errorlevel): - "Exit the program with the specified message and error code." - print msg - sys.exit(errorlevel) - -def open_source_file(source_filename): - "Open the source file with the given name. Make sure it's readable." - if not os.access(source_filename,os.R_OK): - exit("Unable to read covariate counts file '" + sys.argv[1] + "'",1) - if not source_filename.endswith('.csv'): - exit("Source file is in incorrect format. Must be csv.") - return open(source_filename,'r') - -def read_header(source_file): - "Read the header from the given source file. Do basic validation." - header = source_file.readline().split(',') - if header[0] != 'rg' or header[1] != 'dn': - exit("Input file is in invalid format. First two columns should be ,",1) - return header - -def create_intermediate_file(source_file,header,read_group,dinuc): - "Create an intermediate file for a particular read group / dinuc from a given source file" - base = source_file.name[:source_file.name.rfind('.csv')] - intermediate_filename = "%s.%s.%s.csv" % (base,read_group,dinuc) - intermediate_file = open(intermediate_filename,"w") - intermediate_file.write(','.join(header[2:])) - return intermediate_file - -def open_target_file(target_filename): - "Open a target file and write out the header." - target_file = open(target_filename,'w') - target_file.write("rg\tdinuc\t") - for p in range(5): - for q in range(5): - target_file.write("%d,%d\t" % (p,q)) - target_file.write("\n") - return target_file - -def process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script): - "Process the contents of an intermediate file. An intermediate file is the specific data arranged per read group, per dinuc." - base = source_file.name[:source_file.name.rfind('.csv')] + '.' + read_group - regression_command = ' '.join((R_exe,logistic_regression_script,base,base,dinuc)) - result = os.system(regression_command) - if result != 0: - exit('Unable to run linear regression; command was %s, error code was %d' % (regression_command, result),1) - parameters_filename = '.'.join((base,dinuc,'parameters')) - if not os.access(parameters_filename,os.R_OK): - exit("Unable to read output of R from file " + parameters_filename) - parameters_file = open(parameters_filename,'r') - parameters = ' '.join([line.rstrip('\n') for line in parameters_file]).split(' ') - target_file.write('\t'.join([read_group,dinuc]+parameters)+'\n') - # optionally remove the input and output files. For now, keep them around for debugging purposes. - # os.remove('.'.join((base,dinuc,'csv'))) - # os.remove('.'.join((base,dinuc,'parameters'))) - -def compute_logistic_regression(source_filename,target_filename,R_exe,logistic_regression_script): - "Do the heavy lifting. Group covariant data into individual output files, compute the logistic regression, and format the results." - source_file = open_source_file(source_filename) - target_file = open_target_file(target_filename) - - header = read_header(source_file) - - intermediate_file = None - read_group = None - dinuc = None - - for data_line in source_file: - data_line = data_line.strip() - if len(data_line) == 0: - continue - data = data_line.split(',') - if read_group != data[0] or dinuc != data[1]: - if intermediate_file: - intermediate_file.close() - process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script) - read_group,dinuc = data[0:2] - intermediate_file = create_intermediate_file(source_file,header,read_group,dinuc) - intermediate_file.write(','.join(data[2:])+'\n') - - if intermediate_file: - intermediate_file.close() - process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script) - - source_file.close() - target_file.close() - -if __name__ == "__main__": - # set up sensible defaults for the locations of R and the logistic regression script - R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript" - logistic_regression_script="resources/logistic_regression.R" - - if len(sys.argv) < 3: - exit("Usage: logistic_regression ",1) - - compute_logistic_regression(sys.argv[1],sys.argv[2],R_exe,logistic_regression_script) diff --git a/python/MergeBAMBatch.py b/python/MergeBAMBatch.py deleted file mode 100755 index e2936a84a..000000000 --- a/python/MergeBAMBatch.py +++ /dev/null @@ -1,78 +0,0 @@ -import farm_commands -import os.path -import sys -from optparse import OptionParser -from datetime import date -import glob -import operator -import ValidateGATK -import picard_utils -from MergeBAMsUtils import * - -def splitSourcesByPopulation(allSources, merged_filename_base, NAID2Pop): - sourcePairs = [[source, source] for source in allSources] - return groupSources(sourcePairs, NAID2Pop, merged_filename_base) - -if __name__ == "__main__": - usage = "usage: %prog files.list [options]" - parser = OptionParser(usage=usage) - parser.add_option("-q", "--farm", dest="farmQueue", - type="string", default=None, - help="Farm queue to send processing jobs to") - parser.add_option("-d", "--dir", dest="output_dir", - type="string", default="./", - help="Output directory") - parser.add_option("", "--dry", dest="dry", - action='store_true', default=False, - help="If provided, nothing actually gets run, just a dry run") - parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles", - action='store_true', default=False, - help="Ignores already written files, if present") - parser.add_option("-s", "--useSamtools", dest="useSamtools", - action='store_true', default=False, - help="If present, uses samtools to perform the merge") - parser.add_option("-m", "--mergeBin", dest="mergeBin", - type="string", default=None, - help="Path to merge binary") - parser.add_option("-n", "--naIDPops", dest="NAIDS2POP", - type="string", default=None, - help="Path to file contains NAID POP names. If provided, input files will be merged by population") - parser.add_option("-p", "--pop", dest="onlyPop", - type="string", default=None, - help="If provided, only this population will be processed") - - (OPTIONS, args) = parser.parse_args() - if len(args) != 1: - parser.error("incorrect number of arguments") - - directory = OPTIONS.output_dir - - if not os.path.exists(directory): - os.mkdir(directory) - - NAID2Pop = None - if OPTIONS.NAIDS2POP <> None: - NAID2Pop = readNAIdMap(OPTIONS.NAIDS2POP) - - for line in open(args[0]): - s = line.split() - if ( s <> [] and s[0] <> '#' ): - merged_filename_base = s[0] - allSources = reduce( operator.__add__, map( glob.glob, s[1:] ), [] ) - print 'Merging info:' - for spec in splitSourcesByPopulation(allSources, merged_filename_base, NAID2Pop): - if OPTIONS.onlyPop <> None and spec.group() <> OPTIONS.onlyPop: - continue - - spec.setPath(directory) - spec.pprint() - - jobid = None - if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAM()): - output = spec.getMergedBase() - cmd = spec.mergeCmd(OPTIONS.mergeBin, useSamtools = OPTIONS.useSamtools) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry) - - if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAMIndex()): - jobid = farm_commands.cmd(spec.getIndexCmd(), OPTIONS.farmQueue, None, waitID = jobid, just_print_commands = OPTIONS.dry) - diff --git a/python/MergeBAMsUtils.py b/python/MergeBAMsUtils.py deleted file mode 100755 index 039f23d8d..000000000 --- a/python/MergeBAMsUtils.py +++ /dev/null @@ -1,210 +0,0 @@ -import farm_commands -import os.path -import sys -from optparse import OptionParser -from datetime import date -import glob -import operator -import ValidateGATK -import picard_utils -import operator - -MERGE_BIN = '/seq/software/picard/current/bin/MergeSamFiles.jar' -bam_ext = 'bam' -bam_index_ext = 'bai' - -def readNAIdMap(NAIdFile): - m = dict() - for data in [line.split() for line in open(NAIdFile)]: - naid, pop = data[0:2] - print naid, ' => ', pop - assert naid not in m - m[naid] = pop - print 'Read NAID->population map' - print 'Contains', len(m), 'id -> population mappings' - print 'Distinct populations:', picard_utils.unique(m.values()) - return m - -_abbrevs = [ - (1<<50L, 'P'), - (1<<40L, 'T'), - (1<<30L, 'G'), - (1<<20L, 'M'), - (1<<10L, 'k'), - (1, '') - ] - -def greek(size): - """Return a string representing the greek/metric suffix of a size""" - for factor, suffix in _abbrevs: - if size > factor: - break - return '%.1f%s' % (float(size)/factor, suffix) - -class MergeFilesSpec: - def __init__(self, sources, group, merged_filename_base, path = '' ): - self.sourceFiles = sources - self.groupName = group - self.merged_filename_base = merged_filename_base - self.path = '' - - def __str__(self): - return 'MergeFilesSpec: ' + self.group() - - def group(self): - return self.groupName - - def sources(self): - return self.sourceFiles - - def filename(self): - if self.merged_filename_base <> None: - return self.merged_filename_base + '.'.join([self.group()]) - else: - return self.group() - - def pprint(self): - print '--------------------------------------------------------------------------------' - print ' Population: ', self.group() - print ' Merged filename: ', self.getMergedBAM() - print ' N sources: ', len(self.sources()) - for source in sorted(self.sources()): - print ' Source: ', source - print ' Sizes: ', self.sourceSizes(humanReadable=True) - print ' Est. merged size: ', greek(reduce(operator.__add__, self.sourceSizes(), 0)) - - def setPath(self, path): - self.path = path - - def getMergedBase(self): - return os.path.join(self.path, self.filename()) - - def getMergedBAM(self): - return self.getMergedBase() + '.' + bam_ext - - def getMergedBAMIndex(self): - return self.getMergedBase() + '.' + bam_ext + '.' + bam_index_ext - - def sourceSizes(self, humanReadable=False): - sizes = map( os.path.getsize, self.sources() ) - if humanReadable: - sizes = map(greek, sizes) - return sizes - - def mergeCmd(self, mergeBin = None, MSD = True, useSamtools = False): - if mergeBin == None: - mergeBin = MERGE_BIN - - return picard_utils.mergeBAMCmd(self.getMergedBAM(), self.sources(), mergeBin, MSD = MSD, useSamtools = useSamtools) - - def getIndexCmd(self): - return "samtools index " + self.getMergedBAM() - -# Very general-purpose operation that returns merge specs given two lists of pairs: -# The first list, sources, contains superkey / sourceFile pairs. -# The second list, groups, contains key / group pairs. -# -# This function walks over all source pairs, and for each superkey, it -# looks for any key within groups contained within superkey. If it finds it, -# it associates sourceFile with the merge group in groups. -# -# The system requires that superkey match one (and only) one key in groups. It also -# requires that each group string be unique. The system can handle groups provided -# as a list of pairs of a dictionary. -# -# The function returns a list of MergeFileSpecs, one for each group with -# at least one associated sourceFile. -# -def groupSources(sources, groups, merged_filename_base): - if groups == None: - return [MergeFilesSpec(map( lambda x: x[1], sources), '', merged_filename_base)] - else: - specs = dict() - if type(groups) == list: groups = dict(groups) - - for superkey, sourceFile in sources: - spec = None - for key, group in groups.iteritems(): - #print 'Examining', superkey, key, group - if superkey.find(key) <> -1: - if group in specs: - spec = specs[group] - else: - spec = MergeFilesSpec([], group, merged_filename_base) - specs[group] = spec - print 'Mapping', group, key, superkey, sourceFile - spec.sourceFiles.append(sourceFile) - if spec == None: - sys.exit('File contains an unknown superkey: ' + superkey) - v = specs.values() - v.sort(key = MergeFilesSpec.group) - return v - -import unittest -class TestMergeBAMsUtils(unittest.TestCase): - def setUp(self): - import cStringIO - groupsString = """NA10846 ceu CEPH1 -NA10847 ceu CEPH1 -NA12144 ceu CEPH1 -NA12145 ceu CEPH1 -NA12146 yri CEPH1 -NA12239 yri CEPH1 -NA07029 ceu CEPH1 -NA07019 ceu CEPH1 -NA06994 ceu CEPH1 -NA07000 ceu CEPH1 -NA07022 ceu CEPH1 -NA07056 ceu CEPH1 -NA07048 ceu CEPH1 -NA06991 ceu CEPH1 -NA07034 ceu CEPH1 -""" - lanesString = """NA10846 30GA9AAXX 1 Paired CEPH 30GA9AAXX.1.observed_genotypes.geli -NA10846 30GA9AAXX 6 Paired CEPH 30GA9AAXX.6.observed_genotypes.geli -NA10847 30GA9AAXX 7 Paired CEPH 30GA9AAXX.7.observed_genotypes.geli -NA12146 30JLTAAXX 2 Paired CEPH 30JLTAAXX.2.observed_genotypes.geli -NA12239 30PNVAAXX 1 Paired CEPH 30PNVAAXX.1.observed_genotypes.geli -NA12144 30PYMAAXX 1 Paired CEPH 30PYMAAXX.1.observed_genotypes.geli -NA12146 30PYMAAXX 7 Paired CEPH 30PYMAAXX.7.observed_genotypes.geli -""" - - self.laneIDCounts = dict([["NA10846", 2], ["NA10847", 1], ["NA12146", 2], ["NA12239", 1], ["NA12144", 1]]) - - pops = [line.strip() for line in cStringIO.StringIO(groupsString)] - lanes = [line.strip() for line in cStringIO.StringIO(lanesString)] - - print pops - print lanes - - self.ids2pop = [line.split()[0:2] for line in pops] - self.ids2gelis = [[line.split()[0], line.split()[5]] for line in lanes] - self.ids2ids = dict([[line.split()[0]] * 2 for line in lanes]) - - def testPopGroups(self): - specs = groupSources(self.ids2gelis, self.ids2pop, 'foo') - print 'Specs', specs - self.assertEqual(len(specs), 2) - self.assertEqual(specs[0].group(), 'ceu') - self.assertEqual(specs[1].group(), 'yri') - - ceu = specs[0] - yri = specs[1] - #print ceu.sources() - self.assertEqual(len(ceu.sources()), 4) - self.assertEqual(len(yri.sources()), 3) - self.assertEqual(ceu.getMergedBAM(), 'foo.ceu.bam') - self.assertEqual(ceu.getMergedBAMIndex(), 'foo.ceu.bam.bai') - self.assertEqual(yri.getMergedBAM(), 'foo.yri.bam') - self.assertEqual(yri.getMergedBAMIndex(), 'foo.yri.bam.bai') - - def testIDGroups(self): - specs = groupSources(self.ids2gelis, self.ids2ids, 'foo') - self.assertEqual(len(specs), 5) - for spec in specs: - print 'Spec', spec - self.assertEqual(len(spec.sources()), self.laneIDCounts[spec.group()]) - self.assertEqual(spec.getMergedBAM(), 'foo.' + spec.group() + '.bam') - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/python/MergeBamsByKey.py b/python/MergeBamsByKey.py deleted file mode 100755 index 8732bcc0f..000000000 --- a/python/MergeBamsByKey.py +++ /dev/null @@ -1,82 +0,0 @@ -import farm_commands -import os.path -import sys -from optparse import OptionParser -import picard_utils -from MergeBAMsUtils import * - -def splitSourcesByKeys( bams, keys ): - keyPairs = [[key, key] for key in keys] - keybamPairs = zip(keys, bams) - return groupSources(keybamPairs, keyPairs, None) - -if __name__ == "__main__": - usage = """usage: %prog bams.list [options] -Merges BAM files by keys from a file of a list of bams. - -bams.list is a whitespace separated file. One column (--keyCol arg) is the key, and another -column (--bamCol) is a path to a bam file. This program will group the bam files -by key and spawn merge and index jobs to merge all of the files sharing the same key together""" - - parser = OptionParser(usage=usage) - parser.add_option("-q", "--farm", dest="farmQueue", - type="string", default=None, - help="Farm queue to send processing jobs to") - parser.add_option("-d", "--dir", dest="output_dir", - type="string", default="./", - help="Output directory") - parser.add_option("", "--dry", dest="dry", - action='store_true', default=False, - help="If provided, nothing actually gets run, just a dry run") - parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles", - action='store_true', default=False, - help="Ignores already written files, if present") - parser.add_option("-m", "--mergeBin", dest="mergeBin", - type="string", default=None, - help="Path to merge binary") - parser.add_option("", "--MSD", dest="MSD", - action='store_true', default=False, - help="Merge sequence dictionaries?") - parser.add_option("", "--keyCol", dest="keyCol", - type=int, default=1, - help="Column in the list file holding the key") - parser.add_option("", "--bamCol", dest="bamCol", - type=int, default=2, - help="Column in the list file holding the bam file path") - parser.add_option("-l", "--link", dest="link", - action='store_true', default=False, - help="If true, program will soft link single bam files that don't need merging") - - (OPTIONS, args) = parser.parse_args() - if len(args) != 1: - parser.error("incorrect number of arguments") - - directory = OPTIONS.output_dir - - if not os.path.exists(directory): - os.mkdir(directory) - - bamsList = [line.strip().split() for line in open(args[0])] - keys = map( lambda x: x[OPTIONS.keyCol-1], bamsList ) - bams = map( lambda x: x[OPTIONS.bamCol-1], bamsList ) - - print 'Merging info:' - for info in bamsList: print info - for spec in splitSourcesByKeys(bams, keys): - spec.setPath(directory) - spec.pprint() - - jobid = None - if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAM()): - output = spec.getMergedBase() + '.stdout' - if len(spec.sources()) == 1 and OPTIONS.link: - cmd = 'ln -s ' + spec.sources()[0] + ' ' + spec.getMergedBAM() - else: - cmd = spec.mergeCmd(OPTIONS.mergeBin, MSD = OPTIONS.MSD) - print cmd - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, just_print_commands = OPTIONS.dry) - - if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAMIndex()): - #pass - jobid = farm_commands.cmd(spec.getIndexCmd(), OPTIONS.farmQueue, None, waitID = jobid, just_print_commands = OPTIONS.dry) - diff --git a/python/MergeEvalMapTabs.py b/python/MergeEvalMapTabs.py deleted file mode 100755 index 06af40298..000000000 --- a/python/MergeEvalMapTabs.py +++ /dev/null @@ -1,154 +0,0 @@ -#!/usr/bin/env python - -# merges .tab files produced by EvalMapping.py into a tabular format - -#from __future__ import print_function # python 2.6 / 3.0 print as a function -import os,sys,re -from rpy2 import * - -def ordered_eval_files(aligner, mutation_type, file_ext): - tab_files = [f for f in os.listdir(".") if f.endswith("."+aligner+"."+file_ext) and (mutation_type in f or "None_den." in f)] - - tabs = [] - for f in tab_files: - num = int(re.search(r'den\.(\d\d?)_', f).groups()[0]) - if "None_den" in f: - num = 0 - tabs.append( (num,f) ) - tabs.sort() - return tabs - -def anchor_hist(filename): - fin = open(filename) - offsets = [] - read_poses = [] - - for line in fin: - id = None - fields = line.split() - if len(fields) > 0 and ".read." in fields[0]: - id = line.split()[0] - read_poses.append(id.split(".")[-1]) - offsets.append(id.split(":")[1].split(".")[0]) - - #if "read name :" in line: - # id = line.split()[3] - - import rpy2.robjects as robj - if len(read_poses): - print len(read_poses) - return robj.r.hist(robj.FloatVector(read_poses), breaks=77, plot=False) - else: - print 0 - return robj.IntVector([]) - -def estimateCPUTime(filename): - times = [] - if os.path.exists(filename): - regex = re.compile('CPU time\s*:\s+([\d\.]+) sec') - #print ('Filename', filename) - def extractTime1(line): - sobj = regex.search(line) - if sobj <> None: - time = float(sobj.group(1)) - #print (line, sobj, sobj.groups(), time) - return time - else: - return None - times = [time for time in map(extractTime1, open(filename)) if time <> None] - #print('times', times) - - return max(times) - -def print_eval_tables(aligner, mut_type, num_var, filename, min_qual_cutoff=0): - filebase = os.path.splitext(filename)[0] - cpuTime = estimateCPUTime(filebase + '.stdout') - - lines = open(filename).readlines() - data = None - if len(lines) > 1: - da_line = lines[0] - for line in lines: - line = line.rstrip() - fields = line.split("\t") - try: - qual_cutoff = int(fields[0]) - except: - qual_cutoff = 1000 - if qual_cutoff <= min_qual_cutoff: - break - else: - da_line = line - - #print ("%2d\t" % num_var, file=sys.stderr), - data = [aligner, mut_type, min_qual_cutoff, num_var] + map(int, da_line.split()) + [cpuTime] - print "%s\t%s\t%2d\t%s\t%.2f" % (aligner, mut_type, num_var, da_line, cpuTime) - return data - -def plot_anchor_hists(): - import rpy2.robjects as robj - #robj.r.par( mfrow = robj.RVector([2,2]) ) - #robj.r('X11(width=24, height=15)') - robj.r('png("lotto_histo.png", width=1500, height=900)') - robj.r('par(mfrow=c(4,6 ), cin=c(3,3))') - - for (aligner, cutoff) in (("maq", 30), ("ilt",-1), ("merlin",-1), ("swmerlin",-1)): - print aligner, ">"+str(cutoff) - - num_file = ordered_eval_files(aligner, mut_type, file_ext="eval") - #print (*num_file, sep="\n") - for num, file in num_file: - - h = anchor_hist( file ) - title = "{aligner} {num} bp insert".format(aligner=aligner, num=num) - robj.r.plot(h, xlab="Anchor position", main=title, col="blue", ylim=robj.FloatVector([0,40]), xlim=robj.FloatVector([0,90])) - - robj.r("dev.off()") - #raw_input("Press any key to exit...") - - -def analyzeData( data, mut_type ): - import rpy2.robjects as robj - # print name / data name - aligners = [('MAQ', 'maq>30'), ('Merlin', 'swmerlin>-1'), ('ILT', 'ilt>-1'), ('BWA', 'bwa32>30')] - - def selectData( key ): - def keepDataP( data ): - return data[0] == key and data[1] == mut_type - return filter( keyDataP, data ) - - def placeRatePlot(): - robj.r('png("x.png", width=1500, height=900)') - x = robj.FloatVector(mutDensity) - y = robj.FloatVector(placeRate) - robj.r.plot(x, y, xlab="Mutation density / size", col="blue") - robj.r("dev.off()") - - return 0 - -if __name__ == "__main__": - - if len(sys.argv) < 2: - print "merge_tabs.py MUTATION_TYPE (e.g. INSERTION, DELETION, SNP)" - sys.exit(1) - else: - mut_type = sys.argv[1] - - if False: - plot_anchor_hists() - else: - data = [] - for (aligner, cutoff) in (("maq", 30), ("maq", 0), ("maq",-1), ("ilt",-1), ("merlin",-1), ("swmerlin",-1), ("bwa", 30), ("bwa", 0), ("bwa", -1), ("bwa32", 30), ("bwa32", 0), ("bwa32", -1)): - name = aligner + ">" + str(cutoff) - #print (aligner, ">"+str(cutoff)) - - num_file = ordered_eval_files(aligner, mut_type, file_ext="tab") - #num_file = ordered_eval_files(aligner, mut_type, file_ext="eval") - for num, file in num_file: - datum = print_eval_tables( name, mut_type, num, file, cutoff ) - if datum <> None: - data.append(datum) - #print - - analyzeData( data, mut_type ) - diff --git a/python/RecalQual.py b/python/RecalQual.py deleted file mode 100755 index bf41c377c..000000000 --- a/python/RecalQual.py +++ /dev/null @@ -1,221 +0,0 @@ -#!/usr/bin/env python - -# Executable files. Please update these to match the installed locations of your tools. -samtools_exe='/seq/dirseq/samtools/current/samtools' -java_exe='/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/bin/java' -R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript" - -# Any special site-specific arguments to pass the JVM. -jvm_args='-ea' - -# Which platforms should the calibration tool be run over? -platforms=['illumina'] - -# Where to put the output created as part of recalibration. -# If editing, please end this variable with a trailing slash. -output_root = './' - -# Location of the resource files distributed with the recalibration tool. -# If editing, please end this variable with a trailing slash. -resources='recalibratorResources/' -#resources='/broad/1KG/DCC_recal/ReadQualityRecalibrator/resources/' - -import getopt,glob,os,sys -import LogisticRegressionByReadGroup - -# if the provided resources path is relative, convert it to a path -# relative to the calling script rather than the cwd. -if not os.path.isabs(resources): - application_path = os.path.dirname(sys.argv[0]) - resources = os.path.abspath(application_path) + '/' + resources - -# Where does the reference live? -reference_base = resources + 'Homo_sapiens_assembly18' -reference = reference_base + '.fasta' -reference_dict = reference_base + '.dict' -reference_fai = reference_base + '.fasta.fai' - -# Where does DBSNP live? -dbsnp = resources + 'dbsnp_129_hg18.rod' - -# Where are the application files required to run the recalibration? -#gatk = resources + 'gatk/GenomeAnalysisTK.jar' -gatk = '/home/radon01/depristo/dev/GenomeAnalysisTK_stable/trunk/dist/GenomeAnalysisTK.jar' - -logistic_regression_script = resources + 'logistic_regression.R' -empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R' - -# Assemble the platform list into command-line arguments. -platform_args = ' '.join(['-pl %s' % platform for platform in platforms]) - -def output_file_needs_update(output_file, input_files = []): - if reprocessFiles: # we are always reprocessing files - return True - if not os.path.exists(output_file): # the file doesn't exist - return True - else: - return False - -def executeCommand(cmd): - if not dryRun: - return os.system(cmd) - else: - print ' => Would execute', cmd - return 0 - -def exit(msg,errorcode): - print msg - sys.exit(errorcode) - -def check_input_file_available(filename,description): - if not os.access(filename,os.R_OK): - exit('Unable to access %s %s' % (description,filename),1) - -def graph_file(graph_script,graph_data): - 'Graph the given data using the given script. Leave the data in the output directory.' - check_input_file_available(graph_script,'%s R graphing script' % graph_script) - check_input_file_available(graph_data,'%s graphing data' % graph_data) - result = executeCommand(' '.join((R_exe,graph_script,graph_data))) - if result != 0: - exit('Unable to graph data: %s' % graph_data,1) - -def recalibrate(): - 'Recalibrate the given bam file' - # generate the covariates - print 'generating covariates' - linear_regression_results = output_dir + 'linear_regression_results.out' - if output_file_needs_update(linear_regression_results): - generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args)) - returncode = executeCommand(generate_covariates) - if returncode != 0: - exit('Unable to generate covariates',1) - - # compute the logistic regression - if not dryRun: - print 'computing the logistic regression' - LogisticRegressionByReadGroup.compute_logistic_regression(output_dir + 'initial.covariate_counts.csv',linear_regression_results,R_exe,logistic_regression_script) - else: - print 'Logistic recalibration data files already generated', linear_regression_results - - if output_file_needs_update(calibrated_bam): - # apply the logistic regression, writing the output data to calibrated_bam - print 'applying the correction to the reads' - apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams',output_dir+'linear_regression_results.out','-outputBAM',calibrated_bam)) - returncode = executeCommand(apply_logistic_regression) - if returncode != 0: - exit('Unable to apply logistic regression',1) - else: - print 'Recalibrated BAM file already generated', calibrated_bam - - # index the calibrated bam - if output_file_needs_update(calibrated_bam + '.bai'): - print 'indexing the calibrated bam' - index_calibrated_bamfile = ' '.join((samtools_exe,'index',calibrated_bam)) - returncode = executeCommand(index_calibrated_bamfile) - if returncode != 0: - exit('Unable to index calibrated bamfile',1) - else: - print 'Recalibrated BAM index file already generated', calibrated_bam + '.bai' - - print 'Recalibration complete! Calibrated bam is available here: ' + calibrated_bam - -def evaluate(): - 'Evaluate recalibration results.' - print 'Evaluating recalibration results' - - recalibrated_regression_results = output_dir + 'recalibrated' + 'linear_regression_results.out' - if output_file_needs_update(recalibrated_regression_results): - # regenerate the covariates - regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args)) - print 'regenerating covariates' - returncode = executeCommand(regenerate_covariates) - if returncode != 0: - exit('Unable to regenerate covariates',1) - - print 'graphing initial results' - for filename in glob.glob(output_dir+'initial.*.empirical_v_reported_quality.csv'): - graph_file(empirical_vs_reported_grapher,filename) - print 'graphing final results' - for filename in glob.glob(output_dir+'recalibrated.*.empirical_v_reported_quality.csv'): - graph_file(empirical_vs_reported_grapher,filename) - else: - print 'Evaluated files already generated', recalibrated_regression_results - -def usage(): - exit('Usage: python RecalQual.py [--recalibrate] [--evaluate] ',1) - -# Try to parse the given command-line arguments. -try: - opts, args = getopt.gnu_getopt(sys.argv[1:],'',['recalibrate','evaluate', 'reprocess', 'dry', 'outputRoot=']) -except getopt.GetoptError, err: - usage() - -# An input and an output file is required. Fail if left unspecified. -if len(args) < 2: - usage() - -# Determine whether to evaluate / recalibrate. -recalibrate_requested = False -evaluate_requested = False -reprocessFiles = False -dryRun = False -for opt,arg in opts: - if opt == '--recalibrate': - recalibrate_requested = True - if opt == '--evaluate': - evaluate_requested = True - if opt == '--reprocess': - reprocessFiles = True - if opt == '--dry': - dryRun = True - if opt == '--outputRoot': - output_root = arg - -# Default to 'recalibrate' unless the user specified only evaluate. -do_recalibration = not (evaluate_requested and not recalibrate_requested) -# Only evaluate if the user specifically requested evaluation. -do_evaluation = evaluate_requested - -print 'Looking for supplemental files in ' + resources - -# check that the input bam file exists, and that the bam is indexed. -bam = args[0] -bam_index = bam + '.bai' - -check_input_file_available(bam,'reads file') -check_input_file_available(bam_index,'reads index file') - -# parse the user's calibration output file requirements -calibrated_bam = args[1] -calibrated_bam_index = calibrated_bam + '.bai' - -# check that the fasta and supporting files are available -check_input_file_available(reference,'reference file') -check_input_file_available(reference_dict,'reference dictionary') -check_input_file_available(reference_fai,'reference index file') - -# check that the dbsnp is available -check_input_file_available(dbsnp,'dbsnp file') - -# sanity check that the software is available -check_input_file_available(samtools_exe,'samtools') -check_input_file_available(java_exe,'java') -check_input_file_available(R_exe,'R') -check_input_file_available(gatk,'Genome Analysis Toolkit') -check_input_file_available(logistic_regression_script,'logistic regression script') - -# make an output directory for temporary files -output_dir=output_root+'output.' + bam[bam.rfind('/')+1:bam.rfind('.bam')] + '/' -if not os.path.isdir(output_dir): - os.mkdir(output_dir) -if not os.path.isdir(output_dir): - exit('Unable to create output directory ' + output_dir,1) - -# assemble the required program arguments -gatk_base_cmdline = ' '.join((java_exe,jvm_args,'-jar',gatk,'-R',reference,'--DBSNP',dbsnp,'-l INFO')) - -if do_recalibration: - recalibrate() - -if do_evaluation: - evaluate() diff --git a/python/SAM.py b/python/SAM.py deleted file mode 100644 index cfeb3c008..000000000 --- a/python/SAM.py +++ /dev/null @@ -1,237 +0,0 @@ -import string -import operator -from pushback_file import pushback_file - -# Type TAG Req? Default Description -SAMHeaderFields = \ - [['HD', 'VN', True, '0.1.0', 'File format version'], \ - ['HD', 'SO', False, None, 'Sort order. Valid values are: unsorted, queryname or coordinate'], \ - ['SQ', 'SN', True, None, 'Sequence name. Unique among all sequence records in the file. The value of this field is used in alignment records.'], \ - ['SQ', 'LN', True, None, 'Sequence length'], \ - ['SQ', 'AS', False, None, 'Genome assembly identifier. Refers to the reference genome assembly in an unambiguous form. Example: HG18.'], \ - ['SQ', 'M5', False, None, 'MD5 checksum of the sequence in the uppercase (gaps and space are removed)'], \ - ['SQ', 'UR', False, None, 'URI of the sequence'], \ - ['SQ', 'SP', False, None, 'Species'], \ - ['RG', 'ID', True, 'ReadGroup1', 'Unique read group identifier. The value of the ID field is used in the RG tags of alignment records.'], \ - ['RG', 'SM', True, 'SampleA', 'Sample (use pool name where a pool is being sequenced)'], \ - ['RG', 'LB', False, None, 'Library'], \ - ['RG', 'DS', False, None, 'Description'], \ - ['RG', 'PU', False, None, 'Platform unit (e.g. lane for Illumina or slide for SOLiD)'], \ - ['RG', 'PI', False, None, 'Predicted median insert size (maybe different from the actual median insert size)'], \ - ['RG', 'CN', False, None, 'Name of sequencing center producing the read'], \ - ['RG', 'DT', False, None, 'Date the run was produced (ISO 8601 date or date/time)'], \ - ['RG', 'PL', False, None, 'Platform/technology used to produce the read'], \ - ['PG', 'ID', True, 'ProgramA','Program name'], \ - ['PG', 'VN', False, None, 'Program version'], \ - ['PG', 'CL', False, None, 'Command line']] - -def SAMHeaderEncode( type, tag ): - return type + '.' + tag - -SAMHeaderDict = dict() -for record in SAMHeaderFields: - type, tag, req, default, desc = record - SAMHeaderDict[SAMHeaderEncode(type, tag)] = [req, default, desc] - -# ----------------------------------------------------------------------------------------------- -# -# A SAM header is a potentially complex object, so we just punt on it. The only really required -# outputs that *must* be user specified are the SQ: SN and SQ: LN fields. -# Everything else is just split up and stored in our hash table by reading the SAMHeaderFields -# data structure (defined above). All required fields are initialized to their default values -# -# ----------------------------------------------------------------------------------------------- -class SAMHeader(dict): - def __init__(self, seqName, seqLen, **keys): - # setup initial header - self.fields = dict() - for record in SAMHeaderFields: - type, tag, req, default, desc = record - self.setField( type, tag, default ) - - self.setField('SQ', 'SN', seqName ) - self.setField('SQ', 'LN', seqLen ) - - # add keyword arguments - for key, value in keys.iteritems(): - type, tag = key.split('.') - self.setField( type, tag, value ) - - def isReq( self, type, tag ): - return SAMHeaderDict[SAMHeaderEncode(type,tag)][0] - - def setField( self, type, tag, value ): - #print 'Setting', type, tag, value - self.fields[SAMHeaderEncode(type, tag)] = value - - def getField( self, type, tag ): - return self.fields[SAMHeaderEncode(type, tag)] - - def __str__( self ): - types = ['HD', 'SQ', 'RG'] - - def formatType( type ): - s = '@' + type + '\t' - for record in SAMHeaderFields: - qtype, tag, req, default, desc = record - if type == qtype and self.isReq(type, tag): - value = self.getField(type, tag) - if value == None: - raise Error('Unset required SAM header ' + type + ' ' + tag) - s += tag + ':' + str(value) + '\t' - return s - return string.join(map( formatType, types ),'\n') - -SAM_SEQPAIRED = 0x0001 # the read is paired in sequencing, no matter whether it is mapped in a pair -SAM_MAPPAIRED = 0x0002 # the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1 -SAM_UNMAPPED = 0x0004 # the query sequence itself is unmapped -SAM_MATEUNMAPPED = 0x0008 # the mate is unmapped 1 -SAM_QUERYSTRAND = 0x0010 # strand of the query (0 for forward; 1 for reverse strand) -SAM_MATESTRAND = 0x0020 # strand of the mate 1 -SAM_ISFIRSTREAD = 0x0040 # the read is the first read in a pair 1,2 -SAM_ISSECONDREAD = 0x0080 # the read is the second read in a pair 1,2 -SAM_NOTPRIMARY = 0x0100 # the alignment is not primary (a read having split hits may have multiple primary alignment records) - -SAM_FLAGS = { - SAM_SEQPAIRED : 'the read is paired in sequencing, no matter whether it is mapped in a pair', - SAM_MAPPAIRED : 'the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1', - SAM_UNMAPPED : 'the query sequence itself is unmapped', - SAM_MATEUNMAPPED : 'the mate is unmapped 1', - SAM_QUERYSTRAND : 'strand of the query (0 for forward; 1 for reverse strand)' , - SAM_MATESTRAND : 'strand of the mate 1' , - SAM_ISFIRSTREAD : 'the read is the first read in a pair 1,2' , - SAM_ISSECONDREAD : 'the read is the second read in a pair 1,2', - SAM_NOTPRIMARY : 'the alignment is not primary (a read having split hits may have multiple primary alignment records)' - } - -def SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig = '*', pairPos = 0, insertSize = 0 ): - r = SAMRecord() - r.setValuesFromArgs( qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig, pairPos, insertSize ) - return r - -def SAMRecordFromString( str ): - r = SAMRecord() - r.setValuesFromString( str ) - return r - -def SAMFlagValue( flags, testFlag ): - return testFlag & flags - -def SAMFlagIsSet( flags, testFlag ): - return SAMFlagValue(flags, testFlag) <> 0 - -def SAMFlagsDescs( flags ): - def keepMe(p): - flagKey, flagDesc = p - return [flagKey, SAMFlagIsSet(flags, flagKey), flagDesc] - return sorted(map( keepMe, SAM_FLAGS.iteritems() )) - -# ----------------------------------------------------------------------------------------------- -# -# This is really the meat of the SAM I/O system. -# -# setValuesFromArgs takes the required arguments for a SAM record and stores them in a list -# (in SAM ordering) so that writing the values to a string is trivial. Note the conversion -# of int quals to ASCII encoded quals. -# -# ----------------------------------------------------------------------------------------------- -class SAMRecord: - def __init__( self, vals = []): - self.vals = [] - - def setValuesFromArgs( self, qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig = '*', pairPos = 0, insertSize = 0 ): - self.vals = [qname, self.formatFlags(flags), rname, pos, mapq, \ - cigar, pairContig, pairPos, insertSize, seq.lower(), self.toASCII33(quals) ] - - def setValuesFromString( self, line ): - #print 'line is', line - formats = [str, int, str, int, int, str, str, int, int, str, str] - self.vals = map( lambda f, v: f(v), formats, line.split()[0:len(formats)] ) - - def formatFlags( self, flags ): - b = reduce( operator.__or__, flags, 0 ) - #print 'FormatFlags', flags, b - return b - - def getQName(self): return self.vals[0] - def getFlags(self): return self.vals[1] - def getRname(self): return self.vals[2] # Reference sequence name (can be chr1, chr2, etc...) - def getPos(self): return self.vals[3] - def getMapq(self): return self.vals[4] - def getCigar(self): return self.vals[5] # Returns CIGAR which gives match, insertion, and other info as "75M2I" - def getSeq(self): return self.vals[9] - def getQuals(self): return self.fromASCII33(self.vals[10]) - - def toASCII33( self, quals ): - return string.join( map( lambda q: chr(q+33), quals ), '' ) - - def fromASCII33( self, qualStr ): - return map( lambda q: ord(q)-33, qualStr) - - def __str__( self ): - return string.join( map( str, self.vals ), '\t') - -# ----------------------------------------------------------------------------------------------- -# -# Wrapper class for reading and writing records to a SAM file -# -# ----------------------------------------------------------------------------------------------- -class SAMIO: - def __init__( self, fileName, header = None, debugging = False, func = None ): - self.header = header - self.debugging = debugging - - if func == None: - func = lambda x: x - self.func = func - - if self.header == None: - self.fileHandle = pushback_file(fileName, "r") - self.readHeader() - else: - self.fileHandle = file(fileName, "w") - self.writeHeader() - - def close(self): - self.fileHandle.close() - - def readHeader(self): - self.header = "" - while True: - line = self.fileHandle.readline() - if line.startswith("@"): - self.header += line - else: - self.fileHandle.pushback(line) - if self.debugging: - pass #print str(self.header) - return True - - return self.header - - def RecordGenerator( self ): - for line in self.fileHandle: - #print 'Reading line', line - line = line.rstrip("\n") - yield self.func(SAMRecordFromString(line)) - raise StopIteration - return - - def __iter__(self): - return self.RecordGenerator() - - def writeRecord( self, recordIterator ): - for record in recordIterator: - self.writeRecord(record) - - def writeRecord( self, record ): - if self.debugging: - print record - print >> self.fileHandle, record - return True - - def writeHeader( self ): - if self.debugging: - print str(self.header) - print >> self.fileHandle, str(self.header) - return True diff --git a/python/SamWalk.py b/python/SamWalk.py deleted file mode 100755 index 5d3f71a60..000000000 --- a/python/SamWalk.py +++ /dev/null @@ -1,304 +0,0 @@ -#!/util/bin/python -# -# -# This file really needs to be cleaned up and functions need to be improved -# -# (1) utility operations should be moved to a utility file -# (2) Need to add support for windows paired reads, information about where -# reads come from (which file), and n-base pair loci -# (3) Move to C++ based SAM/BAM i/o system -# (4) Figure out a way to get the reference to stream, and use a automatically pickled version -# (5) Add capability to walk over all reference bases, regardless of coverage. Currently -# we only walk over covered bases. -# -# -from optparse import OptionParser -import os -import sys -import random -import string -import os.path -import heapq -from itertools import * - -from SAM import * -import SimpleSAM -#import SimulateReads # utility functions should be imported - -# Global variable containing command line arguments -OPTIONS = None - -# -# Trivial class that added pushback support to a stream -# -class peakStream(object): - """A simple wrapper around a stream that adds unget operation""" - def __init__( self, stream ): - self.ungot = [] - self.s = stream - - def unget( self, elt ): - """Put elt at the front of the underlying stream, making the next get call return it""" - - #print 'unget', self.ungot - self.ungot.append(elt) - #print ' -> unget', self.ungot - - # we hold the next() operation - def __iter__( self ): return self - - def next(self): - """For python iterator support""" - #print 'ungot', self.ungot - if self.ungot <> []: - elt = self.ungot.pop(0) - else: - elt = self.s.next() - - #print ' -> ungot', self.ungot - #print ' elt', elt - return elt - -# -# Returns a single, combined stream of SAM records in chromosome order from a list of -# input streams, each in reference order -# -def rawReadStream( ref, inputs ): - """Returns a single iterable that returns records in reference order - from each of the input streams""" - def includePriorities( stream ): - return imap( lambda x: (x.getPos(), x), stream ) - def prunePriorities( stream ): - return imap( lambda p: SimpleSAM.MappedReadFromSamRecord(ref, p[1]), stream ) - - with_priorities = map( includePriorities, inputs ) - return prunePriorities( heapq.merge( *with_priorities ) ) - -# -# Just wraps the raw read stream objects in a list as they come out -# -def readStream( ref, inputs ): - """Returns a single iterable that returns SAM records in reference order - from each of the input streams""" - rs = rawReadStream( ref, inputs ) - return imap( lambda x: [x], rs ) - -# -# More complex stream object. Takes a list of input streams and creates a stream -# returning successive loci covered by the reads in the combined input stream. -# -class lociStream(): - """A streaming data structure that returns reads spanning each covered loci in - the input reads, offsets into them - where the bases are equivalent, and the position of the locus. - """ - def __init__( self, ref, inputs ): - self.ref = ref - self.rs = peakStream(rawReadStream( ref, inputs )) - self.window = [] - - self.chr = None - self.pos = None - - def __iter__(self): - return self - - def pushRead( self, read ): - self.window.append(read) - - def cleanWindow( self, chr, pos ): - """Walk over the window of reads, deleting those who no longer cover the current pos""" - if OPTIONS.debug: - print 'cleanWindow start:', chr, pos, self.window - - def keepReadP( read ): - return read.chr == chr and pos >= read.start and pos <= read.end - self.window = filter( keepReadP, self.window ) - - if OPTIONS.debug: - print 'cleanWindow stop:', chr, pos, self.window - - def expandWindow( self, chr, pos ): - """Keep reading from the read stream until we've added all reads from the stream covering pos""" - if OPTIONS.debug: - print 'expandWindow start:', chr, pos, self.window - for read in self.rs: - #print 'read', read, pos - if read.chr == chr and read.start <= pos and read.end >= pos: - self.pushRead(read) - else: - self.rs.unget( read ) - #self.rs = chain( [read], self.rs ) - break - if OPTIONS.debug: - print 'expandWindow stop:', chr, pos, self.window - - - # - # This is the workhorse routine. It constructs a window of reads covering the - # next locus in the genome, and returns the reference, the contig, its position - # in the genome, along with a vector of offsets into the list of reads that denote - # the equivalent base among all the reads and reference. - # - def next(self): - if self.pos <> None: - self.cleanWindow(self.chr, self.pos) # consume reads not covering pos - self.expandWindow(self.chr, self.pos) # add reads to window covering pos - - if self.window == []: - # the window is empty, we need to jump to the first pos of the first read in the stream: - nextRead = self.rs.next() - self.pushRead( nextRead ) - self.chr = nextRead.chr - self.pos = nextRead.start - return self.next() - else: - # at this point, window contains all reads covering the pos, we need to return them - # and the offsets into each read for this loci - def calcOffset( read ): - offset = self.pos - read.start - return offset - - offsets = map(calcOffset, self.window) - currPos = self.pos - self.pos += 1 # we are going to try to get the next position - return self.ref, self.chr, currPos, offsets, self.window - -# -# Reference reader -# -def readRef(referenceFasta): - ref = {} - - header = None - seq = '' - for line in open(referenceFasta): - if line[0] == '>': - if header <> None: - ref[header] = seq.lower() - seq = '' - header = line[1:].strip() - else: - seq += line.strip() - - ref[header] = seq.lower() - #print ref - return ref - -# -# Main() procedure -# -def main(): - global OPTIONS, ROOT - - # ------------------------------------------------------------ - # - # Setup command line options - # - # ------------------------------------------------------------ - usage = "usage: %prog [options] Walker [sam or bam file or file list]+" - parser = OptionParser(usage=usage) - parser.add_option("-r", "--ref", dest="reference", - type="string", default=None, - help="Reference DNA seqeunce in FASTA format") - parser.add_option("-m", "--maxRecords", dest="maxRecords", - type=int, default=None, - help="Max number of SAM records to process") - parser.add_option("-q", "--quiet", dest="quiet", - action='store_true', default=False, - help="Be quiet when generating output") - parser.add_option("-d", "--debug", dest="debug", - action='store_true', default=False, - help="Verbose debugging output?") - - (OPTIONS, args) = parser.parse_args() - print 'args', args - if len(args) < 2: - parser.error("Incorrect number of arguments") - - - # ------------------------------------------------------------ - # - # Dynamically load the walker class (the first cmd line arg) - # and initialize a walker class object - # - # - # load walkers from standard location - # - # ------------------------------------------------------------ - walkerName = args[0] - walkerModule = __import__(walkerName, globals(), locals(), [], -1) - walkerClass = walkerModule.__dict__[walkerName] - walker = walkerClass() - - print walkerName - print walkerModule - print walkerClass - print walker - - print '------------------------------------------------------------' - print 'program:', sys.argv[0] - print '' - print ' ref: ', OPTIONS.reference - print ' walker: ', walker - if walker.hasOption('name'): - print ' -> ', walker.getOption('name') - if walker.hasOption('desc'): - print ' -> ', walker.getOption('desc') - print '------------------------------------------------------------' - - - # ------------------------------------------------------------ - # - # Initialize the engine - # - # ------------------------------------------------------------ - - # read the reference - refs = readRef(OPTIONS.reference) - - # create the low-level SAMIO streams - files = args[1:] - inputs = map( lambda file: SAMIO( file, debugging=OPTIONS.debug ), files ) - - # build the higher level stream object, either by record or by loci - if walker.isRecordWalker(): - stream = readStream( refs, inputs ) - if walker.isLociWalker(): - stream = lociStream( refs, inputs ) - - # ------------------------------------------------------------ - # - # Move the walker object over the stream - # - # For each element in the record or loci stream, invoke the walker - # object on it. Use filter, map, and reduce to construct the sum - # - # ------------------------------------------------------------ - sum = walker.reduceDefault - counter = 0 - for elt in stream: - counter += 1 - #print 'processing elt', counter, elt, OPTIONS.maxRecords - if OPTIONS.maxRecords <> None: - if (counter * 10) % OPTIONS.maxRecords == 0: - print ' => ', (counter * 100.0) / OPTIONS.maxRecords, '% done' - - if counter > OPTIONS.maxRecords: - break - if walker.filterfunc( *elt ): - x = walker.mapfunc( *elt ) - sum = walker.reducefunc( x, sum ) - print 'Result is', sum - -if __name__ == "__main__": - if True: - import cProfile, pstats - cProfile.run('main()', 'prof') - stats = pstats.Stats('prof') - stats.sort_stats('time') - stats.print_stats(20) - else: - main() - - diff --git a/python/SamWalkTest.py b/python/SamWalkTest.py deleted file mode 100755 index fc2412d85..000000000 --- a/python/SamWalkTest.py +++ /dev/null @@ -1,31 +0,0 @@ -#!/util/bin/python - -#from SamWalk import * -from SimpleSAM import * -import Walker - -class SamWalkTest(object, Walker.Walker): - def __init__(self): - Walker.Walker.__init__( self, 'byRecord', {}, name = 'SamWalkTest' ) - - def filterfunc( self, read ): - """Return true if you want to keep the read for mapping""" - return True - #return datum.getPos() % 2 == 0 - - def mapfunc( self, read ): - """Do something to the read, and return a result for reduction""" - #print datum - return 1 - #return len(datum.getSeq()) - #return datum.getSeq() + '/' - #return "\n>%s\n%s" % (datum.getQName(), datum.getSeq()) - - #reduceDefault = '' - def reducefunc( self, x, sum ): - """Take the result of mapping, and previous reduce results in sum, and combine them""" - #print x - return x + sum - - - diff --git a/python/SimpleSAM.py b/python/SimpleSAM.py deleted file mode 100644 index d5e673b1d..000000000 --- a/python/SimpleSAM.py +++ /dev/null @@ -1,69 +0,0 @@ -from SAM import * - -def read2align( refSeq, readSeq, chr, pos, cigar ): - """Converts the read bases, a reference sequence, and a cigar string into a pair of - alignments strings for refSeq and readSeq. The alignment string has base equivalents - between the reference and the read bases""" - # - # only works for non-indel containing reads right now! - # - contig = refSeq[chr] - refAlign = contig[pos:pos+len(readSeq)] - readAlign = readSeq - return refAlign, readAlign - -def MappedReadFromSamRecord(ref, r): - """Converts SameRead into a SimpleSamRecord object""" - # - # This is a temp. function while we wait for Ted's C++ SAM/BAM io system to come online - # - refAlign, readAlign = read2align(ref, r.getSeq(), r.getRname(), r.getPos(), r.getCigar) - return MappedRead( r.getQName(), r.getFlags(), r.getRname(), r.getPos(), r.getMapq(), - r.getCigar(), r.getSeq(), refAlign, readAlign, r.getQuals() ) - -class MappedRead(object): - """Higher level object representing a mapped read. - - Attempts to be more accessible than a raw SAM record, doing a lot of processing on the record w.r.t. - the reference to make analysis of the reads easier. This is the fundamental data structure - used to represent reads through the walker engine""" - - def __init__( self, qname, flags, chr, pos, mapq, cigar, readBases, refAlign, readAlign, qualsByBase ): - # basic information about the read - self.qname = qname - self.mapq = mapq - self.cigar = cigar - self.bases = readBases # actual bases of reads, unaligned, may be fw or reverse - self.quals = qualsByBase # as a vector of floats, by base position - - # dealing with position information, more than a standard record Hasbro - self.chr = chr - self.start = pos - self.readlen = len(readBases) - self.end = pos + len(refAlign) - 1 - - # print 'refAlign', refAlign, len(refAlign) - - # dealing with stand - self.flags = flags - self.isForwardStrand = SAMFlagIsSet(flags, SAM_QUERYSTRAND) - self.isReverseStrand = not SAMFlagIsSet(flags, SAM_QUERYSTRAND) - - self.isPrimary = SAMFlagIsSet(flags, SAM_NOTPRIMARY) - self.isUnmapped = SAMFlagIsSet(flags, SAM_UNMAPPED) - self.isPaired = SAMFlagIsSet(flags, SAM_SEQPAIRED) - self.isMapPaired = SAMFlagIsSet(flags, SAM_MAPPAIRED) - self.isFirstReadInPair = SAMFlagIsSet(flags, SAM_ISFIRSTREAD) - self.isSecondReadInPair = SAMFlagIsSet(flags, SAM_ISSECONDREAD) - - #self.isMapPaired = SAMFlagIsSet(flags, SAM_MATEUNMAPPED) - #self.isMapPaired = SAMFlagIsSet(flags, SAM_MATESTRAND) - - # everything will be reverse complemented already - self.refAlign = refAlign # - self.readAlign = readAlign # all should be forward strand - - def __str__(self): - return '' % (self.chr, self.start, self.end) - - __repr__ = __str__ \ No newline at end of file diff --git a/python/SimulateReads.py b/python/SimulateReads.py deleted file mode 100755 index 19bd11b40..000000000 --- a/python/SimulateReads.py +++ /dev/null @@ -1,754 +0,0 @@ -#!/util/bin/python - -from optparse import OptionParser -import os -import sys -#import set -import random -import string - -from SAM import * - -def all(iterable): - for element in iterable: - if not element: - return False - return True - -def any(iterable): - for element in iterable: - if element: - return True - return False - -OPTIONS = None - -bases = set(['a', 't', 'g', 'c']) - -class Mutation: - """Representation of a change from one sequence to another""" - - SNP = "SNP" - INSERTION = "Insertion" - DELETION = "Deletion" - TYPES = set([SNP, INSERTION, DELETION]) - - CIGARChars = { SNP : 'M', INSERTION : 'I', DELETION : 'D' } - - def __init__( self, contig, pos, type, original, mutated ): - # example: chr20 123 SNP "a" "t" <- a to t at base 123 on chr20 - # example: chr20 123 INSERTION "" "atg" <- insertion of 3 bases starting at pos 123 - # example: chr20 123 DELETION "atg" "" <- deletion of 3 bases starting at pos 123 - self.contig = contig # contig of the mutation - self._pos = pos # position of the change in contig coordinates - self._type = type # one of the TYPES - assert(self._type in Mutation.TYPES) - self.original = original # String of the original genotype - self.mutated = mutated # String of the mutated genotype - - def pos(self, offset=0): return self._pos - offset - def type(self): return self._type - def __len__(self): - return max( len(self.mutated), len(self.original) ) - - def mutString( self ): - if self._type == Mutation.SNP: - return 'P:%s->%s' % (self.original, self.mutated) - elif self._type == Mutation.INSERTION: - return 'I:%s' % (self.mutated) - elif self._type == Mutation.DELETION: - return 'D:%s' % (self.original) - else: - raise Exception('Unexpected mutation type ' + self._type) - - def CIGARChar( self ): - return Mutation.CIGARChars[self.type()] - - def __str__( self ): - return '%s:%d %s' % (self.contig, self._pos, self.mutString()) - - def __repr__( self ): - return 'Mutation(%s, %d, %s)' % (self.contig, self._pos, self.mutString()) - - def execute( self, offset, refseq, mutseq ): - p = self.pos(offset) - if self.type() == Mutation.SNP: - mutseq[p] = self.mutated - elif self.type() == Mutation.INSERTION: - refseq[p:p] = string.join(['-'] * len(self),'') - mutseq[p:p] = self.mutated - elif self.type() == Mutation.DELETION: - mutseq[p:p+len(self)] = string.join(['-'] * len(self), '') - refseq.extend(self.mutated) - mutseq.extend(self.mutated) - - #print refseq, mutseq - return refseq, mutseq - -# --------------------------------------------------------------------------- -# -# Code for dealing with CIGAR strings -# -# --------------------------------------------------------------------------- -def flatten(l): - out = [] - for item in l: - if isinstance(item, (list, tuple)): - out.extend(flatten(item)) - else: - out.append(item) - return out - -def mutationsCIGAR( readStart, alignStart, readLen, mutations ): - if len(mutations) == 1 and mutations[0].type() <> Mutation.SNP: - mut = mutations[0] - internalPos = mut.pos() - alignStart - - leftLen = max(internalPos - readStart,0) - mutLen = min(internalPos + len(mut) - readStart, readLen - leftLen, len(mut)) - rightLen = readLen - leftLen - mutLen - #print mut, readStart, alignStart, internalPos, leftLen, mutLen, rightLen - #print - - l = [ (leftLen, 'M'), (mutLen, mut.CIGARChar()), (rightLen, 'M') ] - l = filter( lambda x: x[0] > 0, l ) - return string.join(map(str, flatten(l)), '') - - if not all( map( lambda mut: mut.type() == Mutation.SNP, mutations ) ): - # bail - raise Exception('Currently only supports multiple SNPs CIGARS') - - return str(readLen) + 'M' - -# def mutationsCIGARBAD( refStart, readStart, readStop, mutations ): -# sortedMutations = sorted(mutations, key=Mutation.pos) -# CIGAR = [] -# pos = readStart -# -# for mutation in sortedMutations: -# internalMutationStart = mutation.pos() - refStart -# if internalMutationStart >= readStart and internalMutationStart < readStop -# delta = mutation.pos() - pos -# if not OPTIONS.quiet: -# print 'mutationsCIGAR', pos, CIGAR, delta, mutation -# if mutation.type() <> Mutation.SNP and delta > 0: -# CIGAR.append([delta, 'M']) -# pos = mutation.pos() -# -# if mutation.type == Mutation.INSERTION: -# CIGAR.append(['I', len(mutation)]) -# if mutation.type == Mutation.DELETION: -# CIGAR.append(['D', len(mutation)]) -# -# delta = refSeqPos + refLen - pos -# if not OPTIONS.quiet: -# print 'mutationsCIGAR', pos, CIGAR, delta, mutation -# if delta > 0: -# CIGAR.append([delta, 'M']) -# -# s = string.join(map( str, flatten(CIGAR) ), '') -# print 'CIGAR:', CIGAR, s -# return s - -# --------------------------------------------------------------------------- -# -# mutating the reference -# -# --------------------------------------------------------------------------- -def executeMutations( mutations, seqIndex, refseq, mutseq ): - for mutation in mutations: - internalSite = mutation.pos() - seqIndex - refseq, mutseq = mutation.execute( seqIndex, refseq, mutseq ) - return refseq, mutseq - -def isBadSequenceForSim(seq): - if any( map( lambda b: b not in bases, seq.tostring().lower() ) ): - return True - else: - return False - - -def mutateReference(ref, contig, mutSite, mutType, mutParams, nBasesToPad): - #print 'mutateReference' - - # start and stop - refStartIndex = mutSite - nBasesToPad + 1 - internalStartIndex = nBasesToPad - 1 - - if OPTIONS.mutationType == 'SNP' or OPTIONS.mutationType == 'NONE': - refStopIndex = mutSite + nBasesToPad - else: - refStopIndex = mutSite + nBasesToPad - 1 - - refsub = ref[refStartIndex : refStopIndex] - mutseq = refsub.tomutable() - refsub = refsub.tomutable() - mutations = [] - - def success(): - return True, mutations, refsub, mutseq - def fail(): - return False, [], None, None - - if refStartIndex < 0: - return fail() - if isBadSequenceForSim(refsub): - #print 'rejecting seq', refsub.tostring() - #print map( lambda b: b not in bases, refsub.tostring().lower() ) - return fail() - - if not OPTIONS.quiet: - print ref - print refsub - - otherSites = set(range(refStartIndex, refStopIndex)) - set([mutSite]) - # print 'otherSites', otherSites - for site in [mutSite] + random.sample(otherSites, max(OPTIONS.mutationDensity-1,0)): - internalSite = site - refStartIndex - if mutType == 'NONE' or OPTIONS.mutationDensity < 1: - pass - elif mutType == 'SNP': - mutBase = ref[site].lower() - otherBases = bases - set(mutBase) - otherBase = random.choice(list(otherBases)) - assert mutBase <> otherBase - if not OPTIONS.quiet: - print mutBase, ' => ', otherBase - mutations.append( Mutation( contig, site, Mutation.SNP, mutBase, otherBase ) ) - elif mutType == 'INSERTION': - inSeq = string.join([ random.choice(list(bases)) for i in range( OPTIONS.indelSize ) ], '') - mutation = Mutation( contig, site, Mutation.INSERTION, '', inSeq ) - mutations.append( mutation ) - elif mutType == 'DELETION': - inSeq = string.join([ random.choice(list(bases)) for i in range( OPTIONS.indelSize ) ], '') - mutation = Mutation( contig, site, Mutation.DELETION, '', ref[refStopIndex:refStopIndex+OPTIONS.indelSize]) - mutations.append( mutation ) - else: - raise Exception('Unexpected mutation type', mutType) - - # process the mutations - refsub, mutseq = executeMutations( mutations, refStartIndex, refsub, mutseq ) - - return success() - -# version 1.0 -- prototype -# def mutateReference(ref, mutSite, mutType, mutParams, nBasesToPad): -# #print 'mutateReference' -# refStartIndex = mutSite - nBasesToPad + 1 -# refStopIndex = mutSite + nBasesToPad -# internalStartIndex = nBasesToPad - 1 -# -# if refStartIndex < 0: -# return False, None, None -# -# refsub = ref[refStartIndex : refStopIndex] -# print ref -# print refsub -# -# if mutType == 'SNP' or mutType == 'None': -# #print ' In IF' -# mutBase = ref[mutSite] -# bases = set(['A', 'T', 'G', 'C']) -# if mutBase in bases: -# otherBases = bases - set(mutBase) -# otherBase = random.choice(list(otherBases)) -# assert mutBase <> otherBase -# mutseq = refsub.tomutable() -# -# if mutType == 'SNP': -# print mutBase, ' => ', otherBase -# mutseq[internalStartIndex] = otherBase -# -# print refsub -# print mutseq -# align = Bio.Align.Generic.Alignment(Gapped(IUPAC.unambiguous_dna, '-')) -# align.add_sequence("ref", refsub.tostring()) -# align.add_sequence("mut", mutseq.tostring()) -# print str(align) -# -# return True, refsub, mutseq -# -# return False, None, None - -# --------------------------------------------------------------------------- -# -# sample reads from alignments -# -# --------------------------------------------------------------------------- -def accumulateBases( mutSeq, start, readLen ): - count = 0 - for stop in range(start, len(mutSeq)): - if notDash(mutSeq[stop]): - count += 1 - #print stop, count - if count == readLen: - break - - stop += 1 - read = string.join(filter( notDash, mutSeq[start:stop] ), '') - - return read, stop - -# doesn't support paired reads -# -# def sampleReadsFromAlignment(refSeq, mutSeq, alignStart, readLen, nReads, mutations): -# #print 'REF:', refSeq.tostring() -# #print 'MUT:', mutSeq.tostring() -# #print '' -# -# # we are assuming that mutSeq is exactly 1 + 2 * readLen in size, which the mutation -# # right in the middle -# lastGoodStartSite = readLen - 1 -# if not OPTIONS.quiet: -# print refSeq.tostring(), 'ref' -# print mutSeq.tostring(), 'mut' -# -# # we can potentially start at any site in mutSeq -# nPotentialStarts = len(mutSeq) - readLen + 1 -# potentialStarts = range(nPotentialStarts) -# -# if nReads.lower() == 'tile' or int(nReads) >= nPotentialStarts: -# if OPTIONS.uniqueReadsOnly: -# starts = potentialStarts -# else: -# starts = map( lambda x: random.choice(potentialStarts), range(nPotentialStarts)) -# else: -# starts = random.sample(potentialStarts, nPotentialStarts) -# -# #print 'potentialStarts', potentialStarts -# #print 'Starts', starts -# -# def sampleRead( start ): -# if mutSeq[start] <> '-': -# read, stop = accumulateBases( mutSeq, start, readLen ) -# remaining = len(mutSeq) - stop -# refForRead = refSeq[start:stop] -# #print 'XXXX', start, stop, refForRead, read -# cigar = mutationsCIGAR( alignStart, readLen, mutations ) -# if not OPTIONS.quiet: -# leftPads = string.join(start * ['-'], '') -# rightPads = string.join(remaining * ['-'], '') -# print leftPads + mutSeq.tostring()[start:stop] + rightPads, start, stop, cigar -# return filter(notDash, read), alignStart + start, cigar -# else: -# return False, False, False -# -# reads = map( sampleRead, starts ) -# return [read for read in reads if read[0] <> False] - -class refSite: - def __init__( self, refSeq, mutSeq, alignStart, mutations ): - self.refSeq = refSeq - self.mutSeq = mutSeq - self.alignStart = alignStart - self.mutations = mutations - -def sample1Read( refSite, start, readLen, reverseComplementP = False ): - if refSite.mutSeq[start] <> '-': - read, stop = accumulateBases( refSite.mutSeq, start, readLen ) - - mutSubSeq = refSite.mutSeq[start:stop] - if reverseComplementP: - s = Seq( read, IUPAC.unambiguous_dna ) - #print 'reverseComplementP', read, s.reverse_complement().tostring(), mutSubSeq - read = s.reverse_complement().tostring() - mutSubSeq.complement() - - remaining = len(refSite.mutSeq) - stop - refForRead = refSite.refSeq[start:stop] - #print 'XXXX', start, stop, refForRead, read - cigar = mutationsCIGAR( start, refSite.alignStart, readLen, refSite.mutations ) - leftPads = string.join(start * ['-'], '') - rightPads = string.join(remaining * ['-'], '') - ppReadStr = leftPads + mutSubSeq.tostring() + rightPads # + '(' + read + ')' - return True, filter(notDash, read), refSite.alignStart + start, cigar, ppReadStr - else: - return False, None, None, None, None - -def sampleReadsFromAlignment(wholeRef, refSeq, mutSeq, alignStart, readLen, nReads, mutations, pairedOffset, mutations2, refSeq2, mutSeq2): - #print 'REF:', refSeq.tostring() - #print 'MUT:', mutSeq.tostring() - #print '' - - # pairedSite, mutations2, refSeq2, mutSeq2 can all be None - - refSite1 = refSite( refSeq, mutSeq, alignStart, mutations ) - refSite2 = refSite( refSeq2, mutSeq2, alignStart + pairedOffset, mutations2 ) - - #print refSite1.alignStart, refSite2.alignStart, pairedOffset - - # we are assuming that mutSeq is exactly 1 + 2 * readLen in size, which the mutation - # right in the middle - lastGoodStartSite = readLen - 1 - if not OPTIONS.quiet: - if OPTIONS.generatePairedEnds: - r = wholeRef.seq[refSite1.alignStart:refSite2.alignStart + 2*readLen - 1] - print r.tostring(), 'ref' - print r.complement().tostring(), 'ref, complement' - else: - print refSeq.tostring(), 'ref' - print mutSeq.tostring(), 'mut' - - # we can potentially start at any site in mutSeq - nPotentialStarts = len(mutSeq) - readLen + 1 - potentialStarts = range(nPotentialStarts) - #print 'potentialStarts', potentialStarts - - if nReads.lower() == 'tile' or int(nReads) >= nPotentialStarts: - if OPTIONS.uniqueReadsOnly: - starts = potentialStarts - else: - starts = map( lambda x: random.choice(potentialStarts), range(nPotentialStarts)) - else: - starts = random.sample(potentialStarts, int(nReads)) - - #print 'Starts', starts - - def sampleRead( start ): - good1, read1, pos1, cigar1, ppstr1 = sample1Read( refSite1, start, readLen ) - - if OPTIONS.generatePairedEnds: - good2, read2, pos2, cigar2, ppstr2 = sample1Read( refSite2, start, readLen, True ) - if not OPTIONS.quiet: - offsetDashes = string.join(['-'] * (pairedOffset), '') - print - print ppstr1 + offsetDashes, pos1, cigar1 - print offsetDashes + ppstr2, pos2, cigar2 - return [(read1, pos1, cigar1), (read2, pos2, cigar2)] - else: - if not OPTIONS.quiet: - print ppstr1, pos1, cigar1 - return [(read1, pos1, cigar1), (None, 0, None)] - - reads = map( sampleRead, starts ) - return [read for read in reads if read[0][0] <> None] - -def notDash( c ): - return c <> '-' - -def fakeQuals( seq ): - return len(seq) * [30] - #return range(len(seq)) - -def alignedRead2SAM( siteID, readID, fastaID, read, pos, cigar, read2, pos2, cigar2 ): - rname = fastaID - mapq = 40 - quals = fakeQuals( read ) - - qname = '%s:%d.read.%d' % (fastaID, siteID, readID) - - if not OPTIONS.generatePairedEnds: - # not in paired mode - flags = [] - record1 = SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, read, quals ) - record2 = None - else: - flags = [SAM_SEQPAIRED, SAM_MAPPAIRED] - insertSize = pos2 - pos - len(read) - record1 = SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, read, quals, pairContig = rname, pairPos = pos2, insertSize = insertSize ) - record2 = SAMRecordFromArgs( qname + 'p', flags, rname, pos2, mapq, cigar2, read2, quals, pairContig = rname, pairPos = pos, insertSize = insertSize ) - - #print 'read1, read2', record1.getSeq(), record2.getSeq() - - return [record1, record2] - -# --------------------------------------------------------------------------- -# -# paired end reads -# -# --------------------------------------------------------------------------- -def pairedReadSite( ref, leftMutSite, readLen ): - pairedOffset = int(round(random.normalvariate(OPTIONS.insertSize, OPTIONS.insertDev))) - - def fail(): return False, None - - if pairedOffset < 0: - return fail() - - pairedSite = pairedOffset + leftMutSite + readLen - #print 'pairedSite', pairedOffset, leftMutSite, pairedSite - - if pairedSite + readLen >= len(ref): - return fail() - refsub = ref[pairedSite - readLen:pairedSite + readLen + 1] - if isBadSequenceForSim( refsub ): - return fail() - - return True, pairedSite - - - -# --------------------------------------------------------------------------- -# -# build allowed regions and sampling starts from region -# -# --------------------------------------------------------------------------- -def parseSamplingRange( arg, ref, readLen ): - # returns a list of the allowed regions to sample in the reference - - def one(x): - elts = x.split() - #print 'elts', elts - if len(elts) > 0: - elts1 = elts[0].split(':') - #print 'elts1', elts1 - return map( int, elts1 ) - else: - return False - - if arg <> None: - try: - return [ one(arg) ] - except: - # try to read it as a file - return filter( None, map( one, open(arg).readlines() ) ) - else: - return [[0, len(ref.seq) - readLen]] - -def sampleStartSites( sampleRanges, nsites ): - print 'sampleStartSites', sampleRanges, nsites - - # build a set of potential sites - if len(sampleRanges) > 1: - potentialSites = set() - for start, end in sampleRanges: - #print 'potentialSites', start, end, potentialSites - potentialSites = potentialSites.union(set(xrange(start, end))) - else: - start, end = sampleRanges[0] - potentialSites = xrange(start, end) - - #print 'potentialSites', potentialSites - - # choose sites from potentials - if len(potentialSites) <= nsites: - # tile every site - sites = potentialSites - else: - # we need to sample from the potential sites - sites = random.sample( potentialSites, nsites ) - - # print 'Sites', sites - print 'Number of potential start sites:', len(potentialSites) - print 'Number of start sites that will be generated:', len(sites) - return sites - -def readRef(referenceFasta): - handle = open(referenceFasta) - for seq_record in SeqIO.parse(handle, "fasta"): - print seq_record.id - print seq_record.name - #print repr(seq_record.seq) - print len(seq_record.seq) - yield seq_record - handle.close() - -import os.path -def outputFilename(): - root = os.path.splitext(os.path.split(OPTIONS.reference)[1])[0] - if OPTIONS.generatePairedEnds: - pairedP = 'Yes' - else: - pairedP = 'No' - - params = [['mut',OPTIONS.mutationType], ['den', max(OPTIONS.mutationDensity, OPTIONS.indelSize)], [OPTIONS.readLen,'bp'], \ - ['nsites', OPTIONS.nSites], ['cov', OPTIONS.coverage], ['range', OPTIONS.range], ['paired', pairedP]] - filename = root + '__' + string.join(map( lambda p: string.join(map(str, p),'.'), params), '_') - root = os.path.join(OPTIONS.outputPrefix, filename) - return root, root + '.sam' - -def main(): - global OPTIONS, ROOT - - usage = "usage: %prog [options]" - parser = OptionParser(usage=usage) - parser.add_option("-r", "--ref", dest="reference", - type="string", default=None, - help="Reference DNA seqeunce in FASTA format") - parser.add_option("-o", "--outputPrefix", dest="outputPrefix", - type="string", default='', - help="Output Prefix SAM file") - parser.add_option("-c", "--coverage", dest="coverage", - type="string", default='1', - help="Number of reads to produce that cover each mutated region for the reference, or tile for all possible reads.") - parser.add_option("-n", "--nsites", dest="nSites", - type=int, default=1, - help="Number of sites to generate mutations in the reference") - parser.add_option("-l", "--readlen", dest="readLen", - type=int, default=36, - help="Length of reads to generate") - parser.add_option("-s", "--seed", dest="seed", - type=int, default=123456789, - help="Random number seed. If 0, then system clock is used") - parser.add_option("-t", "--muttype", dest="mutationType", - type="string", default='None', - help="Type of mutation to introduce from reference. Can be None, SNP, insertion, or deletion") - parser.add_option("-e", "--mutdensity", dest="mutationDensity", - type=int, default=1, - help="Number of mutations to introduce from the reference") - parser.add_option("-x", "--range", dest="range", - type="string", default=None, - help="Sampling range restriction, in the form of x:y without a space") - parser.add_option("-u", "--unique", dest="uniqueReadsOnly", - action='store_true', default=True, - help="Assumes that the user wants unique reads generated. If the coverage is greater than the number of unique reads at the site, the region is just tiled") - parser.add_option("-i", "--indelsize", dest="indelSize", - type=int, default=0, - help="Size in bp of the insertion or deletion to introduce") - - # Paired end options - parser.add_option("", "--paired", dest="generatePairedEnds", - action='store_true', default=False, - help="Should we generate paired end reads?") - parser.add_option("", "--insertSize", dest="insertSize", - type=int, default=280, - help="Size in bp of the paired library insertion") - parser.add_option("", "--insertDev", dest="insertDev", - type=int, default=5, - help="Standard deviation in the size in bp of the paired library insertion") - - parser.add_option("-q", "--quiet", dest="quiet", - action='store_true', default=False, - help="Be quiet when generating output") - parser.add_option("-d", "--debug", dest="debug", - action='store_true', default=False, - help="Verbose debugging output?") - (OPTIONS, args) = parser.parse_args() - if len(args) != 0: - parser.error("incorrect number of arguments") - - root, outputSAM = outputFilename() - - if OPTIONS.seed == 0: - random.seed(None) - else: - random.seed(OPTIONS.seed) - - OPTIONS.mutationType = OPTIONS.mutationType.upper() - - print '------------------------------------------------------------' - print 'program:', sys.argv[0] - print '' - print ' ref: ', OPTIONS.reference - print ' output: ', outputSAM - print '' - print ' mutation type: ', OPTIONS.mutationType - print ' mutation density: ', OPTIONS.mutationDensity - print ' indel size: ', OPTIONS.indelSize - print ' readLen: ', OPTIONS.readLen - print ' nSites: ', OPTIONS.nSites - print ' coverage: ', OPTIONS.coverage - print ' range restriction: ', OPTIONS.range - print '' - print ' paired ends?: ', OPTIONS.generatePairedEnds - print ' insert size: ', OPTIONS.insertSize - print ' insert stddev: ', OPTIONS.insertDev - print '' - print ' Debugging?: ', OPTIONS.debug - print '------------------------------------------------------------' - - if OPTIONS.mutationType <> 'SNP' and OPTIONS.mutationDensity > 1: - raise Exception('Does not support mutation density > 1 for mutations of class', OPTIONS.mutationType) - - readLen = OPTIONS.readLen - fastaRecords = [seq for seq in readRef(OPTIONS.reference)] - header = SAMHeader( fastaRecords[0].id, len(fastaRecords[0].seq) ) - SAMout = SAMIO( outputSAM, header, debugging=OPTIONS.debug ) - - mutationsout = open(root + '.mutations.txt', 'w') - qualsout = open(root + '.quals.txt', 'w') - refout = open(root + '.ref', 'w') - - fastaout = open(root + '.fasta', 'w') - if OPTIONS.generatePairedEnds: - fastaout2 = open(root + '.pairs.fasta', 'w') - qualsout2 = open(root + '.pairs.quals.txt', 'w') - - counter = 0 - refLen = 0 - for ref in fastaRecords: - refLen = len(ref.seq) - - # write the crazy ref file info needed by samtools - print >> refout, ref.id + '\t' + str(refLen) - - sampleRanges = parseSamplingRange( OPTIONS.range, ref, readLen ) - sites = sampleStartSites( sampleRanges, OPTIONS.nSites ) - - for mutSite, siteCounter in zip( sites, range(1, len(sites)+1) ): - #print 'Sampling site:', mutSite, refLen - #mutSite = readLen-1 - nSitesSelected = max(int(round(len(sites)/100.0)), 1) - #print siteCounter, len(sites), nSitesSelected) - if siteCounter % nSitesSelected == 0: - print 'Sampled site %d of %d (%.2f%%)' % ( siteCounter, len(sites), (100.0*siteCounter) / len(sites)) - - good, mutations, refSeq, mutSeq = mutateReference(ref.seq, ref.id, mutSite, OPTIONS.mutationType, [], readLen) - - pairedSite, mutations2, refSeq2, mutSeq2 = 0, None, None, None - if good and OPTIONS.generatePairedEnds: - if OPTIONS.mutationType == 'SNP': - pairedMutType = 'SNP' - else: - pairedMutType = 'NONE' - - good, pairedSite = pairedReadSite( ref.seq, mutSite, readLen ) - print 'pairedReadSite', good, pairedSite, good - if good: - good, mutations2, refSeq2, mutSeq2 = mutateReference(ref.seq, ref.id, pairedSite, pairedMutType, [], readLen) - - #print 'Good', good, mutations, refSeq, mutSeq - if good: - print >> mutationsout, ref.id + ':' + str(mutSite) + '\t' + string.join(map(str, mutations), '\t') - #print 'Mutations', mutations - - refSeqPos = mutSite - readLen + 1 - readPairs = sampleReadsFromAlignment(ref, refSeq, mutSeq, refSeqPos, readLen, OPTIONS.coverage, mutations, pairedSite - mutSite, mutations2, refSeq2, mutSeq2) - for i in range(len(readPairs)): - counter += 1 - - read1, read2 = readPairs[i] - seq, pos, cigar = read1 - seq2, pos2, cigar2 = read2 - - # write out the sam and fasta files - def write1( record, recordi ): - if record <> None: - SAMout.writeRecord( record ) - sequences = [SeqRecord(Seq(record.getSeq()), id = record.getQName(), description='')] - #print sequences - - if recordi % 2 == 0: - localFastaOut, localQualsOut = fastaout, qualsout - else: - localFastaOut, localQualsOut = fastaout2, qualsout2 - - SeqIO.write(sequences, localFastaOut, "fasta") - print >> localQualsOut, record.getQName(), string.join(map( str, record.getQuals() ), ' ') - - #print i, read1, read2 - records = alignedRead2SAM( mutSite, i+1, ref.id, seq, pos + 1, cigar, seq2, pos2 + 1, cigar2 ) - map( write1, records, range( len(records) ) ) - - SAMout.close() - qualsout.close() - fastaout.close() - refout.close() - mutationsout.close() - - if OPTIONS.generatePairedEnds: - qualsout2.close() - fastaout2.close() - -# for record in SAMIO( outputSAM ): -# print record - -if __name__ == "__main__": - import Bio - from Bio import SeqIO - from Bio.Seq import Seq - from Bio.SeqRecord import SeqRecord - from Bio.Alphabet import IUPAC, Gapped - main() - - diff --git a/python/SpawnMapperJobs.py b/python/SpawnMapperJobs.py deleted file mode 100755 index ca3045585..000000000 --- a/python/SpawnMapperJobs.py +++ /dev/null @@ -1,194 +0,0 @@ -#!/usr/bin/env python - -import getopt, sys, os, string -from farm_commands import * - -#FastaQuals2Fastq_exe = "/wga/dev/andrewk/Arachne/AlignerEvaluation/FastaQuals2Fastq.py" -FastaQuals2Fastq_exe = "FastaQuals2Fastq.py" - -def isFastaB(filename): - """Is the file a fastb file already?""" - #print os.path.splitext(filename) - return os.path.splitext(filename)[1] == '.fastb' - -def readListOfLanes( listFile ): - """Simply reads a list of files to process from a file""" - lines = map( string.split, map( string.strip, open(listFile).readlines() ) ) - return map( lambda x: x[0], lines ) - #return map( lambda x: x[0], lines ), map( lambda x: x[1], lines ) - - -def run_swmerlin(input_file, input_head, farm=""): - run_merlin(input_file, input_head, farm, sw=True) - -def run_merlin(input_file, input_head, farm="", sw=False): - "sw = Merlin Smith-Waterman option" - if isFastaB(input_file): - input_fastb = input_file - else: - input_fastb = input_head+".fastb" - if regenExistingFiles or not os.path.exists(input_fastb): - cmd("Fasta2Fastb IN= "+input_file, just_print_commands=justPrintCommands) - if sw: - output_head = input_head+".swmerlin" - else: - output_head = input_head+".merlin" - cmd_str = "Merlin REF_FASTB= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.fastb REF_MERLIN= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.merlinref.bin FASTB= "+input_fastb+" OUT_HEAD="+output_head - if sw: - cmd_str += " SW=True" - cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands) - -USE_BATCH = False - -def run_ilt(input_file, input_head, farm=""): - #print 'isFastaB', input_file, isFastaB(input_file) - if isFastaB(input_file): - input_fastb = input_file - else: - input_fastb = input_head+".fastb" - if regenExistingFiles or not os.path.exists(input_fastb): - cmd("Fasta2Fastb IN= "+input_file, just_print_commands=justPrintCommands) - - output_head = input_head+".ilt" - - if USE_BATCH: - cmd_str = "~depristo/bin/batchShortQueryLookup2.pl --NUMPROCS=10 --BATCHQUEUE=long --SEQS="+input_fastb+" --L=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.lookup --MAX_FREQ=1000 --O= "+output_head - cmd(cmd_str, False, input_head, just_print_commands=justPrintCommands) - else: - cmd_str = "ImperfectLookupTable SEQS= "+input_fastb+" L= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.lookup MAX_FREQ=1000 OUT_PREFIX= "+output_head - cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands) - -def run_BWA32(input_file, input_head, farm=""): - run_BWA(input_file, input_head, farm, 32) - -def run_BWA(fasta, input_head, farm="", seed='inf'): - output_head = input_head+".bwa" - quals = input_head+".quals.txt" - fastq = input_head+".fastq" - if regenExistingFiles or not os.path.exists(fastq) : - cmd_str = FastaQuals2Fastq_exe+" "+fasta+" "+quals+" "+fastq - cmd(cmd_str) - if seed <> 'inf': - output_head += str(seed) - output_sam = output_head + ".sam" - cmd_str = "bwahuman samse "+str(seed)+" "+fastq+" "+output_sam - cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands) - -def run_MAQ(input_fasta, head, farm=""): - maq_exe = "/seq/dirseq/maq-0.7.1/maq" - bfa_ref="/seq/dirseq/ktibbett/maq-0.7.1-test/Homo_sapiens_assembly18.bfa" - - fasta = input_fasta - quals = head+".quals.txt" - fastq = head+".fastq" - if regenExistingFiles or not os.path.exists(fastq) : - cmd_str = FastaQuals2Fastq_exe+" "+fasta+" "+quals+" "+fastq - cmd(cmd_str) - - bfq = head+".bfq" - if regenExistingFiles or not os.path.exists(bfq): - cmd( maq_exe+" fastq2bfq "+fastq+" "+bfq, just_print_commands=justPrintCommands ) - - out_head = head+".maq" - maq_out = out_head+".out.aln.map" - cmd_str = maq_exe+" map -e 100 -a 600 -s 0 "+maq_out+" "+bfa_ref+" "+bfq - cmd(cmd_str, farm, out_head, just_print_commands=justPrintCommands) - -def usage(): - print "Required arguments:" - print " -i Input FASTA head (*.fasta, *.qualb)" - print " OR" - print " -d Directory to grab all FASTA files from" - print " OR" - print " -l List of FASTA/FASTB files to process" - print - print "Optional arguments:" - print " -f QUEUE Farm jobs to QUEUE on LSF" - print - print " -m MAPPER Compare output from MAPPER which can be: ilt, merlin, swmerlin, maq, all (default: all)" - print - print " -x Don't execute commands, just print them" - print - print " -w Output files to current directory (strip path from input file/dir/list" - print - - -def get_all_fasta_files(fasta_dir): - files = os.listdir(fasta_dir) - if not fasta_dir.endswith("/"): fasta_dir += "/" - fasta_files = [fasta_dir+f for f in files if f.endswith(".fasta") and os.path.getsize(fasta_dir+f) > 0] - #print fasta_files - return fasta_files - -justPrintCommands = False -regenExistingFiles = False - -if __name__ == "__main__": - opts = None - try: - opts, args = getopt.getopt(sys.argv[1:], "i:d:f:m:l:xw:r", ["input","fasta_dir","farm","mapper","listOfLanes", "dontexe", "regenExistingFiles", "outputInWorkingDirectory"]) - except getopt.GetoptError: - print sys.argv - usage() - sys.exit(2) - - input_head = "" - fasta_dir = "" - mapper_str = "all" - farm_sub = False - listOfLanes = None - outputInWorkingDirectory = False - - for opt, arg in opts: - print opt, arg - if opt in ("-i", "--input"): - input_head = arg - if opt in ("-l", "--listOfLanes"): - listOfLanes = arg - if opt in ("-d", "--fasta_dir"): - fasta_dir = arg - if opt in ("-f", "--farm"): - farm_sub = arg - if opt in ("-r", "--regenExistingFiles"): - regenExistingFiles = True - if opt in ("-m", "--mapper"): - mapper_str = arg - if opt in ("-x", "--dontexe"): - justPrintCommands = True - if opt in ("-w", "--outputInWorkingDirectory"): - outputInWorkingDirectory = True - - if (input_head == "") and (fasta_dir == "") and (listOfLanes == None): - print input_head, fasta_dir, listOfLanes - usage() - sys.exit(2) - - # Select function(s) for mapper - mapper_func_list = {"ilt":run_ilt, "merlin":run_merlin, "swmerlin":run_swmerlin, "maq":run_MAQ, "bwa":run_BWA, "bwa32":run_BWA32} - if mapper_str.lower() == "all": - mapper_list = mapper_func_list.values() - else: - mapper_list = [mapper_func_list.get(mapper_str.lower())] - if mapper_list == [None]: - sys.exit("Don't know of mapper argument: "+mapper_str) - - if input_head: - input_heads = [None] - input_files = [input_head + 'fasta'] - elif listOfLanes <> None: - input_files = readListOfLanes(listOfLanes) - input_heads = [None] * len(input_files) - else: - input_files = [file for file in get_all_fasta_files(fasta_dir)] - input_heads = [None] * len(input_files) - - for input_file, input_head in zip(input_files, input_heads): - if input_head == None: - file_head = os.path.splitext(input_file)[0] - if outputInWorkingDirectory: - file_head = os.path.split(file_head)[1] - else: - file_head = input_head - for mapper in mapper_list: - mapper( input_file, file_head, farm=farm_sub ) - print diff --git a/python/SpawnValidationJobs.py b/python/SpawnValidationJobs.py deleted file mode 100755 index 4b59cfde5..000000000 --- a/python/SpawnValidationJobs.py +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env python - -import getopt, sys, os, string -from farm_commands import * - -def picardCMD(name, **keywords ): - cmd = name - for key, value in keywords.iteritems(): - cmd += ' ' + key + "=" + str(value) - return cmd - -def spawnValidationJob( input_file, output_head, farm, maxErrors): - validate_exe = "ValidateSAM" - output_file = output_head + '.stdout' - - if regenExistingFiles or not os.path.exists(output_file): - cmd_str = picardCMD( validate_exe, I=input_file, M=maxErrors ) - if farm == "": - cmd_str += " > " + output_file - cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands) - -def usage(): - print "Required arguments:" - print " -d Directory to grab all sam/bam files from" - print - print "Optional arguments:" - print " -f QUEUE Farm jobs to QUEUE on LSF" - print - print " -m MAXERRORS Maximum number of errors to detect before aborting" - print - - -def get_all_sam_files(dir): - files = [] - - for dirpath, dirnames, filenames in os.walk(dir): - for filename in filenames: - base, ext = os.path.splitext(filename) - if ext.lower() in ['.sam', '.bam']: - files.append( os.path.join( dirpath, filename ) ) - #print filename, base, ext - - return files - -def output_filename( input_file ): - parts = filter(lambda x: x.strip() <> '', input_file.split("/")) - print parts - return ".".join(parts) + ".validation" - -justPrintCommands = False -regenExistingFiles = False - -if __name__ == "__main__": - opts = None - try: - opts, args = getopt.getopt(sys.argv[1:], "d:f:m:r", ["dir","farm","maxErrors", "regenExistingFiles"]) - except getopt.GetoptError: - print sys.argv - usage() - sys.exit(2) - - dir = "" - mapper_str = "all" - farm_sub = False - maxErrors = 1000 - - for opt, arg in opts: - print opt, arg - if opt in ("-d", "--dir"): - dir = arg - if opt in ("-f", "--farm"): - farm_sub = arg - if opt in ("-m", "--maxErrors"): - maxErrors = arg - if opt in ("-r", "--regenExistingFiles"): - regenExistingFiles = True - if dir == "": - usage() - sys.exit(2) - - input_files = get_all_sam_files(dir) - print 'Processing files: N=', len(input_files) - for input_file in input_files: - print ' ->', input_file - - for input_file in input_files: - output_file = output_filename( input_file ) - print input_file, "=>", output_file - spawnValidationJob( input_file, output_file, farm_sub, maxErrors ) - - - \ No newline at end of file diff --git a/python/StressTestGATK.py b/python/StressTestGATK.py deleted file mode 100755 index 9c7e0e41d..000000000 --- a/python/StressTestGATK.py +++ /dev/null @@ -1,66 +0,0 @@ -import farm_commands -import os.path -import sys -import getopt - -defaultCommands = ['CountReads', 'Pileup'] - -def usage(): - print "Optional arguments:" - print " -f QUEUE Farm jobs to QUEUE on LSF" - print " -c cmd1,cmd2 Walkers to execute, otherwise", ' '.join(defaultCommands) - print " -e Ignore existing files", ' '.join(defaultCommands) - -if __name__ == "__main__": - opts = None - try: - opts, args = getopt.getopt(sys.argv[1:], "f:c:a:e", ["farm", "commands", "args", "ignoreExistingFiles"]) - except getopt.GetoptError: - print sys.argv - usage() - sys.exit(2) - - farm_sub = False - commandsList = defaultCommands - ignoreExistingFiles = False - extraArgs = '' - - for opt, arg in opts: - if opt in ("-f", "--farm"): - farm_sub = arg - if opt in ("-c", "--commands"): - commandsList = arg.split(',') - if opt in ("-e", "--ignoreExistingFiles"): - ignoreExistingFiles = True - if opt in ("-a", "--args"): - extraArgs = arg - - directory = args[1] - for line in open(args[0]): - lineParts = line.split() - lane = lineParts[0].strip() - - ref = '/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta' - if ( len(lineParts) > 1 ): - ref = lineParts[1].strip() - - if not os.path.exists(lane): - print 'Input SAM/BAM file: "', lane, '" does not exist, skipping...' - continue - - head, lane_filename = os.path.split(lane) - filebase = os.path.splitext(lane_filename)[0] - - # make the pileup - # 503 8:03 samtools view -b MV1994.bam Escherichia_coli_K12:10-1000 > sub.bam - # 504 8:03 samtools pileup -f Escherichia_coli_K12_MG1655.fasta sub.bam - - # convert the fasta - for analysis in commandsList: - output = os.path.join(directory, filebase + '.' + analysis + '.output') - if ignoreExistingFiles or not os.path.exists(output): - cmd = "java -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -T " + analysis + " -I " + lane + " -R " + ref + " -o " + output + " -l INFO " + extraArgs - print cmd - farm_commands.cmd(cmd, farm_sub, output) - - diff --git a/python/ValidateGATK.py b/python/ValidateGATK.py deleted file mode 100755 index 443393af7..000000000 --- a/python/ValidateGATK.py +++ /dev/null @@ -1,201 +0,0 @@ -import farm_commands -import os.path -import sys -from optparse import OptionParser -import string - -defaultCommands = ['CountReads', 'Pileup'] - -def indexBAM(reads, queue=None): - cmd = "samtools index " + reads - farm_commands.cmd(cmd, queue, None, just_print_commands=OPTIONS.justPrint) - -def readIntervalFile(file): - def notHeaderP(line): - return line[0][0] <> '@' - lines = filter(notHeaderP, map( string.split, [line for line in open(file)])) - - byContig = {} - for entry in lines: - elt = [int(entry[1]), int(entry[2])] - if entry[0] in byContig: - byContig[entry[0]].append(elt) - else: - byContig[entry[0]] = [elt] - - #print byContig - return byContig - -def isIntervalFile(filename): - return os.path.splitext(filename)[1] == '.interval_list' - -def filterByInterval(intervalMap, unfilteredPileupFile, filteredPileupFile): - print 'Intervals', intervalMap - - def maybeWrite1(line, offsetMap, out): - debug = False - parts = line.split() - contig = parts[0] - pos = int(parts[1]) - if debug: print 'pileup', contig, pos, line, - - if contig in intervalMap: - intervals = intervalMap[contig] - - while (offsetMap[contig] < len(intervals)): - offset = offsetMap[contig] - #print intervals[offset] - curStart, curStop = intervals[offset][0:2] - if debug: print contig, pos, curStart, curStop - if pos >= curStart: - if pos <= curStop: - #print line, - print >> out, line, - break - else: - if debug: print 'Progressing', contig, pos, curStart, curStop - offsetMap[contig] += 1 - else: - break - return offsetMap - - def allDone( offsetMap, intervalMap ): - def oneDone( contig ): - return offsetMap[contig] >= len(intervalMap[contig]) - return all( map(oneDone, offsetMap.iterkeys()) ) - - i = 0 - offsetMap = dict([(key, 0) for key in intervalMap.keys()]) - out = open(filteredPileupFile, 'w') - for line in open(unfilteredPileupFile): - offsetMap = maybeWrite1(line, offsetMap, out) - i += 1 - if i % 10000 == 0 and allDone(offsetMap, intervalMap): - break - out.close() - - - -def main(): - global OPTIONS, ROOT - - usage = "usage: %prog [options]" - parser = OptionParser(usage=usage) - parser.add_option("-f", "--file", dest="validateInfoFile", - type="string", default=None, - help="File name of the validation set") - parser.add_option("-d", "--dataDir", dest="dataDir", - type="string", default=None, - help="Directory to the data files") - parser.add_option("-o", "--outputDir", dest="outputDir", - type="string", default=None, - help="Directory to put the output") - parser.add_option("-q", "--farmQueue", dest="farmQueue", - type="string", default=None, - help="Farm queue to submit jobs to. Leave blank for local processing") - parser.add_option("-e", "--ignoreExistingFiles", dest="ignoreExistingFiles", - action='store_true', default=False, - help="Ignores the existing files") - parser.add_option("-a", "--rebuildAllFiles", dest="rebuildAllFiles", - action='store_true', default=False, - help="If provided, all intermediate files (BAM and pileups) will be regenerated") - parser.add_option("-n", "--dontValidate", dest="dontValidate", - action='store_true', default=False, - help="If provided, won't try to validate, just make sure the validation data files are ready") - parser.add_option("-g", "--gatk", dest="gatkPath", - type="string", default="~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar", - help="Path to the gatk") - parser.add_option("-x", "--args", dest="extraArgs", - type="string", default="", - help="Extra args to pass to the GATK") - parser.add_option("-p", "--justPrint", dest="justPrint", - action='store_true', default=False, - help="Don't actually run GATK, just setup data files") - - (OPTIONS, args) = parser.parse_args() - if len(args) != 0: - parser.error("incorrect number of arguments") - - if OPTIONS.validateInfoFile == None: - parser.error("f option is required") - if OPTIONS.dataDir == None: - parser.error("d option is required") - if OPTIONS.outputDir == None: - parser.error("o option is required") - - if not os.path.exists(OPTIONS.outputDir): - os.mkdir(OPTIONS.outputDir) - if not os.path.exists(OPTIONS.dataDir): - os.mkdir(OPTIONS.dataDir) - - for line in open(OPTIONS.validateInfoFile): - if line.strip() == "" or line[0] == '#': - # comment line - continue - - # line is of the format: - # - originalReads, ref, region = line.split() - - originalReads = os.path.expanduser(originalReads) - ref = os.path.expanduser(ref) - - if not os.path.exists(originalReads) or not os.path.exists(ref): - print 'Input files do not exist!', originalReads, ref - sys.exit(1) - - head, read_filename = os.path.split(originalReads) - filebase = os.path.splitext(read_filename)[0] - - reads = os.path.join(OPTIONS.dataDir, read_filename) - readsIndex = reads + '.bai' - subBAM = os.path.join(OPTIONS.dataDir, filebase + '.selected.bam') - pileup = os.path.join(OPTIONS.dataDir, filebase + '.selected.pileup') - validationOutput = os.path.join(OPTIONS.outputDir, filebase + '.validate.output') - - #print 'reads', reads - if not os.path.exists(reads) or OPTIONS.rebuildAllFiles: - farm_commands.cmd("ln -s " + os.path.abspath(originalReads) + " " + reads, just_print_commands=OPTIONS.justPrint) - - if not os.path.exists(readsIndex) or OPTIONS.rebuildAllFiles: - indexBAM(reads) - - if not os.path.exists(subBAM) or OPTIONS.rebuildAllFiles: - if region == '*' or isIntervalFile(region): - farm_commands.cmd("ln -s " + os.path.abspath(reads) + " " + subBAM, just_print_commands=OPTIONS.justPrint) - else: - cmd = "samtools view -b " + reads + " " + region + " > " + subBAM - farm_commands.cmd(cmd, None, None, just_print_commands=OPTIONS.justPrint) - - subBAMIndex = subBAM+'.bai' - if not os.path.exists(subBAMIndex) or OPTIONS.rebuildAllFiles: - if region == '*' or isIntervalFile(region): - farm_commands.cmd("ln -s " + os.path.abspath(readsIndex) + " " + subBAMIndex, just_print_commands=OPTIONS.justPrint) - else: - indexBAM(subBAM) - - if not os.path.exists(pileup) or OPTIONS.rebuildAllFiles: - if isIntervalFile(region): - filteredPileup = pileup + ".unfiltered" - if not os.path.exists(filteredPileup) or OPTIONS.rebuildAllFiles: - cmd = "samtools pileup -cf " + ref + " " + subBAM + " > " + filteredPileup - #farm_commands.cmd(cmd, None, None) - filterByInterval(readIntervalFile(region), filteredPileup, pileup) - else: - cmd = "samtools pileup -cf " + ref + " " + subBAM + " > " + pileup - farm_commands.cmd(cmd, None, None, just_print_commands=OPTIONS.justPrint) - - if not OPTIONS.dontValidate and (not os.path.exists(validationOutput) or OPTIONS.ignoreExistingFiles): - print validationOutput, 'does not exist' - analysis = "ValidatingPileup" - cmd = "java -ea -Xmx1024m -jar " + OPTIONS.gatkPath + " -T " + analysis + " -I " + subBAM + " -R " + ref + " -l INFO -S SILENT -U " + OPTIONS.extraArgs - - if isIntervalFile(region): - cmd += " --intervals " + region - - cmd += " -B pileup,SAMPileup," + pileup - print cmd - farm_commands.cmd(cmd, OPTIONS.farmQueue, outputFile=validationOutput, just_print_commands=OPTIONS.justPrint) - -if __name__ == "__main__": - main() diff --git a/python/WalkLociTest.py b/python/WalkLociTest.py deleted file mode 100755 index 1423cac26..000000000 --- a/python/WalkLociTest.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/util/bin/python - -import Walker -from SimpleSAM import * - -class WalkLociTest(object, Walker.Walker): - def __init__(self): - Walker.Walker.__init__( self, 'byLoci', {}, name = 'WalkLociTest' ) - self.debug = False - - def filterfunc( self, *data ): - return True - #return datum.getPos() % 2 == 0 - - def mapfunc( self, ref, chrom, pos, offsets, reads ): - - if self.debug: - print '------------------------------' - print ' locus', chrom, pos - print ' offsets = ', offsets - print ' reads = ', reads - - # - # This is the heart of the test walker. You've got the reference, contig, - # pos, offsets, and reads. The line below generate pile-ups - # - def getField(field): - return map( lambda r, o: r.__getattribute__(field)[o], reads, offsets ) - - def toASCII33( quals ): - return ''.join( map( lambda q: chr(q+33), quals )) - - if False: - # actually do some useful work - refbase = ref[chrom][pos-1] - bases = ''.join(getField('bases')) - quals = toASCII33(getField('quals')) - print chrom, pos, refbase, len(reads), all(map(lambda b: b == refbase, bases)), bases, quals - - if self.debug: - for offset, read in zip( offsets, reads ): - if self.debug: - print ' offset, read', offset, read - print read.bases[offset], read.quals[offset] - - #print datum - return 1 - #return len(datum.getSeq()) - #return datum.getSeq() + '/' - #return "\n>%s\n%s" % (datum.getQName(), datum.getSeq()) - - #reduceDefault = '' - def reducefunc( self, x, sum ): - #print x - return x + sum - - - diff --git a/python/Walker.py b/python/Walker.py deleted file mode 100755 index 7be6150d5..000000000 --- a/python/Walker.py +++ /dev/null @@ -1,88 +0,0 @@ -class Walker: - """Represents a functional walking object to be applied to SAM records. - - Actually useful programs will inherit from this walker object to implement - one or more of the blank operators provided by this module. Effectly, this - engine applies a walker object in a standard order to each record in a SAM/BAM - file: - - walker = MyWalker(args) - samRecords = parseReadsInFiles( files ): - results = [] - - foreach record in files: - if walker.filterfunc( record ): - x = walker.mapfunc( record ) - results.append(x) - reduced = reduce( walker.reducefunc, results, walker.reduceDefault ) - - - where: - - filterfunc( data ) -> returns true or false if the data should be - included or excluded from mapping - - mapfunc( data ) -> applied to each record, locus, or pair of records, depending - on the walker type - - reducefunc( x, sum ) -> combines partial results x with the accumulating - results sum across all the data - - """ - def __init__(self, walkerType, options, name = None, desc = None ): - self.options = options - self.setOption( 'walkerType', walkerType ) - self.setOption( 'name', name ) - self.setOption( 'desc', desc ) - - print self.options - - reduceDefault = 0 - - # - # walker types - # - def isRecordWalker( self ): - return self.getOption('walkerType') == 'byRecord' - def isPairedRecordWalker( self ): - return self.getOption('walkerType') == 'byRecordPairs' - def isLociWalker( self ): - return self.getOption('walkerType') == 'byLoci' - - # - # Option processing - # - def getOption( self, flag ): - if self.hasOption( flag ): - return self.options[flag] - else: - return None - - def setOption( self, flag, value ): - self.options[flag] = value - - def hasOption(self, flag): - return flag in self.options - - def optionEnabled(self, flag): - return flag in self.options and self.options[flag] <> False - - def optionDisabled(self, flag): - return not self.optionEnabled(flag) - - # - # Default functions - # - def filterfunc( self, datum ): - return True - - def mapfunc( self, datum ): - return datum - - def reducefunc( self, mapResult, sum ): - return sum + 1 - -if __name__ == "__main__": - main() - - diff --git a/python/aln_file.nocvs.py b/python/aln_file.nocvs.py deleted file mode 100755 index 57d8e318d..000000000 --- a/python/aln_file.nocvs.py +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env python - -import string, sys - -class aln_record: - "Stores one record of data from a MAQ .aln.txt file" - field_names = ( - "read name", - "chromosome", - "position", - "strand", - "insert size from the outer coorniates of a pair", - "paired flag", - "mapping quality", - "single-end mapping quality", - "alternative mapping quality", - "number of mismatches of the best hit", - "sum of qualities of mismatched bases of the best hit", - "number of 0-mismatch hits of the first 24bp", - "number of 1-mismatch hits of the first 24bp on the reference", - "length of the read", - "read sequence", - "sequence quality" - ) - max_field_name_len = max(map(len, field_names)) - - def __init__(self, obj, parse = True): - self.fields = [] - if type(obj) == str: - self.setValuesFromString( obj, parse ); - else: - raise TypeError("aln_record did not recognize type: "+str(type(obj))) - - def setValuesFromString( self, line, parse = True ): - if parse: - formats = [str, str, int, str, int, int, int, int, int, int, int, int, int, int, str, str] - self.fields = map( lambda f, v: f(v), formats, line.split() ) - else: - self.fields = line.split() - - def __str__(self): - s = "" - for n,v in zip(aln_record.field_names, self.fields): - s += ("%"+str(aln_record.max_field_name_len)+"s : %s\n") % (n, str(v)) - return s - - #return string.join( map( str, self.fields ), ' ') - - def id(self): return self.fields[0] - def contig(self): return self.fields[1] - - def offset(self): return self.fields[2]-1 - def pos(self): return self.fields[2] - - # Quality of read mapping (only maq gives this field) - def map_qual(self): return self.fields[6] - - #def offset_end(self): return self.fields[8] - #def pos_end(self): return self.fields[8]+1 - - #def linear_start(self): return self.fields[9] - - -class aln_file: - def __init__(self, filename, parse=True): - self.filename = filename - self.parse = parse - self.faln = open(self.filename) - - def RecordGenerator(self): - for line in self.faln: - yield aln_record(line, self.parse) - raise StopIteration - - def __iter__(self): - return self.RecordGenerator() - -if __name__ == "__main__": - if len(sys.argv) != 2: - print "To test aln_file class:\naln_file.py ALN_FILE" - else: - count = 0 - for aln in aln_file(sys.argv[1]): - print aln - count += 1 - #if count > 5: - # break diff --git a/python/aln_file.py b/python/aln_file.py deleted file mode 100755 index 57d8e318d..000000000 --- a/python/aln_file.py +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env python - -import string, sys - -class aln_record: - "Stores one record of data from a MAQ .aln.txt file" - field_names = ( - "read name", - "chromosome", - "position", - "strand", - "insert size from the outer coorniates of a pair", - "paired flag", - "mapping quality", - "single-end mapping quality", - "alternative mapping quality", - "number of mismatches of the best hit", - "sum of qualities of mismatched bases of the best hit", - "number of 0-mismatch hits of the first 24bp", - "number of 1-mismatch hits of the first 24bp on the reference", - "length of the read", - "read sequence", - "sequence quality" - ) - max_field_name_len = max(map(len, field_names)) - - def __init__(self, obj, parse = True): - self.fields = [] - if type(obj) == str: - self.setValuesFromString( obj, parse ); - else: - raise TypeError("aln_record did not recognize type: "+str(type(obj))) - - def setValuesFromString( self, line, parse = True ): - if parse: - formats = [str, str, int, str, int, int, int, int, int, int, int, int, int, int, str, str] - self.fields = map( lambda f, v: f(v), formats, line.split() ) - else: - self.fields = line.split() - - def __str__(self): - s = "" - for n,v in zip(aln_record.field_names, self.fields): - s += ("%"+str(aln_record.max_field_name_len)+"s : %s\n") % (n, str(v)) - return s - - #return string.join( map( str, self.fields ), ' ') - - def id(self): return self.fields[0] - def contig(self): return self.fields[1] - - def offset(self): return self.fields[2]-1 - def pos(self): return self.fields[2] - - # Quality of read mapping (only maq gives this field) - def map_qual(self): return self.fields[6] - - #def offset_end(self): return self.fields[8] - #def pos_end(self): return self.fields[8]+1 - - #def linear_start(self): return self.fields[9] - - -class aln_file: - def __init__(self, filename, parse=True): - self.filename = filename - self.parse = parse - self.faln = open(self.filename) - - def RecordGenerator(self): - for line in self.faln: - yield aln_record(line, self.parse) - raise StopIteration - - def __iter__(self): - return self.RecordGenerator() - -if __name__ == "__main__": - if len(sys.argv) != 2: - print "To test aln_file class:\naln_file.py ALN_FILE" - else: - count = 0 - for aln in aln_file(sys.argv[1]): - print aln - count += 1 - #if count > 5: - # break diff --git a/python/callingProgress.py b/python/callingProgress.py deleted file mode 100755 index 6b03c5cee..000000000 --- a/python/callingProgress.py +++ /dev/null @@ -1,37 +0,0 @@ -import farm_commands -import os.path -import sys -from optparse import OptionParser -from datetime import date -import glob -import operator - -if __name__ == "__main__": - usage = "usage: %prog files" - parser = OptionParser(usage=usage) - parser.add_option("-s", "--size", dest="regionSize", - type="int", default=None, help="") - - (OPTIONS, args) = parser.parse_args() - - def extract(line): - s = line.split() - t = s[0].split(":") - if len(t) == 2: - chr, pos = t - return chr, int(pos) - else: - return None, None - - for file in args: - chr, lastPos, startPos, calledBases, progress = None, None, None, None, None - lastLine = None - for line in open(file): - if startPos == None: - chr, startPos = extract(line) - lastLine = line - if lastLine <> None and startPos <> None: - lastPos = extract(lastLine)[1] - calledBases = lastPos - startPos - progress = "%.2f" % (float(calledBases) / OPTIONS.regionSize * 100) - print file, chr, startPos, lastPos, calledBases, progress \ No newline at end of file diff --git a/python/compSNPCalls.py b/python/compSNPCalls.py deleted file mode 100644 index 63f857e3e..000000000 --- a/python/compSNPCalls.py +++ /dev/null @@ -1,591 +0,0 @@ -#!/usr/bin/env python - -import sys, string -import os -import re -from itertools import * -from optparse import OptionParser -from memo import DiskMemoize, time_func - -class ref_genome: - """Reads reference genome in FASTA format into a dict""" - - def __init__(self, ref_genome_file): - ref_genome.chr_offset = [[] for i in range(45)] - chr_id = 0 - seq = "" - for line in open(ref_genome_file): - if line.startswith(">"): - print line[1:], - if line.startswith(">chrM"): # skip first > line - continue - ref_genome.chr_offset[chr_id] = seq - chr_id += 1 - seq = " " # make it 1 indexed instead of 0 indexed - #if chr_id > 2: - # break - else: - seq += line.rstrip().upper() - ref_genome.chr_offset[chr_id] = seq - - def __getitem__(self, key): - return ref_genome.chr_offset[key] - -AffyChr2Index = dict() -for i in range(1,23): - AffyChr2Index[str(i)] = i -AffyChr2Index['MT'] = 0 -AffyChr2Index['X'] = 23 -AffyChr2Index['Y'] = 24 - -class GenotypeCall: - #ref = time_func(ref_genome)("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") - def __init__( self, chr, pos, genotype, snpP, lod ): - self.chr = chr - self.pos = int(pos) - self._site = chr + ':' + str(self.pos) - self._isSNP = snpP - self.genotype = string.join(map(string.upper, sorted(genotype)), '/') # sorted list of bases at position - self.lod = lod - - def refbase(self): - return GenotypeCall.ref[AffyChr2Index[self.chr]][self.pos] - - def __hash__(self): - return hash(self._site) - - def __eq__(self, other): - return self._site == other._site - - def site(self): return self._site - def isSNP(self): return self._isSNP - - def ref_het_hom(self): - if self.genotype[0] <> self.genotype[2]: - return 1 # het(erozygous non-ref) - else: - # homozygous something - if self.genotype[0] == self.refbase: - return 0 # ref - else: - return 2 # hom(ozygous non-ref) - - def isHET(self): return self.genotype[0] <> self.genotype[2] - def isHOM(self): return self.genotype[0] == self.genotype[2] - - def __str__(self): - return "%s:%s %s %s" % ( self.chr, self.pos, self.genotype, self.lod) - -MAQGenotypeEncoding = { - 'A' : ['A', 'A'], - 'C' : ['C', 'C'], - 'T' : ['T', 'T'], - 'G' : ['G', 'G'], - "M" : ['A', 'C'], - 'K' : ['G', 'T'], - 'Y' : ['C', 'T'], - 'R' : ['A', 'G'], - 'W' : ['A', 'T'], - 'S' : ['C', 'G'], - 'D' : ['A', 'G', 'T'], - 'B' : ['C', 'G', 'T'], - 'H' : ['A', 'C', 'T'], - 'V' : ['A', 'C', 'G'], - 'N' : ['A', 'C', 'G', 'T'] } - -MAQ2STDChr = dict() -for i in range(1,23): - MAQ2STDChr['chr'+str(i)] = str(i) -MAQ2STDChr['chrM'] = 'MT' -MAQ2STDChr['chrX'] = 'X' -MAQ2STDChr['chrY'] = 'Y' - -def convertMAQChr(maqChr): - #print 'convertMAQChr:', maqChr, MAQ2STDChr[maqChr] - if maqChr in MAQ2STDChr: - return MAQ2STDChr[maqChr] - else: - return '?' - -def convertMAQGenotype( oneBaseCode ): - return MAQGenotypeEncoding[oneBaseCode] - -def internalReadSNPFile( parse1, filename ): - result = [] - snps_extracted = 0 - for snp in imap( parse1, open(filename) ): - if snp: - result.append(snp) - snps_extracted += 1 - if snps_extracted > OPTIONS.debug_lines: - break - - print len(result),"genotypes extracted" - return result - -def snpMAP( snps ): - #d = dict( map( lambda x: [x.site(), x], snps ) ) - d = dict() - for snp in snps: - d[snp.site()] = snp#d - - #print 'snps', snps, d - return d - -def overlappingSites( snps1, snps2 ): - map1 = snpMAP(snps1) - map2 = snpMAP(snps2) - shared = set(map1.keys()) & set(map2.keys()) - print 'Number of snp1 records', len(map1) - print 'Number of snp2 records', len(map2) - print 'Number of shared sites', len(shared) - print "\n".join(map(str,snps1)) - return shared - -def readMAQSNPs(filename): - # Each line consists of: - # chromosome - # position - # reference base - # consensus base - # Phred-like consensus quality - # read depth - # the average number of hits of reads covering this position - # the highest mapping quality of the reads covering the position - # the minimum consensus quality in the 3bp flanking regions at each side of the site (6bp in total) - # the second best call - # log likelihood ratio of the second best and the third best call - # and the third best call. - # - # Also, note that: - # - # What do those "S", "M" and so on mean in the cns2snp output? - # They are IUB codes for heterozygotes. Briefly: - # - # M=A/C, K=G/T, Y=C/T, R=A/G, W=A/T, S=G/C, D=A/G/T, B=C/G/T, H=A/C/T, V=A/C/G, N=A/C/G/T - def read1(line): - formats = [str, int, str, str, int, int] - vals = map( lambda f, x: f(x), formats, line.split()[0:6] ) - alignQual = vals[4] - if alignQual >= (10*OPTIONS.lod): - return GenotypeCall( convertMAQChr(vals[0]), vals[1], convertMAQGenotype(vals[3]), vals[2] <> vals[3], alignQual/10.0 ) - else: - #print 'Filtering', alignQual, vals - return False - - return internalReadSNPFile( read1, filename ) - -OPTIONS = None - -def MerlinChr( index ): - if index == 0: - return 'MT' - elif index == 23: - return 'X' - elif index == 24: - return 'Y' - else: - return str(index) - -def readMerlinSNPs(filename): - # 0:72 G GG 155.337967 0.000000 homozygous A:0 C:2 G:510 T:2 514 0 1 1 GG:-5.59 CG:-160.92 GT:-161.51 AG:-162.11 CT:-1293.61 CC:-1293.61 TT:-1294.19 AC:-1294.21 AT:-1294.80 AA:-1295.40 - # 0:149 T CC 118.595886 1131.024696 homozygous-SNP A:2 C:442 G:1 T:7 452 0 1 1 CC:-24.21 CT:-142.81 AC:-156.33 CG:-156.96 TT:-1155.23 AT:-1159.41 GT:-1160.04 AA:-1173.26 AG:-1173.56 GG:-1174.20 - # chr:pos ref genotype bestVsRef bestVsNextBest class ... - def read1(line): - formats = [lambda x: x.split(':'), str, sorted, float, float, str] - vals = map( lambda f, x: f(x), formats, line.split()[0:6] ) - bestVsRef, bestVsNext = vals[3:5] - isSNP = vals[5].find('-SNP') <> -1 - if bestVsRef >= OPTIONS.lod and isSNP: - return GenotypeCall( MerlinChr(int(vals[0][0])), int(vals[0][1]) + 1, vals[2], isSNP, bestVsRef ) - else: - return False - - return internalReadSNPFile( read1, filename ) - -def readSNPfile( filename, format ): - formats = { 'merlin' : readMerlinSNPs, 'maq' : readMAQSNPs } - if format.lower() in formats: - return list(formats[format.lower()](filename)) - else: - raise Exception('Unknown SNP file format ' + format) - -def readAffyFile(filename): - # chrom position genotype probe_set_id dbsnp_id - # 1 84647761 TC SNP_A-1780419 rs6576700 - # 5 156323558 GG SNP_A-1780418 rs17054099 - def read1(line): - formats = [str, int, sorted, str, str] - vals = map( lambda f, x: f(x), formats, line.split() ) - - try: - chr = str(int(vals[0])) - except: - chr = convertMAQChr(vals[0]) - #print 'CHR', chr, vals[0] - return GenotypeCall( chr, vals[1], vals[2], False, 100 ) - - file = open(filename) - file.readline() # skip header - #affyData = map( read1, file ) - affyData = [] - for index, line in enumerate(file): - affyData.append(read1(line)) - if index > OPTIONS.debug_lines: - break - if index % 10000 == 0: - print index - # Give a chance to use list before creating dictionary - return affyData - - #print "1111111" - #return dict( zip( map( GenotypeCall.site, affyData ), affyData ) ) - -def equalSNPs( snp1, snp2 ): - return snp1.genotype == snp2.genotype - -# def concordance( truthSet, testSet, includeVector = None ): -# # calculates a bunch of useful stats about the two -# # data genotype call sets above -# -# states = [[x,0] for x in ['tested', 'shared', 'shared-equal', 'test-snp', 'hom-ref', 'hom-snp', 'het-snp', 'het-ref']] -# counts = dict(states) -# -# def incShared(state, equalP ): -# counts[state] += 1 -# if equalP: -# counts[state + '-equal'] += 1 -# -# nTestSites = 0 -# for i, testSNP in izip( count(), testSet ): -# if includeVector <> None and not includeVector[i]: -# # we are skiping this site -# continue -# -# nTestSites += 1 -# -# if testSNP.isSNP(): -# counts['test-snp'] += 1 -# -# #print testSNP.site() -# if testSNP.site() in truthSet: -# truth = truthSet[testSNP.site()] -# eql = equalSNPs( testSNP, truth ) -# -# incShared( 'shared', eql ) -# if testSNP.isSNP(): -# if truth.isHOM(): incShared( 'hom-snp', eql ) -# else: incShared( 'het-snp', eql ) -# else: -# if truth.isHOM(): incShared( 'hom-ref', eql ) -# else: incShared( 'het-ref', eql ) -# -# if OPTIONS.verbose and nTestSites % 100 == 0 and nSharedSites > 0: -# #print nTestSites, nSharedSites, nEqualSites -# print nTestSites, counts -# -# counts['tested'] = nTestedSites -# -# return counts - -class ConcordanceData: - def __init__(self, name, file1count, file2count): - self.name = name - self.nFile1Sites = file1count # num sites in file 1 - self.nFile2Sites = file2count # num sites in file 1 - self.nSharedSites = 0 # num SNP pairs that map to same position on the genome - self.nEqualSites = 0 # num SNPs pars with the same genotype - - def inc( self, truthSNP, testSNP ): - self.nSharedSites += 1 - if equalSNPs( testSNP, truthSNP ): # if the genotypes are equal - self.nEqualSites += 1 - - def rate(self): - return (100.0 * self.nEqualSites) / max(self.nSharedSites,1) - - def __str__(self): - return '%d %d %.2f' % ( self.nSharedSites, self.nEqualSites, self.rate() ) - -def concordance( truthSet, testSet, sharedSites = None ): - # calculates a bunch of useful stats about the two - # data genotype call sets above - # - # The 2 calls in main work like this: - # affy, snp1, snp1_snp2_shared - # affy, snp2, snp1_snp2_shared - - nTestSites = 0 - - # Now for each of the calls to concordance, we generate 3 sets: - # - allData: all SNP1 sites that are also in Affy - allData = ConcordanceData('all', len(truthSet), len(testSet)) - # - sharedData: SNP1 sites that are also SNP2 sites that are alse in Affy - sharedData = ConcordanceData('shared', len(truthSet), len(testSet)) - # - uniqueData: SNP1 sites that are not SNP2 sites but that are in Affy - uniqueData = ConcordanceData('unique', len(truthSet), len(testSet)) - for i, testSNP in izip( count(), testSet ): - nTestSites += 1 - if testSNP.site() in truthSet: - truthSNP = truthSet[testSNP.site()] - - allData.inc( truthSNP, testSNP ) - if sharedSites <> None: - if testSNP.site() in sharedSites: - sharedData.inc( truthSNP, testSNP ) - else: - uniqueData.inc( truthSNP, testSNP ) - - if OPTIONS.verbose and nTestSites % 100000 == 0: - #print nTestSites, nSharedSites, nEqualSites - print nTestSites, allData, sharedData, uniqueData - - return nTestSites, allData, sharedData, uniqueData - -# def concordance( truthSet, testSet, includeVector = None ): -# # calculates a bunch of useful stats about the two -# # data genotype call sets above -# -# states = [[x,0] for x in ['tested', 'shared', 'test-snp', 'shared-hom-ref', 'shared-het-snp', 'shared-hom-snp']] -# counts = dict(states) -# -# nTestSites = 0 -# nSharedSites = 0 -# nEqualSites = 0 -# for i, testSNP in izip( count(), testSet ): -# nTestSites += 1 -# #print testSNP.site() -# if testSNP.site() in truthSet: -# nSharedSites += 1 -# if equalSNPs( testSNP, truthSet[testSNP.site()] ): -# nEqualSites += 1 -# #else: -# # print '~', testSNP, truthSet[testSNP.site()] -# if OPTIONS.verbose and nTestSites % 100000 == 0 and nSharedSites > 0: -# #print nTestSites, nSharedSites, nEqualSites -# print nTestSites, nSharedSites, nEqualSites, (100.0 * nEqualSites) / nSharedSites -# -# return [nTestSites, nSharedSites, nEqualSites, (100.0 * nEqualSites) / nSharedSites] - - -def printConcordanceResults( filename1, filename2, results, hasSharedSites = False ): - nTestSites, allData, sharedData, uniqueData = results - - def print1(data): - print '------------------------------------------------------------' - print 'Concordance results', data.name, 'sites' - print ' -> Genotype file1', filename1 - print ' -> Genotype file2', filename2 - print ' -> Number of tested sites (%s %s): %d' % (filename2, data.name, nTestSites) - print ' -> Number of sites shared between files (%s %s): %d' % (filename2, data.name, data.nSharedSites) - print ' -> Percent sites in file1 shared (%s %s): %.2f' % (filename1, data.name, data.nSharedSites / (0.01 * data.nFile1Sites)) - print ' -> Percent sites in file2 shared (%s %s): %.2f' % (filename1, data.name, data.nSharedSites / (0.01 * data.nFile2Sites)) - print ' -> Number of genotypically equivalent sites (%s %s): %d' % (filename2, data.name, data.nEqualSites) - print ' -> Concordance rate (%s %s): %.2f' % (filename2, data.name, data.rate()) - - print1(allData) - if hasSharedSites: - print1( sharedData ) # shared between SNP1 and SNP2 - print1( uniqueData ) # unique to SNP1 or to SNP2 only - -def dump_shared_snps( affys, snp_list1, snp_list2 ): - print len(affys), len(snp_list1), len(snp_list2) - snps1 = snpMAP(snp_list1) - snps2 = snpMAP(snp_list2) - snp1_sites = set(snps1.keys()); print "SNP1s:",len(snp1_sites) - snp2_sites = set(snps2.keys()); print "SNP2s:",len(snp2_sites) - affy_sites = set(affys.keys()); print "Affys:",len(affy_sites) - snp1or2_affy_sites = (snp1_sites | snp2_sites) & affy_sites - snp1and2_affy_sites = (snp1_sites & snp2_sites) & affy_sites - print "SNP 1 or 2 and Affy: ",len(snp1or2_affy_sites) - print "SNP 1 and 2 and Affy:",len(snp1and2_affy_sites) - - fsnp = open ("snp.tab","w") - print >>fsnp, "site lod1 lod2 lod1v2 gen1 gen2 genaff inc1 inc2 inc12 lodm genm incm ref_het_hom refbase" - for site in snp1and2_affy_sites: - snp1 = snps1[site] - snp2 = snps2[site] - affy = affys[site] - - print >>fsnp, "%-11s %5.2f %5.2f" % (site, snp1.lod, snp2.lod), - try: - snp1div2 = snp1.lod / snp2.lod - except ZeroDivisionError: - snp1div2 = 1000 - print >>fsnp, "%5.2f" % snp1div2, - print >>fsnp, snp1.genotype, snp2.genotype, affy.genotype, - print >>fsnp, "%1d %1d %1d" % (not equalSNPs(snp1, affy), not equalSNPs(snp2,affy), not equalSNPs(snp1,snp2)), - - # Calculte meta_lod from the two lods - if snp1.genotype == snp2.genotype: - meta_lod = snp1.lod + snp2.lod - meta_genotype = snp1.genotype - else: - if snp1.lod > snp2.lod: - meta_lod = snp1.lod - meta_genotype = snp1.genotype - else: - meta_lod = snp2.lod - meta_genotype = snp2.genotype - meta_inc = meta_genotype != affy.genotype - print >>fsnp, "%5.2f %3s %1d" % (meta_lod, meta_genotype, meta_inc), - print >>fsnp, affy.ref_het_hom(), - print >>fsnp, affy.refbase() - -def intersection_union_snps( affy, snps1, snps2 ): - map1 = snpMAP(snps1) - map2 = snpMAP(snps2) - shared_nonaffy_sites = (set(map1.keys()) & set(map2.keys())).difference(affy.keys()) - nonaffy_shared = [(map1[site].lod, map2[site].lod) for site in shared_nonaffy_sites] - shared_affy_sites = set(map1.keys()) & set(map2.keys()) & set(affy.keys()) - - #shared = [] - #for site in shared_sites: - # shared.append((map1[site], map2[site])) - #print "Shared:",len( shared ) - #shared_x = [s[0].lod for s in shared] - #shared_y = [s[1].lod for s in shared] - - both_corr = [] - snp1_corr = [] - snp2_corr = [] - neither_corr = [] - # given two bools telling whether snp1 and snp2 are correct, - # return the correct object - # - # snp2 incorrect, snp2 correct - truth_list = [[neither_corr, snp2_corr], # snp1 incorrect - [snp1_corr, both_corr]] # snp1 correct - - for site in shared_affy_sites: - snp1_true = equalSNPs(map1[site], affy[site]) - snp2_true = equalSNPs(map2[site], affy[site]) - truth_list[snp1_true][snp2_true].append( (map1[site].lod, map2[site].lod) ) - - print "Beginning plot..." - import rpy2.robjects as robj - robj.r('X11(width=15, height=15)') - XY_MAX = 25 - plots = ((nonaffy_shared, "gray45"),(both_corr, "black"), (snp1_corr, "red"), (snp2_corr, "green"), (neither_corr, "blue")) - plots = ((both_corr, "black"), (snp1_corr, "red"), (snp2_corr, "green"), (neither_corr, "blue")) - robj.r.plot([0, XY_MAX], [0, XY_MAX], \ - xlab = os.path.splitext(OPTIONS.snp1)[0].capitalize()+" LOD", \ - ylab = os.path.splitext(OPTIONS.snp2)[0].capitalize()+" LOD", \ - main = "Shared SNP site LODs", \ - xlim = robj.FloatVector([0,XY_MAX]), \ - ylim = robj.FloatVector([0,min(XY_MAX,25)]), \ - pch = 19, \ - cex = 0.0) - for xy, color in plots: - print "X" - robj.r.points([pt[0] for pt in xy], [pt[1] for pt in xy], col=color, pch=19, cex=.3) - print "Color:",color - print "Len:",len(xy) - #print "\n".join(["%25s %25s" % pt for pt in xy]) - - #print "\n".join(["%25s %25s" % shared_snp for shared_snp in shared]) - #robj.r.plot(shared_x, shared_y, xlab=OPTIONS.format1.capitalize()+" LOD", ylab=OPTIONS.format2.capitalize()+" LOD", main="Shared SNP site LODs", col="black", xlim=robj.FloatVector([0,XY_MAX]), ylim=robj.FloatVector([0,min(XY_MAX,25)]), pch=19, cex=0.3) - raw_input("Press enter to continue") - - return - - ss1 = set(snps1) - ss2 = set(snps2) - print "Shared:",len( ss1.intersection(ss2) ) - print "Snp1 only:",len( ss1.difference(ss2) ) - print "Snp2 only:",len( ss2.difference(ss1) ) - print "Snp1 total:",len( ss1 ) - print "Snp2 total:",len( ss2 ) - -def count_het_sites(snp_list): - hets = 0 - for snp in snp_list: - if snp.isHET(): - hets += 1 - print hets,"hets,",len(snp_list),"total,", - print "%.1f" % (float(hets)/len(snp_list)*100) - -def main(argv): - global OPTIONS, ROOT - - usage = "usage: %prog --truth affy-truth-file --snp1 snpfile1 --snp2 snpfile2" - parser = OptionParser(usage=usage) - parser.add_option("-t", "--truth", dest="affy", - type="string", default=None, - help="Affy truth file") - parser.add_option("-1", "--snp1", dest="snp1", - type="string", default=None, - help="Affy truth file") - parser.add_option("-2", "--snp2", dest="snp2", - type="string", default=None, - help="Affy truth file") - parser.add_option("", "--f1", dest="format1", - type="string", default=None, - help="File type of snpfile1") - parser.add_option("", "--f2", dest="format2", - type="string", default=None, - help="File type of snpfile2") - parser.add_option("-l", "--lod", dest="lod", - type="float", default=5.0, - help="Minimum LOD of confident SNP call in Merlin or Q (10x) in MAQ") - parser.add_option("-v", "--verbose", dest="verbose", - action='store_true', default=False, - help="Verbose output") - parser.add_option("-d", "--debug_lines", dest="debug_lines", - type='float', default=sys.maxint, - help="Number of input data lines to process for debugging") - - (OPTIONS, args) = parser.parse_args() - if len(args) != 0: - parser.error("incorrect number of arguments") - - if OPTIONS.affy == None: - parser.error("No affy data specified") - - if OPTIONS.format1 == None: - parser.error("First format cannot be none") - - if OPTIONS.snp2 <> None and OPTIONS.format2 == None: - parser.error("snp2 file given but format was not specified") - - # Load reference genome - #ref = ref_genome("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") - #sys.exit() - - if 1: - print "Reading Affy truth data..." - #readAffyFile2 = DiskMemoize( readAffyFile, "readAffyFile", global_deps = ["OPTIONS.lod"] ) - readAffyFile2 = time_func(readAffyFile) - affy_list = readAffyFile2( filename=OPTIONS.affy ) - #count_het_sites(affy_list) - #sys.exit() - affy = dict( zip( map( GenotypeCall.site, affy_list ), affy_list ) ) - print 'Read affy truth data:' - print ' -> number of genotyped loci', len(affy) - - #readSNPfile2 = DiskMemoize( readSNPfile, "readSNPfile", global_deps = ["OPTIONS.lod"] ) - readSNPfile2 = time_func(readSNPfile) - print "Reading SNPs 1 file..." - snps1 = readSNPfile2( filename=OPTIONS.snp1, format=OPTIONS.format1 ) - if OPTIONS.snp2 <> None: - print "Reading SNPs 2 file..." - snps2 = readSNPfile2( filename=OPTIONS.snp2, format=OPTIONS.format2 ) - - dump_shared_snps( affy, snps1, snps2 ) - #intersection_union_snps( affy, snps1, snps2 ) - #sys.exit() - - sharedSites = None - if OPTIONS.snp2 <> None: - sharedSites = overlappingSites( snps1, snps2 ) - results1 = concordance( affy, snps1, sharedSites ) - printConcordanceResults( OPTIONS.affy, OPTIONS.snp1, results1, True ) - results2 = concordance( affy, snps2, sharedSites ) - printConcordanceResults( OPTIONS.affy, OPTIONS.snp2, results2, True ) - - else: - results = concordance( affy, snps1 ) - printConcordanceResults( OPTIONS.affy, OPTIONS.snp1, results, sharedSites ) - -if __name__ == "__main__": - main(sys.argv) diff --git a/python/create_venn_evals.py b/python/create_venn_evals.py deleted file mode 100755 index e5bc1bad1..000000000 --- a/python/create_venn_evals.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python - -# Creates summary stats from a concordance file - SNPs called and number in dbSNP - and will output - -import sys, re, os, farm_commands - -sting_dir = os.environ.get("STING_DIR") -if sting_dir == None: - print "You need to set the environment variable STING_DIR to point to your Sting/GATK build" - sys.exit() - -if len(sys.argv) < 2: - print "Usage: PROG CONCORDANCE_FILE" - sys.exit() - -class callset_data: - def __init__(self, snps, in_dbSNP, titv): - self.snps = snps - self.in_dbSNP = in_dbSNP - self.titv = titv - def inc_snps(self): - self.snps += 1 - def inc_dbsnps(self): - self.in_dbSNP += 1 - def __str__(self): - return "SNPs: %10d in_dbSNP: %d TiTv: %d" % (self.snps, self.in_dbSNP, self.titv) - -sets = dict() -concordance_filename = os.path.abspath(sys.argv[1]) -for line in file(concordance_filename): - fields = line.split("\t") - filter = 0 - if not re.search('HybridSelectionVariantFilter', line): - match = re.search('Venn=(\S*)', line) ### TODO: Only works in 2-way venn with ";", remove ";" for N-way Venn - should be changed to work for both automatically - if match: - my_set = match.groups()[0] - - if sets.has_key(my_set): - sets[my_set].inc_snps() - if fields[2] != ".": - sets[my_set].inc_dbsnps() - else: - sets[my_set] = callset_data(1, 0, 0) - -print "\n".join(["%40s %s" % (k,str(v)) for k,v in sets.items()]) - -print "Splitting concordance file\n" -splits_dir = concordance_filename+".splits" - -for vennSet in sets.keys(): - print vennSet - output_head = splits_dir+"_"+vennSet - vcf_file = splits_dir+"_"+vennSet+".vcf" - varianteval_file = splits_dir+"_"+vennSet+".varianteval" - - if not os.path.exists(varianteval_file): - fout = open(vcf_file, "w") - for line in file(concordance_filename): - if line.startswith("#"): - fout.write(line) - else: - match = re.search('Venn='+vennSet, line) - if match: - fout.write(line) - - varianteval_out = os.path.basename(concordance_filename)+".vcf" - farm_commands.cmd("java -Xmx4096m -jar "+sting_dir+" -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta " - +"-T VariantEval " - +"-B eval,VCF,"+vcf_file+" " - +"-o "+varianteval_file+" " - +"-D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod", - "gsa") - - - - - - - - - diff --git a/python/easyRecalQuals.py b/python/easyRecalQuals.py deleted file mode 100755 index 0343744b7..000000000 --- a/python/easyRecalQuals.py +++ /dev/null @@ -1,80 +0,0 @@ -import farm_commands -import os.path -import sys -from optparse import OptionParser -from gatkConfigParser import * -import glob - -if __name__ == "__main__": - usage = """usage: %prog [-c config.cfg]* input.bam output.bam""" - - parser = OptionParser(usage=usage) - parser.add_option("-A", "--args", dest="args", - type="string", default="", - help="arguments to GATK") - parser.add_option("-m", "--mode", dest="RecalMode", - type="string", default="SEQUENTIAL", - help="Mode argument to provide to table recalibrator") - parser.add_option("-q", "--farm", dest="farmQueue", - type="string", default=None, - help="Farm queue to send processing jobs to") - parser.add_option("-c", "--config", dest="configs", - action="append", type="string", default=[], - help="Configuration file") - parser.add_option("-d", "--dir", dest="scratchDir", - type="string", default="./", - help="Output directory") - parser.add_option("-w", "--wait", dest="initialWaitID", - type="string", default=None, - help="If providedm the first job dispatched to LSF will use this id as it ended() prerequisite") - parser.add_option("", "--dry", dest="dry", - action='store_true', default=False, - help="If provided, nothing actually gets run, just a dry run") - parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles", - action='store_true', default=False, - help="Ignores already written files, if present") - - (OPTIONS, args) = parser.parse_args() - if len(args) != 2: - parser.error("incorrect number of arguments") - - config = gatkConfigParser(OPTIONS.configs) - inputBAM = args[0] - outputBAM = args[1] - rootname = os.path.split(os.path.splitext(outputBAM)[0])[1] - - covariateRoot = os.path.join(OPTIONS.scratchDir, rootname) - covariateInitial = covariateRoot + '.init' - initDataFile = covariateInitial + '.recal_data.csv' - covariateRecal = covariateRoot + '.recal' - recalDataFile = covariateRecal + '.recal_data.csv' - - if not os.path.exists(OPTIONS.scratchDir): - os.mkdir(OPTIONS.scratchDir) - - def covariateCmd(bam, outputDir): - add = " -I %s --OUTPUT_FILEROOT %s" % (bam, outputDir) - return config.gatkCmd('CountCovariates', log=outputDir, stdLogName=True) + add - - def recalibrateCmd(inputBAM, dataFile, outputBAM): - return config.gatkCmd('TableRecalibration', log=outputBAM, stdLogName=True) + " -I %s -params %s -outputBAM %s -mode %s" % (inputBAM, dataFile, outputBAM, OPTIONS.RecalMode) - - def runCovariateCmd(inputBAM, dataFile, dir, jobid): - if OPTIONS.ignoreExistingFiles or not os.path.exists(dataFile): - cmd = covariateCmd(inputBAM, dir) - return farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid) - - # - # Actually do some work here - # - jobid = OPTIONS.initialWaitID - if OPTIONS.ignoreExistingFiles or not os.path.exists(initDataFile): - jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid) - - if OPTIONS.ignoreExistingFiles or not os.path.exists(outputBAM): - cmd = recalibrateCmd(inputBAM, initDataFile, outputBAM) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid) - jobid = farm_commands.cmd('samtools index ' + outputBAM, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid) - - jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid) - diff --git a/python/makeIGVCategory.py b/python/makeIGVCategory.py deleted file mode 100755 index 6d6bde30c..000000000 --- a/python/makeIGVCategory.py +++ /dev/null @@ -1,24 +0,0 @@ -import os.path -from optparse import OptionParser - -def main(): - global OPTIONS - usage = "usage: %prog [options] files" - parser = OptionParser(usage=usage) - parser.add_option("-c", "--category", dest="category", - type='string', default="Anonymous", - help="If provided, catagory name") - - (OPTIONS, args) = parser.parse_args() - if len(args) == 0: - parser.error("incorrect number of arguments") - - print '' % OPTIONS.category - for arg in args: - path = os.path.abspath(arg) - name = os.path.basename(arg) - print ' ' % (name, path) - print '' - -if __name__ == "__main__": - main() diff --git a/python/mergeVCFs.py b/python/mergeVCFs.py deleted file mode 100755 index 7fb35f8a3..000000000 --- a/python/mergeVCFs.py +++ /dev/null @@ -1,107 +0,0 @@ -import os.path -import sys -from optparse import OptionParser -from vcfReader import * -from itertools import * -import faiReader - -def main(): - global OPTIONS - usage = "usage: %prog [options] file1 ... fileN" - parser = OptionParser(usage=usage) - parser.add_option("-f", "--f", dest="fai", - type='string', default=None, - help="FAI file defining the sort order of the VCF") - parser.add_option("-o", "--o", dest="output", - type='string', default=None, - help="if provided, output will go here instead of stdout") - parser.add_option("-a", "--assumeSorted", dest="assumeSorted", - action='store_true', default=False, - help="If provided, this assumes the input VCF files are themselves sorted, enabling a simple efficent merge") - parser.add_option("-v", "--verbose", dest="verbose", - action='store_true', default=False, - help="If provided, verbose progress will be enabled") - - (OPTIONS, args) = parser.parse_args() - if len(args) == 0: - parser.error("Requires at least 1 VCF to merge") - - order = None - if OPTIONS.fai <> None: - if OPTIONS.verbose: print 'reading FAI', OPTIONS.fai - - order = faiReader.readFAIContigOrdering(OPTIONS.fai) - #print 'Order', order - - if OPTIONS.output != None: - out = open(OPTIONS.output,'w') - else: - out = sys.stdout - - if OPTIONS.assumeSorted: - mergeSort(out, args, order) - else: - memSort(out, args, order) - -def cmpVCFRecords(order, r1, r2): - if order <> None: - c1 = order[str(r1.getChrom())] - c2 = order[str(r2.getChrom())] - orderCmp = cmp(c1, c2) - if orderCmp <> 0: - return orderCmp - return cmp(r1.getPos(), r2.getPos()) - -def mergeSort(out, args, order): - #print 'MergeSort', args, order - header = None - - orderMap = [] - for file in args: - #print file - openedFile = open(file) - for header, record, counter in lines2VCF(openedFile, extendedOutput = True, decodeAll = False): - orderMap.append([record, file]) - break - openedFile.close() - - #print orderMap - sortedOrderMap = sorted(orderMap, key=lambda x: x[0], cmp = lambda r1, r2: cmpVCFRecords(order, r1, r2)) - #print sortedOrderMap - - for headerLine in header: print >> out, headerLine - i = 0 - n = len(sortedOrderMap) - for file in map( lambda x: x[1], sortedOrderMap): - if OPTIONS.verbose: - i += 1 - print 'Processing', file, ':', i, 'of', n - - for record in lines2VCF(open(file), extendedOutput = False, decodeAll = False): - print >> out, record.format() - -def memSort(args, order): - header = None - records = [] - for file in args: - #print file - for header, record, counter in lines2VCF(open(file), extendedOutput = True, decodeAll = False): - records.append(record) - - records.sort(lambda r1, r2: cmpVCFRecords(order, r1, r2)) - for line in formatVCF(header, records): - #pass - print >> out, line - -PROFILE = False -if __name__ == "__main__": - if PROFILE: - import cProfile - cProfile.run('main()', 'fooprof') - import pstats - p = pstats.Stats('fooprof') - p.sort_stats('cumulative').print_stats(10) - p.sort_stats('time').print_stats(10) - p.sort_stats('time', 'cum').print_stats(.5, 'init') - else: - main() \ No newline at end of file diff --git a/python/pilot2CallingPipeline.py b/python/pilot2CallingPipeline.py deleted file mode 100755 index 6b02c62ad..000000000 --- a/python/pilot2CallingPipeline.py +++ /dev/null @@ -1,332 +0,0 @@ -import farm_commands -import os.path -import sys -from optparse import OptionParser -from datetime import date -import glob -import operator -import faiReader -import math -import shutil - -GATK_STABLE = 'java -ea -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar -l INFO -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta ' -GATK_DEV = 'java -ea -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta ' -GATK = GATK_STABLE - -class CallTarget: - def __init__(self, name, hetero, minQ = 50, depth = 120, truthGFFName = None, variantEvalArgs = '', otherVCF = None): - self.name = name - self.hetero = hetero - self.minQ = minQ - self.depth = depth - if truthGFFName <> None: - self.truthGFFName = truthGFFName - else: - self.truthGFFName = name - self.variantEvalArgs = variantEvalArgs - self.otherVCF = otherVCF - self.unionVCF = None - if otherVCF != None: - self.unionVCF = self.name + '.gatk.glftrio.union.filtered.vcf' - self.listFile = None - self.releaseVCF = None - -CEU_HET = 0.79e-3 -YRI_HET = 1.0e-3 - -targets = [ - CallTarget('NA12878', CEU_HET), - CallTarget('NA12891', CEU_HET), - CallTarget('NA12892', CEU_HET), - CallTarget('NA19238', YRI_HET), - CallTarget('NA19239', YRI_HET), - CallTarget('NA19240', YRI_HET), - - CallTarget('ceu.trio', CEU_HET, 50, 360, 'NA12878', '--sampleName NA12878', '/humgen/gsa-hpprojects/1kg/1kg_pilot2/currentBestProjectCalls/CEU_1kg_pilot2.vcf'), - CallTarget('yri.trio', YRI_HET, 50, 360, 'NA19240', '--sampleName NA19240', '/humgen/gsa-hpprojects/1kg/1kg_pilot2/currentBestProjectCalls/YRI_1kg_pilot2.vcf'), - -# CallTarget('NA19240.alltechs.solid_original', YRI_HET, 50, 120, 'NA19240'), -# CallTarget('NA12878.alltechs.solid_original', CEU_HET, 50, 120, 'NA12878'), - - CallTarget('NA19240.alltechs.solid_recal', YRI_HET, 50, 120, 'NA19240'), - CallTarget('NA12878.alltechs.solid_recal', CEU_HET, 50, 120, 'NA12878'), -] - -sets = ['Intersection', 'filteredInBoth', 'gatk', 'gatk-filteredInOther', 'glftrio', 'glftrio-filteredInOther', 'Intersection', ['gatk-unique', 'gatk.*'], ['glftrio-unique', 'glftrio.*']] - -def main(): - global OPTIONS - usage = "usage: %prog stage [options]" - parser = OptionParser(usage=usage) -# parser.add_option("-q", "--farm", dest="farmQueue", -# type="string", default=None, -# help="Farm queue to send processing jobs to") - parser.add_option("", "--dry", dest="dry", - action='store_true', default=False, - help="If provided, nothing actually gets run, just a dry run") - parser.add_option("-s", "--sample", dest="useSample", - type='string', default=None, - help="If provided, only run pipeline for this sample") - parser.add_option("-c", "--minQ", dest="minQ", - type='float', default=None, - help="If provided, will actually use this Q threshold for calls") - parser.add_option("-d", "--dir", dest="dir", - type='string', default="", - help="If provided, this is the root where files are read and written") - parser.add_option("-q", "--farm", dest="farmQueue", - type="string", default=None, - help="Farm queue to send processing jobs to") - - (OPTIONS, args) = parser.parse_args() - if len(args) != 1: - parser.error("incorrect number of arguments") - - stages = args[0].split(",") - - allJobs = [] - for callTarget in targets: - if callTarget.name == OPTIONS.useSample or OPTIONS.useSample == None: - lastJobs = None - - if OPTIONS.minQ != None: - callTarget.minQ = OPTIONS.minQ - - for stage in stages: - print 'STAGE', stage - target = callTarget.name - callTarget.listFile = os.path.join("lists", target + ".list") - unfilteredVCFBaseName = target + '.gatk.ug.vcf' - unfilteredVCF = os.path.join(OPTIONS.dir, unfilteredVCFBaseName) - filteredVCF = os.path.join(OPTIONS.dir, target + '.gatk.ug.filtered.vcf') - - tmpdir = os.path.join(OPTIONS.dir, "intermediates", unfilteredVCFBaseName + ".scatter") - if not os.path.exists(tmpdir): - os.makedirs(tmpdir) - - if callTarget.unionVCF != None: - callTarget.releaseVCF = os.path.join(OPTIONS.dir, callTarget.name + ".gatk_glftrio.intersection.annotated.filtered.vcf") - - print 'Heading into stage', stage, lastJobs - - newJobs = [] - if stage == 'CALL': - newJobs = callSNPs(target, callTarget, callTarget.listFile, tmpdir) - - if stage == 'MERGE': - newJobs = mergeSNPs(target, lastJobs, unfilteredVCF, tmpdir) - - if stage == 'FILTER': - newJobs = filterSNPs(callTarget, lastJobs, unfilteredVCF, filteredVCF) - - if stage == 'UNION': - newJobs = unionSNPs(callTarget, lastJobs, filteredVCF ) - - if stage == 'SELECT_INTERSECT': - if ( callTarget.unionVCF != None ): - # we are one fo the release targets - newJobs = finalize1KGRelease(callTarget, lastJobs, os.path.join(OPTIONS.dir, callTarget.unionVCF ), callTarget.releaseVCF) - - if stage == 'EVAL': - newJobs = evalSNPs(callTarget, lastJobs, unfilteredVCF, filteredVCF) - - if stage == 'RELEASE': - dir = '/humgen/gsa-scr1/pub/1000GenomesPilot2' - subdir = '1000GenomesPilot2SNPs_GATK_glftrio_' + date.today().strftime("%m%d%y") - releaseSNPs(os.path.join(dir, subdir), callTarget, filteredVCF ) - - if stage == 'CLEAN': - shutil.rmtree("intermediates", True) - for file in [filteredVCF, unfilteredVCF]: - if os.path.exists(file): os.remove(file) - - print 'New jobs' - for job in newJobs: - print ' ', job - allJobs.append(newJobs) - if newJobs != []: - lastJobs = newJobs - - print 'EXECUTING JOBS' - farm_commands.executeJobs(allJobs, farm_queue = OPTIONS.farmQueue, just_print_commands = OPTIONS.dry) - -hg18 = ['chr' + str(i) for i in range(1,23)] + ['chrX'] -b36 = [str(i) for i in range(1,23)] + ['X'] - -def autosomePlusX(name): - return name in b36 or name in hg18 - -def partition(faiRecords, maybeChrom, n): -# 1 247249719 3 60 61 -# 2 242951149 251370554 60 61 - chromAndSize = [[x[0], int(x[1])] for x in faiRecords if autosomePlusX(x[0])] - #print chromAndSize - if maybeChrom <> None: - chromAndSize = filter(lambda x: x[0] == maybeChrom, chromAndSize) - - def r(chrom, chrStart, chrEnd): - outputVCF = 'calls.chr%s.good.%s.vcf' % (chrom, chrStart) - L = '-L %s:%d-%d' % (chrom, chrStart, chrEnd) - return outputVCF, chrom, chrStart, chrEnd, L - - for chrom, size in chromAndSize: - #print 'SIZE', chrom, size - if n == 1: - yield r(chrom, 1, size) - else: - idealBp = float(size-1) / n - chrEnd = None - chrStart = 1 - for i in range(1, n): - chrEnd = 1 + int(round(idealBp * (i + 1))) - #print chrom, chrStart, chrEnd, idealBp - result = r(chrom, chrStart, chrEnd) - chrStart = chrEnd + 1 - yield result - if chrEnd <> size: - raise Exception('X') - -def callSNPs(target, callTarget, listFile, tmpdir): - extras = "-mmq 10 -mbq 10 -pl SOLID --heterozygosity %e" % (callTarget.hetero) - fai = "/humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta.fai" - NWaysParallel = 5 - - bins = partition(faiReader.readFAI(fai), None, NWaysParallel) - jobs = list() - for outputVCF, chrom, chrStart, chrEnd, L in bins: - outputVCF = os.path.join(tmpdir, outputVCF) - #print outputVCF, L, os.path.exists(outputVCF) - #if not os.path.exists(outputVCF): - print 'Enqueuing job for', outputVCF, L - cmd = 'java -Xmx2048m -jar /home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar ' + \ - '-T UnifiedGenotyper -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -mrl 500000 ' + \ - '-I %s -confidence %d %s -varout %s -vf VCF -L %s -gm JOINT_ESTIMATE' % (listFile, callTarget.minQ, extras, outputVCF, L) - #jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry) - jobs.append(farm_commands.FarmJob(cmd, jobName = "CALL_%s_c%s_s%s" % (target, str(chrom), str(chrStart)))) - - return jobs - -def mergeSNPs(target, lastJobs, snpFile, tmpdir): - #print 'LastJobs = ', lastJobs - cmd = "python ~/dev/GenomeAnalysisTK/trunk/python/mergeVCFs.py -a -f /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta.fai %s/*.vcf > %s" % (tmpdir, snpFile) - return [farm_commands.FarmJob(cmd, jobName = "MERGE_%s" % (target), dependencies = lastJobs)] - -def filterSNPs(callTarget, lastJobs, unfilteredVCF, filteredVCF): - target = callTarget.name - #expression = "AB > 0.75 || DP > %s" % depth - expression1 = ['GATK_STANDARD', "AB > 0.75 || DP > %s || MQ0 > 40 || SB > -0.10" % callTarget.depth] - expression2 = ['HARD_TO_VALIDATE', "MQ0 >= 4 && ((MQ0 / (1.0 * DP)) > 0.1)"] - cmd = GATK + '-T VariantFiltration -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -B variant,VCF,%s --clusterWindowSize 10 -o %s ' % (unfilteredVCF, filteredVCF) - - for name, exp in [expression1, expression2]: - cmd += '--filterName %s --filterExpression "%s" ' % ( name, exp ) - - #jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry) - return [farm_commands.FarmJob(cmd, jobName = "FILTER_%s" % (target), dependencies = lastJobs)] - -def evalSNPs(callTarget, lastJobs, unfilteredVCF, filteredVCF): - evalRoot = os.path.join(OPTIONS.dir, "eval") - if not os.path.exists(evalRoot): - os.makedirs(evalRoot) - - def eval1(vcfFullPath, namePostfix = "", args = ""): - vcf = os.path.split(vcfFullPath)[1] - out = os.path.join(OPTIONS.dir, "eval", vcf + namePostfix + ".eval") - cmd = GATK + "-T VariantEval -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -B 1kg_ceu,VCF,/humgen/gsa-hpprojects/1kg/1kg_pilot1/SNPCalls/Joint/RC1/CEU.2and3_way.annotated.vcf -B 1kg_yri,VCF,/humgen/gsa-hpprojects/1kg/1kg_pilot1/SNPCalls/Joint/RC1/YRI.2and3_way.annotated.vcf -B eval,VCF,%s -G -A -o %s -L %s" % ( vcfFullPath, out, '\;'.join(map(str, b36)) ) - hapmap3 = os.path.join("../../hapmap3Genotypes", callTarget.truthGFFName + ".b36.gff") - if os.path.exists(hapmap3): - cmd += " -B hapmap-chip,GFF,%s %s %s" % (hapmap3, callTarget.variantEvalArgs, args) - #jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry) - return farm_commands.FarmJob(cmd, jobName = "EVAL_%s_%s" % (callTarget.name, namePostfix), dependencies = lastJobs) - - jobs = [] - jobs.append(eval1(filteredVCF)) - jobs.append(eval1(unfilteredVCF)) - if callTarget.unionVCF != None: - jobs.append(eval1(os.path.join(OPTIONS.dir, callTarget.unionVCF), ".union")) - for set in sets: - if type(set) == list: - name, selector = set - else: - name, selector = set, set - jobs.append(eval1(os.path.join(OPTIONS.dir, callTarget.unionVCF), "." + name, " -vcfInfoSelector set=\"" + selector + "\"")) - - if callTarget.releaseVCF != None: - jobs.append(eval1(callTarget.releaseVCF, ".release")) - - - return jobs - -def unionSNPs(callTarget, lastJobs, filteredVCF ): - if callTarget.otherVCF != None: - cmd = GATK + "-T VCFCombine -B GATK,VCF,%s -B glfTrio,VCF,%s -O %s -type UNION -priority GATK,glfTrio -A" % ( filteredVCF, callTarget.otherVCF, os.path.join(OPTIONS.dir, callTarget.unionVCF) ) - return [farm_commands.FarmJob(cmd, jobName = "UNION_%s" % (callTarget.name), dependencies = lastJobs)] - else: - return [] -# jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry) - -def finalize1KGRelease(callTarget, lastJobs, inputVCF, outputVCF): - commands = [] - lastJob = lastJobs - -# subdir = os.path.join(OPTIONS.dir, "1kg_release") -# if not os.path.exists(subdir): -# os.makedirs(subdir) - - # first, filter out the intersection - cmd = GATK + '-T VCFSelect -B variant,VCF,%s -o %s -match "(set eq \'Intersection\' || set eq \'filteredInBoth\')" ' % (inputVCF, outputVCF) - lastJob = farm_commands.FarmJob(cmd, jobName = "SELECT_%s" % (callTarget.name), dependencies = lastJob) - commands.append(lastJob) - - # call variant annotator to annotate all of the calls - # annotatedVCF = os.path.join(subdir, callTarget.name + ".gatk_glftrio.intersection.annotated.vcf") -# cmd = GATK + '-T VariantAnnotator -I %s -standard -B variant,VCF,%s -vcf %s -L 1:1-1,000,000 ' % (callTarget.listFile, intersectVCF, annotatedVCF) -# lastJob = farm_commands.FarmJob(cmd, jobName = "ANNOTATE_%s" % (callTarget.name), dependencies = lastJob) -# commands.append(lastJob) -# -# # filter the calls -# filteredVCF = os.path.join(subdir, callTarget.name + ".gatk_glftrio.intersection.annotated.filtered.vcf") -# commands = commands + filterSNPs(callTarget, lastJob, annotatedVCF, filteredVCF) -# - return commands - -def releaseSNPs(dir, callTarget, filteredVCF ): - if not os.path.exists(dir): - os.makedirs(dir) - - print 'Copying files into ', dir, filteredVCF, callTarget.unionVCF, callTarget.releaseVCF - if not OPTIONS.dry: shutil.copy(filteredVCF, dir) - if callTarget.unionVCF != None: - if not OPTIONS.dry: shutil.copy(os.path.join(OPTIONS.dir, callTarget.unionVCF), dir) - if callTarget.releaseVCF != None: - if not OPTIONS.dry: shutil.copy(callTarget.releaseVCF, dir) - -if __name__ == "__main__": - main() - - -# java -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -T VCFCombine -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -B GATK,VCF,ceu.trio.gatk.ug.filtered.vcf -B glfTrio,VCF,/humgen/gsa-hpprojects/1kg/1kg_pilot2/currentBestProjectCalls/CEU_1kg_pilot2.vcf -O test.vcf -type UNION -priority GATK,glfTrio -l INFO -A -# java -ea -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -T VariantEval -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -B eval,VCF,test.vcf -B hapmap-chip,GFF,../../hapmap3Genotypes/NA12878.b36.gff --sampleName NA12878 -vcfInfoSelector set=gatk-filtered - - -# if ( $1 == 3.1 ) then -# cat ceu.trio.calls.allTechs.mmq10_mbq10_q200.filtered.vcf | cut -f 1-10 > ceu.trio.calls.allTechs.mmq10_mbq10_q200.NA12878only.filtered.vcf -# endif -# -# if ( $1 == 4 ) then -# foreach callset ( NA12878.allTechs.mmq10_mbq10_q200 ceu.trio.calls.allTechs.mmq10_mbq10_q200.NA12878only ) -# java -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar -T CallsetConcordance -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -B GATK,VCF,$callset.filtered.vcf -B glfTrio,VCF,CEU_1kg_pilot2.na12878.vcf -CT SimpleVenn -CO ${callset}_v_CEU_1kg_pilot2.filtered.vcf -l INFO -# cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "callset2_only"' > ${callset}_v_CEU_1kg_pilot2.filtered.CEU_1kg_pilot2Unique.vcf -# cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "callset1_only"' > ${callset}_v_CEU_1kg_pilot2.filtered.${callset}Unique.vcf -# cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "concordant"' > ${callset}_v_CEU_1kg_pilot2.filtered.concordant.vcf -# end -# endif -# -# if ( $1 == 5 ) then -# mkdir /humgen/gsa-scr1/pub/1000Pilot2_010710 -# -# foreach file ( NA12878.allTechs.mmq10_mbq10_q200.filtered.vcf NA12878.SLX.mmq10_mbq10_q50.filtered.vcf NA12891.calls.mmq10_mbq10_q50.filtered.vcf NA12892.calls.mmq10_mbq10_q50.filtered.vcf ceu.trio.calls.allTechs.mmq10_mbq10_q200.filtered.vcf ) -# echo $file -# cp $file /humgen/gsa-scr1/pub/1000Pilot2_010710 -# end -# -# endif diff --git a/python/qltout.py b/python/qltout.py deleted file mode 100755 index 4964d64cf..000000000 --- a/python/qltout.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python - -import string - -class align_block: - def __init__(self, indel_count, length, mismatches): - self.indel_count = int(indel_count) - self.length = int(length) - self.mismatches = int(mismatches) - def __str__(self): - return string.join( map( str, [self.indel_count, self.length, self.mismatches]), ' ') - -class qltout_record: - - def __init__(self, obj): - self.fields = [] - self.align_blocks = [] - if type(obj) == str: - self.setValuesFromString( obj ); - else: - raise TypeError("qltout_record did not recognize type: "+str(type(obj))) - - def setValuesFromString( self, line ): - formats = [str, int, int, int, int, int, int, int, int, int, int] - - split_line = line.split() - self.fields = map( lambda f, v: f(v), formats, split_line[:11] ) - - next_align_block = 11 - while next_align_block < len(split_line): - self.align_blocks.append( align_block(*split_line[next_align_block:next_align_block+3]) ) - next_align_block += 3 - #print self.__str__() - - def indelled_bases(self): - return sum([ab.indel_count for ab in self.align_blocks]) - - def __str__(self): - return string.join( map( str, self.fields ), ' ')+' '+string.join( map( str, self.align_blocks ), ' ') - - def id(self): return self.fields[1] - def contig(self): return self.fields[6] - - def offset(self): return self.fields[7] - def pos(self): return self.fields[7]+1 - - def offset_end(self): return self.fields[8] - def pos_end(self): return self.fields[8]+1 - - def linear_start(self): return self.fields[9] - def map_qual(self): return 100 # This only exists in maq files - # for ILT and Merlin, we assume the same confidence for all mappings - - -class qltout_file: - def __init__(self, filename): - self.filename = filename - self.fqlt = open(self.filename) - - def RecordGenerator(self): - - for line in self.fqlt: - #fields = line.rstrip("\n").split("\t") - if not line.startswith("QUERY"): - continue - yield qltout_record(line) #fields) - raise StopIteration - - def __iter__(self): - return self.RecordGenerator() - - diff --git a/python/samtooltest.sh b/python/samtooltest.sh deleted file mode 100755 index 0c23d9f16..000000000 --- a/python/samtooltest.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/tcsh - -setenv refroot ~/work/refmut/ -setenv refname test2 -setenv ref "${refroot}/${refname}" -setenv cov 100 -setenv sites 1 -setenv x '' # '-x 6696745:6708910' - -# testing samtools -./SimulateReads.py --ref ${ref}.fasta --coverage ${cov} -n ${sites} -l 10 -t None ${x} - -# running samtools -samtools import ${refname}__mut.None_10.bp_nsites.${sites}_cov.${cov}.ref ${refname}__mut.None_10.bp_nsites.${sites}_cov.${cov}.sam samtest.bam - -samtools sort samtest.bam samtestSorted - -samtools index samtestSorted.bam - -samtools pileup -f ${ref}.fasta samtestSorted.bam diff --git a/python/snpSelector.py b/python/snpSelector.py deleted file mode 100755 index 9e5596f46..000000000 --- a/python/snpSelector.py +++ /dev/null @@ -1,728 +0,0 @@ -import os.path -import sys -from optparse import OptionParser -from vcfReader import * -#import pylab -import operator -from itertools import * -import math -import random - -DEBUG = False - -class Range: - ANY = '*' - def __init__(self, left = ANY, right = ANY, leftOpen = False, rightOpen = False): - self.left = left - self.right = right - self.leftOpen = leftOpen - self.rightOpen = rightOpen - - def __str__(self): - leftB, rightB = '[', ']' - if self.leftOpen: leftB = '(' - if self.rightOpen: rightB = ')' - return '%s%s, %s%s' % (leftB, self.left, self.right, rightB) - - __repr__ = __str__ - - def dashedString(self): - return str(self).replace(", ", "-") - - def contains(self, v): - def test(r, op, open): - return r == Range.ANY or op(v, r) or (not open and v == r) - return test(self.left, operator.__gt__, self.leftOpen) and test(self.right, operator.__lt__, self.rightOpen) - -class CallCovariate: - def __init__(self, feature, featureRange, qualRange, FPRate = None): - self.feature = feature - - self.featureRange = featureRange - - self.qualRange = qualRange - self.FPRate = FPRate - - def containsVariant(self, call): - inFeature = self.featureRange.contains(call.getField(self.feature)) - inQual = self.qualRange.contains(call.getQual()) - #print 'inFeature, inQual', inFeature, inQual - return inFeature and inQual - - def getFPRate(self): return self.FPRate - def getFeature(self): return self.feature - - def getCovariateField(self): return self.getFeature() + '_RQ' - - def __str__(self): return "[CC feature=%s range=%s qualRange=%s]" % (self.feature, self.featureRange, self.qualRange) - -class RecalibratedCall: - def __init__(self, call, features): - self.call = call - self.features = dict([[feature, None] for feature in features]) - - def recalFeature( self, feature, FPRate ): - assert self.features[feature] == None, "Feature " + feature + ' has value ' + str(self.features[feature]) + ' for call ' + str(self.call) # not reassigning values - assert FPRate <= 1 and FPRate >= 0 - self.features[feature] = FPRate - - def getFeature( self, feature, missingValue = None, phredScaleValue = False ): - v = self.features[feature] - if v == None: - return missingValue - elif phredScaleValue: - return phredScale(v) - else: - return v - - def jointFPErrorRate(self): - #print self.features - logTPRates = [math.log10(1-r) for r in self.features.itervalues() if r <> None] - logJointTPRate = reduce(lambda x, y: x + y, logTPRates, 0) - logJointTPRate = min(logJointTPRate, 1e-3 / 3) # approximation from het of 0.001 - jointTPRate = math.pow(10, logJointTPRate) - #print logTPRates - #print logJointTPRate, jointTPRate - return 1 - jointTPRate - - def featureStringList(self): - return ','.join(map(lambda feature: '%s=Q%d' % (feature, self.getFeature(feature, '*', True)), self.features.iterkeys())) - - def __str__(self): - return '[%s: %s => Q%d]' % (str(self.call), self.featureStringList(), phredScale(self.jointFPErrorRate())) - -def readVariants( file, maxRecords = None, decodeAll = True, downsampleFraction = 1, filter = None, minQScore = -1, maxQScore = 10000000, mustBeVariant = False, skip = None ): - if filter == None: - filter = not OPTIONS.unfiltered - - f = open(file) - header, columnNames, lines = readVCFHeader(f) - - nLowQual = 0 - counter = 0 - def parseVariant(args): - global nLowQual, counter - header1, VCF, counter = args - counter += 1 - #print counter, skip, counter % skip - if filter and not VCF.passesFilters() or ( False and mustBeVariant == True and not VCF.isVariant() ): # currently ignore mustBeVariant - #print 'filtering', VCF - return None - elif VCF.getQual() <= minQScore or VCF.getQual() > maxQScore: - #print 'filtering', VCF.getQual() - #nLowQual += 1 - return None - elif skip <> None and counter % skip <> 0: - #print 'skipping' - return None - elif random.random() <= downsampleFraction: - #print 'keeping', VCF - return VCF - else: - return None - - variants = ifilter(None, imap(parseVariant, islice(lines2VCF(lines, header=header, columnNames = columnNames, extendedOutput = True, decodeAll = decodeAll), maxRecords))) - if nLowQual > 0: - print '%d snps filtered due to QUAL < %d' % (nLowQual, minQScore) - return header, variants - -def selectVariants( variants, selector = None ): - if selector <> None: - return filter(selector, variants) - else: - return variants - -def titv(variants): - ti = len(filter(VCFRecord.isTransition, variants)) - tv = len(variants) - ti - titv = ti / (1.0*max(tv,1)) - - return titv - -def dbSNPRate(variants): - inDBSNP = len(filter(VCFRecord.isKnown, variants)) - return float(inDBSNP) / max(len(variants),1) - -def gaussian(x, mu, sigma): - constant = 1 / math.sqrt(2 * math.pi * sigma**2) - exponent = -1 * ( x - mu )**2 / (2 * sigma**2) - return constant * math.exp(exponent) - -# if target = T, and FP calls have ti/tv = 0.5, we want to know how many FP calls -# there are in N calls with ti/tv of X. -# -def titvFPRateEstimate(variants, target): - titvRatio = titv(variants) - - # f <- function(To,T) { (To - T) / (1/2 - T) + 0.001 } - def theoreticalCalc(): - if titvRatio >= target: - FPRate = 0 - else: - FPRate = (titvRatio - target) / (0.5 - target) - FPRate = min(max(FPRate, 0), 1) - TPRate = max(min(1 - FPRate, 1 - dephredScale(OPTIONS.maxQScoreForCovariate)), dephredScale(OPTIONS.maxQScoreForCovariate)) - if DEBUG: print 'FPRate', FPRate, titvRatio, target - assert FPRate >= 0 and FPRate <= 1 - return TPRate - - # gaussian model - def gaussianModel(): - LEFT_HANDED = True - sigma = 1 # old value is 5 - constant = 1 / math.sqrt(2 * math.pi * sigma**2) - exponent = -1 * ( titvRatio - target )**2 / (2 * sigma**2) - TPRate = gaussian(titvRatio, target, sigma) / gaussian(target, target, sigma) - if LEFT_HANDED and titvRatio >= target: - TPRate = 1 - TPRate -= dephredScale(OPTIONS.maxQScoreForCovariate) - if DEBUG: print 'TPRate', TPRate, constant, exponent, dephredScale(OPTIONS.maxQScoreForCovariate) - return TPRate - - FPRate = 1 - theoreticalCalc() - #FPRate = 1 - gaussianModel() - nVariants = len(variants) - - if DEBUG: print ':::', nVariants, titvRatio, target, FPRate - - return titvRatio, FPRate - -def phredScale(errorRate): - return -10 * math.log10(max(errorRate, 1e-10)) - -def dephredScale(qscore): - return math.pow(10, float(qscore) / -10) - -def frange6(*args): - """A float range generator.""" - start = 0.0 - step = 1.0 - - l = len(args) - if l == 1: - end = args[0] - elif l == 2: - start, end = args - elif l == 3: - start, end, step = args - if step == 0.0: - raise ValueError, "step must not be zero" - else: - raise TypeError, "frange expects 1-3 arguments, got %d" % l - - v = start - while True: - if (step > 0 and v >= end) or (step < 0 and v <= end): - raise StopIteration - yield v - v += step - -def compareFieldValues( v1, v2 ): - if type(v1) <> type(v2): - #print 'Different types', type(v1), type(v2) - c = cmp(type(v1), type(v2)) - else: - c = cmp(v1, v2) - #print 'Comparing %s %s = %s' % (v1, v2, c) - return c - -def calculateBins(variants, field, minValue, maxValue, partitions): - values = map(lambda x: x.getField(field), variants) - return calculateBinsForValues(values, field, minValue, maxValue, partitions) - -def calculateBinsForValues(values, field, minValue, maxValue, partitions): - sortedValues = sorted(values) - captureFieldRangeForPrinting(field, sortedValues) - - targetBinSize = len(values) / (1.0*partitions) - #print sortedValues - uniqBins = groupby(sortedValues) - binsAndSizes = map(lambda x: [x[0], len(list(x[1]))], uniqBins) - #print 'BINS AND SIZES', binsAndSizes - - def bin2Break(bin): return [bin[0], bin[0], bin[1]] - bins = [bin2Break(binsAndSizes[0])] - for bin in binsAndSizes[1:]: - #print ' Breaks', bins - #print ' current bin', bin - curLeft = bins[-1][0] - curSize = bin[1] - prevSize = bins[-1][2] - #print curSize, prevSize - if curSize + prevSize > targetBinSize or (not isNumber(curLeft) and isNumber(bin[0])): - #print ' => appending', bin2Break(bin) - bins.append(bin2Break(bin)) - else: - bins[-1][1] = bin[0] - bins[-1][2] += curSize - - #print 'Returning ', bins - #sys.exit(1) - return bins - -def fieldRange(variants, field): - values = map(lambda v: v.getField(field), variants) - minValue = min(values) - maxValue = max(values) - #rangeValue = maxValue - minValue - bins = calculateBins(variants, field, minValue, maxValue, OPTIONS.partitions) - validateBins(bins) - return minValue, maxValue, bins - -def validateBins(bins): - #print 'Bins are', bins - for left1, right1, count1 in bins: - for left2, right2, count2 in bins: - def contains2(x): - return left2 < x and x < right2 - - if left1 <> left2 and right1 <> right2: - if None in [left1, left2, right1, right2]: - pass # we're ok - elif contains2(left1) or contains2(right2): - raise Exception("Bad bins", left1, right1, left2, right2) - -def printFieldQualHeader(): - more = "" - if TRUTH_CALLS <> None: - more = CallCmp.HEADER - def p(stream): - if stream <> None: - print >> stream, ' %20s %20s left right %15s nVariants nNovels titv titvNovels dbSNP Q' % ("category", "field", "qRange"), more - p(sys.stdout) - p(RECAL_LOG) - -def printFieldQual( category, field, cc, variants ): - more = "" - if TRUTH_CALLS <> None: - callComparison, theseFPs = sensitivitySpecificity(variants, TRUTH_CALLS) - more = str(callComparison) - novels = selectVariants(variants, VCFRecord.isNovel) - def p(stream): - if stream <> None: - print >> stream, ' %20s %20s %15s %15s %8d %8d %2.2f %2.2f %3.2f %3d' % (category, field, binString(field, cc.featureRange), cc.qualRange.dashedString(), len(variants), len(novels), titv(variants), titv(novels), dbSNPRate(variants) * 100, phredScale(cc.FPRate)), more - p(sys.stdout) - p(RECAL_LOG) - -FIELD_RANGES = dict() -def captureFieldRangeForPrinting(field, sortedValues): - """Finds the minimum float value in sortedValues for convenience printing in recal.log""" - #print sortedValues - floatValues = filter(isNumber, sortedValues) - if floatValues <> []: - FIELD_RANGES[field] = floatValues[0] - #print 'Setting field range to', field, FIELD_RANGES[field] - - -def isNumber(x): - return isinstance(x, (int, long, float)) - -def binString(field, cc): - epsilon = 1e-2 - left, right = cc.left, cc.right - leftStr = str(left) - rightStr = "%5s" % str(right) - if OPTIONS.plottableNones and not isNumber(left) and not isNumber(right) and field in FIELD_RANGES: - left = right = FIELD_RANGES[field] - epsilon - if OPTIONS.plottableNones and not isNumber(left) and isNumber(right): - left = right - epsilon - if isNumber(left): leftStr = "%.4f" % left - if isNumber(right): rightStr = "%.4f" % right - return '%12s %12s' % (leftStr, rightStr) - -# -# -# -def recalibrateCalls(variants, fields, callCovariates): - def phred(v): return phredScale(v) - - for variant in variants: - recalCall = RecalibratedCall(variant, fields) - originalQual = variant.getField('QUAL') - - for callCovariate in callCovariates: - if callCovariate.containsVariant(variant): - FPR = callCovariate.getFPRate() - recalCall.recalFeature(callCovariate.getFeature(), FPR) - recalCall.call.setField(callCovariate.getCovariateField(), phred(FPR)) - - #recalCall.call.setField('QUAL', phred(recalCall.jointFPErrorRate())) - recalCall.call.setField('QUAL', phred(recalCall.jointFPErrorRate())) - recalCall.call.setField('OQ', originalQual) - #print 'recalibrating', variant.getLoc() - #print ' =>', variant - yield recalCall.call - -# -# -# -def optimizeCalls(variants, fields, titvTarget): - callCovariates = calibrateFeatures(variants, fields, titvTarget, category = "covariates", useBreaks=True) - recalCalls = recalibrateCalls(variants, fields, callCovariates) - return recalCalls, callCovariates - -def printCallQuals(field, recalCalls, titvTarget, info = ""): - print '--------------------------------------------------------------------------------' - print info - calibrateFeatures(recalCalls, [field], titvTarget, printCall = True, cumulative = False, forcePrint = True, prefix = "OPT-", printHeader = False, category = "optimized-calls" ) - print 'Cumulative' - calibrateFeatures(recalCalls, [field], titvTarget, printCall = True, cumulative = True, forcePrint = True, prefix = "OPTCUM-", printHeader = False, category = "optimized-calls" ) - -def all( p, l ): - for elt in l: - if not p(elt): return False - return True - - -def mapVariantBins(variants, field, cumulative = False, breakQuals = [Range()]): - minValue, maxValue, featureBins = fieldRange(variants, field) - - #print 'BREAKQuals', breakQuals[0] - bins = [(x,y) for x in featureBins for y in breakQuals] - #print 'BINS', bins - - def variantsInBin(featureBin, qualRange): - right = featureBin[1] - if cumulative: - right = Range.ANY - cc = CallCovariate(field, Range(featureBin[0], right), qualRange) - - return cc, selectVariants(variants, lambda v: cc.containsVariant(v)) - - #sys.exit(1) - return starmap( variantsInBin, bins ) - -def qBreaksRanges(qBreaks, useBreaks): - if qBreaks == None or not useBreaks: - return [Range()] # include everything in a single range - else: - breaks = map(float, qBreaks.split(',')) - return map(lambda x, y: Range(x,y, rightOpen = True), chain([Range.ANY], breaks), chain(breaks, [Range.ANY])) - -def calibrateFeatures(variants, fields, titvTarget, printCall = False, cumulative = False, forcePrint = False, prefix = '', printHeader = True, category = None, useBreaks = False ): - covariates = [] - - if printHeader: printFieldQualHeader() - for field in fields: - if DEBUG: print 'Optimizing field', field - - titv, FPRate = titvFPRateEstimate(variants, titvTarget) - #print 'Overall FRRate:', FPRate, nErrors, phredScale(FPRate) - - for cc, selectedVariants in mapVariantBins(variants, field, cumulative = cumulative, breakQuals = qBreaksRanges(OPTIONS.QBreaks, useBreaks and field <> 'QUAL')): - #print 'CC', cc, field, useBreaks - if len(selectedVariants) > max(OPTIONS.minVariantsPerBin,1) or forcePrint: - titv, FPRate = titvFPRateEstimate(selectedVariants, titvTarget) - #dbsnp = dbSNPRate(selectedVariants) - cc.FPRate = FPRate - covariates.append(cc) - printFieldQual( category, prefix + field, cc, selectedVariants ) - else: - print 'Not calibrating bin', cc, 'because it contains too few variants:', len(selectedVariants) - - return covariates - -class CallCmp: - def __init__(self, nTP, nFP, nFN): - self.nTP = nTP - self.nFP = nFP - self.nFN = nFN - -# def FPRate(self): -# return (1.0*self.nFP) / max(self.nTP + self.nFP, 1) - - def FNRate(self): - return (1.0*self.nFN) / max(self.nTP + self.nFN, 1) - - def sensitivity(self): - # = TP / (TP + FN) - return (1.0*self.nTP) / max( self.nTP + self.nFN,1 ) - - def PPV(self): - # = TP / (TP + FP) - return (1.0*self.nTP) / max( self.nTP + self.nFP, 1 ) - - HEADER = "TP FP FN FNRate Sensitivity PPV" - - def __str__(self): - return '%6d %6d %6d %.3f %.3f %.3f' % (self.nTP, self.nFP, self.nFN, self.FNRate(), self.sensitivity(), self.PPV()) - -def variantInTruth(variant, truth): - if variant.getLoc() in truth: - return truth[variant.getLoc()] - else: - return False - -def isVariantInSample(t, sample): - #print "isVariantInSample", t.getLoc(), t.getField(sample), x - return t.getField(sample) <> "0/0" - -def variantsInTruth(truth): - # fixme - return len(filter(lambda x: isVariantInSample(x, OPTIONS.useSample), truth)) - -def sensitivitySpecificity(variants, truth): - nTP, nFP = 0, 0 - FPs = [] - for variant in variants: - t = variantInTruth(variant, truth) - - isTP, isFP = False, False - if OPTIONS.useSample or OPTIONS.onlyAtTruth: - if t: # we have a site - isTP = (isVariantInSample(t, OPTIONS.useSample) and t.isVariant()) or (not isVariantInSample(t, OPTIONS.useSample) and not t.isVariant()) - isFP = not isTP - else: - isTP = t - isFP = not t - - #if variant.getLoc() == "1:867694": - # print variant, 'T: [', t, '] isTP, isFP', isTP, isFP - - if isTP: - t.setField("FN", 0) - variant.setField("TP", 1) - nTP += 1 - elif isFP: - nFP += 1 - variant.setField("TP", 0) - #print t, variant, "is a FP!" - FPs.append(variant) - nRef = 0 # len(filter(lambda x: not x.isVariant(), truth.itervalues())) - nFN = variantsInTruth(truth.itervalues()) - nTP - nRef - #print 'nRef', nTP, nFP, nFN, nRef - return CallCmp(nTP, nFP, nFN), FPs - -def markTruth(calls): - if not OPTIONS.useSample: - for variant in calls.itervalues(): - variant.setField("TP", 0) # set the TP field to 0 - -def compareCalls(calls, truthCalls): - #markTruth(truthCalls) - - def compare1(name, cumulative): - for field in ["QUAL", "OQ"]: - for cc, selectedVariants in mapVariantBins(calls, field, cumulative = cumulative): - #print selectedVariants[0] - printFieldQual("truth-comparison-" + name, field, cc, selectedVariants ) - - print 'PER BIN nCalls=', len(calls) - compare1('per-bin', False) - - print 'CUMULATIVE nCalls=', len(calls) - compare1('cum', True) - -def randomSplit(l, pLeft): - def keep(elt, p): - if p < pLeft: - return elt, None - else: - return None, elt - data = [keep(elt, p) for elt, p in zip(l, map(lambda x: random.random(), l))] - def get(i): return filter(lambda x: x <> None, [x[i] for x in data]) - return get(0), get(1) - -def setup(): - global OPTIONS, header - usage = "usage: %prog files.list [options]" - parser = OptionParser(usage=usage) - parser.add_option("-f", "--f", dest="fields", - type='string', default="QUAL", - help="Comma-separated list of fields (either in the VCF columns of as INFO keys) to use during optimization [default: %default]") - parser.add_option("-t", "--truth", dest="truth", - type='string', default=None, - help="VCF formated truth file. If provided, the script will compare the input calls with the truth calls. It also emits calls tagged as TP and a separate file of FP calls") - parser.add_option("-l", "--recalLog", dest="recalLog", - type='string', default="recal.log", - help="VCF formated truth file. If provided, the script will compare the input calls with the truth calls. It also emits calls tagged as TP and a separate file of FP calls") - parser.add_option("-u", "--unfiltered", dest="unfiltered", - action='store_true', default=False, - help="If provided, unfiltered calls will be used in comparisons [default: %default]") - parser.add_option("", "--plottable", dest="plottableNones", - action='store_true', default=False, - help="If provided, will generate fake plottable points for annotations with None values -- doesn't effect the behavior of the system just makes it easy to plot outputs [default: %default]") - parser.add_option("", "--onlyAtTruth", dest="onlyAtTruth", - action='store_true', default=False, - help="If provided, we only consider TP/FP/FN at truth sites[default: %default]") - parser.add_option("-p", "--partitions", dest="partitions", - type='int', default=25, - help="Number of partitions to use for each feature. Don't use so many that the number of variants per bin is very low. [default: %default]") - parser.add_option("", "--maxRecordsForCovariates", dest="maxRecordsForCovariates", - type='int', default=2000000, - help="Derive covariate information from up to this many VCF records. For files with more than this number of records, the system downsamples the reads [default: %default]") - parser.add_option("-m", "--minVariantsPerBin", dest="minVariantsPerBin", - type='int', default=10, - help="Don't include any covariates with fewer than this number of variants in the bin, if such a thing happens. NEEDS TO BE FIXED") - parser.add_option("-M", "--maxRecords", dest="maxRecords", - type='int', default=None, - help="Maximum number of input VCF records to process, if provided. Default is all records") - parser.add_option("-q", "--qMin", dest="minQScore", - type='int', default=-1, - help="The minimum Q score of the initial SNP list to consider for selection [default: %default]") - parser.add_option("-Q", "--qMax", dest="maxQScore", - type='int', default=1000000, - help="The maximum Q score allowed for both a single covariate and the overall QUAL score [default: %default]") - parser.add_option("", "--QBreaks", dest="QBreaks", - type='string', default=None, - help="Breaks in QUAL for generating covarites [default: %default]") - parser.add_option("", "--maxQScoreForCovariate", dest="maxQScoreForCovariate", - type='int', default=60, - help="The maximum Q score allowed for both a single covariate and the overall QUAL score [default: %default]") - parser.add_option("-o", "--outputVCF", dest="outputVCF", - type='string', default="recal.vcf", - help="If provided, a VCF file will be written out to this file [default: %default]") - parser.add_option("", "--FNoutputVCF", dest="FNoutputVCF", - type='string', default=None, - help="If provided, VCF file will be written out to this file [default: %default]") - parser.add_option("", "--titv", dest="titvTarget", - type='float', default=None, - help="If provided, we will optimize calls to the targeted ti/tv rather than that calculated from known calls [default: %default]") - parser.add_option("-b", "--bootstrap", dest="bootStrap", - type='float', default=None, - help="If provided, the % of the calls used to generate the recalibration tables. [default: %default]") - parser.add_option("-k", "--skip", dest="skip", - type=int, default=None, - help="If provided, we'll only take every nth record [default: %default]") - parser.add_option("-s", "--useSample", dest="useSample", - type='string', default=False, - help="If provided, we will examine sample genotypes for this sample, and consider TP/FP/FN in the truth conditional on sample genotypes [default: %default]") - parser.add_option("-r", "--dontRecalibrate", dest="dontRecalibrate", - action='store_true', default=False, - help="If provided, we will not actually do anything to the calls, they will just be assessed [default: %default]") - - (OPTIONS, args) = parser.parse_args() - if len(args) > 2: - parser.error("incorrect number of arguments") - return args - -def assessCalls(file): - print 'Counting records in VCF', file - numberOfRecords = 1 - downsampleFraction = 1 - if OPTIONS.maxRecords <> None: - numberOfRecords = quickCountRecords(open(file)) - if OPTIONS.maxRecords < numberOfRecords: - numberOfRecords = OPTIONS.maxRecords - downsampleFraction = min(float(OPTIONS.maxRecordsForCovariates) / numberOfRecords, 1) - #print 'Reading variants', OPTIONS.skip, downsampleFraction - header, allCalls = readVariants(file, OPTIONS.maxRecords, downsampleFraction=downsampleFraction, minQScore = OPTIONS.minQScore, maxQScore = OPTIONS.maxQScore, skip = OPTIONS.skip) - allCalls = list(allCalls) - print 'Number of VCF records', numberOfRecords, ', max number of records for covariates is', OPTIONS.maxRecordsForCovariates, 'so keeping', downsampleFraction * 100, '% of records' - print 'Number of selected VCF records', len(allCalls) - - titvtarget = OPTIONS.titvTarget - if titvtarget == None: - titvtarget = titv(selectVariants(allCalls, VCFRecord.isKnown)) - print 'Ti/Tv all ', titv(allCalls) - print 'Ti/Tv known', titv(selectVariants(allCalls, VCFRecord.isKnown)) - print 'Ti/Tv novel', titv(selectVariants(allCalls, VCFRecord.isNovel)) - - return header, allCalls, titvtarget - -def determineCovariates(allCalls, titvtarget, fields): - if OPTIONS.bootStrap: - callsToOptimize, recalEvalCalls = randomSplit(allCalls, OPTIONS.bootStrap) - else: - callsToOptimize = allCalls - - recalOptCalls, covariates = optimizeCalls(callsToOptimize, fields, titvtarget) - printCallQuals("QUAL", list(recalOptCalls), titvtarget, 'OPTIMIZED CALLS') - - if OPTIONS.bootStrap: - recalibatedEvalCalls = recalibrateCalls(recalEvalCalls, fields, covariates) - printCallQuals("QUAL", list(recalibatedEvalCalls), titvtarget, 'BOOTSTRAP EVAL CALLS') - - return covariates - -def writeRecalibratedCalls(file, header, calls): - if file: - f = open(file, 'w') - #print 'HEADER', header - i = 0 - for line in formatVCF(header, calls): - if i % 10000 == 0: print 'writing VCF record', i - i += 1 - print >> f, line - f.close() - -def readTruth(truthVCF): - print 'Reading truth file', truthVCF - rawTruth = list(readVariants(truthVCF, maxRecords = None, decodeAll = True, mustBeVariant = True)[1]) - truth = dict( [[v.getLoc(), v] for v in rawTruth]) - print 'Number of raw and passing filter truth calls', len(rawTruth), len(truth) - return truth - -def evaluateTruth(header, callVCF, truth, truthVCF): - print 'Reading variants back in from', callVCF - header, calls = readVariants(callVCF) - calls = list(calls) - - print '--------------------------------------------------------------------------------' - print 'Comparing calls to truth', truthVCF - print '' - - compareCalls(calls, truth) - - writeRecalibratedCalls(callVCF, header, calls) - - def isFN(v): - return isVariantInSample(v, OPTIONS.useSample) and not v.hasField("FN") - - if truth <> None and OPTIONS.FNoutputVCF: - f = open(OPTIONS.FNoutputVCF, 'w') - #print 'HEADER', header - for line in formatVCF(header, filter( isFN, truth.itervalues())): - print >> f, line - f.close() - -TRUTH_CALLS = None -RECAL_LOG = None -def main(): - global TRUTH_CALLS, RECAL_LOG - - args = setup() - fields = OPTIONS.fields.split(',') - - truthVCF = None - #print("LENGTH OF ARGS "+str(len(args))) - - if OPTIONS.truth <> None: - truthVCF = OPTIONS.truth - TRUTH_CALLS = readTruth(truthVCF) - - if OPTIONS.recalLog <> None: - RECAL_LOG = open(OPTIONS.recalLog, "w") - print >> RECAL_LOG, "# optimized vcf", args[0] - print >> RECAL_LOG, "# truth vcf", truthVCF - for key, value in OPTIONS.__dict__.iteritems(): - print >> RECAL_LOG, '#', key, value - - header, allCalls, titvTarget = assessCalls(args[0]) - if not OPTIONS.dontRecalibrate: - covariates = determineCovariates(allCalls, titvTarget, fields) - print OPTIONS - header, callsToRecalibate = readVariants(args[0], OPTIONS.maxRecords, minQScore = OPTIONS.minQScore, maxQScore = OPTIONS.maxQScore, skip = OPTIONS.skip) - RecalibratedCalls = recalibrateCalls(callsToRecalibate, fields, covariates) - writeRecalibratedCalls(OPTIONS.outputVCF, header, RecalibratedCalls) - else: - printFieldQualHeader() - printCallQuals("QUAL", allCalls, titvTarget) - OPTIONS.outputVCF = args[0] - - if truthVCF <> None: - evaluateTruth(header, OPTIONS.outputVCF, TRUTH_CALLS, truthVCF) - - -PROFILE = False -if __name__ == "__main__": - if PROFILE: - import cProfile - cProfile.run('main()', 'fooprof') - import pstats - p = pstats.Stats('fooprof') - p.sort_stats('cumulative').print_stats(10) - p.sort_stats('time').print_stats(10) - p.sort_stats('time', 'cum').print_stats(.5, 'init') - else: - main() diff --git a/python/subsetDbSNPWithrsIDs.py b/python/subsetDbSNPWithrsIDs.py deleted file mode 100755 index 3dfe69df8..000000000 --- a/python/subsetDbSNPWithrsIDs.py +++ /dev/null @@ -1,49 +0,0 @@ -import sys -from optparse import OptionParser - -def getRsIDSet(dbSNPIds, idIndex): - s = set() - for line in open(dbSNPIds): - #print line - rsID = line.split()[idIndex] - s.add(rsID) - - return frozenset(s) - - -def main(): - global OPTIONS - usage = "usage: %prog [options] dbSNP.in.rsids dbSNP.to.match dbSNP.out" - parser = OptionParser(usage=usage) - parser.add_option("-v", "--verbose", dest="verbose", - action='store_true', default=False, - help="") - - (OPTIONS, args) = parser.parse_args() - if len(args) != 3: - parser.error("incorrect number of arguments") - - dbSNPIds = args[0] - dbSNPMatch = args[1] - dbSNPOut = args[2] - idIndex = 4 - - rsSet = getRsIDSet(dbSNPIds, idIndex) - print 'rsID set has %d elements' % len(rsSet) - - # - count = 0 - matched = 0 - out = open(dbSNPOut, 'w') - for line in open(dbSNPMatch): - count += 1 - rsID = line.split()[idIndex] - if rsID in rsSet: - #sys.stdout.write(line) - matched += 1 - out.write(line) - print 'Processed %d lines, matching %d elements, excluding %d' % ( count, matched, count - matched ) - out.close() - -if __name__ == "__main__": - main() diff --git a/python/tgtc2sam.py b/python/tgtc2sam.py deleted file mode 100755 index 009d992f8..000000000 --- a/python/tgtc2sam.py +++ /dev/null @@ -1,287 +0,0 @@ -#!/usr/bin/env python - -"""Starts from TGTC list of fastb, qualb, and qltout files and creates merged SAM/BAM files""" - -import os, commands, sys -from farm_commands import cmd - -# Global list of failed conversions to be reported at end of execution -failed_list = [] - -class FailedConversion: - def __init__(self, lane, error_str): - self.lane = lane - self.error_str = error_str - print self - def __str__(self): - return self.error_str+" while attempting conversion of "+str(self.lane) - -class lane: - bam_ref_list = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bam.ref_list"; - pipe_dirs_head = "/slxa/pipelineOutput/farm" - pipe_dirs = os.listdir(pipe_dirs_head) - pipe_dirs_dict = dict( [(dir.split("_")[1], dir) for dir in pipe_dirs if "_" in dir] ) - - def __init__(self, line): - fields = line.split(" ") - if len(fields) == 5: - self.fdl, self.fastb, self.qualb, self.qltout, group = fields - else: - self.fdl, self.fastb, self.qualb, self.qltout = fields - self.qltout = self.qltout.rstrip() - - self.fqltout = os.path.splitext(self.qltout)[0]+".fais_filtered.qltout" - self.flowcell, self.date, self.lane = self.fdl.split(".") - self.flowlane = self.flowcell+"."+self.lane - self.path_head = os.path.splitext(self.qltout)[0] - self.sam = self.path_head+".sam" - self.bam = self.path_head+".bam" - self.head = ".".join(os.path.basename(self.path_head).split(".")[:2]) # should amount to FLOWCELL.LANE - - #self.refdict_metrics() - - def pipe_dir(self): - dir = lane.pipe_dirs_dict[self.flowcell] - return lane.pipe_dirs_head+"/"+dir - - def set_default_refdict(self): - self.refdict = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.dict"; - #print "WARNING: Using standard Hg18 reference dictionary (refdict) for",self - - def set_default_metrics(self): - self.metrics = "/broad/1KG/legacy_data/tgtc2sam/unknown.metrics" - #print "WARNING: Using default unkonwn.metrics file for",self - - def refdict_metrics(self): - "Ensures that a reference dictionary and metrics are available" - try: - pipe_lane_dir = self.pipe_dir() - except KeyError: - self.set_default_refdict() - self.set_default_metrics() - return True - - self.metrics = pipe_lane_dir+"/"+str(self.flowlane)+".metrics" - if os.path.exists(self.metrics): - line = commands.getoutput("grep '^reference=' "+pipe_lane_dir+"/"+self.flowlane+".metrics") - if line != "": - reference_fasta = line.split("=")[1] - self.refdict = os.path.splitext(reference_fasta)[0]+'.dict' - if "PhiX" in self.refdict: - failed_list.append(FailedConversion(self, "PhiX dictionary missing")) - return False - else: - self.set_default_refdict() - return True # We were able to generate refdict and metrics files - else: - self.set_default_refdict() - self.set_default_metrics() - return True # We were able to generate refdict file and substituted unknown.metrics as the metrics file - - #failed_list.append(FailedConversion(self, "Could not find metrics file")) - #return False - - def __str__(self): - return self.fdl - -def usage(): - print "Usage: tgtc_sam.py ..." - print - print " Works from tgtc files which contain fastb, qualb, qltout filenames corresponding to one lane of data" - - print " Commands and command arguments:" - print " convert tgtc_list - given a tgtc file converts files to SAM and BAM" - print " tgtc - file with lines of fastb, qualb, qltout filenames" - print " [clean] - optional 3rd argument that will create SAM/BAM files" - print " regardless of whether they already exist" - print - print " merge tgtc output_file - merges converted BAM files" - print " tgtc - file with lines of fastb, qualb, qltout filenames" - print " output_file - path and name of merged BAM file" - print - print " lists list_of_tgtc_lists - run over a list of tgtc lists given in a file" - print " list_of_tgtc_files - file with tgtc filenames in it" - print - print " Global options:" - print " print_only: do not run commands, just print what they would be " - sys.exit(1) - -def make_lane_list(tgtc_file): - all_lanes = [lane(l) for l in open(tgtc_file) if not l.startswith("#")] - lanes = [] - for ln in all_lanes: - if ln.refdict_metrics(): # if we were able to generate refdict and metrics - # append this lane to the lanes list that we will work with - lanes.append(ln) - unpaired = [l for l in lanes if ".end1." not in l.qualb and ".end2." not in l.qualb] - paired1 = [l for l in lanes if ".end1." in l.qualb] - paired2 = [l for l in lanes if ".end2." in l.qualb] - - print "Unpaired lanes:",len(unpaired) - print "Paired 1 lanes:",len(paired1) - print "Paired 2 lanes:",len(paired2) - assert len(paired1) == len(paired2) - - return [(u, None) for u in unpaired] + zip(paired1, paired2) - -def convert_lanes(tgtc_file, recreate_all=False, just_print_commands=False): - lanes_done = 0 - lane_list = make_lane_list(tgtc_file) - print "Beginning processing of these lanes (unpaired and paired):" - print "\n".join(["%s %s" % (p1,p2) for p1,p2 in lane_list]) - - for p1, p2 in lane_list: - if recreate_all or not os.path.exists(p1.sam) or not os.path.getsize(p1.sam): - - # Filer reads with > 4 errors - cmd_str = "\"FilterAlignsILTStyle QLTIN="+p1.qltout+" QLTOUT="+p1.fqltout - - if p2 == None: - # This is an UNPAIRED LANE - print "Processing unpaired lane",p1 - cmd_str += (" && Qltout2SAM"+ - " HEAD="+p1.head+ - " SAM="+p1.sam+ - " QLTOUT="+p1.fqltout+ - " FASTB="+p1.fastb+ - " QUALB="+p1.qualb+ - " REFDICT="+p1.refdict+ - " METRICS="+p1.metrics+ - " ALIGNER=PairFinder"+ - " TMPDIR=/broad/hptmp/andrewk") - else: - # This is a PAIRED LANE - print "Processing paired lanes",p1,p2 - if p1.flowcell != p2.flowcell or p1.lane != p2.lane: - print "### WARNING ### paired lanes ",p1,p2,"do not match! Skipping." - continue - # Filter 2nd Qltout file too - cmd_str += " && FilterAlignsILTStyle QLTIN="+p2.qltout+" QLTOUT="+p2.fqltout - cmd_str += (" && QltoutPair2SAM"+ - " HEAD="+p1.head+ - " SAM="+p1.sam+ - " QLTOUT1="+p1.fqltout+" QLTOUT2="+p2.fqltout+ - " FASTB1="+p1.fastb+" FASTB2="+p2.fastb+ - " QUALB1="+p1.qualb+" QUALB2="+p2.qualb+ - " REFDICT="+p1.refdict+ - " METRICS="+p1.metrics+ - " ALIGNER=PairFinder"+ - " TMPDIR=/broad/hptmp/andrewk") - - # Make the BAM file - cmd_str += " && /seq/dirseq/samtools/current/samtools import "+lane.bam_ref_list+" "+p1.sam+" "+p1.bam+"\"" - # Now remove one or both SAM files if all previous commands returned 0 for success - cmd_str += " && rm -f "+p1.sam - if p2 != None: cmd_str += " && rm -f "+p2.sam - - status = cmd(cmd_str, "long", output_head=p1.path_head, just_print_commands=just_print_commands) - if status == 0: - lanes_done += 1 - else: - failed_list.append("Unkown failure") - #if lanes_done + len(failed_list) > 0: - #break - - failed = len(failed_list) - lanes_total = lanes_done+failed - percent_done = lanes_done / lanes_total if lanes_done != 0 else 0 - print "Lane conversion success: %d/%d (%.1f%%)" % (lanes_done, lanes_total, percent_done) - -def merge_bams(tgtc_file, outfile, just_print_commands=False): - input_lane_list = make_lane_list(tgtc_file) - #lane_list = [lp for lp in lane_list if lp[0].flowcell =="30LLT"] - missing_files = False - lane_list = [] - for p1,p2 in input_lane_list: - if "adapted" not in p1.qualb or (p2 != None and "adapted" not in p2.qualb): - print "NOT ADAPTED", p1.qualb, p2.qualb - if not os.path.exists(p1.bam): - #print "Missing file:",p1.bam - missing_files = True - else: - lane_list.append( (p1,p2) ) - - #return - if not missing_files: - if not os.path.exists(outfile): - print len(lane_list),"lanes / lane pairs to merge" - if not os.path.exists(os.path.dirname(outfile)): - os.makedirs(os.path.dirname(outfile)) - - cmd("java -cp /home/radon01/depristo/dev/workspace/Libraries/bin"+ - " edu.mit.broad.picard.sam.MergeSamFiles"+ - " "+" ".join(["I="+p1.bam for p1,p2 in lane_list])+ - " O="+outfile+ - " TMP_DIR=/broad/hptmp/andrewk/"+ - " && /seq/dirseq/samtools/current/samtools index "+outfile+ - " && /xchip/tcga/gbm/visualization/sam/IGVAlignmentProcessor/bin/run_linux-64.sh "+outfile+" "+os.path.dirname(outfile)+" hg18 -w 100", - "long", outfile, just_print_commands=just_print_commands) - - # Original samtools merge command below that was used for rev. 1 merging - - #cmd("/seq/dirseq/samtools/current/samtools merge "+outfile+ - # " "+" ".join([p1.bam for p1,p2 in lane_list])+ - # " && /seq/dirseq/samtools/current/samtools index "+outfile+ - # " && /xchip/tcga/gbm/visualization/sam/IGVAlignmentProcessor/bin/run_linux-64.sh "+outfile+" "+os.path.dirname(outfile)+" hg18 -w 100", - # "long", outfile) - - else: - # Convert the lanes that had missing outfiles - #convert_lanes(tgtc_file) - pass - -def process_tgtc_lists(tgtc_list_file): - "Works through a list of tgtc files and merges them to produce the output" - lists_done = 0 - - for line in open(tgtc_list_file): - if line[0] == '#': - continue - fields = line.split() - if len(fields) != 2: - continue - head, tgtc_list = fields - bam = head+".bam" - if True: #not os.path.exists(bam): - print "Processing tgtc_list_file",tgtc_list,"with goal of producing",bam - merge_bams(tgtc_list, bam) - print - else: - print bam,"already exists\n" - - lists_done += 1 - #if lists_done > 0: - # break - -if __name__ == "__main__": - if len(sys.argv) < 2: - usage() - operation = sys.argv[1] - - # Check for print commands only option - if "print_only" in sys.argv: - just_print_commands = True - sys.argv.remove("print_only") - else: - just_print_commands = False - - if operation == "convert": - if len(sys.argv) < 3: - usage() - tgtc_file = sys.argv[2] - # Run convert_lanes and recreate_all == True if clean option was given as 3rd arg - convert_lanes(tgtc_file, len(sys.argv) > 3 and sys.argv[3] == "clean", just_print_commands=just_print_commands) - elif operation == "merge": - if len(sys.argv) < 4: - usage() - tgtc_file = sys.argv[2] - outfile = sys.argv[3] - merge_bams(tgtc_file, outfile, just_print_commands=just_print_commands) - elif operation == "lists": - tgtc_list_file = "/broad/1KG/legacy_data/tgtc2sam/tgtc_lists_2_legacy_bams" - if len(sys.argv) >= 3: - tgtc_list_file = sys.argv[2] - process_tgtc_lists(tgtc_list_file) - else: - print "Didn't understand operation value: "+operation - usage()