Deleting old python code

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4961 348d0f76-0448-11de-a6fe-93d51630548a
2011-01-07 21:33:45 +00:00 · 2011-01-07 21:33:45 +00:00 · e64a300642
parent 3362f0c280
commit e64a300642
44 changed files with 0 additions and 6901 deletions
--- a/python/1kgScripts/runme_freeze5.1.csh
+++ b/python/1kgScripts/runme_freeze5.1.csh
@ -1,3 +0,0 @@
-python ~/dev/GenomeAnalysisTK/trunk/python/MergeBAMBatch.py -d /humgen/gsa-hphome1/projects/1kg_pilot1/mergedBamsByPopulation/ -q gsa /broad/1KG/DCC_merged/lists/low_coverage_freeze5.list -n /broad/1KG/DCC_merged/lists/naids_and_pop.txt
-python ~/dev/GenomeAnalysisTK/trunk/python/MergeBAMBatch.py -d freeze5.1 -q gsa -s /broad/1KG/DCC_merged/lists/trios_freeze5.list
-
--- a/python/AlignBam.py
+++ b/python/AlignBam.py
@ -1,47 +0,0 @@
-#!/usr/bin/env python
-
-import os,sys
-from farm_commands import cmd
-
-def run_paired_BWA(bam):
-    "Takes a paired end BAM file and outputs an aligned '.aligned.bam' file "
-    lane = 0
-    root = os.path.splitext(bam)[0]
-    
-    # BAM to BFQ
-    cmd("/seq/software/picard/1.21/bin/BamToBfq.jar I="+bam+" LANE="+str(lane)+" FLOWCELL_BARCODE="+root+" PAIRED_RUN=True RUN_BARCODE="+root+" ANALYSIS_DIR=.")
-    
-    # BFQ to FQ
-    root1 = root+"."+str(lane)+".0.1"
-    root2 = root+"."+str(lane)+".0.2"
-    bfq1 = root1+".bfq"
-    bfq2 = root2+".bfq"
-    fq1 = root1+".fq"
-    fq2 = root2+".fq"
-
-    cmd("/seq/dirseq/maq-0.7.1/maq bfq2fastq "+bfq1+" "+fq1)
-    cmd("/seq/dirseq/maq-0.7.1/maq bfq2fastq "+bfq2+" "+fq2)
-
-    # Run BWA
-    sai1 = root1+".sai"
-    sai2 = root2+".sai"
-    cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa aln /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+fq1+" >"+sai1)
-    cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa aln /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+fq2+" >"+sai2)
-    
-    # Pairing
-    alignedsam = root+".aligned.sam"
-    cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa sampe /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+sai1+" "+sai2+" "+fq1+" "+fq2+" > "+alignedsam)
-    
-    # SAM to BAM
-    bam_ref_list = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bam.ref_list"
-    alignedbam = root+".aligned.bam"
-    cmd("/seq/dirseq/samtools/current/samtools import "+bam_ref_list+" "+alignedsam+" "+alignedbam)
-    
-if __name__ == "__main__":
-    if len(sys.argv) < 2:
-        print "Usage: AlignBam.py BAM_FILE"
-        sys.exit()
-    bam = sys.argv[1]
-    run_paired_BWA(bam)
-    
-        
--- a/python/AlignBams.py
+++ b/python/AlignBams.py
@ -1,41 +0,0 @@
-import os,sys
-from farm_commands import cmd
-
-def run_BWA(bam, paired=True):
-    "Takes a paired end BAM file and outputs an aligned '.aligned.bam' file "
-    lane = 0
-    root = os.path.splitext(bam)[0]
-    print root
-    
-    # BAM to BFQ
-    cmd("/seq/software/picard/current/bin/BamToBfq.jar I="+bam+" LANE="+str(lane)+" FLOWCELL_BARCODE="+root+" PAIRED_RUN=True RUN_BARCODE="+root+" ANALYSIS_DIR=.")
-    
-    # BFQ to FQ
-    root1 = root+"."+str(lane)+".0.1"
-    root2 = root+"."+str(lane)+".0.2"
-    bfq1 = root1+".bfq"
-    bfq2 = root2+".bfq"
-    fq1 = root1+".fq"
-    fq2 = root2+".fq"
-
-    cmd("/seq/dirseq/maq-0.7.1/maq bfq2fastq "+bfq1+" "+fq1)
-    cmd("/seq/dirseq/maq-0.7.1/maq bfq2fastq "+bfq2+" "+fq2)
-
-    # Run BWA
-    sai1 = root1+".sai"
-    sai2 = root2+".sai"
-    cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa aln /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+fq1+" >"+sai1)
-    cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa aln /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+fq2+" >"+sai2)
-    
-    # Pairing
-    alignedsam = root+".aligned.sam"
-    cmd("/seq/dirseq/bwa/bwa-0.4.9/bwa sampe /humgen/gsa-scr1/ebanks/bwaref/Homo_sapiens_assembly18.fasta "+sai1+" "+sai2+" "+fq1+" "+fq2+" > "+alignedsam)
-    
-    # SAM to BAM
-    bam_ref_list = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bam.ref_list"
-    alignedbam = root+".aligned.bam"
-    cmd("/seq/dirseq/samtools/current/samtools import "+bam_ref_list+" "+alignedsam+" "+alignedbam)
-    
-if __name__ == "__main__":
-    bamfiles = sys.argv[1:]
-    
--- a/python/BarcodeAnalysis.py
+++ b/python/BarcodeAnalysis.py
@ -1,64 +0,0 @@
-#!/usr/bin/python
-
-import os,sys
-from farm_commands import cmd
-
-def get_line_number_from_file(filename, line_num):
-    for index, line in enumerate(open(filename)):
-        if index == line_num:
-            return line
-
-if __name__ == "__main__":
-    
-    from optparse import OptionParser
-    class OptionParser (OptionParser):
-        def check_required (self, opts):
-            for opt in opts:
-                option = self.get_option(opt)
-    
-                # Assumes the option's 'default' is set to None!
-                if getattr(self.values, option.dest) is None:
-                    self.error("%s option not supplied" % option)
-    
-    parser = OptionParser()
-    parser.add_option("-q", "--queue", help="Queue to submit jobs to (default: long)", dest="queue", default="long")
-    parser.add_option("-l", "--lane", help="Lane number to process", dest="lane", default="")
-    parser.add_option("-t", "--tmpdir", help="Temp directory to use", dest="tmpdir")
-    parser.add_option("-b", "--barcodes", help="File with barcodes as \"Id[TABa]barcode_sequence\" with one header line", dest="barcodes")
-    parser.add_option("-a", "--baits", help="Bait interval list (default: 1KG bait list)", dest="ti", default="/seq/references/HybSelOligos/thousand_genomes_alpha_redesign/thousand_genomes_alpha_redesign.baits.interval_list")
-    parser.add_option("-r", "--targets", help="Target interval list (default: 1KG target list)", dest="ti", default="/seq/references/HybSelOligos/thousand_genomes_alpha_redesign/thousand_genomes_alpha_redesign.targets.interval_list")
-    (options, args) = parser.parse_args()
-    parser.check_required(("-l","-t","-b"))
-        
-    solexa_dir = "."
-    
-    barcodes = []
-    barcode_file = open(options.barcodes) #"../barcode_sequences.txt")
-    barcode_file.readline()
-    for line in barcode_file:
-        fields = line.split()
-        barcodes.append(fields[1])
-    
-    farm = options.queue 
-    lane = options.lane
-    alias = "lane"+str(lane)
-    tmp_dir = options.tmpdir #"/humgen/gsa-scr1/andrewk/tmp/"
-    
-    cmd("/seq/software/picard/current/bin/ExtractIlluminaBarcodes.jar B="+solexa_dir+" L="+str(lane)+" BARCODE_POSITION=1 METRICS_FILE=metrics.out BARCODE=" + " BARCODE=".join(barcodes))
-    
-    cmd( "/seq/software/picard/current/bin/BustardToSam.jar B="+solexa_dir+" L="+str(lane)+" O="+alias+".bam SAMPLE_ALIAS="+alias+" RUN_BARCODE="+alias+" TMP_DIR="+tmp_dir)#, farm_queue=farm) #PAIRED_RUN=True 
-    
-    for barcode in barcodes:
-        id1 = cmd( "/seq/software/picard/current/bin/BustardToSam.jar B="+solexa_dir+" L="+str(lane)+" BARCODE_POSITION=1  BARCODE="+barcode+" O="+barcode+".bam RUN_BARCODE="+barcode+" SAMPLE_ALIAS=lane5 TMP_DIR="+tmp_dir, farm_queue=farm) #PAIRED_RUN=True 
-    
-        id2 = cmd( "/home/radon01/andrewk/dev/Sting/trunk/python/AlignBam.py "+barcode+".bam", farm_queue=farm, waitID = id1)
-        id3 = cmd("/seq/software/picard/1.21/bin/MergeSamFiles.jar I="+barcode+".aligned.bam O="+barcode+".aligned.merged.bam VALIDATION_STRINGENCY=SILENT", farm_queue=farm, waitID = id2)
-        id4 = cmd("java -Xmx4096m -jar /seq/software/picard/1.21/bin/MarkDuplicates.jar VALIDATION_STRINGENCY=SILENT I= "+barcode+".aligned.merged.bam  O="+barcode+".aligned.duplicates_marked.bam  M="+barcode+".aligned.bam.duplicates_marked.bam.duplicate_metrics TMP_DIR="+tmp_dir, farm_queue=farm, waitID=id3)
-        cmd("java -jar /seq/software/picard/current/bin/CalculateHsMetrics.jar BI="+options.bait_list+" TI="+options.target_list+" I="+barcode+".aligned.duplicates_marked.bam M="+barcode+".hs_metrics VALIDATION_STRINGENCY=SILENT", farm_queue=farm, waitID = id4)
-    
-    
-    fout = open("hybrid_selection.metrics", "w")
-    print >> fout, "BARCODE\t"+get_line_number_from_file(barcodes[0]+".hs_metrics", 6).rstrip()
-    for barcode in barcodes:
-        print >> fout, barcode+"\t"+get_line_number_from_file(barcode+".hs_metrics", 7).rstrip()
-    
--- a/python/CoverageEval.py
+++ b/python/CoverageEval.py
@ -1,357 +0,0 @@
-#!/usr/bin/env python
-
-import sys, itertools, FlatFileTable
-from enum import Enum
-
-def subset_list_by_indices(indices, list):
-    subset = []
-    for index in indices:
-        subset.append(list[index])
-    return subset
-
-def chunk_generator(record_gen, key_fields):
-    """Input:
-  record_gen: generator that produces dictionaries (records in database speak)
-  key_fields: keys in each dictionary used to determine chunk membership
-Output:
-  locus_chunk: list of consecutive lines that have the same key_fields"""
-  
-    locus_chunk = []
-    last_key = ""
-    first_record = True
-    for record in record_gen:
-        key = [record[f] for f in key_fields]
-        if key == last_key or first_record:
-            locus_chunk.append(record)
-            first_record = False
-        else:
-            if locus_chunk != []:
-                yield locus_chunk
-                locus_chunk = [record]
-        last_key = key
-    yield locus_chunk
-
-
-class call_stats:
-    def __init__(self, call_type, coverage): #, acc_conf_calls, conf_call_rate, cum_corr_calls, cum_calls, coverage):
-        self.call_type = call_type
-        self.coverage = coverage
-        self.calls = 0
-        self.conf_ref_calls = 0
-        self.conf_het_calls = 0
-        self.conf_hom_calls = 0
-        self.conf_var_calls = 0
-        self.conf_genotype_calls = 0
-        self.conf_refvar_calls = 0
-        self.correct_genotype = 0
-        self.correct_refvar = 0
-
-
-    def add_stat(self, calls, conf_ref_calls, conf_het_calls, conf_hom_calls, conf_var_calls, conf_genotype_calls, conf_refvar_calls, correct_genotype, correct_refvar):
-        #print self, calls, conf_ref_calls, conf_het_calls, conf_hom_calls, conf_var_calls, conf_genotype_calls, conf_refvar_calls, correct_genotype, correct_refvar
-        self.calls += calls
-        self.conf_ref_calls += conf_ref_calls
-        self.conf_het_calls += conf_het_calls
-        self.conf_hom_calls += conf_hom_calls
-        self.conf_var_calls += conf_var_calls
-        self.conf_genotype_calls += conf_genotype_calls
-        self.conf_refvar_calls += conf_refvar_calls
-        self.correct_genotype += correct_genotype
-        self.correct_refvar += correct_refvar
-
-    @staticmethod
-    def calc_discovery_stats(chunk):
-        conf_calls = 0
-        correct_genotype = 0
-        calls = 0
-        for record in chunk:
-            calls += 1
-            if float(record["BtrLod"]) >= 5 or call_type.from_record_2_state(record["ReferenceBase"][0],record["BestGenotype"]) == call_type.call_types_2_state.Reference and float(record["BtnbLod"]) >= 5:
-                conf_calls += 1
-                if call_type.discovery_call_correct(record):
-                    correct_genotype += 1
-
-        return correct_genotype, conf_calls, calls
-
-    @staticmethod
-    def calc_genotyping_stats(chunk):
-        conf_calls = 0
-        correct_genotype = 0
-        calls = 0
-        for record in chunk:
-            calls += 1
-            if float(record["BtnbLod"]) >= 5:
-                conf_calls += 1
-                if call_type.genotyping_call_correct(record):
-                    correct_genotype += 1
-                    
-        return correct_genotype, conf_calls, calls
-
-    @staticmethod
-    def stats_header():
-        return "TrueGenotype,Coverage,AccuracyConfidentGenotypingCalls,ConfidentGenotypingCallRate,AccuracyConfidentDiscoveryCalls,ConfidentDiscoveryCallRate,Calls,ConfRefCalls,ConfHetCalls,ConfHomCalls,ConfGenotypeCalls,CorrectGenotypes,ConfVarCalls,ConfDiscoveryCalls,CorrectDiscovery"
-        
-    def __str__(self):
-        return ",".join(map(str, (self.calls, self.conf_ref_calls, self.conf_het_calls, self.conf_hom_calls, self.conf_genotype_calls, self.correct_genotype, self.conf_var_calls, self.conf_refvar_calls, self.correct_refvar)))
-
-    def stats_str(self):
-        return "%s,%d,%.5f,%.5f,%.5f,%.5f,%s" % (self.call_type, self.coverage, float(self.correct_genotype) / max(self.conf_genotype_calls,1), float(self.conf_genotype_calls) / max(self.calls,1), float(self.correct_refvar) / max(self.conf_refvar_calls,1), float(self.conf_refvar_calls) / max(self.calls,1), self.__str__())
-
-class call_type:
-    """Class that returns an Enum with the type of call provided by a record"""
-    call_types_3_state = Enum("HomozygousSNP","HeterozygousSNP","HomozygousReference")
-    call_types_3_state_short = Enum("Hom","Het","Ref")    
-    call_types_2_state = Enum("Variant","Reference")
-    
-    @staticmethod
-    def from_record_3_state(ref, genotype): # record):
-        """Given reference base as string, determine whether called genotype is homref, het, homvar"""
-        #ref = record["ReferenceBase"][0]
-        #genotype = record["HapmapChipGenotype"]
-        return call_type.call_types_3_state[genotype.count(ref)]
-
-    @staticmethod
-    def from_record_2_state(ref, genotype):
-        """Given reference base as string, determine whether called genotype is ref or var"""
-        #ref = record["ReferenceBase"][0]
-        #genotype = record["HapmapChipGenotype"]
-        return  call_type.call_types_2_state[0] if genotype.count(ref) < 2 else call_type.call_types_2_state[1]
-
-    @staticmethod
-    def genotyping_call_correct(record):
-        return record["HapmapChipGenotype"] == record["BestGenotype"]
- 
-
-    @staticmethod
-    def discovery_call_correct(record):
-        return call_type.from_record_2_state(record["ReferenceBase"][0], record["HapmapChipGenotype"]) == call_type.from_record_2_state(record["ReferenceBase"][0], record["BestGenotype"])
-
-def print_record_debug(rec):
-    print "TEST Genotyping:", call_type.genotyping_call_correct(rec)
-    print "TEST Discovery:", call_type.discovery_call_correct(rec)
-    print "DIFF:", call_type.genotyping_call_correct(rec) != call_type.discovery_call_correct(rec)
-    print "\n".join(["   %20s => '%s'" % (k,v) for k,v in sorted(rec.items())])
-    print call_type.from_record_2_state(rec["ReferenceBase"][0],rec["HapmapChipGenotype"])
-    print call_type.from_record_2_state(rec["ReferenceBase"][0],rec["BestGenotype"])
-    print
-
-def aggregate_stats(filename, max_loci, table_filename, debug):
-    aggregate = dict()
-    fout = open(table_filename,"w")
-    fout.write(call_stats.stats_header()+"\n")
-
-    locus_gen = chunk_generator(FlatFileTable.record_generator(filename, None), ("Sequence","Position"))
-    for index, locus_chunk in enumerate(locus_gen):
-        if index >= max_loci:
-            break
-        if (index % 1000) == 0:
-            sys.stderr.write( str(index)+" loci processed, at: "+locus_chunk[0]["Sequence"]+":"+locus_chunk[0]["Position"]+"\n")
-
-        covs = dict()
-        coverage_chunk_gen = chunk_generator(locus_chunk, ("DownsampledCoverage", "Sequence", "Position"))
-        for cov_chunk in coverage_chunk_gen:
-            first_record = cov_chunk[0]
-            #stat = call_stats.calc_stats(cov_chunk)
-            
-            for record in cov_chunk:
-                hapmap_genotyping_call_type = call_type.from_record_3_state(record["ReferenceBase"][0],record["HapmapChipGenotype"])
-                key = hapmap_genotyping_call_type, int(record["DownsampledCoverage"])
-                #key = call_type.from_record_3_state(record)#, int(record["DownsampledCoverage"])
-                record["DownsampledCoverage"] = int(record["DownsampledCoverage"])
-                record["HapmapChipCallType"] = hapmap_genotyping_call_type
-                value = record
-
-                correct_genotype, conf_genotype_calls, genotype_calls = call_stats.calc_genotyping_stats([record])
-                correct_refvar, conf_refvar_calls, refvar_calls = call_stats.calc_discovery_stats([record])
-                assert(genotype_calls == refvar_calls)
-
-                conf_ref_calls = 0
-                conf_het_calls = 0
-                conf_hom_calls = 0
-                best_genotyping_call_type = call_type.from_record_3_state(record["ReferenceBase"][0],record["BestGenotype"])
-                if conf_genotype_calls: 
-                    if best_genotyping_call_type.index == 0: conf_hom_calls = 1
-                    if best_genotyping_call_type.index == 1: conf_het_calls = 1
-                    if best_genotyping_call_type.index == 2: conf_ref_calls = 1
-
-                conf_var_calls = 0
-                if conf_refvar_calls:
-                    this_variant_call_type = call_type.from_record_2_state(record["ReferenceBase"][0],record["BestGenotype"])
-                    conf_var_calls = 1 if this_variant_call_type.index == 0 else 0
-                
-                aggregate.setdefault(key, call_stats(*key))
-                #print ",".join(map(str,(genotype_calls, conf_ref_calls, conf_het_calls, conf_hom_calls, conf_var_calls, conf_genotype_calls, conf_refvar_calls, correct_genotype, correct_refvar)))
-                aggregate[key].add_stat(genotype_calls, conf_ref_calls, conf_het_calls, conf_hom_calls, conf_var_calls, conf_genotype_calls, conf_refvar_calls, correct_genotype, correct_refvar)
-
-                if debug:# and conf_refvar_calls:
-                    print "KEYS:",key
-                    print_record_debug(record)
-
-
-    for key, records in sorted(aggregate.items()):
-        fout.write(records.stats_str()+"\n")
-
-    fout.close()
-    #return aggregate
-
-class weighted_avg:
-
-    def __init__(self):
-        self.sum = 0.0
-        self.count = 0
-
-    def add(self, value, counts):
-        self.sum += value*counts
-        self.count += counts
-
-    def return_avg(self):
-        return float(self.sum) / max(self.count,1)
-
-def stats_from_hist(options, depth_hist_filename, stats_filename, variant_eval_dir, depth_multiplier=1.0):
-
-    #hist_zero = {"CallType" : ,"Coverage","AccuracyConfidentCalls","ConfidentCallRate","CumCorrectCalls","CumCalls","CumgCorrectFraction"}
-    #prob_genotype = [1e-5, 1e-3, 1-1e-3]
-    #prob_genotype = [0.37, 0.62, .0]
-    #prob_genotype = [0.203, 0.304, .491]
-    #prob_genotype = [0.216, 0.302, .481]
-    prob_genotype = [0.205, 0.306, 0.491] # Based on CEU NA12878 actual hapmap chip calls
-    #prob_genotype = [0.213, 0.313, 0.474] # Based on YRB NA19240 actual hapmap chip calls
-    theta = 1.0/1850 # expected heterozygosity
-    prob_genotype = [0.5 * theta, 1.0 * theta, 1 - 1.5*theta] # Based on CEU NA12878 actual hapmap chip calls
-    
-    
-    hist = []
-    hist_gen = FlatFileTable.record_generator(depth_hist_filename, sep=" ", skip_until_regex_line = "^depth count freq")
-    for index, record in enumerate(hist_gen):
-        assert int(record["depth"]) == index
-        hist.append(int(record["count"]))
-    
-    # If upsampling is not done in the CoverageEval GATK module, the number of observations of reads 
-    # with high depth of coverage can be very low giving 
-        
-    stats_dict = dict()
-    stats_gen = FlatFileTable.record_generator(stats_filename, sep=",")
-    for record in stats_gen:
-        key1 = int(record["Coverage"])
-        key2 = record["TrueGenotype"]
-        #print key2
-        stats_dict.setdefault(key1, dict()) # create a nested dict if it doesn't exist
-        stats_dict[key1][key2] = record # create an entry for these keys
-
-        #highest_homref_calls = 0
-        #if record["TrueGenotype"] == "HomozygousReference":
-        #    calls = int(record["Calls"])
-        #    if calls > highest_homref_calls:
-        #        highest_homref_calls = calls
-        
-    acc = dict()
-    call_rate = dict()
-    conf_calls = dict()
-    
-    start = 1
-    end = 1000
-    max_usable_depth = 40 # Depth of coverage beyond which stats are not sampled enough and we take the stat at this depth instead
-    for depth, depth_count in enumerate(hist[start:end+1],start): # For Cd = depth count
-        #print "DEPTH: "+str(depth)
-        try:
-            depth = max(int(float(depth*depth_multiplier)),1)
-            if depth > max_usable_depth: # Ensure that high entries with bad stats use a good stat from a depth that we now is well sampled
-                depth = max_usable_depth
-            depth_entries = stats_dict[depth]
-        except KeyError:
-            print "Stopped on depth",depth
-            break
-        if True:
-            for true_genotype, stat in depth_entries.items(): # For t (SNP type) = true_genotype
-                #print "TRUE_GENOTYPE: "+str(true_genotype)
-                for genotype in call_type.call_types_3_state:
-                    conf_calls.setdefault(genotype, 0.0)
-                    prob_conf_x_call = float(stat["Conf"+str(call_type.call_types_3_state_short[genotype.index])+"Calls"])/float(stat["Calls"])
-                    conf_calls[genotype] += depth_count * prob_conf_x_call * prob_genotype[genotype.index]
-                    #if genotype.index == 1:
-                    #    print "%.5f " % prob_conf_x_call, depth, depth_count, conf_calls[genotype], int(stat["Conf"+str(call_type.call_types_3_state_short[genotype.index])+"Calls"]), int(stat["Calls"])
-
-                acc.setdefault(true_genotype,weighted_avg())
-                call_rate.setdefault(true_genotype,weighted_avg())
-                acc[true_genotype].add(float(stat["AccuracyConfidentGenotypingCalls"]), depth_count)
-                call_rate[true_genotype].add(float(stat["ConfidentGenotypingCallRate"]), depth_count)
-
-    import numpy
-    for genotype in call_type.call_types_3_state:
-        print "%19s accuracy : %.3f" % (str(genotype), acc[str(genotype)].return_avg())
-        print "%19s call rate: %.3f" % (str(genotype), call_rate[str(genotype)].return_avg())
-
-    print "\nExpected performance given perfect accuracy and call rate:"
-    print "%19s        %7s %7s %7s" % ("", "Actual", "Perfect", "Diff.")
-    total_hist_sites = numpy.sum(hist)
-    total_predicted = 0
-    for genotype in call_type.call_types_3_state:
-        predicted = conf_calls[genotype]
-        total_predicted += predicted
-        perfect = prob_genotype[genotype.index]*total_hist_sites
-        diff = perfect - predicted
-        percent_of_possible = predicted / perfect * 100
-        print "%19s calls: %8.0f %8.0f %8.0f %8.1f" % (genotype, predicted, perfect, diff, percent_of_possible)
-        #repl_string += "%s %.0f\n" % (genotype, predicted)
-    print "              Total calls: %7d" % total_predicted
-    
-    #stats_gen = FlatFileTable.record_generator(stats_filename, sep=",")
-    #for chunk in chunk_generator(stats_gen, key_fields=("True_Genotype")):
-    
-    print "\nCoverage histogram mean: %.2f" % numpy.average(range(len(hist)), weights=hist)
-    
-    # STEM AND LEAF PLOT
-    
-    # If VariantEval directory given, compare with those results
-    if options.variant_eval_file != None:
-        vareval_file = open(options.variant_eval_file)
-        num_hets = None; num_homs = None
-        for line in vareval_file:
-            if "UNKNOWN_CALLED_VAR_HET_NO_SITES" in line: num_hets = line.rstrip().split()[2]
-            if "UNKNOWN_CALLED_VAR_HOM_NO_SITES" in line: num_homs = line.rstrip().split()[2]
-            
-    if options.oneline_stats != None:
-        oneline_stats = open(options.oneline_stats, "w")
-        pred_hom = conf_calls[call_type.call_types_3_state[0]]
-        pred_het = conf_calls[call_type.call_types_3_state[1]]
-        print >>oneline_stats, "%s %.0f %s %.0f %s" % (options.oneline_stats, pred_hom, num_homs, pred_het, num_hets)  
-
-def usage(parser):
-    parser.print_usage()
-    sys.exit()
-
-if __name__ == "__main__":
-    from optparse import OptionParser
-
-    parser = OptionParser()
-    parser.add_option("-c", "--genotype_call_file", help="GELI file to use in generating coverage stat table", dest="genotype_filename", )
-    parser.add_option("-s", "--stats_file", help="file to containing empirical genotyping stats", dest="stats_filename", default=None)
-    parser.add_option("-g", "--histogram_file", help="file containing counts of each depth of coverage", dest="hist_filename")
-    parser.add_option("-m", "--max_loci", help="maximum number of loci to parse (for debugging)", default=sys.maxint, dest="max_loci", type="int")
-    parser.add_option("-e", "--evaluate", help="evaluate genotypes; requires a stats file and a histogram file", default=False, dest="evaluate_genotypes", action="store_true")
-    parser.add_option("-d", "--debug", help="provide debugging output", default=False, dest="debug", action="store_true")
-    parser.add_option("-p", "--depth_multiplier", help="multiply all depths in histogram by this value; for \"downsampling\" depth", default=1.0, dest="depth_multiplier", type=float)
-    parser.add_option("-v", "--variant_eval_file", help="file with output of VariantEval genotype concordance to compare this prediction to", default=None, dest="variant_eval_file")
-    parser.add_option("-o", "--oneline_stats_file", help="output single, tabular line of stats to this file", default=None, dest="oneline_stats")
-    
-    (options, args) = parser.parse_args()
-
-    if not options.evaluate_genotypes:
-        print "Creating performance tables from genotypes file"
-        #if options.stats_filename == None:
-        #    sys.exit("Must provide -s stats fliname option")
-        if options.genotype_filename == None:
-            sys.exit("Must provide -c genotype call filename option")
-        stats_filename = options.stats_filename if options.stats_filename != None else options.genotype_filename+".stats"
-        aggregate_stats(options.genotype_filename, options.max_loci, options.stats_filename, options.debug)
-    else:
-        print "Evaluating genotypes"
-        print "Depth multiplier:",options.depth_multiplier
-        if options.hist_filename == None:
-            sys.exit("Must provide -g histogram filename option")
-        if options.stats_filename == None:
-            sys.exit("Must provide -s stats fliname option")
-        stats_from_hist(options, options.hist_filename, options.stats_filename, options.variant_eval_file, options.depth_multiplier)
-        
-
-
--- a/python/CoverageMeta.py
+++ b/python/CoverageMeta.py
@ -1,141 +0,0 @@
-#!/usr/bin/env python
-
-import sys, os
-from farm_commands import cmd
-
-#if __name__ == "__main__":
-from optparse import OptionParser
-class OptionParser (OptionParser):
-
-    def check_required (self, opts):
-        for opt in opts:
-            option = self.get_option(opt)
-
-            # Assumes the option's 'default' is set to None!
-            if getattr(self.values, option.dest) is None:
-                self.error("%s option not supplied" % option)
-
-def exists_and_non_zero(file):
-    return os.path.exists(file) and os.path.getsize(file) != 0
-    
-def process_bam(bam_file):
-    gatk_exe = "java -Xmx8192m -jar ~/dev/Sting/trunk/dist/GenomeAnalysisTK.jar "
-    output_basepath = os.path.splitext(bam_file)[0]
-    output_basefile = os.path.basename(output_basepath)
-
-    print "Input BAM file:",bam_file
-    
-    using_hapmapchip = None
-    if False:
-        # Config for chr1 of pilot 2 people
-        ref = " -R /broad/1KG/reference/human_b36_both.fasta"
-        #interval_string = "1"
-    
-        # Assure existance of intervals file
-        intervals_file = output_basepath+".hapmap.intervals.chr1"
-        if not exists_and_non_zero(intervals_file):
-            Cmd = "awk -F' ' '{ if ($1 == \""+interval_string+"\") {print $1 \":\" $4} }' "+hapmapchip_file+" > "+intervals_file
-            cmd(Cmd, die_on_fail = True)
-    
-        # Setup hapmapchip GFFs; assure that hapmap chip file is here
-        using_hapmapchip = True
-        hapmapchip_file = output_basepath+".hapmapchip.gff"
-        if not exists_and_non_zero(hapmapchip_file):
-            sys.exit("Expected to find "+hapmapchip_file+" but it's not around")
-        
-    else:
-        # Config for pilot 3 Broad data
-        #ref = " -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
-        #intervals_file = "/seq/references/HybSelOligos/thousand_genomes_alpha_redesign/thousand_genomes_alpha_redesign.targets.interval_list"
-
-        # Config for pilot 3 DCC data (12/16/09 r. 2375)
-        ref = " -R /broad/1KG/reference/human_b36_both.fasta"
-        intervals_file = "/humgen/gsa-scr1/GATK_Data/thousand_genomes_alpha_redesign.targets.b36.interval_list"
-        
-    # Assure that BAM index exists
-    bam_index_file = output_basepath+".bam.bai"
-    if not exists_and_non_zero(bam_index_file):
-        Cmd = "samtools index "+bam_file
-        cmd(Cmd, die_on_fail = True)
-        
-    # Run UnifiedGenotyper calls
-    vcf_outfile = output_basepath+".calls.vcf"
-    if not exists_and_non_zero(vcf_outfile):
-        print "Running UnifiedGenotyper"
-        Cmd = gatk_exe + ref + " -T UnifiedGenotyper"+\
-            " -I "+bam_file+\
-            " -varout "+vcf_outfile+\
-            " -L "+intervals_file+\
-            " -fmq0 -l INFO "
-            #" --genotype -lod 5"
-        
-            #+interval_string+\
-        cmd(Cmd, die_on_fail=True)
-    
-    # Run VariantEval on UnifiedGenotyper calls
-    varianteval_outfile = output_basepath+".varianteval"
-    if not exists_and_non_zero(varianteval_outfile):
-        print "Running VariantEval with output to "+varianteval_outfile
-        Cmd = gatk_exe + ref + " -T VariantEval"+\
-            " -B dbsnp,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod"+\
-            " -B eval,VCF,"+ssg_outfile+\
-            " -o "+varianteval_outfile+\
-            " -L "+intervals_file+\
-            " -G -V"+\
-            " --evalContainsGenotypes"+\
-            " -minPhredConfidenceScore 150"+\
-            " -fmq0 -l INFO"
-        if using_hapmapchip:
-            Cmd = Cmd + " -hc "+hapmapchip_file
-    
-        cmd(Cmd, die_on_fail=True)
-    
-    # Run CoverageHist on BAM file
-    hist_outfile = output_basepath+".hapmap.coverage_hist"
-    if not exists_and_non_zero(hist_outfile):
-        print "Running CoverageHist on "+bam_file
-        Cmd = gatk_exe + ref + " -T DepthOfCoverage"+\
-            " -I "+bam_file+\
-            " -o "+hist_outfile+\
-            " -L "+intervals_file+\
-            " -histogram --suppressLocusPrinting"+\
-            " -fmq0 -l INFO"
-            
-        cmd(Cmd, die_on_fail=True)#,"gsa", output_basepath)
-    
-    # Now do CoverageEval predictions if they haven't been done
-    oneline_stats_file = output_basepath+".oneline_stats"
-    if not exists_and_non_zero(oneline_stats_file):
-        Cmd = "CoverageEval.py -e"+\
-            " -s /humgen/gsa-scr1/projects/1kg_pilot2/coverage_eval/fixed_stats/current/NA12878.chr1.full.stats"+\
-            " -g "+hist_outfile+\
-            " -v "+varianteval_outfile+\
-            " -o "+oneline_stats_file
-        cmd(Cmd, die_on_fail=True)
-
-if __name__ == "__main__":
-    parser = OptionParser()
-    parser.add_option("-b", "--bam-file", help="Input BAM filename", dest="bam_file", default=None)
-    parser.add_option("-a", "--all_bams", help="Process all BAMs in current directory", default=False, dest="all_bams", action="store_true")
-    
-    (options, args) = parser.parse_args()
-    
-    if not options.all_bams:
-        parser.check_required(("-b",))    
-        process_bam(options.bam_file)
-    else:
-        bams = [f for f in os.listdir(".") if f.endswith(".bam")]
-        print bams
-        for bam in bams:
-            process_bam(bam)
-            #print
-
-    
-    
-    
-    
-    
-    
-    
-    
-    
--- a/python/DOCParameter.py
+++ b/python/DOCParameter.py
@ -1,47 +0,0 @@
-#!/util/bin/python
-
-import math, string, sys
-
-for input_file in sys.argv[1:]:
-    
-    FILENAME = input_file
-
-    # Read the depth of coverage distribution from file
-    depth = []; count = []
-    for line in open(FILENAME):
-        try:
-            int(line[0])
-            split = string.split(line,' ')
-            if int(split[1]) != 0:
-                depth.append(int(split[0]))
-                count.append(int(split[1]))
-        except ValueError: pass
-
-    # Calculate the mode
-    mode = depth[count.index(max(count))]
-
-    # Find the index of maximum extent of 'good' data 
-    idx = depth.index(mode); dist = 10**9
-    while abs(count[idx] - count[0]) < dist:
-        dist = abs(count[idx] - count[0])
-        idx += 1
-    model_count = count[:idx + 1]
-
-    # Calculate mean & variance of 'good' data
-    model_tot = sum(model_count); adder = 0
-    for i in range(len(model_count)): adder += depth[i]*model_count[i]
-    mean = adder/float(model_tot)
-
-    adder = 0
-    for i in range(len(model_count)): adder += model_count[i]*(mean - depth[i])**2
-    var = adder/float(model_tot)
-    stdev = var**0.5
-
-    # Output Thresholds
-    print FILENAME
-    print 'Mean Depth of Coverage: ' + str(mean)
-    print 'Plus One Std Dev: ' + str(mean + stdev)
-    print 'Plus Two Std Dev: ' + str(mean + 2*stdev)
-    print 'Plus Three Std Dev: ' + str(mean + 3*stdev)
-    print 'Plus Four Std Dev: ' + str(mean + 4*stdev)
-    print 'Plus Five Std Dev: ' + str(mean + 5*stdev)
--- a/python/EvalMapping.py
+++ b/python/EvalMapping.py
@ -1,309 +0,0 @@
-#!/usr/bin/env python
-
-# Evaluates mapping results produced my RunMapper.py by comparig
-# the result to the original simulated reads in a sam file
-
-import getopt, sys, os, re
-from operator import attrgetter, itemgetter
-from subprocess import Popen, STDOUT, PIPE
-from SAM import SAMIO
-from pushback_file import pushback_file
-from qltout import qltout_file
-from aln_file import aln_file
-
-
-#!/usr/bin/env python
-
-import string, sys
-
-class SAMRecordWrapper:
-    def __init__(self, rec):
-        #print 'Rec', rec
-        self.rec = rec
-    
-    def id(self): return self.rec.getQName()
-    def contig(self): return self.rec.getRname()
-    def pos(self): return self.rec.getPos()
-    def offset(self): return self.pos()-1
-    def map_qual(self): return self.rec.getMapq()
-    
-    def __str__(self): return str(self.rec)
-
-def SAMIOWrapper( file ):
-    print 'SAMIOWrapper', file
-    return SAMIO( file, debugging=True, func=SAMRecordWrapper)
-
-class mapper_info:
-    def __init__(self, name, ext, file_class):
-        self.name = name.lower()
-        self.ext = ext
-        self.file_class = file_class
-
-mapper_defs = [ mapper_info("ilt", "ilt.qltout", qltout_file),
-            mapper_info("merlin", "qltout", qltout_file),
-            mapper_info("swmerlin", "qltout", qltout_file),
-            mapper_info("bwa", "sam", SAMIOWrapper),
-            mapper_info("bwa32", "sam", SAMIOWrapper),
-            mapper_info("maq", "aln.txt", aln_file) ]
-mapper_defs = dict( [(m.name, m) for m in mapper_defs] )
-
-def Comp_sam_map(comp_head, mapper, file_output, sam_indel):
-    
-    # sam_indel is a +/- number corresponding to the size of insertions / deletions
-    # in generated SAM file that are fixed (1,2,4,8,16...), this doesn't always
-    # translate to the SAM's cigar, but the full value should be used
-
-    file_output_head = comp_head+"."+mapper.name
-    if file_output:
-        # if non-false, output to filename in file_output_head
-        # if "" or False, leave output going to stdout
-        #saveout = sys.stdout
-        feval = open(file_output_head+".eval", "w")
-        #sys.stdout = feval
-
-        # create a file with a one-line tabular output of stats 
-        # (for merging into Excel / R input)
-        fevaltab = open(file_output_head+".tab", "w")
-
-    # if maq file, convert aln.map to aln.txt
-    aln_txt_filename = file_output_head+"."+mapper.ext
-    if mapper.name == "maq" and not os.path.exists(aln_txt_filename):
-        maq_exe = "/seq/dirseq/maq-0.7.1/maq"
-        
-        # SHOULD BE THIS
-        #cmd_str = maq_exe+" mapview "+file_output_head+".out.aln.map"
-
-        cmd_str = maq_exe+" mapview "+file_output_head+".out.aln.map"
-
-        print >> sys.stderr, "Executing "+cmd_str
-        fout = open(aln_txt_filename, "w")
-        p = Popen( cmd_str , shell=True, stdout=fout)
-
-    map_filename = comp_head+"."+mapper.name+"."+mapper.ext
-    print 'map_filename', map_filename
-    map_file = mapper.file_class(map_filename)
-
-    SAM_filename = comp_head+".sam"
-    sam_file = SAMIO(SAM_filename, debugging=True)
-
-    #sams = [s for s in sam_file]
-    print "Reading map file..."
-    maps = [q for q in map_file]
-
-    print "Reading SAM file..."
-    sams = {}
-    if mapper.name in ["maq", 'bwa', 'bwa32']:
-        for s in sam_file:
-            sams[s.getQName()] = s
-    else:
-        for i, s in enumerate(sam_file):
-            sams[i] = s
-            
-    #maps = {}
-    #for m in map_file:
-    #    maps[m.id()] = m
-    #sams = [s for s in sam_file]
-
-    class smq_t:
-        def __init__(self, sam, map, qual):
-            self.sam = sam # SAM record (exists for all reads)
-            self.map = map # Mapping record, may be None
-            self.qual = qual # Quality of mapping
-
-    print "Making SMQ records..."
-    SMQ = []
-    for maprec in maps:
-        samrec = sams.get(maprec.id()) # get mapping otherwise None
-        SMQ.append( smq_t(samrec, maprec, maprec.map_qual()) )
-
-    print "Sorting..."
-    SMQ.sort(key=attrgetter('qual'), reverse=True)
-
-    print "Evaluating placements..."
-    placed = 0
-    incorrectly_placed = 0
-    correctly_placed = 0
-    fevaltab.write( "\t".join( ["last_qual", "total_reads", "placed", "correctly_placed", "incorrectly_placed", "not_placed"] ) + "\n" )
-    if len(SMQ):
-        last_qual = SMQ[0].qual # grab 1st qual
-        for smq in SMQ:
-            if smq.sam == None:
-                continue
-        
-            if smq.qual != last_qual:
-                total_reads = len(sams)#placed + not_placed
-                not_placed = len(sams) - placed
-                fevaltab.write( "\t".join( map(str, [last_qual, total_reads, placed, correctly_placed, incorrectly_placed, not_placed]) ) + "\n" )
-                #print "Wrote all reads with qual >= "+str(last_qual)
-                last_qual = smq.qual
-
-            placed += 1
-
-            # Convert the CIGAR string - e.g. 75M1I, 74M2D - into an int corresponding to bases
-            # inserted / deleted - e.g. +1, -2
-            #cigar = smq.sam.getCigar()
-            #indelled_bases = 0
-            #match = re.search(r'(\d+)I', cigar)
-            #if match:
-            #    indelled_bases += int(match.groups()[0])
-            #match = re.search(r'(\d+)D', cigar)
-            #if match:
-            #    indelled_bases -= int(match.groups()[0])
-
-            # We consider a read properly aligned if 
-            # - both start positions are the same OR
-            # - the end position of the mapping = end position of the samq
-            #if smq.sam.getPos() != smq.map.pos() and smq.sam.getPos() - indelled_bases != smq.map.pos():
-                
-            try:
-                map_indel_bases = smq.map.indelled_bases() # total number of indels in mapped seq
-                # are generally at the beginning of an alignment spanning the anchor base
-            except AttributeError:
-                map_indel_bases = 0 # Since MAQ doesn't do local alignment, we don't care 
-                # if its sam.py interface doesn't have access to indel data since there are 
-                # no indels in its alignmnets
-
-            if smq.sam.getPos() != smq.map.pos() and smq.map.pos() + sam_indel + map_indel_bases != smq.sam.getPos():
-                #print >> feval, "Indelled SAM bases: "+str(indelled_bases)
-                try:
-                    print >> feval, "Indelled map bases: "+str(map_indel_bases)
-                except AttributeError:
-                    pass
-                print >> feval, "SAM ref: %5s %s" % (smq.sam.getRname(), smq.sam.getPos())
-                print >> feval, "Mapping: %5s %s" % (smq.map.contig(), smq.map.pos())
-                print >> feval, "SAM record:\n"+str(smq.sam)
-                print >> feval, "Mapping record:\n"+str(smq.map)+"\n"
-                incorrectly_placed += 1
-            else:
-                correctly_placed +=1
-            
-            
-    #indexed_maps = [None] * len(sams)
-    #for q in maps:
-    #    indexed_maps[q.id()] = q
-            
-    #def f(s,q): 
-    #    if q == None: 
-    #        return s
-    #    else:
-    #        return False
-
-    #missing = filter( None, map(f, sams, indexed_maps))#, range(len(indexed_maps))) )
-    
-
-    #for miss in missing:
-    #    print miss
-    #not_placed = len(missing)
-    #not_placed = len(sams) - len(maps)
-    total_reads = len(sams)  #placed + not_placed
-    not_placed = len(sams) - placed
-
-    print >> feval, "Total reads  : "+str(total_reads)
-    print >> feval, "+-Placed     : "+str(placed)
-    print >> feval, "| +-Correct  : "+str(correctly_placed)
-    print >> feval, "| +-Incorrect: "+str(incorrectly_placed)
-    print >> feval, "+-Not placed : "+str(not_placed)
-
-    fevaltab.write( "\t".join( map(str, [0, total_reads, placed, correctly_placed, incorrectly_placed, not_placed]) ) + "\n" )
-    print "Wrote all reads with all quals"
-
-    #if file_output_head:
-    #    sys.stdout = saveout
-
-def remove_escape_codes(s):
-    import re
-    return re.sub(r'\x1b\[(\d\d)?\w', "", s)
-
-def usage():
-    print "EvalMapping.py\n"
-    print "Required arguments:\n"
-    print "  -h HEAD    All input and output files start with HEAD"
-    print "   --OR--"
-    print "  <list of *.sam files on STDIN to use as filename templates>"
-    print "    i.e. ls *.sam | EvalMapping.py -m MAPPER"
-    print
-    print "  -m MAPPER  Mapping program output to compare (e.g. merlin, ILT)"
-    print
-    print "Optional arguments:\n"
-    print 
-    print "  -o         Output to screen instead of HEAD.MAPPER.eval"
-    print 
-    print "Input files:"
-    print "  HEAD.sam"
-    print 
-    print "Result files expected for Merlin and ILT:"
-    print "  HEAD.MAPPER.qltout"
-    print 
-    print "Result files expected for MAQ:"
-    print
-    sys.exit(2)
-
-if __name__ == "__main__":
-    opts = None
-    try:
-        opts, args = getopt.getopt(sys.argv[1:], "h:m:o", ["head","mapper","file-output"])
-    except getopt.GetoptError:
-        usage()
-
-    comp_heads = []
-    mapper_str = ""
-    file_output = True
-    for opt, arg in opts:
-        if opt in ("-h", "--head"):
-            comp_heads = [arg]
-        if opt in ("-m", "--mapper"):
-            mapper_str = arg
-            possible_mappers = mapper_defs.keys()
-            # Check it later now...
-            #if mapper_str not in possible_mappers:
-            #    print "--mapper argument was \""+mapper_str+"\"; expected one of the following: "+", ".join(possible_mappers)
-            #    sys.exit(2)
-        if opt in ("-o", "--file-output"):
-            file_output = False
-
-    # Check for required args
-    if (mapper_str == ""):
-        usage()
-    elif (mapper_str == "all"):
-        mappers = mapper_defs.values()
-        print mappers
-    else:
-        try:
-            mapper_str_items = mapper_str.split(",")
-            mappers = [mapper_defs.get(mapper.lower()) for mapper in mapper_str_items]
-        except KeyError:
-            print "Don't know this mapper given in -m option: "+mapper
-            print "Try: ilt, merlin, maq, bwa"
-            sys.exit(1)
-
-    # if we didn't already get a single file from -h 
-    if len(comp_heads) == 0:
-        # and there's a list on STDIN,
-        if not sys.stdin.isatty(): # redirected from file or pipe
-            comp_heads = [ os.path.splitext( remove_escape_codes(file).rstrip() )[0] for file in sys.stdin.readlines() ]
-            comp_heads = [ f for f in comp_heads if len(f) > 0 ]
-        else:
-            # no single file AND no STDIN file list
-            usage()
-    
-    print "\nFile heads to evaluate:"
-    print "\n".join(comp_heads)+"\n"
-
-    comp_heads.reverse()
-    for comp_head in comp_heads: #[:1]
-        for mapper in mappers:
-            # Guess the original size of simulated indel from filename
-            m = re.search(r'_den\.(\d+)_', comp_head)
-            if m:
-                if ".DELETION_" in comp_head:
-                    indel_size = -1 * int(m.groups()[0])
-                elif ".INSERTION_" in comp_head:
-                    indel_size =  int(m.groups()[0])
-                else:
-                    indel_size = 0
-            else:
-                sys.exit("Couldn't guess simulated read indel size from filename")
-    
-            print "Evaluating "+mapper.name+" results with indel size: "+str(indel_size)+" and file header: "+comp_head
-            Comp_sam_map(comp_head, mapper, file_output, indel_size)
-
--- a/python/FastaQuals2Fastq.py
+++ b/python/FastaQuals2Fastq.py
@ -1,65 +0,0 @@
-#!/usr/bin/env python
-
-# Utility script to convert FASTA + QUALS file into FASTQ required for MAQ
-# MAQ needs to convert fastq into bfq file using fastq2bfq in order to use
-# this data
-
-import sys, fasta, os
-from itertools import *
-
-def usage():
-    print "Usage: fastq.py fasta_in quals_in fastq_out"
-    print "  fasta_in: *.fasta file"
-    print "  quals_in: *.quals.txt OR *.quala"
-    sys.exit()
-
-def output_fastq_record(file_handle, fasta, quals):
-    #print 'fasta, quals', fasta, quals
-    #for fasta, qual in zip(fastas, quals):
-    
-    qualsVector = map(int, quals.seq.split())
-    
-    print >> fqout, "@"+fasta.id
-    print >> fqout, fasta.seq
-    print >> fqout, "+"
-    print >> fqout, "".join( map(lambda q: chr(q+33), qualsVector ) )
-
-def qualStr2fasta( line ):
-    elts = line.split()
-    id = elts[0]
-    quals = ' '.join(elts[1:])
-    return fasta.fasta_record(id, quals)
-
-if __name__ == "__main__":
-    if len(sys.argv[1:]) != 3:
-        usage()
-    else:
-        fasta_in_file, quals_in_file, fastq_out_file = sys.argv[1:]
-
-    print "Quals input file: "+quals_in_file
-    print "Fasta input file: "+fasta_in_file
-    
-    if os.path.splitext(quals_in_file)[1] == ".txt":
-        qual_factory = imap( qualStr2fasta, file(quals_in_file) )
-    else:
-        if os.path.splitext(quals_in_file)[1] == ".quala":
-            # Create fasta factory from the quala file and treat it 
-            # as if it was a fasta file and pull the numbers
-            qual_factory = fasta.fasta_file(quals_in_file, cleanup=False)
-        else:
-            print "Qual input file should be a *.quals.txt or *.quala file"
-
-    print "Fastq ouptut file: "+fastq_out_file
-    fqout = file(fastq_out_file, "w")
-
-    for qual_record, fasta_record in izip_longest(qual_factory, fasta.fasta_file(fasta_in_file)):  
-        if qual_record == None or fasta_record == None:
-            # We ran out of one record type, so warn the user!
-            print "WARNING: Different number of quals and fastas in input files!"
-            sys.exit(1)
-        #print 'qual_record', qual_record
-        #print 'fasta_record', fasta_record
-        output_fastq_record(fqout, fasta_record, qual_record)
-
-
-    
--- a/python/FlatFileTable.py
+++ b/python/FlatFileTable.py
@ -1,40 +0,0 @@
-#!/usr/bin/env python
-
-import sys, itertools
-
-def record_generator(filename, sep="\t", skip_n_lines=0, skip_until_regex_line=""):
-    """Given a file with field headers on the first line and records on subsequent lines,
-generates a dictionary for each line keyed by the header fields"""
-    fin = open(filename)
-
-    if skip_n_lines > 0:
-        for i in range(skip_n_lines): # Skip a number of lines
-            fin.readline()
-
-    found_regex = False
-    if skip_until_regex_line != "":
-        import re
-        regex_line = re.compile(skip_until_regex_line)
-        for line in fin:
-            match = regex_line.search(line)
-            if match:
-                found_regex = line
-                break
-        if not found_regex:
-            print "Warning: Regex "+skip_until_regex_line+" not found in FlatFileTable:record_generator"
-
-    if found_regex:
-        header = found_regex.rstrip().split(sep) # Parse header
-    else:
-        header = fin.readline().rstrip().split(sep) # Pull off header
-    
-    for line in fin: # 
-        fields = line.rstrip().split(sep)
-        record = dict(itertools.izip(header, fields))
-        yield record
-
-def record_matches_values(record, match_field_values):
-    for match_field, match_values in match_field_values:
-        if record[match_field] not in match_values:
-            return False
-    return True
--- a/python/Geli2GFF.py
+++ b/python/Geli2GFF.py
@ -1,49 +0,0 @@
-#!/usr/bin/env python
-
-# Converts Geli files (genotype likelihood in picard) to GFF format for GATK
-
-import sys, os, subprocess
-
-Index2AffyChr = []
-Index2AffyChr.append("chrM")
-for i in range(1,23):
-    Index2AffyChr.append("chr"+str(i))
-Index2AffyChr.append("chrX")
-Index2AffyChr.append("chrY")
-
-if __name__ == "__main__":
-    if len(sys.argv) < 3:
-        print "usage: Geli2GFF.py INPUT_FILE OUTPUT_DIRECTORY"
-        sys.exit()
-    else:
-        infile = sys.argv[1]
-        outdir = sys.argv[2]
-
-    output_file = outdir+"/"+os.path.splitext(os.path.basename(infile))[0]+".gff"
-    outfile = open(output_file, "w")
-
-    cmd = "/seq/software/picard/current/bin/GeliToText.jar I="+infile
-    pipe = subprocess.Popen(cmd, shell=True, bufsize=4000, stdout=subprocess.PIPE).stdout
-
-    pipe.readline()
-    for line in pipe:
-        if line[0] not in ('@', '#'):
-            fields = line.split("\t")
-            #print fields
-            try:
-                contig = fields[0] #Index2AffyChr[int(fields[0])]
-            except KeyError, ValueError:
-                print "skipping "+fields[0]
-                continue # Skip entries not in chr 0 through 24
-                                      
-            print >>outfile, contig+"\tHapmapGenotype\t"+fields[5]+"\t"+fields[1]+"\t"+str(int(fields[1])+1)+"\t.\t.\t.\t"
-
-
-            
-
-
-
-
-
-
-    
--- a/python/Gelis2PopSNPs.py
+++ b/python/Gelis2PopSNPs.py
@ -1,170 +0,0 @@
-import farm_commands
-import os.path
-import sys
-from optparse import OptionParser
-import string
-import re
-import glob
-import picard_utils
-import itertools
-
-gatkPath = "~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar"
-ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
-   
-def geli2dbsnpFile(geli):
-    root, flowcellDotlane, ext = picard_utils.splitPath(geli)
-    if ext == ".calls":
-        return None
-    else:
-        return os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
-
-def SingleSampleGenotyperCmd(bam, geli, use2bp):
-    naid = os.path.split(bam)[1].split(".")[0]
-    metrics = geli + '.metrics'
-    gatkPath = '/humgen/gsa-scr1/kiran/repositories/Sting/trunk/dist/GenomeAnalysisTK.jar'
-    hapmapChip = '/home/radon01/andrewk/hapmap_1kg/gffs/' + naid + '.gff'
-    hapmapChipStr = ' '.join(['--hapmap_chip', hapmapChip]) 
-    if not os.path.exists(hapmapChip):
-        print '*** warning, no hapmap chip resuls for', naid
-        hapmapChipStr = ''
-
-    targetList = '/home/radon01/depristo/work/1kg_pilot_evaluation/data/thousand_genomes_alpha_redesign.targets.interval_list'
-    cmd = "java -ea -jar " + gatkPath + ' ' + ' '.join(['-T SingleSampleGenotyper', '-I', bam, '-L', targetList, '-R', ref, '-D', '/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod', '-metout', metrics, '-varout', geli, '-geli -l INFO ']) + hapmapChipStr
-    return cmd
- 
-def bams2geli(bams):
-    def call1(bam):
-        geli = os.path.splitext(bam)[0] + '.geli'
-        jobid = 0
-        if OPTIONS.useSSG:
-            if not os.path.exists(geli + '.calls'):
-                cmd = SingleSampleGenotyperCmd(bam, geli + '.calls', OPTIONS.useSSG2b)
-        else:
-            if not os.path.exists(geli):
-                cmd = picard_utils.callGenotypesCmd( bam, geli, options = picard_utils.hybridSelectionExtraArgsForCalling())
-        jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry )
-        return geli, jobid
-    calls = map(call1, bams)
-    return map(lambda x: x[0], calls), map(lambda x: x[1], calls)        
-   
-def gelis2gelisText( gelis ):
-    def geli2geliText( maybeGeli ):
-        if os.path.splitext(maybeGeli)[1] == ".calls" :
-            return maybeGeli
-        else:
-            return os.path.split(maybeGeli)[1] + '.calls'
-            
-    return map( geli2geliText, gelis)
-
-   
-def main():    
-    global OPTIONS, ROOT
-
-    usage = "usage: %prog lanes.list nIndividuals [options]"
-    parser = OptionParser(usage=usage)
-    parser.add_option("-q", "--farmQueue", dest="farmQueue",
-                        type="string", default=None,
-                        help="Farm queue to submit jobs to.  Leave blank for local processing")
-    parser.add_option("-l", "--lod", dest="lod",
-                        type="float", default=5,
-                        help="minimum lod for calling a variant")
-    parser.add_option("-k", "--column", dest="column",
-                        type="int", default=1,
-                        help="Column in the file with the bam or geli file path")
-    parser.add_option("", "--dry", dest="dry",
-                        action='store_true', default=False,
-                        help="If provided, nothing actually gets run, just a dry run")
-    parser.add_option("", "--ssg", dest="useSSG",
-                        action='store_true', default=False,
-                        help="If provided, we'll use the GATK SSG for genotyping")
-    parser.add_option("", "--ssg2b", dest="useSSG2b",
-                        action='store_true', default=False,
-                        help="If provided, we'll use 2bp enabled GATK SSG ")
-    parser.add_option("-o", "--output", dest="output",
-                        type="string", default='/dev/stdout',
-                        help="x")
-                        
-    (OPTIONS, args) = parser.parse_args()
-    if len(args) != 2:
-        parser.error("incorrect number of arguments: " + str(args))
-    lines = [line.strip().split() for line in open(args[0])]
-    nIndividuals = int(args[1])
-    
-    outputFile = open(OPTIONS.output, 'w')
-    print >> outputFile, '#', ' '.join(sys.argv)
-    
-    data = map( lambda x: x[OPTIONS.column-1], lines )
-    if os.path.splitext(data[0])[1] == '.bam':
-        gelis, jobids = bams2geli(data)
-        if filter(lambda x: x <> 0, jobids) <> []:
-            # there's still work to do
-            sys.exit('Stopping.  Please rerun this program when the farm jobs are complete: ' + str(jobids))
-            # TODO: Should add a wait here for all farm jobs to finish...
-        print 'gelis', gelis
-        print 'jobids', jobids
-    else:
-        gelis = map( lambda x: x[OPTIONS.column-1], lines )
-        jobids = [None] * len(gelis)
-    
-    print 'Geli files'
-    print gelis
-
-    for geli, jobid in zip(gelis, jobids):
-        dbsnpFile = geli2dbsnpFile(geli)
-        if dbsnpFile == None and not os.path.exists(dbsnpFile):
-            dbsnpCmd = picard_utils.CollectDbSnpMatchesCmd(geli, dbsnpFile, OPTIONS.lod)
-            if jobid == 0: jobid = None
-            farm_commands.cmd(dbsnpCmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid)
-
-    # TODO: Should add a wait here for all farm jobs to finish...
-
-    # read in the dbSNP tracks
-    nTotalSnps = 0
-    nNovelSnps = 0
-    for geli in gelis:
-        root, flowcellDotlane, ext = picard_utils.splitPath(geli)
-        #dbsnp_matches = os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
-        dbsnp_matches = geli2dbsnpFile(geli)
-        print dbsnp_matches
-        if os.path.exists(dbsnp_matches):
-            TOTAL_SNPS, NOVEL_SNPS, PCT_DBSNP, NUM_IN_DB_SNP = picard_utils.read_dbsnp(dbsnp_matches)
-            nTotalSnps += int(TOTAL_SNPS)
-            nNovelSnps += int(NOVEL_SNPS)
-            print >> outputFile, '# DATA:    ', flowcellDotlane, TOTAL_SNPS, NOVEL_SNPS, PCT_DBSNP, NUM_IN_DB_SNP, dbsnp_matches
-    print  >> outputFile, '# DATA:    TOTAL SNP CALLS SUMMED ACROSS LANES, NOT ACCOUNT FOR IDENTITY', nTotalSnps
-    print  >> outputFile, '# DATA:    NOVEL SNP CALLS SUMMED ACROSS LANES, NOT ACCOUNT FOR IDENTITY ', nNovelSnps
-    print  >> outputFile, '# DATA:    AVERAGE DBSNP RATE ACROSS LANES %.2f' % (100.0 * float(nTotalSnps - nNovelSnps) / (max(nTotalSnps, 1)))
-
-    # convert the geli's to text
-    jobid = None
-    variantsOut = gelis2gelisText( gelis ) 
-    for geli, variantOut in zip(gelis, variantsOut):
-        name = os.path.split(geli)[1]
-        if not os.path.exists(variantOut):
-            cmd = ("GeliToText.jar I=%s | awk '$1 !~ \"@\" && $1 !~ \"#Sequence\" && $0 !~ \"GeliToText\"' | awk '$7 > %f {print \"%s\" $0}' > %s" % ( geli, OPTIONS.lod, name, variantOut) )
-            jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry)
-
-    cmd = ("cat %s | sort -k 1 -k 2 -n > tmp.calls" % ( ' '.join(variantsOut) ) )
-    jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid)
-
-    sortedCallFile = 'all.sorted.calls'
-    cmd = ("~/dev/GenomeAnalysisTK/trunk/perl/sortByRef.pl -k 1 tmp.calls ~/work/humanref/Homo_sapiens_assembly18.fasta.fai > %s"  % ( sortedCallFile ) )    
-    jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid)
-
-    if not OPTIONS.dry:    
-        sortedCalls = [line.split() for line in open(sortedCallFile)]
-        aggregratedCalls = picard_utils.aggregateGeliCalls(sortedCalls)
-
-        print >> outputFile, 'loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom'
-    
-        for snp in map( lambda x: picard_utils.aggregatedGeliCalls2SNP(x, nIndividuals), aggregratedCalls ):
-            if snp == None: continue    # ignore bad calls
-            #print snp
-            #sharedCalls = list(sharedCallsGroup)
-            #genotype = list(sharedCalls[0][5])
-            print >> outputFile, '%s   %s %s %.6f -0.0 -0.0 0.000000 100.0 %d %d %d %d' % (snp.loc, snp.ref, snp.alt(), snp.q(), nIndividuals, snp.nRefGenotypes(), snp.nHetGenotypes(), snp.nHomVarGenotypes())
-        outputFile.close()
-    
-                
-if __name__ == "__main__":
-    main()
--- a/python/LogRegression.py
+++ b/python/LogRegression.py
@ -1,48 +0,0 @@
-#!/usr/bin/env python
-
-import subprocess, time, sys
-
-dinucs = ("AA","AC","AG","AT","CA","CC","CG","CT","GA","GC","GG","GT","TA","TC","TG","TT")
-
-#covar_work_dir = "/humgen/gsa-scr1/andrewk/recalibration_work"
-#covar_table_dir = "/humgen/gsa-scr1/andrewk/recalibration_tables"
-fileroot = sys.argv[1]
-covar_counts_root = fileroot+".covariate_counts"
-parameter_root = fileroot+".log_reg"
-recal_table = fileroot+".recalibration_table"
-
-# Run all R process to do regression and wait for completion
-kids = []
-for dinuc in dinucs:
-    kids.append(subprocess.Popen(["/home/radon01/andrewk/covariates/logistic_regression.R", covar_counts_root, parameter_root, dinuc]))
-    #kids.append(subprocess.Popen(["sleep", "8"]))
-
-while any(kid.poll() is None for kid in kids):
-    time.sleep(0.25)
-
-fout = file(recal_table, "w")
-
-fout.write("dinuc_v.1\t")
-for p in range(5):
-    for q in range(5):
-        fout.write("%d,%d\t" % (p,q))
-fout.write("\n")
-
-for dinuc in dinucs:
-    dinin = open(parameter_root+"."+dinuc+".parameters")
-    #dinin.readline()
-    params = []
-    for line in dinin:
-        line.rstrip("\n")
-        params.extend(map(float, line.split()))
-    fout.write(dinuc+"\t")
-    fout.write("\t".join(map(str, params)))
-    fout.write("\n")
-        
-
-
-
-
-
-
-
--- a/python/LogisticRegressionByReadGroup.py
+++ b/python/LogisticRegressionByReadGroup.py
@ -1,99 +0,0 @@
-#!/usr/bin/env python
-
-import os,sys
-
-def exit(msg,errorlevel):
-    "Exit the program with the specified message and error code."
-    print msg
-    sys.exit(errorlevel)
-
-def open_source_file(source_filename):
-    "Open the source file with the given name.  Make sure it's readable."
-    if not os.access(source_filename,os.R_OK):
-        exit("Unable to read covariate counts file '" + sys.argv[1] + "'",1)
-    if not source_filename.endswith('.csv'):
-        exit("Source file is in incorrect format.  Must be csv.")
-    return open(source_filename,'r')
-
-def read_header(source_file):
-    "Read the header from the given source file.  Do basic validation."
-    header = source_file.readline().split(',')
-    if header[0] != 'rg' or header[1] != 'dn':
-        exit("Input file is in invalid format.  First two columns should be <read group>,<dinucleotide>",1)
-    return header
-
-def create_intermediate_file(source_file,header,read_group,dinuc):
-    "Create an intermediate file for a particular read group / dinuc from a given source file"
-    base = source_file.name[:source_file.name.rfind('.csv')]
-    intermediate_filename = "%s.%s.%s.csv" % (base,read_group,dinuc)
-    intermediate_file = open(intermediate_filename,"w")
-    intermediate_file.write(','.join(header[2:]))
-    return intermediate_file
-
-def open_target_file(target_filename):
-    "Open a target file and write out the header."
-    target_file = open(target_filename,'w')
-    target_file.write("rg\tdinuc\t")
-    for p in range(5):
-        for q in range(5):
-            target_file.write("%d,%d\t" % (p,q))
-    target_file.write("\n")
-    return target_file
-
-def process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script):    
-    "Process the contents of an intermediate file.  An intermediate file is the specific data arranged per read group, per dinuc."
-    base = source_file.name[:source_file.name.rfind('.csv')] + '.' + read_group
-    regression_command = ' '.join((R_exe,logistic_regression_script,base,base,dinuc))
-    result = os.system(regression_command)
-    if result != 0:
-        exit('Unable to run linear regression; command was %s, error code was %d' % (regression_command, result),1)
-    parameters_filename = '.'.join((base,dinuc,'parameters'))
-    if not os.access(parameters_filename,os.R_OK):
-        exit("Unable to read output of R from file " + parameters_filename)
-    parameters_file = open(parameters_filename,'r')
-    parameters = ' '.join([line.rstrip('\n') for line in parameters_file]).split(' ')
-    target_file.write('\t'.join([read_group,dinuc]+parameters)+'\n')
-    # optionally remove the input and output files.  For now, keep them around for debugging purposes.
-    # os.remove('.'.join((base,dinuc,'csv')))
-    # os.remove('.'.join((base,dinuc,'parameters')))
-    
-def compute_logistic_regression(source_filename,target_filename,R_exe,logistic_regression_script):
-    "Do the heavy lifting.  Group covariant data into individual output files, compute the logistic regression, and format the results."
-    source_file = open_source_file(source_filename)
-    target_file = open_target_file(target_filename)
-
-    header = read_header(source_file)
-
-    intermediate_file = None
-    read_group = None
-    dinuc = None
-
-    for data_line in source_file:
-        data_line = data_line.strip()
-        if len(data_line) == 0:
-            continue
-        data = data_line.split(',')
-        if read_group != data[0] or dinuc != data[1]:
-            if intermediate_file:
-                intermediate_file.close()
-                process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script)
-            read_group,dinuc = data[0:2]
-            intermediate_file = create_intermediate_file(source_file,header,read_group,dinuc)
-        intermediate_file.write(','.join(data[2:])+'\n')
-
-    if intermediate_file:
-        intermediate_file.close()
-        process_file(source_file,read_group,dinuc,target_file,R_exe,logistic_regression_script)
-
-    source_file.close()
-    target_file.close()
-
-if __name__ == "__main__":
-    # set up sensible defaults for the locations of R and the logistic regression script
-    R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
-    logistic_regression_script="resources/logistic_regression.R"
-
-    if len(sys.argv) < 3:
-        exit("Usage: logistic_regression <covariate count input csv> <parameters csv>",1)
-
-    compute_logistic_regression(sys.argv[1],sys.argv[2],R_exe,logistic_regression_script)
--- a/python/MergeBAMBatch.py
+++ b/python/MergeBAMBatch.py
@ -1,78 +0,0 @@
-import farm_commands
-import os.path
-import sys
-from optparse import OptionParser
-from datetime import date
-import glob
-import operator
-import ValidateGATK
-import picard_utils
-from MergeBAMsUtils import *
-
-def splitSourcesByPopulation(allSources, merged_filename_base, NAID2Pop):
-    sourcePairs = [[source, source] for source in allSources]
-    return groupSources(sourcePairs, NAID2Pop, merged_filename_base)
-
-if __name__ == "__main__":
-    usage = "usage: %prog files.list [options]"
-    parser = OptionParser(usage=usage)
-    parser.add_option("-q", "--farm", dest="farmQueue",
-                        type="string", default=None,
-                        help="Farm queue to send processing jobs to")
-    parser.add_option("-d", "--dir", dest="output_dir",
-                        type="string", default="./",
-                        help="Output directory")
-    parser.add_option("", "--dry", dest="dry",
-                        action='store_true', default=False,
-                        help="If provided, nothing actually gets run, just a dry run")
-    parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles",
-                        action='store_true', default=False,
-                        help="Ignores already written files, if present")
-    parser.add_option("-s", "--useSamtools", dest="useSamtools",
-                        action='store_true', default=False,
-                        help="If present, uses samtools to perform the merge")
-    parser.add_option("-m", "--mergeBin", dest="mergeBin",
-                        type="string", default=None,
-                        help="Path to merge binary")
-    parser.add_option("-n", "--naIDPops", dest="NAIDS2POP",
-                        type="string", default=None,
-                        help="Path to file contains NAID POP names.  If provided, input files will be merged by population")
-    parser.add_option("-p", "--pop", dest="onlyPop",
-                        type="string", default=None,
-                        help="If provided, only this population will be processed")
-                        
-    (OPTIONS, args) = parser.parse_args()
-    if len(args) != 1:
-        parser.error("incorrect number of arguments")
-
-    directory = OPTIONS.output_dir
-    
-    if not os.path.exists(directory):
-        os.mkdir(directory)
-    
-    NAID2Pop = None
-    if OPTIONS.NAIDS2POP <> None:
-        NAID2Pop = readNAIdMap(OPTIONS.NAIDS2POP)
-    
-    for line in open(args[0]):
-        s = line.split()
-        if ( s <> [] and s[0] <> '#' ):
-            merged_filename_base = s[0]
-            allSources = reduce( operator.__add__, map( glob.glob, s[1:] ), [] )
-            print 'Merging info:'
-            for spec in splitSourcesByPopulation(allSources, merged_filename_base, NAID2Pop):
-                if OPTIONS.onlyPop <> None and spec.group() <> OPTIONS.onlyPop:
-                    continue
-            
-                spec.setPath(directory)
-                spec.pprint()
-                
-                jobid = None
-                if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAM()):
-                    output = spec.getMergedBase()
-                    cmd = spec.mergeCmd(OPTIONS.mergeBin, useSamtools = OPTIONS.useSamtools)
-                    jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry)
-    
-                if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAMIndex()):
-                    jobid = farm_commands.cmd(spec.getIndexCmd(), OPTIONS.farmQueue, None, waitID = jobid, just_print_commands = OPTIONS.dry)
-
--- a/python/MergeBAMsUtils.py
+++ b/python/MergeBAMsUtils.py
@ -1,210 +0,0 @@
-import farm_commands
-import os.path
-import sys
-from optparse import OptionParser
-from datetime import date
-import glob
-import operator
-import ValidateGATK
-import picard_utils
-import operator
-
-MERGE_BIN = '/seq/software/picard/current/bin/MergeSamFiles.jar'
-bam_ext = 'bam'
-bam_index_ext = 'bai'
-
-def readNAIdMap(NAIdFile):
-    m = dict()
-    for data in [line.split() for line in open(NAIdFile)]:
-        naid, pop = data[0:2]
-        print naid, ' => ', pop
-        assert naid not in m
-        m[naid] = pop
-    print 'Read NAID->population map'
-    print 'Contains', len(m), 'id -> population mappings'
-    print 'Distinct populations:', picard_utils.unique(m.values())
-    return m
-
-_abbrevs = [
-    (1<<50L, 'P'),
-    (1<<40L, 'T'), 
-    (1<<30L, 'G'), 
-    (1<<20L, 'M'), 
-    (1<<10L, 'k'),
-    (1, '')
-    ]
-
-def greek(size):
-    """Return a string representing the greek/metric suffix of a size"""
-    for factor, suffix in _abbrevs:
-        if size > factor:
-            break
-    return '%.1f%s' % (float(size)/factor, suffix)
-
-class MergeFilesSpec:
-    def __init__(self, sources, group, merged_filename_base, path = '' ):
-        self.sourceFiles = sources
-        self.groupName = group
-        self.merged_filename_base = merged_filename_base
-        self.path = ''
-    
-    def __str__(self):
-        return 'MergeFilesSpec: ' + self.group()
-    
-    def group(self):
-        return self.groupName
-
-    def sources(self):
-        return self.sourceFiles
-
-    def filename(self):
-        if self.merged_filename_base <> None:
-            return self.merged_filename_base + '.'.join([self.group()])
-        else:
-            return self.group()
-
-    def pprint(self):
-        print '--------------------------------------------------------------------------------'
-        print ' Population:         ', self.group()
-        print ' Merged filename:    ', self.getMergedBAM()
-        print ' N sources:          ', len(self.sources())
-        for source in sorted(self.sources()):
-            print ' Source:             ', source
-        print ' Sizes:              ', self.sourceSizes(humanReadable=True)
-        print ' Est. merged size:   ', greek(reduce(operator.__add__, self.sourceSizes(), 0))
-        
-    def setPath(self, path):
-        self.path = path
-        
-    def getMergedBase(self):
-        return os.path.join(self.path, self.filename())
-
-    def getMergedBAM(self):
-        return self.getMergedBase() + '.' + bam_ext
-
-    def getMergedBAMIndex(self):
-        return self.getMergedBase() + '.' + bam_ext + '.' + bam_index_ext
-                
-    def sourceSizes(self, humanReadable=False):
-        sizes = map( os.path.getsize, self.sources() )
-        if humanReadable:
-            sizes = map(greek, sizes)
-        return sizes
-                
-    def mergeCmd(self, mergeBin = None, MSD = True, useSamtools = False):
-        if mergeBin == None:
-            mergeBin = MERGE_BIN
-            
-        return picard_utils.mergeBAMCmd(self.getMergedBAM(), self.sources(), mergeBin, MSD = MSD, useSamtools = useSamtools)
-        
-    def getIndexCmd(self):
-        return "samtools index " + self.getMergedBAM()
-       
-# Very general-purpose operation that returns merge specs given two lists of pairs:
-# The first list, sources, contains superkey / sourceFile pairs.  
-# The second list, groups, contains key / group pairs.
-#
-# This function walks over all source pairs, and for each superkey, it 
-# looks for any key within groups contained within superkey.  If it finds it,
-# it associates sourceFile with the merge group in groups.  
-#
-# The system requires that superkey match one (and only) one key in groups.  It also
-# requires that each group string be unique.  The system can handle groups provided
-# as a list of pairs of a dictionary.
-#
-# The function returns a list of MergeFileSpecs, one for each group with 
-# at least one associated sourceFile.
-#
-def groupSources(sources, groups, merged_filename_base):
-    if groups == None:
-        return [MergeFilesSpec(map( lambda x: x[1], sources), '', merged_filename_base)]
-    else:
-        specs = dict()
-        if type(groups) == list: groups = dict(groups)
-        
-        for superkey, sourceFile in sources:
-            spec = None
-            for key, group in groups.iteritems():
-                #print 'Examining', superkey, key, group
-                if superkey.find(key) <> -1:
-                    if group in specs:
-                        spec = specs[group]
-                    else:
-                        spec = MergeFilesSpec([], group, merged_filename_base)
-                        specs[group] = spec
-                    print 'Mapping', group, key, superkey, sourceFile
-                    spec.sourceFiles.append(sourceFile)
-            if spec == None:
-                sys.exit('File contains an unknown superkey: ' + superkey)
-        v = specs.values()
-        v.sort(key = MergeFilesSpec.group)
-        return v
-
-import unittest
-class TestMergeBAMsUtils(unittest.TestCase):
-    def setUp(self):
-        import cStringIO
-        groupsString = """NA10846 ceu CEPH1
-NA10847 ceu CEPH1
-NA12144 ceu CEPH1
-NA12145 ceu CEPH1
-NA12146 yri CEPH1
-NA12239 yri CEPH1
-NA07029 ceu CEPH1
-NA07019 ceu CEPH1
-NA06994 ceu CEPH1
-NA07000 ceu CEPH1
-NA07022 ceu CEPH1
-NA07056 ceu CEPH1
-NA07048 ceu CEPH1
-NA06991 ceu CEPH1
-NA07034 ceu CEPH1
-"""
-        lanesString = """NA10846 30GA9AAXX 1 Paired CEPH 30GA9AAXX.1.observed_genotypes.geli
-NA10846 30GA9AAXX 6 Paired CEPH 30GA9AAXX.6.observed_genotypes.geli
-NA10847 30GA9AAXX 7 Paired CEPH 30GA9AAXX.7.observed_genotypes.geli
-NA12146 30JLTAAXX 2 Paired CEPH 30JLTAAXX.2.observed_genotypes.geli
-NA12239 30PNVAAXX 1 Paired CEPH 30PNVAAXX.1.observed_genotypes.geli
-NA12144 30PYMAAXX 1 Paired CEPH 30PYMAAXX.1.observed_genotypes.geli
-NA12146 30PYMAAXX 7 Paired CEPH 30PYMAAXX.7.observed_genotypes.geli
-"""
-
-        self.laneIDCounts = dict([["NA10846", 2], ["NA10847", 1], ["NA12146", 2], ["NA12239", 1], ["NA12144", 1]])
-
-        pops = [line.strip() for line in cStringIO.StringIO(groupsString)]
-        lanes = [line.strip() for line in cStringIO.StringIO(lanesString)]
-        
-        print pops
-        print lanes
-        
-        self.ids2pop = [line.split()[0:2] for line in pops]
-        self.ids2gelis = [[line.split()[0], line.split()[5]] for line in lanes]
-        self.ids2ids = dict([[line.split()[0]] * 2 for line in lanes])
-
-    def testPopGroups(self):
-        specs = groupSources(self.ids2gelis, self.ids2pop, 'foo')
-        print 'Specs', specs
-        self.assertEqual(len(specs), 2)
-        self.assertEqual(specs[0].group(), 'ceu')
-        self.assertEqual(specs[1].group(), 'yri')
-        
-        ceu = specs[0]
-        yri = specs[1]
-        #print ceu.sources()
-        self.assertEqual(len(ceu.sources()), 4)
-        self.assertEqual(len(yri.sources()), 3)
-        self.assertEqual(ceu.getMergedBAM(), 'foo.ceu.bam')
-        self.assertEqual(ceu.getMergedBAMIndex(), 'foo.ceu.bam.bai')
-        self.assertEqual(yri.getMergedBAM(), 'foo.yri.bam')
-        self.assertEqual(yri.getMergedBAMIndex(), 'foo.yri.bam.bai')
-
-    def testIDGroups(self):
-        specs = groupSources(self.ids2gelis, self.ids2ids, 'foo')
-        self.assertEqual(len(specs), 5)
-        for spec in specs:
-            print 'Spec', spec
-            self.assertEqual(len(spec.sources()), self.laneIDCounts[spec.group()])
-            self.assertEqual(spec.getMergedBAM(), 'foo.' + spec.group() + '.bam')
-
-if __name__ == '__main__':
-    unittest.main()
--- a/python/MergeBamsByKey.py
+++ b/python/MergeBamsByKey.py
@ -1,82 +0,0 @@
-import farm_commands
-import os.path
-import sys
-from optparse import OptionParser
-import picard_utils
-from MergeBAMsUtils import *
-
-def splitSourcesByKeys( bams, keys ):
-    keyPairs = [[key, key] for key in keys]
-    keybamPairs = zip(keys, bams)
-    return groupSources(keybamPairs, keyPairs, None)
-
-if __name__ == "__main__":
-    usage = """usage: %prog bams.list [options]
-Merges BAM files by keys from a file of a list of bams.
-
-bams.list is a whitespace separated file.  One column (--keyCol arg) is the key, and another
-column (--bamCol) is a path to a bam file.  This program will group the bam files
-by key and spawn merge and index jobs to merge all of the files sharing the same key together"""
-
-    parser = OptionParser(usage=usage)
-    parser.add_option("-q", "--farm", dest="farmQueue",
-                        type="string", default=None,
-                        help="Farm queue to send processing jobs to")
-    parser.add_option("-d", "--dir", dest="output_dir",
-                        type="string", default="./",
-                        help="Output directory")
-    parser.add_option("", "--dry", dest="dry",
-                        action='store_true', default=False,
-                        help="If provided, nothing actually gets run, just a dry run")
-    parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles",
-                        action='store_true', default=False,
-                        help="Ignores already written files, if present")
-    parser.add_option("-m", "--mergeBin", dest="mergeBin",
-                        type="string", default=None,
-                        help="Path to merge binary")
-    parser.add_option("", "--MSD", dest="MSD",
-                        action='store_true', default=False,
-                        help="Merge sequence dictionaries?")
-    parser.add_option("", "--keyCol", dest="keyCol",
-                        type=int, default=1,
-                        help="Column in the list file holding the key")
-    parser.add_option("", "--bamCol", dest="bamCol",
-                        type=int, default=2,
-                        help="Column in the list file holding the bam file path")
-    parser.add_option("-l", "--link", dest="link",
-                        action='store_true', default=False,
-                        help="If true, program will soft link single bam files that don't need merging")
-                        
-    (OPTIONS, args) = parser.parse_args()
-    if len(args) != 1:
-        parser.error("incorrect number of arguments")
-
-    directory = OPTIONS.output_dir
-    
-    if not os.path.exists(directory):
-        os.mkdir(directory)
-    
-    bamsList = [line.strip().split() for line in open(args[0])]
-    keys = map( lambda x: x[OPTIONS.keyCol-1], bamsList ) 
-    bams = map( lambda x: x[OPTIONS.bamCol-1], bamsList ) 
-    
-    print 'Merging info:'
-    for info in bamsList: print info
-    for spec in splitSourcesByKeys(bams, keys):
-        spec.setPath(directory)
-        spec.pprint()
-        
-        jobid = None
-        if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAM()):
-            output = spec.getMergedBase() + '.stdout'
-            if len(spec.sources()) == 1 and OPTIONS.link:
-                cmd = 'ln -s ' + spec.sources()[0] + ' ' + spec.getMergedBAM()
-            else:
-                cmd = spec.mergeCmd(OPTIONS.mergeBin, MSD = OPTIONS.MSD)
-            print cmd
-            jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, just_print_commands = OPTIONS.dry)
-
-        if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAMIndex()):
-            #pass
-            jobid = farm_commands.cmd(spec.getIndexCmd(), OPTIONS.farmQueue, None, waitID = jobid, just_print_commands = OPTIONS.dry)
-
--- a/python/MergeEvalMapTabs.py
+++ b/python/MergeEvalMapTabs.py
@ -1,154 +0,0 @@
-#!/usr/bin/env python
-
-# merges .tab files produced by EvalMapping.py into a tabular format
-
-#from __future__ import print_function # python 2.6 / 3.0 print as a function
-import os,sys,re
-from rpy2 import *
-
-def ordered_eval_files(aligner, mutation_type, file_ext):
-    tab_files = [f for f in os.listdir(".") if f.endswith("."+aligner+"."+file_ext) and (mutation_type in f or "None_den." in f)]
-    
-    tabs = []
-    for f in tab_files:
-        num = int(re.search(r'den\.(\d\d?)_', f).groups()[0])
-        if "None_den" in f:
-            num = 0
-        tabs.append( (num,f) )
-    tabs.sort()
-    return tabs
-
-def anchor_hist(filename):
-    fin = open(filename)
-    offsets = []
-    read_poses = []
-    
-    for line in fin:
-        id = None
-        fields = line.split()
-        if len(fields) > 0 and ".read." in fields[0]:
-            id = line.split()[0]
-            read_poses.append(id.split(".")[-1])
-            offsets.append(id.split(":")[1].split(".")[0])
-
-        #if "read name :" in line:
-        #    id = line.split()[3]
-            
-    import rpy2.robjects as robj
-    if len(read_poses):
-        print len(read_poses)
-        return robj.r.hist(robj.FloatVector(read_poses), breaks=77, plot=False)
-    else:
-        print 0
-        return robj.IntVector([])
-
-def estimateCPUTime(filename):
-    times = []
-    if os.path.exists(filename):
-        regex = re.compile('CPU time\s*:\s+([\d\.]+) sec')
-        #print ('Filename', filename)
-        def extractTime1(line):
-            sobj = regex.search(line)
-            if sobj <> None:
-                time = float(sobj.group(1))
-                #print (line, sobj, sobj.groups(), time)
-                return time
-            else:
-                return None
-        times = [time for time in map(extractTime1, open(filename)) if time <> None]
-        #print('times', times)
-        
-    return max(times)
-
-def print_eval_tables(aligner, mut_type, num_var, filename, min_qual_cutoff=0):
-    filebase = os.path.splitext(filename)[0]
-    cpuTime = estimateCPUTime(filebase + '.stdout')
-
-    lines = open(filename).readlines()
-    data = None
-    if len(lines) > 1:
-        da_line = lines[0]
-        for line in lines:
-            line = line.rstrip()
-            fields = line.split("\t")
-            try:
-                qual_cutoff = int(fields[0])
-            except:
-                qual_cutoff = 1000
-            if qual_cutoff <= min_qual_cutoff:
-                break
-            else:
-                da_line = line
-            
-        #print ("%2d\t" % num_var, file=sys.stderr),
-        data = [aligner, mut_type, min_qual_cutoff, num_var] + map(int, da_line.split()) + [cpuTime]
-        print "%s\t%s\t%2d\t%s\t%.2f" % (aligner, mut_type, num_var, da_line, cpuTime)
-    return data
-
-def plot_anchor_hists():
-    import rpy2.robjects as robj
-    #robj.r.par( mfrow = robj.RVector([2,2]) )
-    #robj.r('X11(width=24, height=15)')
-    robj.r('png("lotto_histo.png", width=1500, height=900)')
-    robj.r('par(mfrow=c(4,6 ), cin=c(3,3))')
-    
-    for (aligner, cutoff) in (("maq", 30), ("ilt",-1), ("merlin",-1), ("swmerlin",-1)):
-        print aligner, ">"+str(cutoff)
-        
-        num_file = ordered_eval_files(aligner, mut_type, file_ext="eval")
-        #print (*num_file, sep="\n")
-        for num, file in num_file:
-            
-            h = anchor_hist( file )
-            title = "{aligner} {num} bp insert".format(aligner=aligner, num=num)
-            robj.r.plot(h, xlab="Anchor position", main=title, col="blue", ylim=robj.FloatVector([0,40]), xlim=robj.FloatVector([0,90]))
-
-    robj.r("dev.off()")
-    #raw_input("Press any key to exit...")
-
-
-def analyzeData( data, mut_type ):
-    import rpy2.robjects as robj
-    # print name / data name
-    aligners = [('MAQ', 'maq>30'), ('Merlin', 'swmerlin>-1'), ('ILT', 'ilt>-1'), ('BWA', 'bwa32>30')] 
-
-    def selectData( key ):
-        def keepDataP( data ):
-            return data[0] == key and data[1] == mut_type
-        return filter( keyDataP, data )
-
-    def placeRatePlot():
-        robj.r('png("x.png", width=1500, height=900)')
-        x = robj.FloatVector(mutDensity)
-        y = robj.FloatVector(placeRate)
-        robj.r.plot(x, y, xlab="Mutation density / size", col="blue")
-        robj.r("dev.off()")
-
-    return 0
-
-if __name__ == "__main__":
-
-    if len(sys.argv) < 2:
-        print "merge_tabs.py MUTATION_TYPE (e.g. INSERTION, DELETION, SNP)"
-        sys.exit(1)
-    else:
-        mut_type = sys.argv[1]
-
-    if False:
-        plot_anchor_hists()
-    else:
-        data = []
-        for (aligner, cutoff) in (("maq", 30), ("maq", 0), ("maq",-1), ("ilt",-1), ("merlin",-1), ("swmerlin",-1), ("bwa", 30), ("bwa", 0), ("bwa", -1), ("bwa32", 30), ("bwa32", 0), ("bwa32", -1)):
-            name = aligner + ">" + str(cutoff)
-            #print (aligner, ">"+str(cutoff))
-            
-            num_file = ordered_eval_files(aligner, mut_type, file_ext="tab")
-            #num_file = ordered_eval_files(aligner, mut_type, file_ext="eval")
-            for num, file in num_file:
-                datum = print_eval_tables( name, mut_type, num, file, cutoff )
-                if datum <> None:
-                    data.append(datum)
-                #print
-    
-        analyzeData( data, mut_type )
-
--- a/python/RecalQual.py
+++ b/python/RecalQual.py
@ -1,221 +0,0 @@
-#!/usr/bin/env python
-
-# Executable files.  Please update these to match the installed locations of your tools.
-samtools_exe='/seq/dirseq/samtools/current/samtools'
-java_exe='/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/bin/java'
-R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
-
-# Any special site-specific arguments to pass the JVM.
-jvm_args='-ea'
-
-# Which platforms should the calibration tool be run over?
-platforms=['illumina']
-
-# Where to put the output created as part of recalibration.  
-# If editing, please end this variable with a trailing slash.
-output_root = './'
-
-# Location of the resource files distributed with the recalibration tool.  
-# If editing, please end this variable with a trailing slash.
-resources='recalibratorResources/'
-#resources='/broad/1KG/DCC_recal/ReadQualityRecalibrator/resources/'
-
-import getopt,glob,os,sys
-import LogisticRegressionByReadGroup
-
-# if the provided resources path is relative, convert it to a path
-# relative to the calling script rather than the cwd.
-if not os.path.isabs(resources):
-    application_path = os.path.dirname(sys.argv[0])
-    resources = os.path.abspath(application_path) + '/' + resources
-
-# Where does the reference live?
-reference_base = resources + 'Homo_sapiens_assembly18'
-reference      = reference_base + '.fasta'
-reference_dict = reference_base + '.dict'
-reference_fai  = reference_base + '.fasta.fai'
-
-# Where does DBSNP live?
-dbsnp = resources + 'dbsnp_129_hg18.rod'
-
-# Where are the application files required to run the recalibration?
-#gatk = resources + 'gatk/GenomeAnalysisTK.jar'
-gatk = '/home/radon01/depristo/dev/GenomeAnalysisTK_stable/trunk/dist/GenomeAnalysisTK.jar'
-
-logistic_regression_script = resources + 'logistic_regression.R'
-empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R'
-
-# Assemble the platform list into command-line arguments.
-platform_args = ' '.join(['-pl %s' % platform for platform in platforms])
-
-def output_file_needs_update(output_file, input_files = []):
-    if reprocessFiles:                          # we are always reprocessing files 
-        return True
-    if not os.path.exists(output_file):         # the file doesn't exist
-        return True
-    else: 
-        return False 
-
-def executeCommand(cmd):
-    if not dryRun:
-        return os.system(cmd)
-    else:
-        print '  => Would execute', cmd
-        return 0
-
-def exit(msg,errorcode):
-    print msg
-    sys.exit(errorcode)
-
-def check_input_file_available(filename,description):
-    if not os.access(filename,os.R_OK):
-        exit('Unable to access %s %s' % (description,filename),1)
-
-def graph_file(graph_script,graph_data):
-    'Graph the given data using the given script.  Leave the data in the output directory.'
-    check_input_file_available(graph_script,'%s R graphing script' % graph_script)
-    check_input_file_available(graph_data,'%s graphing data' % graph_data)
-    result = executeCommand(' '.join((R_exe,graph_script,graph_data)))
-    if result != 0:
-        exit('Unable to graph data: %s' % graph_data,1)
-
-def recalibrate():
-    'Recalibrate the given bam file'
-    # generate the covariates
-    print 'generating covariates'
-    linear_regression_results = output_dir + 'linear_regression_results.out'
-    if output_file_needs_update(linear_regression_results):
-        generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
-        returncode = executeCommand(generate_covariates)
-        if returncode != 0:
-            exit('Unable to generate covariates',1)
-    
-        # compute the logistic regression
-        if not dryRun:
-            print 'computing the logistic regression'
-            LogisticRegressionByReadGroup.compute_logistic_regression(output_dir + 'initial.covariate_counts.csv',linear_regression_results,R_exe,logistic_regression_script)
-    else:
-        print 'Logistic recalibration data files already generated', linear_regression_results
-  
-    if output_file_needs_update(calibrated_bam):
-        # apply the logistic regression, writing the output data to calibrated_bam
-        print 'applying the correction to the reads'
-        apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams',output_dir+'linear_regression_results.out','-outputBAM',calibrated_bam))
-        returncode = executeCommand(apply_logistic_regression)
-        if returncode != 0:
-            exit('Unable to apply logistic regression',1)
-    else:
-        print 'Recalibrated BAM file already generated', calibrated_bam
-
-    # index the calibrated bam
-    if output_file_needs_update(calibrated_bam + '.bai'):
-        print 'indexing the calibrated bam'
-        index_calibrated_bamfile = ' '.join((samtools_exe,'index',calibrated_bam))
-        returncode = executeCommand(index_calibrated_bamfile)
-        if returncode != 0:
-            exit('Unable to index calibrated bamfile',1)
-    else:
-        print 'Recalibrated BAM index file already generated', calibrated_bam + '.bai'
-
-    print 'Recalibration complete!  Calibrated bam is available here: ' + calibrated_bam    
-
-def evaluate():
-    'Evaluate recalibration results.'
-    print 'Evaluating recalibration results'
-
-    recalibrated_regression_results = output_dir + 'recalibrated' + 'linear_regression_results.out'
-    if output_file_needs_update(recalibrated_regression_results):
-       # regenerate the covariates
-       regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
-       print 'regenerating covariates'
-       returncode = executeCommand(regenerate_covariates)
-       if returncode != 0:
-           exit('Unable to regenerate covariates',1)
-   
-       print 'graphing initial results'
-       for filename in glob.glob(output_dir+'initial.*.empirical_v_reported_quality.csv'):
-           graph_file(empirical_vs_reported_grapher,filename)
-       print 'graphing final results'
-       for filename in glob.glob(output_dir+'recalibrated.*.empirical_v_reported_quality.csv'):
-           graph_file(empirical_vs_reported_grapher,filename)
-    else:
-        print 'Evaluated files already generated', recalibrated_regression_results
-
-def usage():
-    exit('Usage: python RecalQual.py [--recalibrate] [--evaluate] <input bam file> <calibrated output bam file>',1)
-
-# Try to parse the given command-line arguments.
-try:
-    opts, args = getopt.gnu_getopt(sys.argv[1:],'',['recalibrate','evaluate', 'reprocess', 'dry', 'outputRoot='])
-except getopt.GetoptError, err:
-    usage()
-
-# An input and an output file is required.  Fail if left unspecified.
-if len(args) < 2:
-    usage()
-
-# Determine whether to evaluate / recalibrate.
-recalibrate_requested = False
-evaluate_requested = False
-reprocessFiles = False
-dryRun = False
-for opt,arg in opts:
-    if opt == '--recalibrate':
-        recalibrate_requested = True
-    if opt == '--evaluate':
-        evaluate_requested = True
-    if opt == '--reprocess':
-        reprocessFiles = True
-    if opt == '--dry':
-        dryRun = True
-    if opt == '--outputRoot':
-        output_root = arg
-        
-# Default to 'recalibrate' unless the user specified only evaluate.
-do_recalibration = not (evaluate_requested and not recalibrate_requested)
-# Only evaluate if the user specifically requested evaluation.
-do_evaluation = evaluate_requested
-
-print 'Looking for supplemental files in ' + resources
-
-# check that the input bam file exists, and that the bam is indexed.
-bam = args[0]
-bam_index = bam + '.bai'
-
-check_input_file_available(bam,'reads file')
-check_input_file_available(bam_index,'reads index file')
-
-# parse the user's calibration output file requirements
-calibrated_bam = args[1]
-calibrated_bam_index = calibrated_bam + '.bai'
-
-# check that the fasta and supporting files are available
-check_input_file_available(reference,'reference file')
-check_input_file_available(reference_dict,'reference dictionary')
-check_input_file_available(reference_fai,'reference index file')
-
-# check that the dbsnp is available
-check_input_file_available(dbsnp,'dbsnp file')
-
-# sanity check that the software is available
-check_input_file_available(samtools_exe,'samtools')
-check_input_file_available(java_exe,'java')
-check_input_file_available(R_exe,'R')
-check_input_file_available(gatk,'Genome Analysis Toolkit')
-check_input_file_available(logistic_regression_script,'logistic regression script')
-
-# make an output directory for temporary files
-output_dir=output_root+'output.' + bam[bam.rfind('/')+1:bam.rfind('.bam')] + '/'
-if not os.path.isdir(output_dir):
-    os.mkdir(output_dir)
-if not os.path.isdir(output_dir):
-    exit('Unable to create output directory ' + output_dir,1)
-
-# assemble the required program arguments
-gatk_base_cmdline = ' '.join((java_exe,jvm_args,'-jar',gatk,'-R',reference,'--DBSNP',dbsnp,'-l INFO'))
-
-if do_recalibration:
-    recalibrate()
-
-if do_evaluation:
-    evaluate()
--- a/python/SAM.py
+++ b/python/SAM.py
@ -1,237 +0,0 @@
-import string
-import operator
-from pushback_file import pushback_file
-
-#   Type   TAG   Req?   Default     Description
-SAMHeaderFields = \
-    [['HD', 'VN', True,  '0.1.0',   'File format version'], \
-     ['HD', 'SO', False, None,      'Sort order. Valid values are: unsorted, queryname or coordinate'], \
-     ['SQ', 'SN', True,  None,      'Sequence name. Unique among all sequence records in the file. The value of this field is used in alignment records.'], \
-     ['SQ', 'LN', True,  None,      'Sequence length'], \
-     ['SQ', 'AS', False, None,      'Genome assembly identifier. Refers to the reference genome assembly in an unambiguous form. Example: HG18.'], \
-     ['SQ', 'M5', False, None,      'MD5 checksum of the sequence in the uppercase (gaps and space are removed)'], \
-     ['SQ', 'UR', False, None,      'URI of the sequence'],  \
-     ['SQ', 'SP', False, None,      'Species'], \
-     ['RG', 'ID', True, 'ReadGroup1', 'Unique read group identifier. The value of the ID field is used in the RG tags of alignment records.'], \
-     ['RG', 'SM', True, 'SampleA',  'Sample (use pool name where a pool is being sequenced)'], \
-     ['RG', 'LB', False, None,      'Library'], \
-     ['RG', 'DS', False, None,      'Description'], \
-     ['RG', 'PU', False, None,      'Platform unit (e.g. lane for Illumina or slide for SOLiD)'], \
-     ['RG', 'PI', False, None,      'Predicted median insert size (maybe different from the actual median insert size)'], \
-     ['RG', 'CN', False, None,      'Name of sequencing center producing the read'], \
-     ['RG', 'DT', False, None,      'Date the run was produced (ISO 8601 date or date/time)'], \
-     ['RG', 'PL', False, None,      'Platform/technology used to produce the read'], \
-     ['PG', 'ID', True,  'ProgramA','Program name'], \
-     ['PG', 'VN', False, None,      'Program version'], \
-     ['PG', 'CL', False, None,      'Command line']] 
-
-def SAMHeaderEncode( type, tag ):
-    return type + '.' + tag
-
-SAMHeaderDict = dict()
-for record in SAMHeaderFields:
-    type, tag, req, default, desc = record
-    SAMHeaderDict[SAMHeaderEncode(type, tag)] = [req, default, desc]
-
-# -----------------------------------------------------------------------------------------------
-#
-# A SAM header is a potentially complex object, so we just punt on it.  The only really required
-# outputs that *must* be user specified are the SQ: SN and SQ: LN fields.  
-# Everything else is just split up and stored in our hash table by reading the SAMHeaderFields
-# data structure (defined above).  All required fields are initialized to their default values
-#
-# -----------------------------------------------------------------------------------------------
-class SAMHeader(dict):
-    def __init__(self, seqName, seqLen, **keys):
-        # setup initial header
-        self.fields = dict()
-        for record in SAMHeaderFields:
-            type, tag, req, default, desc = record
-            self.setField( type, tag, default )
-        
-        self.setField('SQ', 'SN', seqName )
-        self.setField('SQ', 'LN', seqLen )
-                        
-        # add keyword arguments
-        for key, value in keys.iteritems():
-            type, tag = key.split('.')
-            self.setField( type, tag, value )
-        
-    def isReq( self, type, tag ):
-        return SAMHeaderDict[SAMHeaderEncode(type,tag)][0]
-            
-    def setField( self, type, tag, value ):
-        #print 'Setting', type, tag, value
-        self.fields[SAMHeaderEncode(type, tag)] = value
-        
-    def getField( self, type, tag ):
-        return self.fields[SAMHeaderEncode(type, tag)]
-        
-    def __str__( self ):
-        types = ['HD', 'SQ', 'RG']
-        
-        def formatType( type ):
-            s = '@' + type + '\t'
-            for record in SAMHeaderFields:
-                qtype, tag, req, default, desc = record
-                if type == qtype and self.isReq(type, tag):
-                    value = self.getField(type, tag)
-                    if value == None:
-                        raise Error('Unset required SAM header ' + type + ' ' + tag)
-                    s += tag + ':' + str(value) + '\t'
-            return s
-        return string.join(map( formatType, types ),'\n')
-
-SAM_SEQPAIRED    = 0x0001 # the read is paired in sequencing, no matter whether it is mapped in a pair
-SAM_MAPPAIRED    = 0x0002 # the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1 
-SAM_UNMAPPED     = 0x0004 # the query sequence itself is unmapped 
-SAM_MATEUNMAPPED = 0x0008 # the mate is unmapped 1 
-SAM_QUERYSTRAND  = 0x0010 # strand of the query (0 for forward; 1 for reverse strand) 
-SAM_MATESTRAND   = 0x0020 # strand of the mate 1 
-SAM_ISFIRSTREAD  = 0x0040 # the read is the first read in a pair 1,2 
-SAM_ISSECONDREAD = 0x0080 # the read is the second read in a pair 1,2 
-SAM_NOTPRIMARY   = 0x0100 # the alignment is not primary (a read having split hits may have multiple primary alignment records) 
-
-SAM_FLAGS = {
-    SAM_SEQPAIRED       : 'the read is paired in sequencing, no matter whether it is mapped in a pair',
-    SAM_MAPPAIRED       : 'the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1', 
-    SAM_UNMAPPED        : 'the query sequence itself is unmapped',
-    SAM_MATEUNMAPPED    : 'the mate is unmapped 1', 
-    SAM_QUERYSTRAND     : 'strand of the query (0 for forward; 1 for reverse strand)' ,
-    SAM_MATESTRAND      : 'strand of the mate 1' ,
-    SAM_ISFIRSTREAD     : 'the read is the first read in a pair 1,2' ,
-    SAM_ISSECONDREAD    : 'the read is the second read in a pair 1,2',
-    SAM_NOTPRIMARY      : 'the alignment is not primary (a read having split hits may have multiple primary alignment records)'
-    }
-   
-def SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig = '*', pairPos = 0, insertSize = 0 ):
-    r = SAMRecord()
-    r.setValuesFromArgs( qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig, pairPos, insertSize )
-    return r
-
-def SAMRecordFromString( str ):
-    r = SAMRecord()
-    r.setValuesFromString( str )
-    return r
-    
-def SAMFlagValue( flags, testFlag ):
-    return testFlag & flags
-
-def SAMFlagIsSet( flags, testFlag ):
-    return SAMFlagValue(flags, testFlag) <> 0
-
-def SAMFlagsDescs( flags ):
-    def keepMe(p):
-        flagKey, flagDesc = p
-        return [flagKey, SAMFlagIsSet(flags, flagKey), flagDesc]
-    return sorted(map( keepMe, SAM_FLAGS.iteritems() ))            
-
-# -----------------------------------------------------------------------------------------------
-#
-# This is really the meat of the SAM I/O system.  
-#
-# setValuesFromArgs takes the required arguments for a SAM record and stores them in a list
-# (in SAM ordering) so that writing the values to a string is trivial.  Note the conversion
-# of int quals to ASCII encoded quals.  
-#
-# -----------------------------------------------------------------------------------------------
-class SAMRecord:
-    def __init__( self, vals = []):
-        self.vals = []
-    
-    def setValuesFromArgs( self, qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig = '*', pairPos = 0, insertSize = 0 ):
-        self.vals = [qname, self.formatFlags(flags), rname, pos, mapq, \
-                     cigar, pairContig, pairPos, insertSize, seq.lower(), self.toASCII33(quals) ]
-
-    def setValuesFromString( self, line ):
-        #print 'line is', line
-        formats = [str, int, str, int, int, str, str, int, int, str, str]
-        self.vals = map( lambda f, v: f(v), formats, line.split()[0:len(formats)] )
-
-    def formatFlags( self, flags ):
-        b = reduce( operator.__or__, flags, 0 )
-        #print 'FormatFlags', flags, b
-        return b
-        
-    def getQName(self): return self.vals[0]
-    def getFlags(self): return self.vals[1]
-    def getRname(self): return self.vals[2] # Reference sequence name (can be chr1, chr2, etc...)
-    def getPos(self): return self.vals[3]
-    def getMapq(self): return self.vals[4]
-    def getCigar(self): return self.vals[5] # Returns CIGAR which gives match, insertion, and other info as "75M2I"
-    def getSeq(self): return self.vals[9]
-    def getQuals(self): return self.fromASCII33(self.vals[10])
-
-    def toASCII33( self, quals ):
-        return string.join( map( lambda q: chr(q+33), quals ), '' )
-
-    def fromASCII33( self, qualStr ):
-        return map( lambda q: ord(q)-33, qualStr) 
-
-    def __str__( self ):
-        return string.join( map( str, self.vals ), '\t')
-
-# -----------------------------------------------------------------------------------------------
-#
-# Wrapper class for reading and writing records to a SAM file
-#
-# -----------------------------------------------------------------------------------------------
-class SAMIO:
-    def __init__( self, fileName, header = None, debugging = False, func = None ):
-        self.header = header
-        self.debugging = debugging
-        
-        if func == None:
-            func = lambda x: x
-        self.func = func
-        
-        if self.header == None:
-            self.fileHandle = pushback_file(fileName, "r")
-            self.readHeader()
-        else:
-            self.fileHandle = file(fileName, "w")
-            self.writeHeader()
-            
-    def close(self):
-        self.fileHandle.close()
-        
-    def readHeader(self):
-        self.header = ""
-        while True:
-            line = self.fileHandle.readline()
-            if line.startswith("@"):
-                self.header += line
-            else:
-                self.fileHandle.pushback(line)
-                if self.debugging:
-                    pass #print str(self.header)
-                return True
-
-        return self.header
-
-    def RecordGenerator( self ):
-        for line in self.fileHandle:
-            #print 'Reading line', line
-            line = line.rstrip("\n")
-            yield self.func(SAMRecordFromString(line))
-        raise StopIteration
-        return
-        
-    def __iter__(self):
-        return self.RecordGenerator()
-        
-    def writeRecord( self, recordIterator ):
-        for record in recordIterator:
-            self.writeRecord(record)
-            
-    def writeRecord( self, record ):
-        if self.debugging:
-            print record
-        print >> self.fileHandle, record
-        return True
-        
-    def writeHeader( self ):
-        if self.debugging:
-            print str(self.header)
-        print >> self.fileHandle, str(self.header)
-        return True
--- a/python/SamWalk.py
+++ b/python/SamWalk.py
@ -1,304 +0,0 @@
-#!/util/bin/python
-#
-# 
-# This file really needs to be cleaned up and functions need to be improved
-#
-# (1) utility operations should be moved to a utility file
-# (2) Need to add support for windows paired reads, information about where
-#     reads come from (which file), and n-base pair loci
-# (3) Move to C++ based SAM/BAM i/o system
-# (4) Figure out a way to get the reference to stream, and use a automatically pickled version
-# (5) Add capability to walk over all reference bases, regardless of coverage.  Currently
-#     we only walk over covered bases.
-#
-#
-from optparse import OptionParser
-import os
-import sys
-import random
-import string
-import os.path
-import heapq
-from itertools import *
-
-from SAM import *
-import SimpleSAM
-#import SimulateReads    # utility functions should be imported
-
-# Global variable containing command line arguments
-OPTIONS = None
-
-#
-# Trivial class that added pushback support to a stream
-#
-class peakStream(object):
-    """A simple wrapper around a stream that adds unget operation"""
-    def __init__( self, stream ):
-        self.ungot = []
-        self.s = stream
-        
-    def unget( self, elt ):
-        """Put elt at the front of the underlying stream, making the next get call return it"""
-        
-        #print 'unget', self.ungot
-        self.ungot.append(elt)
-        #print '  -> unget', self.ungot
-
-    # we hold the next() operation
-    def __iter__( self ): return self
-
-    def next(self):
-        """For python iterator support"""
-        #print 'ungot', self.ungot
-        if self.ungot <> []:
-            elt = self.ungot.pop(0)
-        else:
-            elt = self.s.next()
-
-        #print '  -> ungot', self.ungot
-        #print '  elt', elt
-        return elt
-
-# 
-# Returns a single, combined stream of SAM records in chromosome order from a list of 
-# input streams, each in reference order
-# 
-def rawReadStream( ref, inputs ):
-    """Returns a single iterable that returns records in reference order
-    from each of the input streams"""
-    def includePriorities( stream ):
-        return imap( lambda x: (x.getPos(), x), stream )
-    def prunePriorities( stream ):
-        return imap( lambda p: SimpleSAM.MappedReadFromSamRecord(ref, p[1]), stream )
-
-    with_priorities = map( includePriorities, inputs )
-    return prunePriorities( heapq.merge( *with_priorities ) )
-
-# 
-# Just wraps the raw read stream objects in a list as they come out
-# 
-def readStream( ref, inputs ):
-    """Returns a single iterable that returns SAM records in reference order
-    from each of the input streams"""
-    rs = rawReadStream( ref, inputs )
-    return imap( lambda x: [x], rs )
-
-#
-# More complex stream object.  Takes a list of input streams and creates a stream
-# returning successive loci covered by the reads in the combined input stream.
-#
-class lociStream():
-    """A streaming data structure that returns reads spanning each covered loci in
-    the input reads, offsets into them
-    where the bases are equivalent, and the position of the locus.
-    """
-    def __init__( self, ref, inputs ):
-        self.ref = ref
-        self.rs = peakStream(rawReadStream( ref, inputs ))
-        self.window = []
-        
-        self.chr = None
-        self.pos = None
-        
-    def __iter__(self):
-        return self
-        
-    def pushRead( self, read ):
-        self.window.append(read)
-        
-    def cleanWindow( self, chr, pos ):
-        """Walk over the window of reads, deleting those who no longer cover the current pos"""
-        if OPTIONS.debug:
-            print 'cleanWindow start:', chr, pos, self.window 
-
-        def keepReadP( read ):
-            return read.chr == chr and pos >= read.start and pos <= read.end
-        self.window = filter( keepReadP, self.window )
-
-        if OPTIONS.debug:
-            print 'cleanWindow stop:', chr, pos, self.window 
-
-    def expandWindow( self, chr, pos ):
-        """Keep reading from the read stream until we've added all reads from the stream covering pos"""
-        if OPTIONS.debug:
-            print 'expandWindow start:', chr, pos, self.window 
-        for read in self.rs:
-            #print 'read', read, pos
-            if read.chr == chr and read.start <= pos and read.end >= pos:
-                self.pushRead(read)
-            else:
-                self.rs.unget( read )
-                #self.rs = chain( [read], self.rs )
-                break
-        if OPTIONS.debug:
-            print 'expandWindow stop:', chr, pos, self.window 
-            
-            
-    #
-    # This is the workhorse routine.  It constructs a window of reads covering the
-    # next locus in the genome, and returns the reference, the contig, its position
-    # in the genome, along with a vector of offsets into the list of reads that denote
-    # the equivalent base among all the reads and reference.
-    #
-    def next(self):
-        if self.pos <> None:
-            self.cleanWindow(self.chr, self.pos)             # consume reads not covering pos
-            self.expandWindow(self.chr, self.pos)            # add reads to window covering pos
-
-        if self.window == []:
-            # the window is empty, we need to jump to the first pos of the first read in the stream:
-            nextRead = self.rs.next()
-            self.pushRead( nextRead )
-            self.chr = nextRead.chr
-            self.pos = nextRead.start
-            return self.next()
-        else:
-            # at this point, window contains all reads covering the pos, we need to return them 
-            # and the offsets into each read for this loci
-            def calcOffset( read ):
-                offset = self.pos - read.start
-                return offset
-            
-            offsets = map(calcOffset, self.window)
-            currPos = self.pos
-            self.pos += 1       # we are going to try to get the next position
-            return self.ref, self.chr, currPos, offsets, self.window
-            
-# 
-# Reference reader
-#
-def readRef(referenceFasta):
-    ref = {}
-
-    header = None
-    seq = ''
-    for line in open(referenceFasta):
-        if line[0] == '>':
-            if header <> None:
-                ref[header] = seq.lower()
-            seq = ''
-            header = line[1:].strip()
-        else:
-            seq += line.strip()
-
-    ref[header] = seq.lower()
-    #print ref
-    return ref
-
-# 
-# Main() procedure
-#
-def main():
-    global OPTIONS, ROOT
-
-    # ------------------------------------------------------------
-    #
-    # Setup command line options
-    #
-    # ------------------------------------------------------------
-    usage = "usage: %prog [options] Walker [sam or bam file or file list]+"
-    parser = OptionParser(usage=usage)
-    parser.add_option("-r", "--ref", dest="reference",
-                        type="string", default=None,
-                        help="Reference DNA seqeunce in FASTA format")
-    parser.add_option("-m", "--maxRecords", dest="maxRecords",
-                        type=int, default=None,
-                        help="Max number of SAM records to process")
-    parser.add_option("-q", "--quiet", dest="quiet",
-                        action='store_true', default=False,
-                        help="Be quiet when generating output")
-    parser.add_option("-d", "--debug", dest="debug",
-                        action='store_true', default=False,
-                        help="Verbose debugging output?")
-
-    (OPTIONS, args) = parser.parse_args()
-    print 'args', args
-    if len(args) < 2:
-        parser.error("Incorrect number of arguments")
-	    
-
-    # ------------------------------------------------------------
-    #
-    # Dynamically load the walker class (the first cmd line arg)
-    # and initialize a walker class object
-    # 
-    # 
-    # load walkers from standard location 
-    #
-    # ------------------------------------------------------------
-    walkerName = args[0]
-    walkerModule = __import__(walkerName, globals(), locals(), [], -1)
-    walkerClass = walkerModule.__dict__[walkerName]
-    walker = walkerClass()
-    
-    print walkerName
-    print walkerModule
-    print walkerClass
-    print walker
-
-    print '------------------------------------------------------------' 
-    print 'program:', sys.argv[0]
-    print ''
-    print '  ref:               ', OPTIONS.reference
-    print '  walker:            ', walker
-    if walker.hasOption('name'):
-        print '    ->               ', walker.getOption('name')
-    if walker.hasOption('desc'):
-        print '    ->               ', walker.getOption('desc')
-    print '------------------------------------------------------------'
-
-
-    # ------------------------------------------------------------
-    #
-    # Initialize the engine
-    #
-    # ------------------------------------------------------------
-    
-    # read the reference
-    refs = readRef(OPTIONS.reference)
-
-    # create the low-level SAMIO streams
-    files = args[1:]
-    inputs = map( lambda file: SAMIO( file, debugging=OPTIONS.debug ), files )
-    
-    # build the higher level stream object, either by record or by loci
-    if walker.isRecordWalker():
-        stream = readStream( refs, inputs )
-    if walker.isLociWalker():
-        stream = lociStream( refs, inputs )
-
-    # ------------------------------------------------------------
-    #
-    # Move the walker object over the stream
-    # 
-    # For each element in the record or loci stream, invoke the walker 
-    # object on it.  Use filter, map, and reduce to construct the sum
-    #
-    # ------------------------------------------------------------
-    sum = walker.reduceDefault
-    counter = 0
-    for elt in stream:
-        counter += 1
-        #print 'processing elt', counter, elt, OPTIONS.maxRecords
-        if OPTIONS.maxRecords <> None:
-            if (counter * 10) % OPTIONS.maxRecords == 0:
-                print '  => ', (counter * 100.0) / OPTIONS.maxRecords, '% done' 
-        
-            if counter > OPTIONS.maxRecords:
-                break
-        if walker.filterfunc( *elt ):
-            x = walker.mapfunc( *elt )
-            sum = walker.reducefunc( x, sum )
-    print 'Result is', sum
-    
-if __name__ == "__main__":
-    if True:
-        import cProfile, pstats
-        cProfile.run('main()', 'prof')
-        stats = pstats.Stats('prof')
-        stats.sort_stats('time')
-        stats.print_stats(20)
-    else:
-        main()
-
-
--- a/python/SamWalkTest.py
+++ b/python/SamWalkTest.py
@ -1,31 +0,0 @@
-#!/util/bin/python
-
-#from SamWalk import *
-from SimpleSAM import *
-import Walker
-
-class SamWalkTest(object, Walker.Walker):
-    def __init__(self):
-        Walker.Walker.__init__( self, 'byRecord', {}, name = 'SamWalkTest' )
-
-    def filterfunc( self, read ):
-        """Return true if you want to keep the read for mapping"""
-        return True
-        #return datum.getPos() % 2 == 0
-
-    def mapfunc( self, read ):
-        """Do something to the read, and return a result for reduction"""
-        #print datum
-        return 1
-        #return len(datum.getSeq())
-        #return datum.getSeq() + '/'
-        #return "\n>%s\n%s" % (datum.getQName(), datum.getSeq())
-     
-    #reduceDefault = ''
-    def reducefunc( self, x, sum ):
-        """Take the result of mapping, and previous reduce results in sum, and combine them"""
-        #print x
-        return x + sum
-
-
-
--- a/python/SimpleSAM.py
+++ b/python/SimpleSAM.py
@ -1,69 +0,0 @@
-from SAM import *
-
-def read2align( refSeq, readSeq, chr, pos, cigar ):
-    """Converts the read bases, a reference sequence, and a cigar string into a pair of
-    alignments strings for refSeq and readSeq.  The alignment string has base equivalents
-    between the reference and the read bases"""
-    # 
-    # only works for non-indel containing reads right now!
-    #
-    contig = refSeq[chr]
-    refAlign = contig[pos:pos+len(readSeq)]
-    readAlign = readSeq
-    return refAlign, readAlign
-
-def MappedReadFromSamRecord(ref, r):
-    """Converts SameRead into a SimpleSamRecord object"""
-    #
-    # This is a temp. function while we wait for Ted's C++ SAM/BAM io system to come online    
-    #
-    refAlign, readAlign = read2align(ref, r.getSeq(), r.getRname(), r.getPos(), r.getCigar)
-    return MappedRead( r.getQName(), r.getFlags(), r.getRname(), r.getPos(), r.getMapq(), 
-                       r.getCigar(), r.getSeq(), refAlign, readAlign, r.getQuals() ) 
-
-class MappedRead(object):
-    """Higher level object representing a mapped read.  
-    
-    Attempts to be more accessible than a raw SAM record, doing a lot of processing on the record w.r.t.
-    the reference to make analysis of the reads easier.  This is the fundamental data structure
-    used to represent reads through the walker engine"""
-    
-    def __init__( self, qname, flags, chr, pos, mapq, cigar, readBases, refAlign, readAlign, qualsByBase ):
-        # basic information about the read
-        self.qname = qname
-        self.mapq = mapq
-        self.cigar = cigar
-        self.bases = readBases          # actual bases of reads, unaligned, may be fw or reverse
-        self.quals = qualsByBase        # as a vector of floats, by base position
-
-        # dealing with position information, more than a standard record Hasbro
-        self.chr     = chr
-        self.start   = pos
-        self.readlen = len(readBases)
-        self.end     = pos + len(refAlign) - 1
-
-        # print 'refAlign', refAlign, len(refAlign)
-
-        # dealing with stand
-        self.flags = flags
-        self.isForwardStrand = SAMFlagIsSet(flags, SAM_QUERYSTRAND)
-        self.isReverseStrand = not SAMFlagIsSet(flags, SAM_QUERYSTRAND)
-
-        self.isPrimary   = SAMFlagIsSet(flags, SAM_NOTPRIMARY)
-        self.isUnmapped  = SAMFlagIsSet(flags, SAM_UNMAPPED)
-        self.isPaired    = SAMFlagIsSet(flags, SAM_SEQPAIRED)
-        self.isMapPaired = SAMFlagIsSet(flags, SAM_MAPPAIRED)
-        self.isFirstReadInPair  = SAMFlagIsSet(flags, SAM_ISFIRSTREAD)
-        self.isSecondReadInPair = SAMFlagIsSet(flags, SAM_ISSECONDREAD)
-
-        #self.isMapPaired = SAMFlagIsSet(flags, SAM_MATEUNMAPPED)
-        #self.isMapPaired = SAMFlagIsSet(flags, SAM_MATESTRAND)
-
-        # everything will be reverse complemented already
-        self.refAlign = refAlign        # 
-        self.readAlign = readAlign      # all should be forward strand
-
-    def __str__(self):
-        return '<MappedRead %s:%d-%d>' % (self.chr, self.start, self.end)
-
-    __repr__ = __str__ 
--- a/python/SimulateReads.py
+++ b/python/SimulateReads.py
@ -1,754 +0,0 @@
-#!/util/bin/python
-
-from optparse import OptionParser
-import os
-import sys
-#import set
-import random
-import string
-
-from SAM import *
-
-def all(iterable):
-    for element in iterable:
-        if not element:
-            return False
-    return True
-
-def any(iterable):
-    for element in iterable:
-        if element:
-            return True
-    return False
-
-OPTIONS = None
-
-bases = set(['a', 't', 'g', 'c'])
-
-class Mutation:
-    """Representation of a change from one sequence to another""" 
-
-    SNP = "SNP"
-    INSERTION = "Insertion"
-    DELETION = "Deletion"
-    TYPES = set([SNP, INSERTION, DELETION])
-    
-    CIGARChars = { SNP : 'M', INSERTION : 'I', DELETION : 'D' }
-
-    def __init__( self, contig, pos, type, original, mutated ):
-        # example: chr20 123 SNP "a" "t"        <- a to t at base 123 on chr20
-        # example: chr20 123 INSERTION "" "atg" <- insertion of 3 bases starting at pos 123
-        # example: chr20 123 DELETION  "atg" "" <- deletion of 3 bases starting at pos 123
-        self.contig = contig    # contig of the mutation
-        self._pos = pos          # position of the change in contig coordinates
-        self._type = type        # one of the TYPES
-        assert(self._type in Mutation.TYPES)
-        self.original = original    # String of the original genotype
-        self.mutated  = mutated     # String of the mutated genotype
-
-    def pos(self, offset=0): return self._pos - offset
-    def type(self): return self._type
-    def __len__(self):
-        return max( len(self.mutated), len(self.original) )
-
-    def mutString( self ):
-        if self._type == Mutation.SNP:
-            return 'P:%s->%s' % (self.original, self.mutated)
-        elif self._type == Mutation.INSERTION:
-            return 'I:%s' % (self.mutated)
-        elif self._type == Mutation.DELETION:
-            return 'D:%s' % (self.original)
-        else:
-            raise Exception('Unexpected mutation type ' + self._type)
-
-    def CIGARChar( self ):
-        return Mutation.CIGARChars[self.type()]
-            
-    def __str__( self ):
-        return '%s:%d %s' % (self.contig, self._pos, self.mutString())
-        
-    def __repr__( self ):
-        return 'Mutation(%s, %d, %s)' % (self.contig, self._pos, self.mutString())
-
-    def execute( self, offset, refseq, mutseq ):
-        p = self.pos(offset)
-        if self.type() == Mutation.SNP:
-            mutseq[p] = self.mutated
-        elif self.type() == Mutation.INSERTION:
-            refseq[p:p] = string.join(['-'] * len(self),'')
-            mutseq[p:p] = self.mutated
-        elif self.type() == Mutation.DELETION:
-            mutseq[p:p+len(self)] = string.join(['-'] * len(self), '')
-            refseq.extend(self.mutated)
-            mutseq.extend(self.mutated)
-            
-        #print refseq, mutseq
-        return refseq, mutseq
-        
-# ---------------------------------------------------------------------------
-#
-# Code for dealing with CIGAR strings
-#
-# ---------------------------------------------------------------------------
-def flatten(l):
-    out = []
-    for item in l:
-        if isinstance(item, (list, tuple)):
-            out.extend(flatten(item))
-        else:
-            out.append(item)
-    return out
-
-def mutationsCIGAR( readStart, alignStart, readLen, mutations ):
-    if len(mutations) == 1 and mutations[0].type() <> Mutation.SNP:
-        mut = mutations[0]
-        internalPos = mut.pos() - alignStart
-
-        leftLen = max(internalPos - readStart,0)
-        mutLen = min(internalPos + len(mut) - readStart, readLen - leftLen, len(mut))
-        rightLen = readLen - leftLen - mutLen
-        #print mut, readStart, alignStart, internalPos, leftLen, mutLen, rightLen
-        #print
-        
-        l = [ (leftLen, 'M'), (mutLen, mut.CIGARChar()), (rightLen, 'M') ]
-        l = filter( lambda x: x[0] > 0, l )
-        return string.join(map(str, flatten(l)), '')
-        
-    if not all( map( lambda mut: mut.type() == Mutation.SNP, mutations ) ):
-        # bail
-        raise Exception('Currently only supports multiple SNPs CIGARS')
-
-    return str(readLen) + 'M' 
-
-# def mutationsCIGARBAD( refStart, readStart, readStop, mutations ):
-#     sortedMutations = sorted(mutations, key=Mutation.pos)
-#     CIGAR = []
-#     pos = readStart
-#     
-#     for mutation in sortedMutations:
-#         internalMutationStart = mutation.pos() - refStart
-#         if internalMutationStart >= readStart and internalMutationStart < readStop
-#             delta = mutation.pos() - pos
-#             if not OPTIONS.quiet:
-#                 print 'mutationsCIGAR', pos, CIGAR, delta, mutation
-#             if mutation.type() <> Mutation.SNP and delta > 0:
-#                 CIGAR.append([delta, 'M'])
-#                 pos = mutation.pos()
-#             
-#             if mutation.type == Mutation.INSERTION:
-#                 CIGAR.append(['I', len(mutation)])
-#             if mutation.type == Mutation.DELETION:
-#                 CIGAR.append(['D', len(mutation)])
-# 
-#     delta = refSeqPos + refLen - pos
-#     if not OPTIONS.quiet:
-#         print 'mutationsCIGAR', pos, CIGAR, delta, mutation
-#     if delta > 0:
-#         CIGAR.append([delta, 'M'])
-# 
-#     s = string.join(map( str, flatten(CIGAR) ), '')
-#     print 'CIGAR:', CIGAR, s
-#     return s
-
-# ---------------------------------------------------------------------------
-#
-# mutating the reference
-#
-# ---------------------------------------------------------------------------
-def executeMutations( mutations, seqIndex, refseq, mutseq ):
-    for mutation in mutations:
-        internalSite = mutation.pos() - seqIndex
-        refseq, mutseq = mutation.execute( seqIndex, refseq, mutseq )
-    return refseq, mutseq
-    
-def isBadSequenceForSim(seq):
-    if any( map( lambda b: b not in bases, seq.tostring().lower() ) ):
-        return True
-    else:
-        return False
-    
-
-def mutateReference(ref, contig, mutSite, mutType, mutParams, nBasesToPad):
-    #print 'mutateReference'
-
-    # start and stop    
-    refStartIndex = mutSite - nBasesToPad + 1
-    internalStartIndex = nBasesToPad - 1
-
-    if OPTIONS.mutationType == 'SNP' or OPTIONS.mutationType == 'NONE':
-        refStopIndex = mutSite + nBasesToPad
-    else:
-        refStopIndex = mutSite + nBasesToPad - 1
-    
-    refsub = ref[refStartIndex : refStopIndex]
-    mutseq = refsub.tomutable()
-    refsub = refsub.tomutable()
-    mutations = []
-
-    def success():
-        return True, mutations, refsub, mutseq
-    def fail():
-        return False, [], None, None 
-
-    if refStartIndex < 0: 
-        return fail()
-    if isBadSequenceForSim(refsub):
-        #print 'rejecting seq', refsub.tostring()
-        #print map( lambda b: b not in bases, refsub.tostring().lower() )
-        return fail()
-
-    if not OPTIONS.quiet:
-        print ref
-        print refsub
-
-    otherSites = set(range(refStartIndex, refStopIndex)) - set([mutSite])
-    # print 'otherSites', otherSites
-    for site in [mutSite] + random.sample(otherSites, max(OPTIONS.mutationDensity-1,0)):
-        internalSite = site - refStartIndex
-        if mutType == 'NONE' or OPTIONS.mutationDensity < 1:
-            pass
-        elif mutType == 'SNP':
-            mutBase = ref[site].lower()
-            otherBases = bases - set(mutBase)
-            otherBase = random.choice(list(otherBases))
-            assert mutBase <> otherBase
-            if not OPTIONS.quiet:
-                print mutBase, ' => ', otherBase
-            mutations.append( Mutation( contig, site, Mutation.SNP, mutBase, otherBase ) )
-        elif mutType == 'INSERTION':
-            inSeq = string.join([ random.choice(list(bases)) for i in range( OPTIONS.indelSize ) ], '')
-            mutation = Mutation( contig, site, Mutation.INSERTION, '', inSeq )
-            mutations.append( mutation )
-        elif mutType == 'DELETION':
-            inSeq = string.join([ random.choice(list(bases)) for i in range( OPTIONS.indelSize ) ], '')
-            mutation = Mutation( contig, site, Mutation.DELETION, '', ref[refStopIndex:refStopIndex+OPTIONS.indelSize])
-            mutations.append( mutation )
-        else:
-            raise Exception('Unexpected mutation type', mutType)
-
-    # process the mutations
-    refsub, mutseq = executeMutations( mutations, refStartIndex, refsub, mutseq )
-
-    return success()
-
-# version 1.0 -- prototype
-# def mutateReference(ref, mutSite, mutType, mutParams, nBasesToPad):
-#     #print 'mutateReference'
-#     refStartIndex = mutSite - nBasesToPad + 1
-#     refStopIndex = mutSite + nBasesToPad
-#     internalStartIndex = nBasesToPad - 1
-# 
-#     if refStartIndex < 0:
-#         return False, None, None
-# 
-#     refsub = ref[refStartIndex : refStopIndex]
-#     print ref
-#     print refsub
-# 
-#     if mutType == 'SNP' or mutType == 'None':
-#         #print '  In IF'
-#         mutBase = ref[mutSite]
-#         bases = set(['A', 'T', 'G', 'C'])
-#         if mutBase in bases:
-#             otherBases = bases - set(mutBase)
-#             otherBase = random.choice(list(otherBases))
-#             assert mutBase <> otherBase
-#             mutseq = refsub.tomutable()
-# 
-#             if mutType == 'SNP':
-#                 print mutBase, ' => ', otherBase
-#                 mutseq[internalStartIndex] = otherBase
-# 
-#             print refsub
-#             print mutseq
-#             align = Bio.Align.Generic.Alignment(Gapped(IUPAC.unambiguous_dna, '-'))
-#             align.add_sequence("ref", refsub.tostring())
-#             align.add_sequence("mut", mutseq.tostring())
-#             print str(align)
-# 
-#             return True, refsub, mutseq
-# 
-#     return False, None, None
-            
-# ---------------------------------------------------------------------------
-#
-# sample reads from alignments
-#
-# ---------------------------------------------------------------------------
-def accumulateBases( mutSeq, start, readLen ):
-    count = 0
-    for stop in range(start, len(mutSeq)):
-        if notDash(mutSeq[stop]):
-            count += 1
-        #print stop, count
-        if count == readLen:
-            break
-            
-    stop += 1
-    read = string.join(filter( notDash, mutSeq[start:stop] ), '')
-
-    return read, stop
-
-# doesn't support paired reads
-#
-# def sampleReadsFromAlignment(refSeq, mutSeq, alignStart, readLen, nReads, mutations):
-#     #print 'REF:', refSeq.tostring()
-#     #print 'MUT:', mutSeq.tostring()
-#     #print ''
-#     
-#     # we are assuming that mutSeq is exactly 1 + 2 * readLen in size, which the mutation
-#     # right in the middle
-#     lastGoodStartSite = readLen - 1
-#     if not OPTIONS.quiet:
-#         print refSeq.tostring(), 'ref'
-#         print mutSeq.tostring(), 'mut'
-#     
-#     # we can potentially start at any site in mutSeq
-#     nPotentialStarts = len(mutSeq) - readLen + 1
-#     potentialStarts = range(nPotentialStarts)
-# 
-#     if nReads.lower() == 'tile' or int(nReads) >= nPotentialStarts:
-#         if OPTIONS.uniqueReadsOnly:
-#             starts = potentialStarts
-#         else:
-#             starts = map( lambda x: random.choice(potentialStarts), range(nPotentialStarts))
-#     else:
-#         starts = random.sample(potentialStarts, nPotentialStarts)
-# 
-#     #print 'potentialStarts', potentialStarts
-#     #print 'Starts', starts
-#     
-#     def sampleRead( start ):
-#         if mutSeq[start] <> '-':
-#             read, stop = accumulateBases( mutSeq, start, readLen )
-#             remaining = len(mutSeq) - stop
-#             refForRead = refSeq[start:stop]
-#             #print 'XXXX', start, stop, refForRead, read
-#             cigar = mutationsCIGAR( alignStart, readLen, mutations )
-#             if not OPTIONS.quiet:
-#                 leftPads = string.join(start * ['-'], '')
-#                 rightPads = string.join(remaining * ['-'], '')
-#                 print leftPads + mutSeq.tostring()[start:stop] + rightPads, start, stop, cigar
-#             return filter(notDash, read), alignStart + start, cigar 
-#         else:
-#             return False, False, False
-#     
-#     reads = map( sampleRead, starts )
-#     return [read for read in reads if read[0] <> False]
-
-class refSite:
-    def __init__( self, refSeq, mutSeq, alignStart, mutations ):
-        self.refSeq = refSeq
-        self.mutSeq = mutSeq
-        self.alignStart = alignStart
-        self.mutations = mutations
-
-def sample1Read( refSite, start, readLen, reverseComplementP = False ):
-    if refSite.mutSeq[start] <> '-':
-        read, stop = accumulateBases( refSite.mutSeq, start, readLen )
-        
-        mutSubSeq = refSite.mutSeq[start:stop]
-        if reverseComplementP:
-            s = Seq( read, IUPAC.unambiguous_dna ) 
-            #print 'reverseComplementP', read, s.reverse_complement().tostring(), mutSubSeq
-            read = s.reverse_complement().tostring()
-            mutSubSeq.complement()
-        
-        remaining = len(refSite.mutSeq) - stop
-        refForRead = refSite.refSeq[start:stop]
-        #print 'XXXX', start, stop, refForRead, read
-        cigar = mutationsCIGAR( start, refSite.alignStart, readLen, refSite.mutations )
-        leftPads = string.join(start * ['-'], '')
-        rightPads = string.join(remaining * ['-'], '')
-        ppReadStr = leftPads + mutSubSeq.tostring() + rightPads # + '(' + read + ')'
-        return True, filter(notDash, read), refSite.alignStart + start, cigar, ppReadStr
-    else:
-        return False, None, None, None, None
-
-def sampleReadsFromAlignment(wholeRef, refSeq, mutSeq, alignStart, readLen, nReads, mutations, pairedOffset, mutations2, refSeq2, mutSeq2):
-    #print 'REF:', refSeq.tostring()
-    #print 'MUT:', mutSeq.tostring()
-    #print ''
-
-    # pairedSite, mutations2, refSeq2, mutSeq2 can all be None
-    
-    refSite1 = refSite( refSeq, mutSeq, alignStart, mutations )
-    refSite2 = refSite( refSeq2, mutSeq2, alignStart + pairedOffset, mutations2 )
-    
-    #print refSite1.alignStart, refSite2.alignStart, pairedOffset
-    
-    # we are assuming that mutSeq is exactly 1 + 2 * readLen in size, which the mutation
-    # right in the middle
-    lastGoodStartSite = readLen - 1
-    if not OPTIONS.quiet:
-        if OPTIONS.generatePairedEnds:
-            r = wholeRef.seq[refSite1.alignStart:refSite2.alignStart + 2*readLen - 1]
-            print r.tostring(), 'ref'
-            print r.complement().tostring(), 'ref, complement'
-        else:
-            print refSeq.tostring(), 'ref'
-            print mutSeq.tostring(), 'mut'
-    
-    # we can potentially start at any site in mutSeq
-    nPotentialStarts = len(mutSeq) - readLen + 1
-    potentialStarts = range(nPotentialStarts)
-    #print 'potentialStarts', potentialStarts
-
-    if nReads.lower() == 'tile' or int(nReads) >= nPotentialStarts:
-        if OPTIONS.uniqueReadsOnly:
-            starts = potentialStarts
-        else:
-            starts = map( lambda x: random.choice(potentialStarts), range(nPotentialStarts))
-    else:
-        starts = random.sample(potentialStarts, int(nReads))
-
-    #print 'Starts', starts
-   
-    def sampleRead( start ):
-        good1, read1, pos1, cigar1, ppstr1 = sample1Read( refSite1, start, readLen )
-        
-        if OPTIONS.generatePairedEnds:
-            good2, read2, pos2, cigar2, ppstr2 = sample1Read( refSite2, start, readLen, True )
-            if not OPTIONS.quiet:
-                offsetDashes = string.join(['-'] * (pairedOffset), '') 
-                print
-                print ppstr1 + offsetDashes, pos1, cigar1
-                print offsetDashes + ppstr2, pos2, cigar2
-            return [(read1, pos1, cigar1), (read2, pos2, cigar2)]
-        else:
-            if not OPTIONS.quiet:
-                print ppstr1, pos1, cigar1
-            return [(read1, pos1, cigar1), (None, 0, None)]
-    
-    reads = map( sampleRead, starts )
-    return [read for read in reads if read[0][0] <> None]
-
-def notDash( c ):
-    return c <> '-'
-
-def fakeQuals( seq ):
-    return len(seq) * [30]
-    #return range(len(seq))
-
-def alignedRead2SAM( siteID, readID, fastaID, read, pos, cigar, read2, pos2, cigar2 ):
-    rname = fastaID
-    mapq = 40
-    quals = fakeQuals( read )
-
-    qname = '%s:%d.read.%d' % (fastaID, siteID, readID)
-    
-    if not OPTIONS.generatePairedEnds:
-        # not in paired mode
-        flags = []
-        record1 = SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, read, quals )
-        record2 = None
-    else:
-        flags = [SAM_SEQPAIRED, SAM_MAPPAIRED]
-        insertSize = pos2 - pos - len(read)
-        record1 = SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, read, quals, pairContig = rname, pairPos = pos2, insertSize = insertSize )
-        record2 = SAMRecordFromArgs( qname + 'p', flags, rname, pos2, mapq, cigar2, read2, quals, pairContig = rname, pairPos = pos, insertSize = insertSize )
-        
-        #print 'read1, read2', record1.getSeq(), record2.getSeq()
-        
-    return [record1, record2]
-
-# ---------------------------------------------------------------------------
-#
-# paired end reads
-#
-# ---------------------------------------------------------------------------
-def pairedReadSite( ref, leftMutSite, readLen ):
-    pairedOffset = int(round(random.normalvariate(OPTIONS.insertSize, OPTIONS.insertDev)))
-
-    def fail(): return False, None
-    
-    if pairedOffset < 0:
-        return fail()
-        
-    pairedSite = pairedOffset + leftMutSite + readLen
-    #print 'pairedSite', pairedOffset, leftMutSite, pairedSite
-    
-    if pairedSite + readLen >= len(ref):
-        return fail()
-    refsub = ref[pairedSite - readLen:pairedSite + readLen + 1]
-    if isBadSequenceForSim( refsub ):
-        return fail()
-    
-    return True, pairedSite
-    
-
-
-# ---------------------------------------------------------------------------
-#
-# build allowed regions and sampling starts from region
-#
-# ---------------------------------------------------------------------------
-def parseSamplingRange( arg, ref, readLen ):
-    # returns a list of the allowed regions to sample in the reference
-
-    def one(x):
-        elts = x.split()
-        #print 'elts', elts
-        if len(elts) > 0:
-            elts1 = elts[0].split(':')
-            #print 'elts1', elts1
-            return map( int, elts1 )
-        else:
-            return False
-  
-    if arg <> None:
-        try:
-            return [ one(arg) ]
-        except:
-            # try to read it as a file
-            return filter( None, map( one, open(arg).readlines() ) )
-    else:
-        return [[0, len(ref.seq) - readLen]]
-
-def sampleStartSites( sampleRanges, nsites ):
-    print 'sampleStartSites', sampleRanges, nsites
-    
-    # build a set of potential sites
-    if len(sampleRanges) > 1:
-        potentialSites = set()
-        for start, end in sampleRanges:
-            #print 'potentialSites', start, end, potentialSites
-            potentialSites = potentialSites.union(set(xrange(start, end))) 
-    else:
-        start, end = sampleRanges[0]
-        potentialSites = xrange(start, end) 
-
-    #print 'potentialSites', potentialSites
-    
-    # choose sites from potentials
-    if len(potentialSites) <= nsites:
-        # tile every site
-        sites = potentialSites
-    else:
-        # we need to sample from the potential sites
-        sites = random.sample( potentialSites, nsites )
-
-    # print 'Sites', sites
-    print 'Number of potential start sites:', len(potentialSites)
-    print 'Number of start sites that will be generated:', len(sites)
-    return sites 
-
-def readRef(referenceFasta):
-	handle = open(referenceFasta)
-	for seq_record in SeqIO.parse(handle, "fasta"):
-		print seq_record.id 
-		print seq_record.name
-		#print repr(seq_record.seq)
-		print len(seq_record.seq)
-		yield seq_record
-	handle.close()
-
-import os.path
-def outputFilename():
-    root = os.path.splitext(os.path.split(OPTIONS.reference)[1])[0]
-    if OPTIONS.generatePairedEnds:
-        pairedP = 'Yes'
-    else:
-        pairedP = 'No'
-        
-    params = [['mut',OPTIONS.mutationType], ['den', max(OPTIONS.mutationDensity, OPTIONS.indelSize)], [OPTIONS.readLen,'bp'], \
-              ['nsites', OPTIONS.nSites], ['cov', OPTIONS.coverage], ['range', OPTIONS.range], ['paired', pairedP]]
-    filename = root + '__' + string.join(map( lambda p: string.join(map(str, p),'.'), params), '_')
-    root = os.path.join(OPTIONS.outputPrefix, filename)
-    return root, root + '.sam'
-
-def main():
-    global OPTIONS, ROOT
-
-    usage = "usage: %prog [options]"
-    parser = OptionParser(usage=usage)
-    parser.add_option("-r", "--ref", dest="reference",
-                        type="string", default=None,
-                        help="Reference DNA seqeunce in FASTA format")
-    parser.add_option("-o", "--outputPrefix", dest="outputPrefix",
-                        type="string", default='',
-                        help="Output Prefix SAM file")
-    parser.add_option("-c", "--coverage", dest="coverage",
-                        type="string", default='1',
-                        help="Number of reads to produce that cover each mutated region for the reference, or tile for all possible reads.")
-    parser.add_option("-n", "--nsites", dest="nSites",
-                        type=int, default=1,
-                        help="Number of sites to generate mutations in the reference")
-    parser.add_option("-l", "--readlen", dest="readLen",
-                        type=int, default=36,
-                        help="Length of reads to generate")
-    parser.add_option("-s", "--seed", dest="seed",
-                        type=int, default=123456789,
-                        help="Random number seed.  If 0, then system clock is used")
-    parser.add_option("-t", "--muttype", dest="mutationType",
-                        type="string", default='None',
-                        help="Type of mutation to introduce from reference.  Can be None, SNP, insertion, or deletion")
-    parser.add_option("-e", "--mutdensity", dest="mutationDensity",
-                        type=int, default=1,
-                        help="Number of mutations to introduce from the reference")
-    parser.add_option("-x", "--range", dest="range",
-                        type="string", default=None,
-                        help="Sampling range restriction, in the form of x:y without a space")
-    parser.add_option("-u", "--unique", dest="uniqueReadsOnly",
-                        action='store_true', default=True,
-                        help="Assumes that the user wants unique reads generated.  If the coverage is greater than the number of unique reads at the site, the region is just tiled")
-    parser.add_option("-i", "--indelsize", dest="indelSize",
-                        type=int, default=0,
-                        help="Size in bp of the insertion or deletion to introduce")
-
-    # Paired end options
-    parser.add_option("", "--paired", dest="generatePairedEnds",
-                        action='store_true', default=False,
-                        help="Should we generate paired end reads?")
-    parser.add_option("", "--insertSize", dest="insertSize",
-                        type=int, default=280,
-                        help="Size in bp of the paired library insertion")
-    parser.add_option("", "--insertDev", dest="insertDev",
-                        type=int, default=5,
-                        help="Standard deviation in the size in bp of the paired library insertion")
-
-    parser.add_option("-q", "--quiet", dest="quiet",
-                        action='store_true', default=False,
-                        help="Be quiet when generating output")
-    parser.add_option("-d", "--debug", dest="debug",
-                        action='store_true', default=False,
-                        help="Verbose debugging output?")
-    (OPTIONS, args) = parser.parse_args()
-    if len(args) != 0:
-        parser.error("incorrect number of arguments")
-	
-    root, outputSAM = outputFilename()
-
-    if OPTIONS.seed == 0:
-        random.seed(None)
-    else:
-        random.seed(OPTIONS.seed)
-
-    OPTIONS.mutationType = OPTIONS.mutationType.upper()
-    
-    print '------------------------------------------------------------' 
-    print 'program:', sys.argv[0]
-    print ''
-    print '  ref:               ', OPTIONS.reference
-    print '  output:            ', outputSAM
-    print ''
-    print '  mutation type:     ', OPTIONS.mutationType
-    print '  mutation density:  ', OPTIONS.mutationDensity
-    print '  indel size:        ', OPTIONS.indelSize
-    print '  readLen:           ', OPTIONS.readLen
-    print '  nSites:            ', OPTIONS.nSites
-    print '  coverage:          ', OPTIONS.coverage
-    print '  range restriction: ', OPTIONS.range
-    print ''
-    print '  paired ends?:      ', OPTIONS.generatePairedEnds
-    print '  insert size:       ', OPTIONS.insertSize
-    print '  insert stddev:     ', OPTIONS.insertDev
-    print ''
-    print '  Debugging?:        ', OPTIONS.debug
-    print '------------------------------------------------------------'
-
-    if OPTIONS.mutationType <> 'SNP' and OPTIONS.mutationDensity > 1:
-        raise Exception('Does not support mutation density > 1 for mutations of class', OPTIONS.mutationType)
-
-    readLen = OPTIONS.readLen
-    fastaRecords = [seq for seq in readRef(OPTIONS.reference)]
-    header = SAMHeader( fastaRecords[0].id, len(fastaRecords[0].seq) ) 
-    SAMout = SAMIO( outputSAM, header, debugging=OPTIONS.debug )
-    
-    mutationsout = open(root + '.mutations.txt', 'w')
-    qualsout = open(root + '.quals.txt', 'w')
-    refout = open(root + '.ref', 'w')
-
-    fastaout = open(root + '.fasta', 'w')
-    if OPTIONS.generatePairedEnds:
-        fastaout2 = open(root + '.pairs.fasta', 'w')
-        qualsout2 = open(root + '.pairs.quals.txt', 'w')
-
-    counter = 0
-    refLen = 0
-    for ref in fastaRecords:
-        refLen = len(ref.seq)
-        
-        # write the crazy ref file info needed by samtools
-        print >> refout, ref.id + '\t' + str(refLen)
-
-        sampleRanges = parseSamplingRange( OPTIONS.range, ref, readLen )
-        sites = sampleStartSites( sampleRanges, OPTIONS.nSites )
-        
-        for mutSite, siteCounter in zip( sites, range(1, len(sites)+1) ):
-            #print 'Sampling site:', mutSite, refLen 
-            #mutSite = readLen-1
-            nSitesSelected = max(int(round(len(sites)/100.0)), 1)
-            #print siteCounter, len(sites), nSitesSelected)
-            if siteCounter % nSitesSelected == 0:
-                print 'Sampled site %d of %d (%.2f%%)' % ( siteCounter, len(sites), (100.0*siteCounter) / len(sites)) 
-                            
-            good, mutations, refSeq, mutSeq = mutateReference(ref.seq, ref.id, mutSite, OPTIONS.mutationType, [], readLen)
-
-            pairedSite, mutations2, refSeq2, mutSeq2 = 0, None, None, None
-            if good and OPTIONS.generatePairedEnds:
-                if OPTIONS.mutationType == 'SNP':
-                    pairedMutType = 'SNP'
-                else:
-                    pairedMutType = 'NONE'
-                
-                good, pairedSite = pairedReadSite( ref.seq, mutSite, readLen )
-                print 'pairedReadSite', good, pairedSite, good
-                if good:
-                    good, mutations2, refSeq2, mutSeq2 = mutateReference(ref.seq, ref.id, pairedSite, pairedMutType, [], readLen)
-
-            #print 'Good', good, mutations, refSeq, mutSeq
-            if good:
-                print >> mutationsout, ref.id + ':' + str(mutSite) + '\t' + string.join(map(str, mutations), '\t')
-                #print 'Mutations', mutations
-
-                refSeqPos = mutSite - readLen + 1
-                readPairs = sampleReadsFromAlignment(ref, refSeq, mutSeq, refSeqPos, readLen, OPTIONS.coverage, mutations, pairedSite - mutSite, mutations2, refSeq2, mutSeq2)
-                for i in range(len(readPairs)):
-                    counter += 1
-
-                    read1, read2 = readPairs[i]
-                    seq, pos, cigar = read1
-                    seq2, pos2, cigar2 = read2
-
-                    # write out the sam and fasta files
-                    def write1( record, recordi ):
-                        if record <> None:
-                            SAMout.writeRecord( record )
-                            sequences = [SeqRecord(Seq(record.getSeq()), id = record.getQName(), description='')]
-                            #print sequences
-                            
-                            if recordi % 2 == 0:
-                                localFastaOut, localQualsOut = fastaout, qualsout
-                            else:
-                                localFastaOut, localQualsOut = fastaout2, qualsout2
-
-                            SeqIO.write(sequences, localFastaOut, "fasta")
-                            print >> localQualsOut, record.getQName(), string.join(map( str, record.getQuals() ), ' ')
-                    
-                    #print i, read1, read2
-                    records = alignedRead2SAM( mutSite, i+1, ref.id, seq, pos + 1, cigar, seq2, pos2 + 1, cigar2 )
-                    map( write1, records, range( len(records) ) )
-
-    SAMout.close()
-    qualsout.close()
-    fastaout.close()
-    refout.close()
-    mutationsout.close()
-    
-    if OPTIONS.generatePairedEnds:
-        qualsout2.close()
-        fastaout2.close()
-    
-#    for record in SAMIO( outputSAM ):
-#        print record
-	
-if __name__ == "__main__":
-    import Bio
-    from Bio import SeqIO
-    from Bio.Seq import Seq
-    from Bio.SeqRecord import SeqRecord
-    from Bio.Alphabet import IUPAC, Gapped
-    main()
-
-
--- a/python/SpawnMapperJobs.py
+++ b/python/SpawnMapperJobs.py
@ -1,194 +0,0 @@
-#!/usr/bin/env python
-
-import getopt, sys, os, string
-from farm_commands import *
-
-#FastaQuals2Fastq_exe = "/wga/dev/andrewk/Arachne/AlignerEvaluation/FastaQuals2Fastq.py"
-FastaQuals2Fastq_exe = "FastaQuals2Fastq.py"
-        
-def isFastaB(filename):
-    """Is the file a fastb file already?"""
-    #print os.path.splitext(filename)
-    return os.path.splitext(filename)[1] == '.fastb'
-
-def readListOfLanes( listFile ):
-    """Simply reads a list of files to process from a file"""
-    lines = map( string.split, map( string.strip, open(listFile).readlines() ) )
-    return map( lambda x: x[0], lines )
-    #return map( lambda x: x[0], lines ), map( lambda x: x[1], lines )
-    
-
-def run_swmerlin(input_file, input_head, farm=""):
-    run_merlin(input_file, input_head, farm, sw=True)
-
-def run_merlin(input_file, input_head, farm="", sw=False):
-    "sw = Merlin Smith-Waterman option"
-    if isFastaB(input_file):
-        input_fastb = input_file
-    else:
-        input_fastb = input_head+".fastb"
-        if regenExistingFiles or not os.path.exists(input_fastb):
-            cmd("Fasta2Fastb IN= "+input_file, just_print_commands=justPrintCommands)
-    if sw:
-        output_head = input_head+".swmerlin"
-    else:
-        output_head = input_head+".merlin"
-    cmd_str = "Merlin REF_FASTB= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.fastb REF_MERLIN= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.merlinref.bin FASTB= "+input_fastb+" OUT_HEAD="+output_head
-    if sw:
-        cmd_str += " SW=True"
-    cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands)
-
-USE_BATCH = False
-
-def run_ilt(input_file, input_head, farm=""):
-    #print 'isFastaB', input_file, isFastaB(input_file)
-    if isFastaB(input_file):
-        input_fastb = input_file
-    else:
-        input_fastb = input_head+".fastb"
-        if regenExistingFiles or not os.path.exists(input_fastb):
-            cmd("Fasta2Fastb IN= "+input_file, just_print_commands=justPrintCommands)
-
-    output_head = input_head+".ilt"
-
-    if USE_BATCH:
-        cmd_str = "~depristo/bin/batchShortQueryLookup2.pl --NUMPROCS=10 --BATCHQUEUE=long --SEQS="+input_fastb+" --L=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.lookup --MAX_FREQ=1000 --O= "+output_head
-        cmd(cmd_str, False, input_head, just_print_commands=justPrintCommands)
-    else:
-        cmd_str = "ImperfectLookupTable SEQS= "+input_fastb+" L= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.lookup MAX_FREQ=1000 OUT_PREFIX= "+output_head
-        cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands)
-
-def run_BWA32(input_file, input_head, farm=""):
-    run_BWA(input_file, input_head, farm, 32)
-
-def run_BWA(fasta, input_head, farm="", seed='inf'):
-    output_head = input_head+".bwa"
-    quals = input_head+".quals.txt"
-    fastq = input_head+".fastq"
-    if regenExistingFiles or not os.path.exists(fastq) :
-        cmd_str = FastaQuals2Fastq_exe+" "+fasta+" "+quals+" "+fastq
-        cmd(cmd_str)
-    if seed <> 'inf':
-        output_head += str(seed)
-    output_sam = output_head + ".sam"
-    cmd_str = "bwahuman samse "+str(seed)+" "+fastq+" "+output_sam
-    cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands)
-
-def run_MAQ(input_fasta, head, farm=""):
-    maq_exe = "/seq/dirseq/maq-0.7.1/maq"
-    bfa_ref="/seq/dirseq/ktibbett/maq-0.7.1-test/Homo_sapiens_assembly18.bfa"
-
-    fasta = input_fasta
-    quals = head+".quals.txt"
-    fastq = head+".fastq"
-    if regenExistingFiles or not os.path.exists(fastq) :
-        cmd_str = FastaQuals2Fastq_exe+" "+fasta+" "+quals+" "+fastq
-        cmd(cmd_str)
-
-    bfq = head+".bfq"
-    if regenExistingFiles or not os.path.exists(bfq):
-        cmd( maq_exe+" fastq2bfq "+fastq+" "+bfq, just_print_commands=justPrintCommands )
-
-    out_head = head+".maq"
-    maq_out = out_head+".out.aln.map"
-    cmd_str = maq_exe+" map -e 100 -a 600 -s 0 "+maq_out+" "+bfa_ref+" "+bfq
-    cmd(cmd_str, farm, out_head, just_print_commands=justPrintCommands)
-    
-def usage():
-    print "Required arguments:"
-    print "  -i         Input FASTA head (*.fasta, *.qualb)"
-    print "       OR"
-    print "  -d         Directory to grab all FASTA files from"
-    print "       OR"
-    print "  -l         List of FASTA/FASTB files to process"
-    print
-    print "Optional arguments:"
-    print "  -f QUEUE   Farm jobs to QUEUE on LSF"
-    print
-    print "  -m MAPPER  Compare output from MAPPER which can be: ilt, merlin, swmerlin, maq, all (default: all)"
-    print
-    print "  -x         Don't execute commands, just print them"
-    print
-    print "  -w         Output files to current directory (strip path from input file/dir/list"
-    print
-
-
-def get_all_fasta_files(fasta_dir):
-    files = os.listdir(fasta_dir)
-    if not fasta_dir.endswith("/"): fasta_dir += "/"
-    fasta_files = [fasta_dir+f for f in files if f.endswith(".fasta") and os.path.getsize(fasta_dir+f) > 0]
-    #print fasta_files
-    return fasta_files
-
-justPrintCommands = False
-regenExistingFiles = False
-
-if __name__ == "__main__":
-    opts = None
-    try:
-        opts, args = getopt.getopt(sys.argv[1:], "i:d:f:m:l:xw:r", ["input","fasta_dir","farm","mapper","listOfLanes", "dontexe", "regenExistingFiles", "outputInWorkingDirectory"])
-    except getopt.GetoptError:
-        print sys.argv
-        usage()
-        sys.exit(2)
-
-    input_head = ""
-    fasta_dir = ""
-    mapper_str = "all"
-    farm_sub = False
-    listOfLanes = None
-    outputInWorkingDirectory = False
-
-    for opt, arg in opts:
-        print opt, arg
-        if opt in ("-i", "--input"):
-            input_head = arg
-        if opt in ("-l", "--listOfLanes"):
-            listOfLanes = arg
-        if opt in ("-d", "--fasta_dir"):
-            fasta_dir = arg
-        if opt in ("-f", "--farm"):
-            farm_sub = arg
-        if opt in ("-r", "--regenExistingFiles"):
-            regenExistingFiles = True
-        if opt in ("-m", "--mapper"):
-            mapper_str = arg
-        if opt in ("-x", "--dontexe"):
-            justPrintCommands = True
-        if opt in ("-w", "--outputInWorkingDirectory"):
-            outputInWorkingDirectory = True
-
-    if (input_head == "") and (fasta_dir == "") and (listOfLanes == None):
-        print input_head, fasta_dir, listOfLanes
-        usage()
-        sys.exit(2)
-
-    # Select function(s) for mapper
-    mapper_func_list = {"ilt":run_ilt, "merlin":run_merlin, "swmerlin":run_swmerlin, "maq":run_MAQ, "bwa":run_BWA, "bwa32":run_BWA32}
-    if mapper_str.lower() == "all":
-        mapper_list = mapper_func_list.values()
-    else:
-        mapper_list = [mapper_func_list.get(mapper_str.lower())]
-        if mapper_list == [None]:
-            sys.exit("Don't know of mapper argument: "+mapper_str)
-
-    if input_head:
-        input_heads = [None]
-        input_files = [input_head + 'fasta']
-    elif listOfLanes <> None:
-        input_files = readListOfLanes(listOfLanes)
-        input_heads = [None] * len(input_files)
-    else:
-        input_files = [file for file in get_all_fasta_files(fasta_dir)]
-        input_heads = [None] * len(input_files)
-        
-    for input_file, input_head in zip(input_files, input_heads):
-        if input_head == None:
-            file_head = os.path.splitext(input_file)[0]
-            if outputInWorkingDirectory:
-                file_head = os.path.split(file_head)[1]
-        else:
-            file_head = input_head
-        for mapper in mapper_list:
-            mapper( input_file, file_head, farm=farm_sub )
-            print
--- a/python/SpawnValidationJobs.py
+++ b/python/SpawnValidationJobs.py
@ -1,92 +0,0 @@
-#!/usr/bin/env python
-
-import getopt, sys, os, string
-from farm_commands import *
-
-def picardCMD(name, **keywords ):
-    cmd = name
-    for key, value in keywords.iteritems():
-        cmd += ' ' + key + "=" + str(value)
-    return cmd
-
-def spawnValidationJob( input_file, output_head, farm, maxErrors):
-    validate_exe = "ValidateSAM"
-    output_file = output_head + '.stdout'
-
-    if regenExistingFiles or not os.path.exists(output_file):
-        cmd_str = picardCMD( validate_exe, I=input_file, M=maxErrors )
-        if farm == "":
-             cmd_str += " > " + output_file
-        cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands)
-    
-def usage():
-    print "Required arguments:"
-    print "  -d         Directory to grab all sam/bam files from"
-    print
-    print "Optional arguments:"
-    print "  -f QUEUE   Farm jobs to QUEUE on LSF"
-    print
-    print "  -m MAXERRORS  Maximum number of errors to detect before aborting"
-    print
-
-
-def get_all_sam_files(dir):
-    files = []
-    
-    for dirpath, dirnames, filenames in os.walk(dir):
-        for filename in filenames:
-            base, ext = os.path.splitext(filename)
-            if ext.lower() in ['.sam', '.bam']:
-                files.append( os.path.join( dirpath, filename ) )
-            #print filename, base, ext
-
-    return files
-
-def output_filename( input_file ):
-    parts = filter(lambda x: x.strip() <> '', input_file.split("/"))
-    print parts
-    return ".".join(parts) + ".validation" 
-
-justPrintCommands = False
-regenExistingFiles = False
-
-if __name__ == "__main__":
-    opts = None
-    try:
-        opts, args = getopt.getopt(sys.argv[1:], "d:f:m:r", ["dir","farm","maxErrors", "regenExistingFiles"])
-    except getopt.GetoptError:
-        print sys.argv
-        usage()
-        sys.exit(2)
-
-    dir = ""
-    mapper_str = "all"
-    farm_sub = False
-    maxErrors = 1000
-
-    for opt, arg in opts:
-        print opt, arg
-        if opt in ("-d", "--dir"):
-            dir = arg
-        if opt in ("-f", "--farm"):
-            farm_sub = arg
-        if opt in ("-m", "--maxErrors"):
-            maxErrors = arg
-        if opt in ("-r", "--regenExistingFiles"):
-            regenExistingFiles = True            
-    if dir == "":
-        usage()
-        sys.exit(2)
-
-    input_files = get_all_sam_files(dir)
-    print 'Processing files: N=', len(input_files)
-    for input_file in input_files:
-        print '  ->', input_file
-        
-    for input_file in input_files:
-        output_file = output_filename( input_file )
-        print input_file, "=>", output_file
-        spawnValidationJob( input_file, output_file, farm_sub, maxErrors )
-        
-        
-        
--- a/python/StressTestGATK.py
+++ b/python/StressTestGATK.py
@ -1,66 +0,0 @@
-import farm_commands
-import os.path
-import sys
-import getopt
-
-defaultCommands = ['CountReads', 'Pileup']
-
-def usage():
-    print "Optional arguments:"
-    print "  -f QUEUE   Farm jobs to QUEUE on LSF"
-    print "  -c cmd1,cmd2   Walkers to execute, otherwise", ' '.join(defaultCommands)
-    print "  -e             Ignore existing files", ' '.join(defaultCommands)
-    
-if __name__ == "__main__":
-    opts = None
-    try:
-        opts, args = getopt.getopt(sys.argv[1:], "f:c:a:e", ["farm", "commands", "args", "ignoreExistingFiles"])
-    except getopt.GetoptError:
-        print sys.argv
-        usage()
-        sys.exit(2)
-
-    farm_sub = False
-    commandsList = defaultCommands
-    ignoreExistingFiles = False
-    extraArgs = ''
-
-    for opt, arg in opts:
-        if opt in ("-f", "--farm"):
-            farm_sub = arg
-        if opt in ("-c", "--commands"):
-            commandsList = arg.split(',')
-        if opt in ("-e", "--ignoreExistingFiles"):
-            ignoreExistingFiles = True
-        if opt in ("-a", "--args"):
-            extraArgs = arg
-
-    directory = args[1]
-    for line in open(args[0]):
-        lineParts = line.split()
-        lane = lineParts[0].strip()
-        
-        ref = '/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta'
-        if ( len(lineParts) > 1 ):
-            ref = lineParts[1].strip()
-            
-        if not os.path.exists(lane):
-            print 'Input SAM/BAM file: "', lane, '" does not exist, skipping...'
-            continue        
-
-        head, lane_filename = os.path.split(lane)
-        filebase = os.path.splitext(lane_filename)[0]
-
-        # make the pileup
-        # 503  8:03    samtools view -b MV1994.bam Escherichia_coli_K12:10-1000 > sub.bam
-        # 504  8:03    samtools pileup -f Escherichia_coli_K12_MG1655.fasta sub.bam
-    
-        # convert the fasta
-        for analysis in commandsList:
-            output = os.path.join(directory, filebase + '.' + analysis + '.output')
-            if ignoreExistingFiles or not os.path.exists(output):
-                cmd = "java -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -T " + analysis + " -I " + lane + " -R " + ref + " -o " + output + " -l INFO " + extraArgs
-                print cmd
-                farm_commands.cmd(cmd, farm_sub, output)
-
-
--- a/python/ValidateGATK.py
+++ b/python/ValidateGATK.py
@ -1,201 +0,0 @@
-import farm_commands
-import os.path
-import sys
-from optparse import OptionParser
-import string
-
-defaultCommands = ['CountReads', 'Pileup']
-
-def indexBAM(reads, queue=None):
-    cmd = "samtools index " + reads
-    farm_commands.cmd(cmd, queue, None, just_print_commands=OPTIONS.justPrint)    
-
-def readIntervalFile(file):
-    def notHeaderP(line):
-        return line[0][0] <> '@'
-    lines = filter(notHeaderP, map( string.split, [line for line in open(file)]))
-    
-    byContig = {}
-    for entry in lines:
-        elt = [int(entry[1]), int(entry[2])]
-        if entry[0] in byContig:
-            byContig[entry[0]].append(elt)
-        else:
-            byContig[entry[0]] = [elt]
-    
-    #print byContig
-    return byContig
-
-def isIntervalFile(filename):
-    return os.path.splitext(filename)[1] == '.interval_list'
-
-def filterByInterval(intervalMap, unfilteredPileupFile, filteredPileupFile):
-    print 'Intervals', intervalMap
-
-    def maybeWrite1(line, offsetMap, out):
-        debug = False
-        parts = line.split()
-        contig = parts[0]
-        pos = int(parts[1])
-        if debug: print 'pileup', contig, pos, line,
-
-        if contig in intervalMap:
-            intervals = intervalMap[contig]
-            
-            while (offsetMap[contig] < len(intervals)):
-                   offset = offsetMap[contig]
-                   #print intervals[offset]
-                   curStart, curStop = intervals[offset][0:2]
-                   if debug: print contig, pos, curStart, curStop
-                   if pos >= curStart:
-                       if pos <= curStop:
-                           #print line,
-                           print >> out, line,
-                           break
-                       else:
-                           if debug: print 'Progressing', contig, pos, curStart, curStop
-                           offsetMap[contig] += 1
-                   else:
-                       break
-        return offsetMap
-        
-    def allDone( offsetMap, intervalMap ):
-        def oneDone( contig ):
-            return offsetMap[contig] >= len(intervalMap[contig])
-        return all( map(oneDone, offsetMap.iterkeys()) )
-            
-    i = 0
-    offsetMap = dict([(key, 0) for key in intervalMap.keys()])
-    out = open(filteredPileupFile, 'w')
-    for line in open(unfilteredPileupFile):
-        offsetMap = maybeWrite1(line, offsetMap, out)
-        i += 1
-        if i % 10000 == 0 and allDone(offsetMap, intervalMap):
-            break
-    out.close()
-                    
-    
-
-def main():    
-    global OPTIONS, ROOT
-
-    usage = "usage: %prog [options]"
-    parser = OptionParser(usage=usage)
-    parser.add_option("-f", "--file", dest="validateInfoFile",
-                        type="string", default=None,
-                        help="File name of the validation set")
-    parser.add_option("-d", "--dataDir", dest="dataDir",
-                        type="string", default=None,
-                        help="Directory to the data files")
-    parser.add_option("-o", "--outputDir", dest="outputDir",
-                        type="string", default=None,
-                        help="Directory to put the output")
-    parser.add_option("-q", "--farmQueue", dest="farmQueue",
-                        type="string", default=None,
-                        help="Farm queue to submit jobs to.  Leave blank for local processing")
-    parser.add_option("-e", "--ignoreExistingFiles", dest="ignoreExistingFiles",
-                        action='store_true', default=False,
-                        help="Ignores the existing files")
-    parser.add_option("-a", "--rebuildAllFiles", dest="rebuildAllFiles",
-                        action='store_true', default=False,
-                        help="If provided, all intermediate files (BAM and pileups) will be regenerated")
-    parser.add_option("-n", "--dontValidate", dest="dontValidate",
-                        action='store_true', default=False,
-                        help="If provided, won't try to validate, just make sure the validation data files are ready")
-    parser.add_option("-g", "--gatk", dest="gatkPath",
-                        type="string", default="~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar",
-                        help="Path to the gatk")
-    parser.add_option("-x", "--args", dest="extraArgs",
-                        type="string", default="",
-                        help="Extra args to pass to the GATK")
-    parser.add_option("-p", "--justPrint", dest="justPrint",
-                        action='store_true', default=False,
-                        help="Don't actually run GATK, just setup data files")
-
-    (OPTIONS, args) = parser.parse_args()
-    if len(args) != 0:
-        parser.error("incorrect number of arguments")
-
-    if OPTIONS.validateInfoFile == None:
-        parser.error("f option is required")
-    if OPTIONS.dataDir == None:
-        parser.error("d option is required")
-    if OPTIONS.outputDir == None:
-        parser.error("o option is required")
-
-    if not os.path.exists(OPTIONS.outputDir):
-        os.mkdir(OPTIONS.outputDir)
-    if not os.path.exists(OPTIONS.dataDir):
-        os.mkdir(OPTIONS.dataDir)
- 
-    for line in open(OPTIONS.validateInfoFile):
-        if line.strip() == "" or line[0] == '#':
-            # comment line
-            continue
-    
-        # line is of the format:
-        # <reads.bam> <ref.fasta> <processingRegion>
-        originalReads, ref, region = line.split()
-        
-        originalReads = os.path.expanduser(originalReads)
-        ref   = os.path.expanduser(ref)
-
-        if not os.path.exists(originalReads) or not os.path.exists(ref):
-            print 'Input files do not exist!', originalReads, ref
-            sys.exit(1)        
-
-        head, read_filename = os.path.split(originalReads)
-        filebase = os.path.splitext(read_filename)[0]
-
-        reads = os.path.join(OPTIONS.dataDir, read_filename)
-        readsIndex = reads + '.bai'
-        subBAM = os.path.join(OPTIONS.dataDir, filebase + '.selected.bam')
-        pileup = os.path.join(OPTIONS.dataDir, filebase + '.selected.pileup')
-        validationOutput = os.path.join(OPTIONS.outputDir, filebase + '.validate.output')        
-
-        #print 'reads', reads
-        if not os.path.exists(reads) or OPTIONS.rebuildAllFiles:
-            farm_commands.cmd("ln -s " + os.path.abspath(originalReads) + " " + reads, just_print_commands=OPTIONS.justPrint)
-
-        if not os.path.exists(readsIndex) or OPTIONS.rebuildAllFiles:
-            indexBAM(reads)
-            
-        if not os.path.exists(subBAM) or OPTIONS.rebuildAllFiles:
-            if region == '*' or isIntervalFile(region):
-                farm_commands.cmd("ln -s " + os.path.abspath(reads) + " " + subBAM, just_print_commands=OPTIONS.justPrint)
-            else:
-                cmd = "samtools view -b " + reads + " " + region + " > " + subBAM
-                farm_commands.cmd(cmd, None, None, just_print_commands=OPTIONS.justPrint)
-
-        subBAMIndex =  subBAM+'.bai'
-        if not os.path.exists(subBAMIndex) or OPTIONS.rebuildAllFiles:
-            if region == '*' or isIntervalFile(region):
-                farm_commands.cmd("ln -s " + os.path.abspath(readsIndex) + " " + subBAMIndex, just_print_commands=OPTIONS.justPrint)
-            else:
-                indexBAM(subBAM)
-
-        if not os.path.exists(pileup) or OPTIONS.rebuildAllFiles:
-            if isIntervalFile(region):
-                filteredPileup = pileup + ".unfiltered"
-                if not os.path.exists(filteredPileup) or OPTIONS.rebuildAllFiles:
-                    cmd = "samtools pileup -cf " + ref + " " + subBAM + " > " + filteredPileup
-                    #farm_commands.cmd(cmd, None, None)
-                filterByInterval(readIntervalFile(region), filteredPileup, pileup)
-            else:
-                cmd = "samtools pileup -cf " + ref + " " + subBAM + " > " + pileup
-                farm_commands.cmd(cmd, None, None, just_print_commands=OPTIONS.justPrint)
- 
-        if not OPTIONS.dontValidate and (not os.path.exists(validationOutput) or OPTIONS.ignoreExistingFiles):
-            print validationOutput, 'does not exist'
-            analysis = "ValidatingPileup"
-            cmd = "java -ea -Xmx1024m -jar " + OPTIONS.gatkPath + " -T " + analysis + " -I " + subBAM + " -R " + ref + " -l INFO -S SILENT -U " + OPTIONS.extraArgs
-
-            if isIntervalFile(region):
-                cmd += " --intervals " + region
-
-            cmd += " -B pileup,SAMPileup," + pileup
-            print cmd
-            farm_commands.cmd(cmd, OPTIONS.farmQueue, outputFile=validationOutput, just_print_commands=OPTIONS.justPrint)
-
-if __name__ == "__main__":
-    main()
--- a/python/WalkLociTest.py
+++ b/python/WalkLociTest.py
@ -1,58 +0,0 @@
-#!/util/bin/python
-
-import Walker
-from SimpleSAM import *
-
-class WalkLociTest(object, Walker.Walker):
-    def __init__(self):
-        Walker.Walker.__init__( self, 'byLoci', {}, name = 'WalkLociTest' )
-        self.debug = False
-
-    def filterfunc( self, *data ):
-        return True
-        #return datum.getPos() % 2 == 0
-
-    def mapfunc( self, ref, chrom, pos, offsets, reads ):
-
-        if self.debug:
-            print '------------------------------'
-            print '  locus', chrom, pos
-            print '  offsets = ', offsets
-            print '  reads = ', reads
-            
-        # 
-        # This is the heart of the test walker.  You've got the reference, contig, 
-        # pos, offsets, and reads.  The line below generate pile-ups
-        #
-        def getField(field):
-            return map( lambda r, o: r.__getattribute__(field)[o], reads, offsets ) 
-
-        def toASCII33( quals ):
-            return ''.join( map( lambda q: chr(q+33), quals ))
-
-        if False:
-            # actually do some useful work
-            refbase = ref[chrom][pos-1]
-            bases = ''.join(getField('bases'))
-            quals = toASCII33(getField('quals'))
-            print chrom, pos, refbase, len(reads), all(map(lambda b: b == refbase, bases)), bases, quals
-            
-        if self.debug:
-            for offset, read in zip( offsets, reads ):
-                if self.debug:
-                    print '    offset, read', offset, read
-                print read.bases[offset], read.quals[offset]
-        
-        #print datum
-        return 1
-        #return len(datum.getSeq())
-        #return datum.getSeq() + '/'
-        #return "\n>%s\n%s" % (datum.getQName(), datum.getSeq())
-     
-    #reduceDefault = ''
-    def reducefunc( self, x, sum ):
-        #print x
-        return x + sum
-
-
-
--- a/python/Walker.py
+++ b/python/Walker.py
@ -1,88 +0,0 @@
-class Walker:
-    """Represents a functional walking object to be applied to SAM records.
-    
-    Actually useful programs will inherit from this walker object to implement
-    one or more of the blank operators provided by this module.  Effectly, this
-    engine applies a walker object in a standard order to each record in a SAM/BAM
-    file:
-        
-        walker = MyWalker(args)
-        samRecords = parseReadsInFiles( files ):
-        results = []
-        
-        foreach record in files:
-            if walker.filterfunc( record ): 
-                x = walker.mapfunc( record )
-                results.append(x)
-        reduced = reduce( walker.reducefunc, results, walker.reduceDefault )
-
-
-        where:
-    
-        filterfunc( data ) -> returns true or false if the data should be
-            included or excluded from mapping
-        
-        mapfunc( data ) -> applied to each record, locus, or pair of records, depending 
-            on the walker type
-            
-        reducefunc( x, sum ) -> combines partial results x with the accumulating
-            results sum across all the data
-    
-    """
-    def __init__(self, walkerType, options, name = None, desc = None ):
-        self.options = options
-        self.setOption( 'walkerType', walkerType )
-        self.setOption( 'name', name )
-        self.setOption( 'desc', desc )
-        
-        print self.options
-
-    reduceDefault = 0
-    
-    #
-    # walker types
-    #
-    def isRecordWalker( self ):
-        return self.getOption('walkerType') == 'byRecord'
-    def isPairedRecordWalker( self ):
-        return self.getOption('walkerType') == 'byRecordPairs'
-    def isLociWalker( self ):
-        return self.getOption('walkerType') == 'byLoci'
-
-    #
-    # Option processing
-    #
-    def getOption( self, flag ):
-        if self.hasOption( flag ):
-            return self.options[flag]
-        else:
-            return None
-
-    def setOption( self, flag, value ):
-        self.options[flag] = value
-
-    def hasOption(self, flag):
-        return flag in self.options
-
-    def optionEnabled(self, flag):
-        return flag in self.options and self.options[flag] <> False
-
-    def optionDisabled(self, flag):
-        return not self.optionEnabled(flag)
-
-    #
-    # Default functions
-    #
-    def filterfunc( self, datum ):
-        return True
-        
-    def mapfunc( self, datum ):
-        return datum
-     
-    def reducefunc( self, mapResult, sum ):
-        return sum + 1
-    
-if __name__ == "__main__":
-    main()
-
-
--- a/python/aln_file.nocvs.py
+++ b/python/aln_file.nocvs.py
@ -1,87 +0,0 @@
-#!/usr/bin/env python
-
-import string, sys
-
-class aln_record:
-    "Stores one record of data from a MAQ .aln.txt file"
-    field_names = (
-        "read name",
-        "chromosome",
-        "position",
-        "strand",
-        "insert size from the outer coorniates of a pair",
-        "paired flag",
-        "mapping quality",
-        "single-end mapping quality",
-        "alternative mapping quality",
-        "number of mismatches of the best hit",
-        "sum of qualities of mismatched bases of the best hit",
-        "number of 0-mismatch hits of the first 24bp",
-        "number of 1-mismatch hits of the first 24bp on the reference",
-        "length of the read",
-        "read sequence",
-        "sequence quality"
-        )
-    max_field_name_len = max(map(len, field_names))
-
-    def __init__(self, obj, parse = True):
-        self.fields = []
-        if type(obj) == str:
-            self.setValuesFromString( obj, parse );
-        else:
-            raise TypeError("aln_record did not recognize type: "+str(type(obj)))
-    
-    def setValuesFromString( self, line, parse = True ):
-        if parse:
-            formats = [str, str, int, str, int, int, int, int, int, int, int, int, int, int, str, str]
-            self.fields = map( lambda f, v: f(v), formats, line.split() )
-        else:
-            self.fields = line.split()
-
-    def __str__(self):
-        s = ""
-        for n,v in zip(aln_record.field_names, self.fields):
-            s += ("%"+str(aln_record.max_field_name_len)+"s : %s\n") % (n, str(v))
-        return s
-
-        #return string.join( map( str, self.fields ), ' ')
-
-    def id(self): return self.fields[0]
-    def contig(self): return self.fields[1]
-
-    def offset(self): return self.fields[2]-1
-    def pos(self): return self.fields[2]
-
-    # Quality of read mapping (only maq gives this field)
-    def map_qual(self): return self.fields[6]
-
-    #def offset_end(self): return self.fields[8]
-    #def pos_end(self): return self.fields[8]+1
-
-    #def linear_start(self): return self.fields[9]
-
-
-class aln_file:
-    def __init__(self, filename, parse=True):
-        self.filename = filename
-        self.parse = parse
-        self.faln = open(self.filename)
-
-    def RecordGenerator(self):
-        for line in self.faln:
-            yield aln_record(line, self.parse)
-        raise StopIteration
-                        
-    def __iter__(self):
-        return self.RecordGenerator()
-          
-if __name__ == "__main__":
-    if len(sys.argv) != 2:
-        print "To test aln_file class:\naln_file.py ALN_FILE"
-    else:
-        count = 0
-        for aln in aln_file(sys.argv[1]):
-            print aln
-            count += 1
-            #if count > 5:
-            #    break
--- a/python/aln_file.py
+++ b/python/aln_file.py
@ -1,87 +0,0 @@
-#!/usr/bin/env python
-
-import string, sys
-
-class aln_record:
-    "Stores one record of data from a MAQ .aln.txt file"
-    field_names = (
-        "read name",
-        "chromosome",
-        "position",
-        "strand",
-        "insert size from the outer coorniates of a pair",
-        "paired flag",
-        "mapping quality",
-        "single-end mapping quality",
-        "alternative mapping quality",
-        "number of mismatches of the best hit",
-        "sum of qualities of mismatched bases of the best hit",
-        "number of 0-mismatch hits of the first 24bp",
-        "number of 1-mismatch hits of the first 24bp on the reference",
-        "length of the read",
-        "read sequence",
-        "sequence quality"
-        )
-    max_field_name_len = max(map(len, field_names))
-
-    def __init__(self, obj, parse = True):
-        self.fields = []
-        if type(obj) == str:
-            self.setValuesFromString( obj, parse );
-        else:
-            raise TypeError("aln_record did not recognize type: "+str(type(obj)))
-    
-    def setValuesFromString( self, line, parse = True ):
-        if parse:
-            formats = [str, str, int, str, int, int, int, int, int, int, int, int, int, int, str, str]
-            self.fields = map( lambda f, v: f(v), formats, line.split() )
-        else:
-            self.fields = line.split()
-
-    def __str__(self):
-        s = ""
-        for n,v in zip(aln_record.field_names, self.fields):
-            s += ("%"+str(aln_record.max_field_name_len)+"s : %s\n") % (n, str(v))
-        return s
-
-        #return string.join( map( str, self.fields ), ' ')
-
-    def id(self): return self.fields[0]
-    def contig(self): return self.fields[1]
-
-    def offset(self): return self.fields[2]-1
-    def pos(self): return self.fields[2]
-
-    # Quality of read mapping (only maq gives this field)
-    def map_qual(self): return self.fields[6]
-
-    #def offset_end(self): return self.fields[8]
-    #def pos_end(self): return self.fields[8]+1
-
-    #def linear_start(self): return self.fields[9]
-
-
-class aln_file:
-    def __init__(self, filename, parse=True):
-        self.filename = filename
-        self.parse = parse
-        self.faln = open(self.filename)
-
-    def RecordGenerator(self):
-        for line in self.faln:
-            yield aln_record(line, self.parse)
-        raise StopIteration
-                        
-    def __iter__(self):
-        return self.RecordGenerator()
-          
-if __name__ == "__main__":
-    if len(sys.argv) != 2:
-        print "To test aln_file class:\naln_file.py ALN_FILE"
-    else:
-        count = 0
-        for aln in aln_file(sys.argv[1]):
-            print aln
-            count += 1
-            #if count > 5:
-            #    break
--- a/python/callingProgress.py
+++ b/python/callingProgress.py
@ -1,37 +0,0 @@
-import farm_commands
-import os.path
-import sys
-from optparse import OptionParser
-from datetime import date
-import glob
-import operator
-
-if __name__ == "__main__":
-    usage = "usage: %prog files"
-    parser = OptionParser(usage=usage)
-    parser.add_option("-s", "--size", dest="regionSize",
-                        type="int", default=None, help="")
-                         
-    (OPTIONS, args) = parser.parse_args()
-
-    def extract(line):
-        s = line.split()
-        t = s[0].split(":")
-        if len(t) == 2:
-            chr, pos = t
-            return chr, int(pos)
-        else:
-            return None, None
-
-    for file in args:
-        chr, lastPos, startPos, calledBases, progress = None, None, None, None, None
-        lastLine = None
-        for line in open(file):
-            if startPos == None: 
-                chr, startPos = extract(line)
-            lastLine = line
-        if lastLine <> None and startPos <> None:
-            lastPos = extract(lastLine)[1]
-            calledBases = lastPos - startPos
-            progress = "%.2f" % (float(calledBases) / OPTIONS.regionSize * 100)
-        print file, chr, startPos, lastPos, calledBases, progress
--- a/python/compSNPCalls.py
+++ b/python/compSNPCalls.py
@ -1,591 +0,0 @@
-#!/usr/bin/env python
-
-import sys, string
-import os
-import re
-from itertools import *
-from optparse import OptionParser
-from memo import DiskMemoize, time_func
-
-class ref_genome:
-    """Reads reference genome in FASTA format into a dict"""
-
-    def __init__(self, ref_genome_file):
-        ref_genome.chr_offset = [[] for i in range(45)]
-        chr_id = 0
-        seq = ""
-        for line in open(ref_genome_file):
-            if line.startswith(">"):
-                print line[1:],
-                if line.startswith(">chrM"): # skip first > line
-                    continue
-                ref_genome.chr_offset[chr_id] = seq
-                chr_id += 1
-                seq = " " # make it 1 indexed instead of 0 indexed
-                #if chr_id > 2:
-                #    break
-            else:
-                seq += line.rstrip().upper()
-        ref_genome.chr_offset[chr_id] = seq
-
-    def __getitem__(self, key):
-        return ref_genome.chr_offset[key]
-
-AffyChr2Index = dict()
-for i in range(1,23):
-    AffyChr2Index[str(i)] = i
-AffyChr2Index['MT'] = 0
-AffyChr2Index['X'] = 23
-AffyChr2Index['Y'] = 24
-
-class GenotypeCall:
-    #ref = time_func(ref_genome)("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
-    def __init__( self, chr, pos, genotype, snpP, lod ):
-        self.chr = chr
-        self.pos = int(pos)
-        self._site = chr + ':' + str(self.pos)
-        self._isSNP = snpP
-        self.genotype = string.join(map(string.upper, sorted(genotype)), '/')    # sorted list of bases at position
-        self.lod = lod
-
-    def refbase(self):
-        return GenotypeCall.ref[AffyChr2Index[self.chr]][self.pos]
-
-    def __hash__(self):
-        return hash(self._site)
-
-    def __eq__(self, other):
-        return self._site == other._site
-
-    def site(self): return self._site
-    def isSNP(self): return self._isSNP
-    
-    def ref_het_hom(self):
-        if self.genotype[0] <> self.genotype[2]:
-            return 1 # het(erozygous non-ref)
-        else:
-            # homozygous something
-            if self.genotype[0] == self.refbase:
-                return 0 # ref
-            else:
-                return 2 # hom(ozygous non-ref)
-
-    def isHET(self): return self.genotype[0] <> self.genotype[2]
-    def isHOM(self): return self.genotype[0] == self.genotype[2]
-            
-    def __str__(self):
-        return "%s:%s %s %s" % ( self.chr, self.pos, self.genotype, self.lod)
-        
-MAQGenotypeEncoding = {
-    'A' : ['A', 'A'], 
-    'C' : ['C', 'C'], 
-    'T' : ['T', 'T'], 
-    'G' : ['G', 'G'], 
-    "M" : ['A', 'C'], 
-    'K' : ['G', 'T'], 
-    'Y' : ['C', 'T'], 
-    'R' : ['A', 'G'], 
-    'W' : ['A', 'T'], 
-    'S' : ['C', 'G'], 
-    'D' : ['A', 'G', 'T'], 
-    'B' : ['C', 'G', 'T'], 
-    'H' : ['A', 'C', 'T'], 
-    'V' : ['A', 'C', 'G'], 
-    'N' : ['A', 'C', 'G', 'T'] }
-
-MAQ2STDChr = dict()
-for i in range(1,23):
-    MAQ2STDChr['chr'+str(i)] = str(i)
-MAQ2STDChr['chrM'] = 'MT'
-MAQ2STDChr['chrX'] = 'X'
-MAQ2STDChr['chrY'] = 'Y'
-
-def convertMAQChr(maqChr):
-    #print 'convertMAQChr:', maqChr, MAQ2STDChr[maqChr]
-    if maqChr in MAQ2STDChr:
-        return MAQ2STDChr[maqChr]
-    else:
-        return '?'
-
-def convertMAQGenotype( oneBaseCode ):
-    return MAQGenotypeEncoding[oneBaseCode]
-
-def internalReadSNPFile( parse1, filename ):
-    result = []
-    snps_extracted = 0
-    for snp in imap( parse1, open(filename) ):
-        if snp:
-            result.append(snp)
-            snps_extracted += 1
-        if snps_extracted > OPTIONS.debug_lines:
-            break
-
-    print len(result),"genotypes extracted"
-    return result
-
-def snpMAP( snps ):
-    #d = dict( map( lambda x: [x.site(), x], snps ) )
-    d = dict()
-    for snp in snps:
-        d[snp.site()] = snp#d
-    
-    #print 'snps', snps, d
-    return d
-
-def overlappingSites( snps1, snps2 ):
-    map1 = snpMAP(snps1)
-    map2 = snpMAP(snps2)
-    shared = set(map1.keys()) & set(map2.keys())
-    print 'Number of snp1 records', len(map1)
-    print 'Number of snp2 records', len(map2)
-    print 'Number of shared sites', len(shared)
-    print "\n".join(map(str,snps1))
-    return shared
-
-def readMAQSNPs(filename):
-    # Each line consists of:
-    #  chromosome
-    #  position
-    #  reference base
-    #  consensus base
-    #  Phred-like consensus quality
-    #  read depth
-    #  the average number of hits of reads covering this position
-    #  the highest mapping quality of the reads covering the position
-    #  the minimum consensus quality in the 3bp flanking regions at each side of the site (6bp in total)
-    #  the second best call
-    #  log likelihood ratio of the second best and the third best call
-    #  and the third best call.
-    # 
-    # Also, note that:
-    # 
-    #      What do those "S", "M" and so on mean in the cns2snp output?
-    # 		They are IUB codes for heterozygotes. Briefly:
-    # 
-    # 		M=A/C, K=G/T, Y=C/T, R=A/G, W=A/T, S=G/C, D=A/G/T, B=C/G/T, H=A/C/T, V=A/C/G, N=A/C/G/T
-    def read1(line):
-        formats = [str, int, str, str, int, int]
-        vals = map( lambda f, x: f(x), formats, line.split()[0:6] )
-        alignQual = vals[4]
-        if alignQual >= (10*OPTIONS.lod):
-            return GenotypeCall( convertMAQChr(vals[0]), vals[1], convertMAQGenotype(vals[3]), vals[2] <> vals[3], alignQual/10.0 )
-        else:
-            #print 'Filtering', alignQual, vals
-            return False
-
-    return internalReadSNPFile( read1, filename )
-
-OPTIONS = None
-
-def MerlinChr( index ):
-    if index == 0: 
-        return 'MT'
-    elif index == 23: 
-        return 'X'
-    elif index == 24: 
-        return 'Y'
-    else:
-        return str(index)
-    
-def readMerlinSNPs(filename):
-    # 0:72 G GG 155.337967 0.000000 homozygous A:0 C:2 G:510 T:2 514 0 1 1 GG:-5.59 CG:-160.92 GT:-161.51 AG:-162.11 CT:-1293.61 CC:-1293.61 TT:-1294.19 AC:-1294.21 AT:-1294.80 AA:-1295.40 
-    # 0:149 T CC 118.595886 1131.024696 homozygous-SNP A:2 C:442 G:1 T:7 452 0 1 1 CC:-24.21 CT:-142.81 AC:-156.33 CG:-156.96 TT:-1155.23 AT:-1159.41 GT:-1160.04 AA:-1173.26 AG:-1173.56 GG:-1174.20 
-    # chr:pos ref genotype bestVsRef bestVsNextBest class ...
-    def read1(line):
-        formats = [lambda x: x.split(':'), str, sorted, float, float, str]
-        vals = map( lambda f, x: f(x), formats, line.split()[0:6] )
-        bestVsRef, bestVsNext = vals[3:5]
-        isSNP = vals[5].find('-SNP') <> -1
-        if bestVsRef >= OPTIONS.lod and isSNP:
-            return GenotypeCall( MerlinChr(int(vals[0][0])), int(vals[0][1]) + 1, vals[2], isSNP, bestVsRef )
-        else:
-            return False
-
-    return internalReadSNPFile( read1, filename )
-    
-def readSNPfile( filename, format ):
-    formats = { 'merlin' : readMerlinSNPs, 'maq' : readMAQSNPs }
-    if format.lower() in formats:
-        return list(formats[format.lower()](filename))
-    else:
-        raise Exception('Unknown SNP file format ' + format)
-    
-def readAffyFile(filename):
-    # chrom	position	genotype	probe_set_id	dbsnp_id
-    # 1	84647761		TC	SNP_A-1780419	rs6576700
-    # 5	156323558		GG	SNP_A-1780418	rs17054099
-    def read1(line):
-        formats = [str, int, sorted, str, str]
-        vals = map( lambda f, x: f(x), formats, line.split() )
-        
-        try:
-            chr = str(int(vals[0]))
-        except:
-            chr = convertMAQChr(vals[0])
-        #print 'CHR', chr, vals[0]
-        return GenotypeCall( chr, vals[1], vals[2], False, 100 )
-
-    file = open(filename)
-    file.readline()                                     # skip header
-    #affyData = map( read1, file )
-    affyData = []
-    for index, line in enumerate(file):
-        affyData.append(read1(line))
-        if index > OPTIONS.debug_lines:
-            break
-        if index % 10000 == 0:
-            print index
-    # Give a chance to use list before creating dictionary
-    return affyData
-
-    #print "1111111"
-    #return dict( zip( map( GenotypeCall.site, affyData ), affyData ) )
-
-def equalSNPs( snp1, snp2 ):
-    return snp1.genotype == snp2.genotype
-
-# def concordance( truthSet, testSet, includeVector = None ):
-#     # calculates a bunch of useful stats about the two 
-#     # data genotype call sets above
-#     
-#     states = [[x,0] for x in ['tested', 'shared', 'shared-equal', 'test-snp', 'hom-ref', 'hom-snp', 'het-snp', 'het-ref']]
-#     counts = dict(states)
-#     
-#     def incShared(state, equalP ):
-#         counts[state] += 1
-#         if equalP:
-#             counts[state + '-equal'] += 1
-#     
-#     nTestSites = 0
-#     for i, testSNP in izip( count(), testSet ):
-#         if includeVector <> None and not includeVector[i]:
-#             # we are skiping this site
-#             continue
-# 
-#         nTestSites += 1
-# 
-#         if testSNP.isSNP():
-#             counts['test-snp'] += 1
-# 
-#         #print testSNP.site()
-#         if testSNP.site() in truthSet:
-#             truth = truthSet[testSNP.site()]
-#             eql = equalSNPs( testSNP, truth )
-#             
-#             incShared( 'shared', eql )
-#             if testSNP.isSNP():
-#                 if truth.isHOM(): incShared( 'hom-snp', eql )
-#                 else: incShared( 'het-snp', eql )
-#             else:
-#                 if truth.isHOM(): incShared( 'hom-ref', eql )
-#                 else: incShared( 'het-ref', eql )
-#             
-#         if OPTIONS.verbose and nTestSites % 100 == 0 and nSharedSites > 0:
-#             #print nTestSites, nSharedSites, nEqualSites
-#             print nTestSites, counts 
-# 
-#     counts['tested'] = nTestedSites
-#             
-#     return counts
-
-class ConcordanceData:
-    def __init__(self, name, file1count, file2count):
-        self.name = name
-        self.nFile1Sites = file1count    # num sites in file 1
-        self.nFile2Sites = file2count    # num sites in file 1
-        self.nSharedSites = 0   # num SNP pairs that map to same position on the genome
-        self.nEqualSites = 0    # num SNPs pars with the same genotype
-
-    def inc( self, truthSNP, testSNP ):
-        self.nSharedSites += 1
-        if equalSNPs( testSNP, truthSNP ): # if the genotypes are equal
-            self.nEqualSites += 1
-            
-    def rate(self):
-        return (100.0 * self.nEqualSites) / max(self.nSharedSites,1)        
-        
-    def __str__(self):
-        return '%d %d %.2f' % ( self.nSharedSites, self.nEqualSites, self.rate() )
-
-def concordance( truthSet, testSet, sharedSites = None ):
-    # calculates a bunch of useful stats about the two 
-    # data genotype call sets above
-    #
-    # The 2 calls in main work like this:
-    # affy, snp1, snp1_snp2_shared
-    # affy, snp2, snp1_snp2_shared
-    
-    nTestSites = 0
-
-    # Now for each of the calls to concordance, we generate 3 sets:
-    # - allData: all SNP1 sites that are also in Affy 
-    allData = ConcordanceData('all', len(truthSet), len(testSet))
-    # - sharedData: SNP1 sites that are also SNP2 sites that are alse in Affy
-    sharedData = ConcordanceData('shared', len(truthSet), len(testSet))
-    # - uniqueData: SNP1 sites that are not SNP2 sites but that are in Affy
-    uniqueData = ConcordanceData('unique', len(truthSet), len(testSet))
-    for i, testSNP in izip( count(), testSet ):
-        nTestSites += 1
-        if testSNP.site() in truthSet:
-            truthSNP = truthSet[testSNP.site()]
-
-            allData.inc( truthSNP, testSNP )
-            if sharedSites <> None:
-                if testSNP.site() in sharedSites:
-                    sharedData.inc( truthSNP, testSNP )
-                else:
-                    uniqueData.inc( truthSNP, testSNP )
-
-        if OPTIONS.verbose and nTestSites % 100000 == 0:
-            #print nTestSites, nSharedSites, nEqualSites
-            print nTestSites, allData, sharedData, uniqueData
-
-    return nTestSites, allData, sharedData, uniqueData
-
-# def concordance( truthSet, testSet, includeVector = None ):
-#     # calculates a bunch of useful stats about the two 
-#     # data genotype call sets above
-#     
-#     states = [[x,0] for x in ['tested', 'shared', 'test-snp', 'shared-hom-ref', 'shared-het-snp', 'shared-hom-snp']]
-#     counts = dict(states)
-#     
-#     nTestSites = 0
-#     nSharedSites = 0
-#     nEqualSites = 0
-#     for i, testSNP in izip( count(), testSet ):
-#         nTestSites += 1
-#         #print testSNP.site()
-#         if testSNP.site() in truthSet:
-#             nSharedSites += 1
-#             if equalSNPs( testSNP, truthSet[testSNP.site()] ):
-#                 nEqualSites += 1
-#             #else:
-#             #    print '~', testSNP, truthSet[testSNP.site()] 
-#         if OPTIONS.verbose and nTestSites % 100000 == 0 and nSharedSites > 0:
-#             #print nTestSites, nSharedSites, nEqualSites
-#             print nTestSites, nSharedSites, nEqualSites, (100.0 * nEqualSites) / nSharedSites
-#             
-#     return [nTestSites, nSharedSites, nEqualSites, (100.0 * nEqualSites) / nSharedSites]
-
-
-def printConcordanceResults( filename1, filename2, results, hasSharedSites = False ):
-    nTestSites, allData, sharedData, uniqueData = results
-
-    def print1(data):
-        print '------------------------------------------------------------'
-        print 'Concordance results', data.name, 'sites'
-        print '  -> Genotype file1', filename1
-        print '  -> Genotype file2', filename2
-        print '  -> Number of tested sites (%s %s): %d' % (filename2, data.name, nTestSites)
-        print '  -> Number of sites shared between files (%s %s): %d' % (filename2, data.name, data.nSharedSites)
-        print '  ->   Percent sites in file1 shared (%s %s): %.2f' % (filename1, data.name, data.nSharedSites / (0.01 * data.nFile1Sites))
-        print '  ->   Percent sites in file2 shared (%s %s): %.2f' % (filename1, data.name, data.nSharedSites / (0.01 * data.nFile2Sites))
-        print '  -> Number of genotypically equivalent sites (%s %s): %d' % (filename2, data.name, data.nEqualSites)
-        print '  -> Concordance rate (%s %s): %.2f' % (filename2, data.name, data.rate())
-    
-    print1(allData)
-    if hasSharedSites:
-        print1( sharedData ) # shared between SNP1 and SNP2
-        print1( uniqueData ) # unique to SNP1 or to SNP2 only
-
-def dump_shared_snps( affys, snp_list1, snp_list2 ):
-    print len(affys), len(snp_list1), len(snp_list2)
-    snps1 = snpMAP(snp_list1)
-    snps2 = snpMAP(snp_list2)
-    snp1_sites = set(snps1.keys()); print "SNP1s:",len(snp1_sites)
-    snp2_sites = set(snps2.keys()); print "SNP2s:",len(snp2_sites)
-    affy_sites = set(affys.keys()); print "Affys:",len(affy_sites)
-    snp1or2_affy_sites = (snp1_sites | snp2_sites) & affy_sites
-    snp1and2_affy_sites = (snp1_sites & snp2_sites) & affy_sites
-    print "SNP 1 or 2 and Affy: ",len(snp1or2_affy_sites)
-    print "SNP 1 and 2 and Affy:",len(snp1and2_affy_sites)
-
-    fsnp = open ("snp.tab","w")
-    print >>fsnp, "site lod1 lod2 lod1v2 gen1 gen2 genaff inc1 inc2 inc12 lodm genm incm ref_het_hom refbase"
-    for site in snp1and2_affy_sites:
-        snp1 = snps1[site]
-        snp2 = snps2[site]
-        affy = affys[site]
-
-        print >>fsnp, "%-11s %5.2f %5.2f" % (site, snp1.lod, snp2.lod),
-        try:
-            snp1div2 = snp1.lod / snp2.lod
-        except ZeroDivisionError:
-            snp1div2 = 1000
-        print >>fsnp, "%5.2f" % snp1div2,
-        print >>fsnp, snp1.genotype, snp2.genotype, affy.genotype,
-        print >>fsnp, "%1d %1d %1d" % (not equalSNPs(snp1, affy), not equalSNPs(snp2,affy), not equalSNPs(snp1,snp2)),
-
-        # Calculte meta_lod from the two lods
-        if snp1.genotype == snp2.genotype:
-            meta_lod = snp1.lod + snp2.lod
-            meta_genotype = snp1.genotype
-        else:
-            if snp1.lod > snp2.lod:
-                meta_lod = snp1.lod
-                meta_genotype = snp1.genotype
-            else:
-                meta_lod = snp2.lod
-                meta_genotype = snp2.genotype
-        meta_inc = meta_genotype != affy.genotype
-        print >>fsnp, "%5.2f %3s %1d" % (meta_lod, meta_genotype, meta_inc),
-        print >>fsnp, affy.ref_het_hom(),
-        print >>fsnp, affy.refbase()
-
-def intersection_union_snps( affy, snps1, snps2 ):
-    map1 = snpMAP(snps1)
-    map2 = snpMAP(snps2)
-    shared_nonaffy_sites = (set(map1.keys()) & set(map2.keys())).difference(affy.keys())
-    nonaffy_shared = [(map1[site].lod, map2[site].lod) for site in shared_nonaffy_sites]
-    shared_affy_sites = set(map1.keys()) & set(map2.keys()) & set(affy.keys())
-
-    #shared = []
-    #for site in shared_sites:
-    #    shared.append((map1[site], map2[site]))
-    #print "Shared:",len( shared )
-    #shared_x = [s[0].lod for s in shared]
-    #shared_y = [s[1].lod for s in shared]
-
-    both_corr = []
-    snp1_corr = []
-    snp2_corr = []
-    neither_corr = []
-    # given two bools telling whether snp1 and snp2 are correct, 
-    # return the correct object
-    #
-    #             snp2 incorrect, snp2 correct
-    truth_list = [[neither_corr, snp2_corr], # snp1 incorrect
-                  [snp1_corr, both_corr]]    # snp1 correct
-
-    for site in shared_affy_sites:
-        snp1_true = equalSNPs(map1[site], affy[site])
-        snp2_true = equalSNPs(map2[site], affy[site])
-        truth_list[snp1_true][snp2_true].append( (map1[site].lod, map2[site].lod) )
-
-    print "Beginning plot..."
-    import rpy2.robjects as robj
-    robj.r('X11(width=15, height=15)')
-    XY_MAX = 25
-    plots = ((nonaffy_shared, "gray45"),(both_corr, "black"), (snp1_corr, "red"), (snp2_corr, "green"), (neither_corr, "blue"))
-    plots = ((both_corr, "black"), (snp1_corr, "red"), (snp2_corr, "green"), (neither_corr, "blue"))
-    robj.r.plot([0, XY_MAX], [0, XY_MAX], \
-                xlab = os.path.splitext(OPTIONS.snp1)[0].capitalize()+" LOD", \
-                ylab = os.path.splitext(OPTIONS.snp2)[0].capitalize()+" LOD", \
-                main = "Shared SNP site LODs", \
-                xlim = robj.FloatVector([0,XY_MAX]), \
-                ylim = robj.FloatVector([0,min(XY_MAX,25)]), \
-                pch = 19, \
-                cex = 0.0)
-    for xy, color in plots:
-        print "X"
-        robj.r.points([pt[0] for pt in xy], [pt[1] for pt in xy], col=color, pch=19, cex=.3)
-        print "Color:",color
-        print "Len:",len(xy)
-        #print "\n".join(["%25s %25s" % pt for pt in xy])
-
-    #print "\n".join(["%25s %25s" % shared_snp for shared_snp in shared])
-    #robj.r.plot(shared_x, shared_y, xlab=OPTIONS.format1.capitalize()+" LOD", ylab=OPTIONS.format2.capitalize()+" LOD", main="Shared SNP site LODs", col="black", xlim=robj.FloatVector([0,XY_MAX]), ylim=robj.FloatVector([0,min(XY_MAX,25)]), pch=19, cex=0.3)
-    raw_input("Press enter to continue")
-    
-    return
-
-    ss1 = set(snps1)
-    ss2 = set(snps2)
-    print "Shared:",len( ss1.intersection(ss2) )
-    print "Snp1 only:",len( ss1.difference(ss2) )
-    print "Snp2 only:",len( ss2.difference(ss1) )
-    print "Snp1 total:",len( ss1 )
-    print "Snp2 total:",len( ss2 )
-
-def count_het_sites(snp_list):
-    hets = 0
-    for snp in snp_list:
-        if snp.isHET():
-            hets += 1
-    print hets,"hets,",len(snp_list),"total,",
-    print "%.1f" % (float(hets)/len(snp_list)*100)
-
-def main(argv):
-    global OPTIONS, ROOT
-
-    usage = "usage: %prog --truth affy-truth-file --snp1 snpfile1 --snp2 snpfile2"
-    parser = OptionParser(usage=usage)
-    parser.add_option("-t", "--truth", dest="affy",
-                        type="string", default=None,
-                        help="Affy truth file")
-    parser.add_option("-1", "--snp1", dest="snp1",
-                        type="string", default=None,
-                        help="Affy truth file")
-    parser.add_option("-2", "--snp2", dest="snp2",
-                        type="string", default=None,
-                        help="Affy truth file")
-    parser.add_option("", "--f1", dest="format1",
-                        type="string", default=None,
-                        help="File type of snpfile1")
-    parser.add_option("", "--f2", dest="format2",
-                        type="string", default=None,
-                        help="File type of snpfile2")
-    parser.add_option("-l", "--lod", dest="lod",
-                        type="float", default=5.0,
-                        help="Minimum LOD of confident SNP call in Merlin or Q (10x) in MAQ")
-    parser.add_option("-v", "--verbose", dest="verbose",
-                        action='store_true', default=False,
-                        help="Verbose output")
-    parser.add_option("-d", "--debug_lines", dest="debug_lines",
-                        type='float', default=sys.maxint,
-                        help="Number of input data lines to process for debugging")
-                        
-    (OPTIONS, args) = parser.parse_args()
-    if len(args) != 0:
-        parser.error("incorrect number of arguments")
-
-    if OPTIONS.affy == None:
-        parser.error("No affy data specified")
-
-    if OPTIONS.format1 == None:
-        parser.error("First format cannot be none")
-
-    if OPTIONS.snp2 <> None and OPTIONS.format2 == None:
-        parser.error("snp2 file given but format was not specified")
-        
-    # Load reference genome
-    #ref = ref_genome("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
-    #sys.exit()
-
-    if 1:
-        print "Reading Affy truth data..."
-        #readAffyFile2 = DiskMemoize( readAffyFile, "readAffyFile", global_deps = ["OPTIONS.lod"] )
-        readAffyFile2 = time_func(readAffyFile)
-        affy_list = readAffyFile2( filename=OPTIONS.affy )
-        #count_het_sites(affy_list)
-        #sys.exit()
-        affy = dict( zip( map( GenotypeCall.site, affy_list ), affy_list ) )
-        print 'Read affy truth data:'
-        print '  -> number of genotyped loci', len(affy)
-
-    #readSNPfile2 = DiskMemoize( readSNPfile, "readSNPfile", global_deps = ["OPTIONS.lod"] )
-    readSNPfile2 = time_func(readSNPfile)
-    print "Reading SNPs 1 file..."
-    snps1 = readSNPfile2( filename=OPTIONS.snp1, format=OPTIONS.format1 )
-    if OPTIONS.snp2 <> None:
-        print "Reading SNPs 2 file..."
-        snps2 = readSNPfile2( filename=OPTIONS.snp2, format=OPTIONS.format2 )
-       
-        dump_shared_snps( affy, snps1, snps2 )
-    #intersection_union_snps( affy, snps1, snps2 )
-    #sys.exit()
-
-    sharedSites = None
-    if OPTIONS.snp2 <> None:
-        sharedSites = overlappingSites( snps1, snps2 )
-        results1 = concordance( affy, snps1, sharedSites )
-        printConcordanceResults( OPTIONS.affy, OPTIONS.snp1, results1, True )
-        results2 = concordance( affy, snps2, sharedSites )
-        printConcordanceResults( OPTIONS.affy, OPTIONS.snp2, results2, True )
-
-    else:
-        results = concordance( affy, snps1 )
-        printConcordanceResults( OPTIONS.affy, OPTIONS.snp1, results, sharedSites )    
-
-if __name__ == "__main__":       
-    main(sys.argv)
--- a/python/create_venn_evals.py
+++ b/python/create_venn_evals.py
@ -1,81 +0,0 @@
-#!/usr/bin/env python
-
-# Creates summary stats from a concordance file - SNPs called and number in dbSNP - and will output 
-
-import sys, re, os, farm_commands
-
-sting_dir = os.environ.get("STING_DIR")
-if sting_dir == None:
-    print "You need to set the environment variable STING_DIR to point to your Sting/GATK build"
-    sys.exit()
-    
-if len(sys.argv) < 2:
-    print "Usage: PROG CONCORDANCE_FILE"
-    sys.exit()
-
-class callset_data:
-    def __init__(self, snps, in_dbSNP, titv):
-        self.snps = snps
-        self.in_dbSNP = in_dbSNP
-        self.titv = titv
-    def inc_snps(self):
-        self.snps += 1
-    def inc_dbsnps(self):
-        self.in_dbSNP += 1
-    def __str__(self):
-        return "SNPs: %10d  in_dbSNP: %d  TiTv: %d" % (self.snps, self.in_dbSNP, self.titv)
-
-sets = dict()
-concordance_filename = os.path.abspath(sys.argv[1])
-for line in file(concordance_filename):
-    fields = line.split("\t")
-    filter = 0
-    if not re.search('HybridSelectionVariantFilter', line):
-        match = re.search('Venn=(\S*)', line) ### TODO: Only works in 2-way venn with ";", remove ";" for N-way Venn - should be changed to work for both automatically
-        if match:
-            my_set = match.groups()[0]
-            
-            if sets.has_key(my_set):
-                sets[my_set].inc_snps()
-                if fields[2] != ".":
-                    sets[my_set].inc_dbsnps()
-            else:
-                sets[my_set] = callset_data(1, 0, 0)
-
-print "\n".join(["%40s %s" % (k,str(v)) for k,v in sets.items()])
-
-print "Splitting concordance file\n"
-splits_dir = concordance_filename+".splits"
-
-for vennSet in sets.keys():
-    print vennSet
-    output_head = splits_dir+"_"+vennSet
-    vcf_file = splits_dir+"_"+vennSet+".vcf"
-    varianteval_file = splits_dir+"_"+vennSet+".varianteval"
-
-    if not os.path.exists(varianteval_file):
-        fout = open(vcf_file, "w")
-        for line in file(concordance_filename):    
-            if line.startswith("#"):
-                fout.write(line)
-            else:
-                match = re.search('Venn='+vennSet, line)
-                if match:
-                    fout.write(line)
-                    
-        varianteval_out = os.path.basename(concordance_filename)+".vcf"
-        farm_commands.cmd("java -Xmx4096m -jar "+sting_dir+" -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta "
-                  +"-T VariantEval "
-                  +"-B eval,VCF,"+vcf_file+" "
-                  +"-o "+varianteval_file+" "
-                  +"-D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod",
-                  "gsa")
-
-
-
-
-
-
-
-
-
--- a/python/easyRecalQuals.py
+++ b/python/easyRecalQuals.py
@ -1,80 +0,0 @@
-import farm_commands
-import os.path
-import sys
-from optparse import OptionParser
-from gatkConfigParser import *
-import glob
-
-if __name__ == "__main__":
-    usage = """usage: %prog [-c config.cfg]* input.bam output.bam"""
-
-    parser = OptionParser(usage=usage)
-    parser.add_option("-A", "--args", dest="args",
-                        type="string", default="",
-                        help="arguments to GATK")
-    parser.add_option("-m", "--mode", dest="RecalMode",
-                        type="string", default="SEQUENTIAL",
-                        help="Mode argument to provide to table recalibrator")
-    parser.add_option("-q", "--farm", dest="farmQueue",
-                        type="string", default=None,
-                        help="Farm queue to send processing jobs to")
-    parser.add_option("-c", "--config", dest="configs",
-                        action="append", type="string", default=[],
-                        help="Configuration file")                        
-    parser.add_option("-d", "--dir", dest="scratchDir",
-                        type="string", default="./",
-                        help="Output directory")
-    parser.add_option("-w", "--wait", dest="initialWaitID",
-                        type="string", default=None,
-                        help="If providedm the first job dispatched to LSF will use this id as it ended() prerequisite")
-    parser.add_option("", "--dry", dest="dry",
-                        action='store_true', default=False,
-                        help="If provided, nothing actually gets run, just a dry run")
-    parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles",
-                        action='store_true', default=False,
-                        help="Ignores already written files, if present")
-
-    (OPTIONS, args) = parser.parse_args()
-    if len(args) != 2:
-        parser.error("incorrect number of arguments")
-
-    config = gatkConfigParser(OPTIONS.configs)
-    inputBAM = args[0]
-    outputBAM = args[1]
-    rootname = os.path.split(os.path.splitext(outputBAM)[0])[1]
-
-    covariateRoot = os.path.join(OPTIONS.scratchDir, rootname)
-    covariateInitial = covariateRoot + '.init'
-    initDataFile = covariateInitial + '.recal_data.csv'
-    covariateRecal = covariateRoot +  '.recal'
-    recalDataFile = covariateRecal + '.recal_data.csv'
-
-    if not os.path.exists(OPTIONS.scratchDir):
-        os.mkdir(OPTIONS.scratchDir)
-    
-    def covariateCmd(bam, outputDir):
-        add = " -I %s --OUTPUT_FILEROOT %s" % (bam, outputDir)
-        return config.gatkCmd('CountCovariates', log=outputDir, stdLogName=True) + add 
-
-    def recalibrateCmd(inputBAM, dataFile, outputBAM):
-        return config.gatkCmd('TableRecalibration', log=outputBAM, stdLogName=True) + " -I %s -params %s -outputBAM %s -mode %s" % (inputBAM, dataFile, outputBAM, OPTIONS.RecalMode)
-
-    def runCovariateCmd(inputBAM, dataFile, dir, jobid):
-        if OPTIONS.ignoreExistingFiles or not os.path.exists(dataFile):
-            cmd = covariateCmd(inputBAM, dir)
-            return farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
-
-    #
-    # Actually do some work here
-    #
-    jobid = OPTIONS.initialWaitID
-    if OPTIONS.ignoreExistingFiles or not os.path.exists(initDataFile): 
-        jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid)
-    
-    if OPTIONS.ignoreExistingFiles or not os.path.exists(outputBAM):
-        cmd = recalibrateCmd(inputBAM, initDataFile, outputBAM)
-        jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
-        jobid = farm_commands.cmd('samtools index ' + outputBAM, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
-        
-    jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid)
-        
--- a/python/makeIGVCategory.py
+++ b/python/makeIGVCategory.py
@ -1,24 +0,0 @@
-import os.path
-from optparse import OptionParser
-
-def main():
-    global OPTIONS
-    usage = "usage: %prog [options] files"
-    parser = OptionParser(usage=usage)
-    parser.add_option("-c", "--category", dest="category",
-                        type='string', default="Anonymous", 
-                        help="If provided, catagory name")
-                       
-    (OPTIONS, args) = parser.parse_args()
-    if len(args) == 0:
-        parser.error("incorrect number of arguments")
-
-    print '<Category name="%s">' % OPTIONS.category
-    for arg in args:
-        path = os.path.abspath(arg)
-        name = os.path.basename(arg)
-        print '  <Resource name="%s" path="%s" serverURL="http://www.broad.mit.edu/webservices/igv"/>' % (name, path)
-    print '</Category>'
-
-if __name__ == "__main__":
-    main()
--- a/python/mergeVCFs.py
+++ b/python/mergeVCFs.py
@ -1,107 +0,0 @@
-import os.path
-import sys
-from optparse import OptionParser
-from vcfReader import *
-from itertools import *
-import faiReader
-
-def main():
-    global OPTIONS
-    usage = "usage: %prog [options] file1 ... fileN"
-    parser = OptionParser(usage=usage)
-    parser.add_option("-f", "--f", dest="fai",
-                        type='string', default=None,
-                        help="FAI file defining the sort order of the VCF")
-    parser.add_option("-o", "--o", dest="output",
-                        type='string', default=None,
-                        help="if provided, output will go here instead of stdout")
-    parser.add_option("-a", "--assumeSorted", dest="assumeSorted",
-                        action='store_true', default=False,
-                        help="If provided, this assumes the input VCF files are themselves sorted, enabling a simple efficent merge")         
-    parser.add_option("-v", "--verbose", dest="verbose",
-                        action='store_true', default=False,
-                        help="If provided, verbose progress will be enabled")         
-         
-    (OPTIONS, args) = parser.parse_args()
-    if len(args) == 0:
-        parser.error("Requires at least 1 VCF to merge")
-
-    order = None
-    if OPTIONS.fai <> None:
-        if OPTIONS.verbose: print 'reading FAI', OPTIONS.fai 
-
-        order = faiReader.readFAIContigOrdering(OPTIONS.fai)
-    #print 'Order', order
-
-    if OPTIONS.output != None:
-        out = open(OPTIONS.output,'w')
-    else:
-        out = sys.stdout
-
-    if OPTIONS.assumeSorted:
-        mergeSort(out, args, order)
-    else:
-        memSort(out, args, order)
-
-def cmpVCFRecords(order, r1, r2):
-    if order <> None:
-        c1 = order[str(r1.getChrom())]
-        c2 = order[str(r2.getChrom())]
-        orderCmp = cmp(c1, c2)
-        if orderCmp <> 0:
-            return orderCmp
-    return cmp(r1.getPos(), r2.getPos())
-
-def mergeSort(out, args, order):
-    #print 'MergeSort', args, order
-    header = None
-    
-    orderMap = []
-    for file in args:
-        #print file
-        openedFile = open(file)
-        for header, record, counter in lines2VCF(openedFile, extendedOutput = True, decodeAll = False):
-            orderMap.append([record, file])
-            break
-        openedFile.close()
-
-    #print orderMap
-    sortedOrderMap = sorted(orderMap, key=lambda x: x[0], cmp = lambda r1, r2: cmpVCFRecords(order, r1, r2))
-    #print sortedOrderMap
-
-    for headerLine in header: print >> out, headerLine
-    i = 0
-    n = len(sortedOrderMap)
-    for file in map( lambda x: x[1], sortedOrderMap):
-        if OPTIONS.verbose: 
-            i += 1
-            print 'Processing', file, ':', i, 'of', n
-        
-        for record in lines2VCF(open(file), extendedOutput = False, decodeAll = False):
-            print >> out, record.format()
-        
-def memSort(args, order):
-    header = None
-    records = []
-    for file in args:
-        #print file
-        for header, record, counter in lines2VCF(open(file), extendedOutput = True, decodeAll = False):
-            records.append(record)
-
-    records.sort(lambda r1, r2: cmpVCFRecords(order, r1, r2)) 
-    for line in formatVCF(header, records):
-        #pass
-        print >> out, line
-
-PROFILE = False
-if __name__ == "__main__":
-    if PROFILE:
-        import cProfile
-        cProfile.run('main()', 'fooprof')
-        import pstats
-        p = pstats.Stats('fooprof')
-        p.sort_stats('cumulative').print_stats(10)
-        p.sort_stats('time').print_stats(10)
-        p.sort_stats('time', 'cum').print_stats(.5, 'init')
-    else:
-        main()
--- a/python/pilot2CallingPipeline.py
+++ b/python/pilot2CallingPipeline.py
@ -1,332 +0,0 @@
-import farm_commands
-import os.path
-import sys
-from optparse import OptionParser
-from datetime import date
-import glob
-import operator
-import faiReader
-import math
-import shutil
-
-GATK_STABLE = 'java -ea -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar -l INFO -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta '
-GATK_DEV = 'java -ea -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta '
-GATK = GATK_STABLE
-
-class CallTarget:
-    def __init__(self, name, hetero, minQ = 50, depth = 120, truthGFFName = None, variantEvalArgs = '', otherVCF = None):
-        self.name = name
-        self.hetero = hetero
-        self.minQ = minQ
-        self.depth = depth
-        if truthGFFName <> None:
-            self.truthGFFName = truthGFFName
-        else:
-            self.truthGFFName = name
-        self.variantEvalArgs = variantEvalArgs
-        self.otherVCF = otherVCF
-        self.unionVCF = None
-        if otherVCF != None:
-            self.unionVCF = self.name + '.gatk.glftrio.union.filtered.vcf'
-        self.listFile = None
-        self.releaseVCF = None
-
-CEU_HET = 0.79e-3
-YRI_HET = 1.0e-3
-
-targets = [
-    CallTarget('NA12878', CEU_HET),
-    CallTarget('NA12891', CEU_HET),
-    CallTarget('NA12892', CEU_HET),
-    CallTarget('NA19238', YRI_HET),
-    CallTarget('NA19239', YRI_HET),
-    CallTarget('NA19240', YRI_HET),
-
-    CallTarget('ceu.trio', CEU_HET, 50, 360, 'NA12878', '--sampleName NA12878', '/humgen/gsa-hpprojects/1kg/1kg_pilot2/currentBestProjectCalls/CEU_1kg_pilot2.vcf'),
-    CallTarget('yri.trio', YRI_HET, 50, 360, 'NA19240', '--sampleName NA19240', '/humgen/gsa-hpprojects/1kg/1kg_pilot2/currentBestProjectCalls/YRI_1kg_pilot2.vcf'),
-
-#    CallTarget('NA19240.alltechs.solid_original', YRI_HET, 50, 120, 'NA19240'),
-#    CallTarget('NA12878.alltechs.solid_original', CEU_HET, 50, 120, 'NA12878'),
-
-    CallTarget('NA19240.alltechs.solid_recal', YRI_HET, 50, 120, 'NA19240'),
-    CallTarget('NA12878.alltechs.solid_recal', CEU_HET, 50, 120, 'NA12878'),
-]
-
-sets = ['Intersection', 'filteredInBoth', 'gatk', 'gatk-filteredInOther', 'glftrio', 'glftrio-filteredInOther', 'Intersection', ['gatk-unique', 'gatk.*'], ['glftrio-unique', 'glftrio.*']]
-
-def main():
-    global OPTIONS
-    usage = "usage: %prog stage [options]"
-    parser = OptionParser(usage=usage)
-#     parser.add_option("-q", "--farm", dest="farmQueue",
-#                         type="string", default=None,
-#                         help="Farm queue to send processing jobs to")
-    parser.add_option("", "--dry", dest="dry",
-                        action='store_true', default=False,
-                        help="If provided, nothing actually gets run, just a dry run")
-    parser.add_option("-s", "--sample", dest="useSample",
-                        type='string', default=None,
-                        help="If provided, only run pipeline for this sample")
-    parser.add_option("-c", "--minQ", dest="minQ",
-                        type='float', default=None,
-                        help="If provided, will actually use this Q threshold for calls")
-    parser.add_option("-d", "--dir", dest="dir",
-                        type='string', default="",
-                        help="If provided, this is the root where files are read and written")
-    parser.add_option("-q", "--farm", dest="farmQueue",
-                        type="string", default=None,
-                        help="Farm queue to send processing jobs to")
-                       
-    (OPTIONS, args) = parser.parse_args()
-    if len(args) != 1:
-        parser.error("incorrect number of arguments")
-
-    stages = args[0].split(",")
-
-    allJobs = []
-    for callTarget in targets:
-        if callTarget.name == OPTIONS.useSample or OPTIONS.useSample == None:
-            lastJobs = None
-            
-            if OPTIONS.minQ != None:
-                callTarget.minQ = OPTIONS.minQ
-            
-            for stage in stages:
-                print 'STAGE', stage
-                target = callTarget.name
-                callTarget.listFile = os.path.join("lists", target + ".list")
-                unfilteredVCFBaseName = target + '.gatk.ug.vcf'
-                unfilteredVCF = os.path.join(OPTIONS.dir, unfilteredVCFBaseName)
-                filteredVCF = os.path.join(OPTIONS.dir, target + '.gatk.ug.filtered.vcf')
-    
-                tmpdir = os.path.join(OPTIONS.dir, "intermediates", unfilteredVCFBaseName + ".scatter")
-                if not os.path.exists(tmpdir):
-                    os.makedirs(tmpdir)
-    
-                if callTarget.unionVCF != None:
-                    callTarget.releaseVCF = os.path.join(OPTIONS.dir, callTarget.name + ".gatk_glftrio.intersection.annotated.filtered.vcf")
-    
-                print 'Heading into stage', stage, lastJobs
-    
-                newJobs = []
-                if stage == 'CALL':
-                    newJobs = callSNPs(target, callTarget, callTarget.listFile, tmpdir)
-        
-                if stage == 'MERGE':
-                    newJobs = mergeSNPs(target, lastJobs, unfilteredVCF, tmpdir)
-        
-                if stage == 'FILTER':
-                    newJobs = filterSNPs(callTarget, lastJobs, unfilteredVCF, filteredVCF)
-        
-                if stage == 'UNION':
-                    newJobs = unionSNPs(callTarget, lastJobs, filteredVCF )
-    
-                if stage == 'SELECT_INTERSECT':
-                    if ( callTarget.unionVCF != None ):
-                        # we are one fo the release targets
-                        newJobs = finalize1KGRelease(callTarget, lastJobs, os.path.join(OPTIONS.dir, callTarget.unionVCF ), callTarget.releaseVCF)
-    
-                if stage == 'EVAL':
-                    newJobs = evalSNPs(callTarget, lastJobs, unfilteredVCF, filteredVCF)
-    
-                if stage == 'RELEASE':
-                    dir = '/humgen/gsa-scr1/pub/1000GenomesPilot2'
-                    subdir = '1000GenomesPilot2SNPs_GATK_glftrio_' + date.today().strftime("%m%d%y")
-                    releaseSNPs(os.path.join(dir, subdir), callTarget, filteredVCF )
-        
-                if stage == 'CLEAN':
-                    shutil.rmtree("intermediates", True)
-                    for file in [filteredVCF, unfilteredVCF]:
-                        if os.path.exists(file): os.remove(file)
-        
-                print 'New jobs'
-                for job in newJobs:
-                    print '    ', job
-                allJobs.append(newJobs)
-                if newJobs != []: 
-                    lastJobs = newJobs
-
-    print 'EXECUTING JOBS'
-    farm_commands.executeJobs(allJobs, farm_queue = OPTIONS.farmQueue, just_print_commands = OPTIONS.dry) 
-
-hg18 = ['chr' + str(i) for i in range(1,23)] + ['chrX']
-b36 = [str(i) for i in range(1,23)] + ['X']
-
-def autosomePlusX(name):
-    return name in b36 or name in hg18
-
-def partition(faiRecords, maybeChrom, n):
-# 1       247249719       3       60      61
-# 2       242951149       251370554       60      61
-    chromAndSize = [[x[0], int(x[1])] for x in faiRecords if autosomePlusX(x[0])]
-    #print chromAndSize
-    if maybeChrom <> None:
-        chromAndSize = filter(lambda x: x[0] == maybeChrom, chromAndSize)
-    
-    def r(chrom, chrStart, chrEnd):
-        outputVCF = 'calls.chr%s.good.%s.vcf' % (chrom, chrStart)
-        L = '-L %s:%d-%d' % (chrom, chrStart, chrEnd)
-        return outputVCF, chrom, chrStart, chrEnd, L
-    
-    for chrom, size in chromAndSize:
-        #print 'SIZE', chrom, size
-        if n == 1:
-            yield r(chrom, 1, size)        
-        else:
-            idealBp = float(size-1) / n
-            chrEnd = None
-            chrStart = 1
-            for i in range(1, n):
-                chrEnd = 1 + int(round(idealBp * (i + 1)))
-                #print chrom, chrStart, chrEnd, idealBp
-                result = r(chrom, chrStart, chrEnd)
-                chrStart = chrEnd + 1
-                yield result
-            if chrEnd <> size:
-                raise Exception('X')
-
-def callSNPs(target, callTarget, listFile, tmpdir):
-    extras = "-mmq 10 -mbq 10 -pl SOLID --heterozygosity %e" % (callTarget.hetero)
-    fai = "/humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta.fai"
-    NWaysParallel = 5
-
-    bins = partition(faiReader.readFAI(fai), None, NWaysParallel)
-    jobs = list()
-    for outputVCF, chrom, chrStart, chrEnd, L in bins:
-        outputVCF = os.path.join(tmpdir, outputVCF)
-        #print outputVCF, L, os.path.exists(outputVCF)
-        #if not os.path.exists(outputVCF):
-        print 'Enqueuing job for', outputVCF, L
-        cmd = 'java -Xmx2048m -jar /home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar ' + \
-            '-T UnifiedGenotyper -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -mrl 500000 ' + \
-            '-I %s -confidence %d %s -varout %s -vf VCF -L %s -gm JOINT_ESTIMATE' % (listFile, callTarget.minQ, extras, outputVCF, L)
-        #jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry)
-        jobs.append(farm_commands.FarmJob(cmd, jobName = "CALL_%s_c%s_s%s" % (target, str(chrom), str(chrStart))))
-
-    return jobs
-
-def mergeSNPs(target, lastJobs, snpFile, tmpdir):
-    #print 'LastJobs = ', lastJobs
-    cmd = "python ~/dev/GenomeAnalysisTK/trunk/python/mergeVCFs.py -a -f /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta.fai %s/*.vcf > %s" % (tmpdir, snpFile)
-    return [farm_commands.FarmJob(cmd, jobName = "MERGE_%s" % (target), dependencies = lastJobs)]
-
-def filterSNPs(callTarget, lastJobs, unfilteredVCF, filteredVCF):
-    target = callTarget.name
-    #expression = "AB > 0.75 || DP > %s" % depth
-    expression1 = ['GATK_STANDARD', "AB > 0.75 || DP > %s || MQ0 > 40 || SB > -0.10" % callTarget.depth]
-    expression2 = ['HARD_TO_VALIDATE', "MQ0 >= 4 && ((MQ0 / (1.0 * DP)) > 0.1)"]
-    cmd = GATK + '-T VariantFiltration -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -B variant,VCF,%s --clusterWindowSize 10 -o %s ' % (unfilteredVCF, filteredVCF)
-    
-    for name, exp in [expression1, expression2]:
-        cmd += '--filterName %s --filterExpression "%s" ' % ( name, exp )
-    
-    #jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry)
-    return [farm_commands.FarmJob(cmd, jobName = "FILTER_%s" % (target), dependencies = lastJobs)]
-
-def evalSNPs(callTarget, lastJobs, unfilteredVCF, filteredVCF):
-    evalRoot = os.path.join(OPTIONS.dir, "eval")
-    if not os.path.exists(evalRoot):
-        os.makedirs(evalRoot)
-
-    def eval1(vcfFullPath, namePostfix = "", args = ""):
-        vcf = os.path.split(vcfFullPath)[1]
-        out = os.path.join(OPTIONS.dir, "eval", vcf + namePostfix + ".eval")
-        cmd = GATK + "-T VariantEval -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -B 1kg_ceu,VCF,/humgen/gsa-hpprojects/1kg/1kg_pilot1/SNPCalls/Joint/RC1/CEU.2and3_way.annotated.vcf -B 1kg_yri,VCF,/humgen/gsa-hpprojects/1kg/1kg_pilot1/SNPCalls/Joint/RC1/YRI.2and3_way.annotated.vcf -B eval,VCF,%s -G -A -o %s -L %s" % ( vcfFullPath, out, '\;'.join(map(str, b36)) )
-        hapmap3 = os.path.join("../../hapmap3Genotypes", callTarget.truthGFFName + ".b36.gff") 
-        if os.path.exists(hapmap3):
-            cmd += " -B hapmap-chip,GFF,%s %s %s" % (hapmap3, callTarget.variantEvalArgs, args)
-        #jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry)
-        return farm_commands.FarmJob(cmd, jobName = "EVAL_%s_%s" % (callTarget.name, namePostfix), dependencies = lastJobs)
-    
-    jobs = []
-    jobs.append(eval1(filteredVCF))
-    jobs.append(eval1(unfilteredVCF))
-    if callTarget.unionVCF != None:
-        jobs.append(eval1(os.path.join(OPTIONS.dir, callTarget.unionVCF), ".union"))
-        for set in sets:
-            if type(set) == list:
-                name, selector = set
-            else:
-                name, selector = set, set
-            jobs.append(eval1(os.path.join(OPTIONS.dir, callTarget.unionVCF), "." + name, " -vcfInfoSelector set=\"" + selector + "\""))
-
-    if callTarget.releaseVCF != None:
-        jobs.append(eval1(callTarget.releaseVCF, ".release"))
-
-
-    return jobs
-
-def unionSNPs(callTarget, lastJobs, filteredVCF ):
-    if callTarget.otherVCF != None:
-        cmd = GATK + "-T VCFCombine -B GATK,VCF,%s -B glfTrio,VCF,%s -O %s -type UNION -priority GATK,glfTrio -A" % ( filteredVCF, callTarget.otherVCF, os.path.join(OPTIONS.dir, callTarget.unionVCF) ) 
-        return [farm_commands.FarmJob(cmd, jobName = "UNION_%s" % (callTarget.name), dependencies = lastJobs)]
-    else:
-        return []
-#        jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry)
-
-def finalize1KGRelease(callTarget, lastJobs, inputVCF, outputVCF):
-    commands = []
-    lastJob = lastJobs
-    
-#     subdir = os.path.join(OPTIONS.dir, "1kg_release")
-#     if not os.path.exists(subdir):
-#         os.makedirs(subdir)
-
-    # first, filter out the intersection
-    cmd = GATK + '-T VCFSelect -B variant,VCF,%s -o %s -match "(set eq \'Intersection\' || set eq \'filteredInBoth\')" ' % (inputVCF, outputVCF)
-    lastJob = farm_commands.FarmJob(cmd, jobName = "SELECT_%s" % (callTarget.name), dependencies = lastJob)
-    commands.append(lastJob)
-    
-    # call variant annotator to annotate all of the calls
-    # annotatedVCF = os.path.join(subdir, callTarget.name + ".gatk_glftrio.intersection.annotated.vcf")
-#     cmd = GATK + '-T VariantAnnotator -I %s -standard -B variant,VCF,%s -vcf %s -L 1:1-1,000,000 ' % (callTarget.listFile, intersectVCF, annotatedVCF)
-#     lastJob = farm_commands.FarmJob(cmd, jobName = "ANNOTATE_%s" % (callTarget.name), dependencies = lastJob)
-#     commands.append(lastJob)
-#     
-#     # filter the calls
-#     filteredVCF = os.path.join(subdir, callTarget.name + ".gatk_glftrio.intersection.annotated.filtered.vcf")
-#     commands = commands + filterSNPs(callTarget, lastJob, annotatedVCF, filteredVCF)
-#     
-    return commands
-
-def releaseSNPs(dir, callTarget, filteredVCF ):
-    if not os.path.exists(dir):
-        os.makedirs(dir)
-
-    print 'Copying files into ', dir, filteredVCF, callTarget.unionVCF, callTarget.releaseVCF
-    if not OPTIONS.dry: shutil.copy(filteredVCF, dir)
-    if callTarget.unionVCF != None:
-        if not OPTIONS.dry: shutil.copy(os.path.join(OPTIONS.dir, callTarget.unionVCF), dir)
-    if callTarget.releaseVCF != None:
-        if not OPTIONS.dry: shutil.copy(callTarget.releaseVCF, dir)
-
-if __name__ == "__main__":
-    main()
-
-
-# java -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -T VCFCombine -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -B GATK,VCF,ceu.trio.gatk.ug.filtered.vcf -B glfTrio,VCF,/humgen/gsa-hpprojects/1kg/1kg_pilot2/currentBestProjectCalls/CEU_1kg_pilot2.vcf -O test.vcf -type UNION -priority GATK,glfTrio -l INFO  -A
-# java -ea -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -T VariantEval -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -B eval,VCF,test.vcf -B hapmap-chip,GFF,../../hapmap3Genotypes/NA12878.b36.gff --sampleName NA12878 -vcfInfoSelector set=gatk-filtered
-
-
-# if ( $1 == 3.1 ) then
-# cat ceu.trio.calls.allTechs.mmq10_mbq10_q200.filtered.vcf | cut -f 1-10 > ceu.trio.calls.allTechs.mmq10_mbq10_q200.NA12878only.filtered.vcf
-# endif
-# 
-# if ( $1 == 4 ) then
-# foreach callset ( NA12878.allTechs.mmq10_mbq10_q200 ceu.trio.calls.allTechs.mmq10_mbq10_q200.NA12878only )
-# java -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar -T CallsetConcordance -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -B GATK,VCF,$callset.filtered.vcf -B glfTrio,VCF,CEU_1kg_pilot2.na12878.vcf -CT SimpleVenn -CO ${callset}_v_CEU_1kg_pilot2.filtered.vcf -l INFO
-# cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "callset2_only"' > ${callset}_v_CEU_1kg_pilot2.filtered.CEU_1kg_pilot2Unique.vcf
-# cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "callset1_only"' > ${callset}_v_CEU_1kg_pilot2.filtered.${callset}Unique.vcf
-# cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "concordant"' > ${callset}_v_CEU_1kg_pilot2.filtered.concordant.vcf
-# end
-# endif
-# 
-# if ( $1 == 5 ) then
-# mkdir /humgen/gsa-scr1/pub/1000Pilot2_010710
-# 
-# foreach file ( NA12878.allTechs.mmq10_mbq10_q200.filtered.vcf NA12878.SLX.mmq10_mbq10_q50.filtered.vcf NA12891.calls.mmq10_mbq10_q50.filtered.vcf NA12892.calls.mmq10_mbq10_q50.filtered.vcf ceu.trio.calls.allTechs.mmq10_mbq10_q200.filtered.vcf )
-# echo $file
-# cp $file /humgen/gsa-scr1/pub/1000Pilot2_010710
-# end
-# 
-# endif
--- a/python/qltout.py
+++ b/python/qltout.py
@ -1,72 +0,0 @@
-#!/usr/bin/env python
-
-import string
-
-class align_block:
-    def __init__(self, indel_count, length, mismatches):
-        self.indel_count = int(indel_count)
-        self.length = int(length)
-        self.mismatches = int(mismatches)
-    def __str__(self):
-        return string.join( map( str, [self.indel_count, self.length, self.mismatches]), ' ')
-
-class qltout_record:
-
-    def __init__(self, obj):
-        self.fields = []
-        self.align_blocks = []
-        if type(obj) == str:
-            self.setValuesFromString( obj );
-        else:
-            raise TypeError("qltout_record did not recognize type: "+str(type(obj)))
-    
-    def setValuesFromString( self, line ):
-        formats = [str, int, int, int, int, int, int, int, int, int, int]
-        
-        split_line = line.split()
-        self.fields = map( lambda f, v: f(v), formats, split_line[:11] )
-
-        next_align_block = 11
-        while next_align_block < len(split_line):
-            self.align_blocks.append( align_block(*split_line[next_align_block:next_align_block+3]) )
-            next_align_block += 3
-            #print self.__str__()
-
-    def indelled_bases(self):
-        return sum([ab.indel_count for ab in self.align_blocks])
-
-    def __str__(self):
-        return string.join( map( str, self.fields ), ' ')+' '+string.join( map( str, self.align_blocks ), ' ')
-
-    def id(self): return self.fields[1]
-    def contig(self): return self.fields[6]
-
-    def offset(self): return self.fields[7]
-    def pos(self): return self.fields[7]+1
-
-    def offset_end(self): return self.fields[8]
-    def pos_end(self): return self.fields[8]+1
-
-    def linear_start(self): return self.fields[9]
-    def map_qual(self): return 100 # This only exists in maq files
-    # for ILT and Merlin, we assume the same confidence for all mappings
-
-
-class qltout_file:
-    def __init__(self, filename):
-        self.filename = filename
-        self.fqlt = open(self.filename)
-
-    def RecordGenerator(self):
-        
-        for line in self.fqlt:
-            #fields = line.rstrip("\n").split("\t")
-            if not line.startswith("QUERY"):
-                continue
-            yield qltout_record(line) #fields)
-        raise StopIteration
-                        
-    def __iter__(self):
-        return self.RecordGenerator()
-          
-            
--- a/python/samtooltest.sh
+++ b/python/samtooltest.sh
@ -1,20 +0,0 @@
-#!/bin/tcsh
-
-setenv refroot ~/work/refmut/
-setenv refname test2
-setenv ref "${refroot}/${refname}"
-setenv cov 100
-setenv sites 1
-setenv x '' # '-x 6696745:6708910'
-
-# testing samtools
-./SimulateReads.py --ref ${ref}.fasta --coverage ${cov} -n ${sites} -l 10 -t None ${x}
-
-# running samtools
-samtools import ${refname}__mut.None_10.bp_nsites.${sites}_cov.${cov}.ref ${refname}__mut.None_10.bp_nsites.${sites}_cov.${cov}.sam samtest.bam
-
-samtools sort samtest.bam samtestSorted
-
-samtools index samtestSorted.bam
-
-samtools pileup -f ${ref}.fasta samtestSorted.bam
--- a/python/snpSelector.py
+++ b/python/snpSelector.py
@ -1,728 +0,0 @@
-import os.path
-import sys
-from optparse import OptionParser
-from vcfReader import *
-#import pylab
-import operator
-from itertools import *
-import math
-import random
-    
-DEBUG = False
-
-class Range:
-    ANY = '*'
-    def __init__(self, left = ANY, right = ANY, leftOpen = False, rightOpen = False):
-        self.left = left
-        self.right = right
-        self.leftOpen = leftOpen
-        self.rightOpen = rightOpen
-        
-    def __str__(self):
-        leftB, rightB = '[', ']'
-        if self.leftOpen: leftB = '('
-        if self.rightOpen: rightB = ')'
-        return '%s%s, %s%s' % (leftB, self.left, self.right, rightB)
-        
-    __repr__ = __str__
-        
-    def dashedString(self):
-        return str(self).replace(", ", "-")
-        
-    def contains(self, v):
-        def test(r, op, open):
-            return r == Range.ANY or op(v, r) or (not open and v == r)  
-        return test(self.left, operator.__gt__, self.leftOpen) and test(self.right, operator.__lt__, self.rightOpen)
-
-class CallCovariate:
-    def __init__(self, feature, featureRange, qualRange, FPRate = None):
-        self.feature = feature
-        
-        self.featureRange = featureRange
-
-        self.qualRange = qualRange
-        self.FPRate = FPRate
-        
-    def containsVariant(self, call):
-        inFeature = self.featureRange.contains(call.getField(self.feature))
-        inQual = self.qualRange.contains(call.getQual())
-        #print 'inFeature, inQual', inFeature, inQual
-        return inFeature and inQual
-
-    def getFPRate(self): return self.FPRate
-    def getFeature(self): return self.feature
-    
-    def getCovariateField(self): return self.getFeature() + '_RQ'
-    
-    def __str__(self): return "[CC feature=%s range=%s qualRange=%s]" % (self.feature, self.featureRange, self.qualRange)
-
-class RecalibratedCall:
-    def __init__(self, call, features):
-        self.call = call
-        self.features = dict([[feature, None] for feature in features])
-        
-    def recalFeature( self, feature, FPRate ):
-        assert self.features[feature] == None, "Feature " + feature + ' has value ' + str(self.features[feature]) + ' for call ' + str(self.call) # not reassigning values
-        assert FPRate <= 1 and FPRate >= 0
-        self.features[feature] = FPRate
-        
-    def getFeature( self, feature, missingValue = None, phredScaleValue = False ):
-        v = self.features[feature]
-        if v == None:
-            return missingValue
-        elif phredScaleValue:
-            return phredScale(v)
-        else:
-            return v
-        
-    def jointFPErrorRate(self):
-        #print self.features
-        logTPRates = [math.log10(1-r) for r in self.features.itervalues() if r <> None]
-        logJointTPRate = reduce(lambda x, y: x + y, logTPRates, 0)
-        logJointTPRate = min(logJointTPRate, 1e-3 / 3) # approximation from het of 0.001
-        jointTPRate = math.pow(10, logJointTPRate)
-        #print logTPRates
-        #print logJointTPRate, jointTPRate
-        return 1 - jointTPRate
-        
-    def featureStringList(self):
-        return ','.join(map(lambda feature: '%s=Q%d' % (feature, self.getFeature(feature, '*', True)), self.features.iterkeys()))        
-        
-    def __str__(self):
-        return '[%s: %s => Q%d]' % (str(self.call), self.featureStringList(), phredScale(self.jointFPErrorRate()))
-
-def readVariants( file, maxRecords = None, decodeAll = True, downsampleFraction = 1, filter = None, minQScore = -1, maxQScore = 10000000, mustBeVariant = False, skip = None ):
-    if filter == None:
-        filter = not OPTIONS.unfiltered
-        
-    f = open(file)
-    header, columnNames, lines = readVCFHeader(f)
-
-    nLowQual = 0
-    counter = 0
-    def parseVariant(args):
-        global nLowQual, counter
-        header1, VCF, counter = args
-        counter += 1
-        #print counter, skip, counter % skip
-        if filter and not VCF.passesFilters() or ( False and mustBeVariant == True and not VCF.isVariant() ): # currently ignore mustBeVariant
-            #print 'filtering', VCF
-            return None
-        elif VCF.getQual() <= minQScore or VCF.getQual() > maxQScore:
-            #print 'filtering', VCF.getQual()
-            #nLowQual += 1
-            return None
-        elif skip <> None and counter % skip <> 0:
-            #print 'skipping'
-            return None
-        elif random.random() <= downsampleFraction:
-            #print 'keeping', VCF
-            return VCF
-        else:
-            return None
-
-    variants = ifilter(None, imap(parseVariant, islice(lines2VCF(lines, header=header, columnNames = columnNames, extendedOutput = True, decodeAll = decodeAll), maxRecords)))
-    if nLowQual > 0:
-        print '%d snps filtered due to QUAL < %d' % (nLowQual, minQScore)
-    return header, variants
-
-def selectVariants( variants, selector = None ):
-    if selector <> None:
-        return filter(selector, variants)
-    else:
-        return variants
-
-def titv(variants):
-    ti = len(filter(VCFRecord.isTransition, variants))
-    tv = len(variants) - ti
-    titv = ti / (1.0*max(tv,1))
-
-    return titv
-
-def dbSNPRate(variants):
-    inDBSNP = len(filter(VCFRecord.isKnown, variants))
-    return float(inDBSNP) / max(len(variants),1)
-
-def gaussian(x, mu, sigma):    
-    constant = 1 / math.sqrt(2 * math.pi * sigma**2)
-    exponent = -1 * ( x - mu )**2 / (2 * sigma**2)
-    return constant * math.exp(exponent)
-
-# if target = T, and FP calls have ti/tv = 0.5, we want to know how many FP calls
-# there are in N calls with ti/tv of X.  
-# 
-def titvFPRateEstimate(variants, target):
-    titvRatio = titv(variants)
-    
-    # f <- function(To,T) { (To - T) / (1/2 - T) + 0.001 }
-    def theoreticalCalc():
-        if titvRatio >= target:
-            FPRate = 0
-        else:
-            FPRate = (titvRatio - target) / (0.5 - target)
-        FPRate = min(max(FPRate, 0), 1)
-        TPRate = max(min(1 - FPRate, 1 - dephredScale(OPTIONS.maxQScoreForCovariate)), dephredScale(OPTIONS.maxQScoreForCovariate))
-        if DEBUG: print 'FPRate', FPRate, titvRatio, target
-        assert FPRate >= 0 and FPRate <= 1
-        return TPRate
-    
-    # gaussian model
-    def gaussianModel():
-        LEFT_HANDED = True
-        sigma = 1 # old value is 5
-        constant = 1 / math.sqrt(2 * math.pi * sigma**2)
-        exponent = -1 * ( titvRatio - target )**2 / (2 * sigma**2)
-        TPRate = gaussian(titvRatio, target, sigma) / gaussian(target, target, sigma)
-        if LEFT_HANDED and titvRatio >= target:
-            TPRate = 1
-        TPRate -= dephredScale(OPTIONS.maxQScoreForCovariate)
-        if DEBUG: print 'TPRate', TPRate, constant, exponent, dephredScale(OPTIONS.maxQScoreForCovariate)
-        return TPRate
-    
-    FPRate = 1 - theoreticalCalc()
-    #FPRate = 1 - gaussianModel()
-    nVariants = len(variants)
-    
-    if DEBUG: print ':::', nVariants, titvRatio, target, FPRate
-    
-    return titvRatio, FPRate
-    
-def phredScale(errorRate):
-    return -10 * math.log10(max(errorRate, 1e-10))
-
-def dephredScale(qscore):
-    return math.pow(10, float(qscore) / -10)
-
-def frange6(*args):
-    """A float range generator."""
-    start = 0.0
-    step = 1.0
-
-    l = len(args)
-    if l == 1:
-        end = args[0]
-    elif l == 2:
-        start, end = args
-    elif l == 3:
-        start, end, step = args
-        if step == 0.0:
-            raise ValueError, "step must not be zero"
-    else:
-        raise TypeError, "frange expects 1-3 arguments, got %d" % l
-
-    v = start
-    while True:
-        if (step > 0 and v >= end) or (step < 0 and v <= end):
-            raise StopIteration
-        yield v
-        v += step
-
-def compareFieldValues( v1, v2 ):
-    if type(v1) <> type(v2):
-        #print 'Different types', type(v1), type(v2)
-        c = cmp(type(v1), type(v2))
-    else:
-        c = cmp(v1, v2)
-    #print 'Comparing %s %s = %s' % (v1, v2, c)
-    return c
-
-def calculateBins(variants, field, minValue, maxValue, partitions):
-    values = map(lambda x: x.getField(field), variants)
-    return calculateBinsForValues(values, field, minValue, maxValue, partitions)
-
-def calculateBinsForValues(values, field, minValue, maxValue, partitions):
-    sortedValues = sorted(values)
-    captureFieldRangeForPrinting(field, sortedValues)
-    
-    targetBinSize = len(values) / (1.0*partitions)
-    #print sortedValues
-    uniqBins = groupby(sortedValues)
-    binsAndSizes = map(lambda x: [x[0], len(list(x[1]))], uniqBins)
-    #print 'BINS AND SIZES', binsAndSizes
-
-    def bin2Break(bin): return [bin[0], bin[0], bin[1]]
-    bins = [bin2Break(binsAndSizes[0])]
-    for bin in binsAndSizes[1:]:
-        #print '  Breaks', bins
-        #print '  current bin', bin
-        curLeft = bins[-1][0]
-        curSize = bin[1]
-        prevSize = bins[-1][2]
-        #print curSize, prevSize
-        if curSize + prevSize > targetBinSize or (not isNumber(curLeft) and isNumber(bin[0])):
-            #print '     => appending', bin2Break(bin)
-            bins.append(bin2Break(bin))
-        else:
-            bins[-1][1] = bin[0]
-            bins[-1][2] += curSize
-
-    #print 'Returning ', bins
-    #sys.exit(1)
-    return bins
-
-def fieldRange(variants, field):
-    values = map(lambda v: v.getField(field), variants)
-    minValue = min(values)
-    maxValue = max(values)
-    #rangeValue = maxValue - minValue
-    bins = calculateBins(variants, field, minValue, maxValue, OPTIONS.partitions)
-    validateBins(bins)
-    return minValue, maxValue, bins
-
-def validateBins(bins):
-    #print 'Bins are', bins
-    for left1, right1, count1 in bins:
-        for left2, right2, count2 in bins:
-            def contains2(x):
-                return left2 < x and x < right2
-
-            if left1 <> left2 and right1 <> right2:
-                if None in [left1, left2, right1, right2]:
-                    pass # we're ok
-                elif contains2(left1) or contains2(right2):
-                    raise Exception("Bad bins", left1, right1, left2, right2)
-
-def printFieldQualHeader():
-    more = ""
-    if TRUTH_CALLS <> None:
-        more = CallCmp.HEADER
-    def p(stream):
-        if stream <> None:
-            print >> stream, '  %20s %20s         left        right %15s nVariants  nNovels titv titvNovels  dbSNP  Q' % ("category", "field", "qRange"), more
-    p(sys.stdout)
-    p(RECAL_LOG)
-    
-def printFieldQual( category, field, cc, variants ):
-    more = ""
-    if TRUTH_CALLS <> None:
-        callComparison, theseFPs = sensitivitySpecificity(variants, TRUTH_CALLS)
-        more = str(callComparison)
-    novels = selectVariants(variants, VCFRecord.isNovel)
-    def p(stream):
-        if stream <> None:
-            print >> stream, '  %20s %20s %15s %15s  %8d %8d %2.2f       %2.2f  %3.2f %3d' % (category, field, binString(field, cc.featureRange), cc.qualRange.dashedString(), len(variants), len(novels), titv(variants), titv(novels), dbSNPRate(variants) * 100, phredScale(cc.FPRate)), more
-    p(sys.stdout)
-    p(RECAL_LOG)
-
-FIELD_RANGES = dict()
-def captureFieldRangeForPrinting(field, sortedValues):
-    """Finds the minimum float value in sortedValues for convenience printing in recal.log"""
-    #print sortedValues
-    floatValues = filter(isNumber, sortedValues)
-    if floatValues <> []:
-        FIELD_RANGES[field] = floatValues[0]
-        #print 'Setting field range to', field, FIELD_RANGES[field]
-
-
-def isNumber(x):
-    return isinstance(x, (int, long, float))
-
-def binString(field, cc):
-    epsilon = 1e-2
-    left, right = cc.left, cc.right
-    leftStr = str(left)
-    rightStr = "%5s" % str(right)
-    if OPTIONS.plottableNones and not isNumber(left) and not isNumber(right) and field in FIELD_RANGES:
-        left = right = FIELD_RANGES[field] - epsilon
-    if OPTIONS.plottableNones and not isNumber(left) and isNumber(right):
-        left = right - epsilon        
-    if isNumber(left): leftStr = "%.4f" % left
-    if isNumber(right): rightStr = "%.4f" % right
-    return '%12s %12s' % (leftStr, rightStr)
-
-#
-#
-#
-def recalibrateCalls(variants, fields, callCovariates):
-    def phred(v): return phredScale(v)
-    
-    for variant in variants:
-        recalCall = RecalibratedCall(variant, fields) 
-        originalQual = variant.getField('QUAL') 
-
-        for callCovariate in callCovariates:
-            if callCovariate.containsVariant(variant):
-                FPR = callCovariate.getFPRate()
-                recalCall.recalFeature(callCovariate.getFeature(), FPR)
-                recalCall.call.setField(callCovariate.getCovariateField(), phred(FPR))
-
-        #recalCall.call.setField('QUAL', phred(recalCall.jointFPErrorRate())) 
-        recalCall.call.setField('QUAL', phred(recalCall.jointFPErrorRate())) 
-        recalCall.call.setField('OQ', originalQual)
-        #print 'recalibrating', variant.getLoc()
-        #print '  =>',  variant
-        yield recalCall.call
-    
-#
-#
-#
-def optimizeCalls(variants, fields, titvTarget):
-    callCovariates = calibrateFeatures(variants, fields, titvTarget, category = "covariates", useBreaks=True)
-    recalCalls = recalibrateCalls(variants, fields, callCovariates)
-    return recalCalls, callCovariates
-
-def printCallQuals(field, recalCalls, titvTarget, info = ""):
-    print '--------------------------------------------------------------------------------'
-    print info
-    calibrateFeatures(recalCalls, [field], titvTarget, printCall = True, cumulative = False, forcePrint = True, prefix = "OPT-", printHeader = False, category = "optimized-calls" )
-    print 'Cumulative'
-    calibrateFeatures(recalCalls, [field], titvTarget, printCall = True, cumulative = True, forcePrint = True, prefix = "OPTCUM-", printHeader = False, category = "optimized-calls" )
-
-def all( p, l ):
-    for elt in l:
-        if not p(elt): return False
-    return True
-
-
-def mapVariantBins(variants, field, cumulative = False, breakQuals = [Range()]):
-    minValue, maxValue, featureBins = fieldRange(variants, field)
-
-    #print 'BREAKQuals', breakQuals[0]
-    bins = [(x,y) for x in featureBins for y in breakQuals]
-    #print 'BINS', bins
-
-    def variantsInBin(featureBin, qualRange):
-        right = featureBin[1]
-        if cumulative: 
-            right = Range.ANY
-        cc = CallCovariate(field, Range(featureBin[0], right), qualRange)
-
-        return cc, selectVariants(variants, lambda v: cc.containsVariant(v))
-        
-    #sys.exit(1)    
-    return starmap( variantsInBin, bins )
-
-def qBreaksRanges(qBreaks, useBreaks):
-    if qBreaks == None or not useBreaks:
-        return [Range()]        # include everything in a single range
-    else:
-        breaks = map(float, qBreaks.split(','))
-        return map(lambda x, y: Range(x,y, rightOpen = True), chain([Range.ANY], breaks), chain(breaks, [Range.ANY]))
-
-def calibrateFeatures(variants, fields, titvTarget, printCall = False, cumulative = False, forcePrint = False, prefix = '', printHeader = True, category = None, useBreaks = False ):
-    covariates = []    
-
-    if printHeader: printFieldQualHeader()
-    for field in fields:
-        if DEBUG: print 'Optimizing field', field
-        
-        titv, FPRate = titvFPRateEstimate(variants, titvTarget)
-        #print 'Overall FRRate:', FPRate, nErrors, phredScale(FPRate)
-
-        for cc, selectedVariants in mapVariantBins(variants, field, cumulative = cumulative, breakQuals = qBreaksRanges(OPTIONS.QBreaks, useBreaks and field <> 'QUAL')):
-            #print 'CC', cc, field, useBreaks
-            if len(selectedVariants) > max(OPTIONS.minVariantsPerBin,1) or forcePrint:
-                titv, FPRate = titvFPRateEstimate(selectedVariants, titvTarget)
-                #dbsnp = dbSNPRate(selectedVariants)
-                cc.FPRate = FPRate
-                covariates.append(cc)
-                printFieldQual( category, prefix + field, cc, selectedVariants )
-            else:
-                print 'Not calibrating bin', cc, 'because it contains too few variants:', len(selectedVariants)
-
-    return covariates
-
-class CallCmp:
-    def __init__(self, nTP, nFP, nFN):
-        self.nTP = nTP
-        self.nFP = nFP
-        self.nFN = nFN
-    
-#    def FPRate(self):
-#        return (1.0*self.nFP) / max(self.nTP + self.nFP, 1)
-
-    def FNRate(self):
-        return (1.0*self.nFN) / max(self.nTP + self.nFN, 1)
-
-    def sensitivity(self):
-        # = TP / (TP + FN)
-        return (1.0*self.nTP) / max( self.nTP + self.nFN,1 )
-
-    def PPV(self):
-        # = TP / (TP + FP)
-        return (1.0*self.nTP) / max( self.nTP + self.nFP, 1 )
-    
-    HEADER = "TP    FP    FN  FNRate  Sensitivity PPV"
-    
-    def __str__(self):
-        return '%6d %6d %6d %.3f %.3f %.3f' % (self.nTP, self.nFP, self.nFN, self.FNRate(), self.sensitivity(), self.PPV())
-
-def variantInTruth(variant, truth):
-    if variant.getLoc() in truth:
-        return truth[variant.getLoc()]
-    else:
-        return False
-
-def isVariantInSample(t, sample):
-    #print "isVariantInSample", t.getLoc(), t.getField(sample), x
-    return t.getField(sample) <> "0/0"
-
-def variantsInTruth(truth):
-    # fixme
-    return len(filter(lambda x: isVariantInSample(x, OPTIONS.useSample), truth))
-    
-def sensitivitySpecificity(variants, truth):
-    nTP, nFP = 0, 0
-    FPs = []
-    for variant in variants:
-        t = variantInTruth(variant, truth)
-
-        isTP, isFP = False, False
-        if OPTIONS.useSample or OPTIONS.onlyAtTruth:
-            if t: # we have a site
-                isTP = (isVariantInSample(t, OPTIONS.useSample) and t.isVariant()) or (not isVariantInSample(t, OPTIONS.useSample) and not t.isVariant())
-                isFP = not isTP
-        else:
-            isTP = t
-            isFP = not t
-
-        #if variant.getLoc() == "1:867694":
-        #    print variant, 'T: [', t, '] isTP, isFP', isTP, isFP
-
-        if isTP:
-            t.setField("FN", 0)
-            variant.setField("TP", 1)
-            nTP += 1
-        elif isFP:
-            nFP += 1
-            variant.setField("TP", 0)
-            #print t, variant, "is a FP!"
-            FPs.append(variant)
-    nRef = 0 # len(filter(lambda x: not x.isVariant(), truth.itervalues()))
-    nFN = variantsInTruth(truth.itervalues()) - nTP - nRef
-    #print 'nRef', nTP, nFP, nFN, nRef
-    return CallCmp(nTP, nFP, nFN), FPs
-
-def markTruth(calls):
-    if not OPTIONS.useSample:
-        for variant in calls.itervalues(): 
-            variant.setField("TP", 0) # set the TP field to 0
-
-def compareCalls(calls, truthCalls):
-    #markTruth(truthCalls)
-    
-    def compare1(name, cumulative):
-        for field in ["QUAL", "OQ"]:
-            for cc, selectedVariants in mapVariantBins(calls, field, cumulative = cumulative):
-                #print selectedVariants[0]
-                printFieldQual("truth-comparison-" + name, field, cc, selectedVariants )
-    
-    print 'PER BIN nCalls=', len(calls)
-    compare1('per-bin', False)
-
-    print 'CUMULATIVE nCalls=', len(calls)
-    compare1('cum', True)
-    
-def randomSplit(l, pLeft):
-    def keep(elt, p):
-        if p < pLeft:
-            return elt, None
-        else:
-            return None, elt
-    data = [keep(elt, p) for elt, p in zip(l, map(lambda x: random.random(), l))]
-    def get(i): return filter(lambda x: x <> None, [x[i] for x in data])
-    return get(0), get(1)
-
-def setup():
-    global OPTIONS, header
-    usage = "usage: %prog files.list [options]"
-    parser = OptionParser(usage=usage)
-    parser.add_option("-f", "--f", dest="fields",
-                        type='string', default="QUAL",
-                        help="Comma-separated list of fields (either in the VCF columns of as INFO keys) to use during optimization [default: %default]")
-    parser.add_option("-t", "--truth", dest="truth",
-                        type='string', default=None,
-                        help="VCF formated truth file.  If provided, the script will compare the input calls with the truth calls.  It also emits calls tagged as TP and a separate file of FP calls")
-    parser.add_option("-l", "--recalLog", dest="recalLog",
-                        type='string', default="recal.log",
-                        help="VCF formated truth file.  If provided, the script will compare the input calls with the truth calls.  It also emits calls tagged as TP and a separate file of FP calls")
-    parser.add_option("-u", "--unfiltered", dest="unfiltered",
-                        action='store_true', default=False,
-                        help="If provided, unfiltered calls will be used in comparisons [default: %default]")
-    parser.add_option("", "--plottable", dest="plottableNones",
-                        action='store_true', default=False,
-                        help="If provided, will generate fake plottable points for annotations with None values -- doesn't effect the behavior of the system just makes it easy to plot outputs [default: %default]")
-    parser.add_option("", "--onlyAtTruth", dest="onlyAtTruth",
-                        action='store_true', default=False,
-                        help="If provided, we only consider TP/FP/FN at truth sites[default: %default]")
-    parser.add_option("-p", "--partitions", dest="partitions",
-                        type='int', default=25,
-                        help="Number of partitions to use for each feature.  Don't use so many that the number of variants per bin is very low. [default: %default]")
-    parser.add_option("", "--maxRecordsForCovariates", dest="maxRecordsForCovariates",
-                        type='int', default=2000000,
-                        help="Derive covariate information from up to this many VCF records.  For files with more than this number of records, the system downsamples the reads [default: %default]")
-    parser.add_option("-m", "--minVariantsPerBin", dest="minVariantsPerBin",
-                       type='int', default=10,
-                        help="Don't include any covariates with fewer than this number of variants in the bin, if such a thing happens.  NEEDS TO BE FIXED")
-    parser.add_option("-M", "--maxRecords", dest="maxRecords",
-                       type='int', default=None,
-                        help="Maximum number of input VCF records to process, if provided.  Default is all records")
-    parser.add_option("-q", "--qMin", dest="minQScore",
-                        type='int', default=-1,
-                        help="The minimum Q score of the initial SNP list to consider for selection [default: %default]")
-    parser.add_option("-Q", "--qMax", dest="maxQScore",
-                        type='int', default=1000000,
-                        help="The maximum Q score allowed for both a single covariate and the overall QUAL score [default: %default]")
-    parser.add_option("", "--QBreaks", dest="QBreaks",
-                        type='string', default=None,
-                        help="Breaks in QUAL for generating covarites [default: %default]")
-    parser.add_option("", "--maxQScoreForCovariate", dest="maxQScoreForCovariate",
-                        type='int', default=60,
-                        help="The maximum Q score allowed for both a single covariate and the overall QUAL score [default: %default]")
-    parser.add_option("-o", "--outputVCF", dest="outputVCF",
-                        type='string', default="recal.vcf",
-                        help="If provided, a VCF file will be written out to this file [default: %default]")
-    parser.add_option("", "--FNoutputVCF", dest="FNoutputVCF",
-                        type='string', default=None,
-                        help="If provided, VCF file will be written out to this file [default: %default]")
-    parser.add_option("", "--titv", dest="titvTarget",
-                        type='float', default=None,
-                        help="If provided, we will optimize calls to the targeted ti/tv rather than that calculated from known calls [default: %default]")
-    parser.add_option("-b", "--bootstrap", dest="bootStrap",
-                       type='float', default=None,
-                       help="If provided, the % of the calls used to generate the recalibration tables. [default: %default]")
-    parser.add_option("-k", "--skip", dest="skip",
-                       type=int, default=None,
-                       help="If provided, we'll only take every nth record [default: %default]")
-    parser.add_option("-s", "--useSample", dest="useSample",
-                        type='string', default=False,
-                        help="If provided, we will examine sample genotypes for this sample, and consider TP/FP/FN in the truth conditional on sample genotypes [default: %default]")
-    parser.add_option("-r", "--dontRecalibrate", dest="dontRecalibrate",
-                        action='store_true', default=False,
-                        help="If provided, we will not actually do anything to the calls, they will just be assessed [default: %default]")
-   
-    (OPTIONS, args) = parser.parse_args()
-    if len(args) > 2:
-        parser.error("incorrect number of arguments")
-    return args
-
-def assessCalls(file):
-    print 'Counting records in VCF', file
-    numberOfRecords = 1
-    downsampleFraction = 1
-    if OPTIONS.maxRecords <> None:
-        numberOfRecords = quickCountRecords(open(file))
-        if OPTIONS.maxRecords < numberOfRecords:
-            numberOfRecords = OPTIONS.maxRecords
-        downsampleFraction = min(float(OPTIONS.maxRecordsForCovariates) / numberOfRecords, 1)
-    #print 'Reading variants', OPTIONS.skip, downsampleFraction
-    header, allCalls = readVariants(file, OPTIONS.maxRecords, downsampleFraction=downsampleFraction, minQScore = OPTIONS.minQScore, maxQScore = OPTIONS.maxQScore, skip = OPTIONS.skip)
-    allCalls = list(allCalls)
-    print 'Number of VCF records', numberOfRecords, ', max number of records for covariates is', OPTIONS.maxRecordsForCovariates, 'so keeping', downsampleFraction * 100, '% of records'
-    print 'Number of selected VCF records', len(allCalls)
-    
-    titvtarget = OPTIONS.titvTarget
-    if titvtarget == None:
-        titvtarget = titv(selectVariants(allCalls, VCFRecord.isKnown))
-    print 'Ti/Tv all  ', titv(allCalls)
-    print 'Ti/Tv known', titv(selectVariants(allCalls, VCFRecord.isKnown))
-    print 'Ti/Tv novel', titv(selectVariants(allCalls, VCFRecord.isNovel))
-    
-    return header, allCalls, titvtarget
-
-def determineCovariates(allCalls, titvtarget, fields):
-    if OPTIONS.bootStrap:
-        callsToOptimize, recalEvalCalls = randomSplit(allCalls, OPTIONS.bootStrap)
-    else:
-        callsToOptimize = allCalls 
-
-    recalOptCalls, covariates = optimizeCalls(callsToOptimize, fields, titvtarget)
-    printCallQuals("QUAL", list(recalOptCalls), titvtarget, 'OPTIMIZED CALLS')
-
-    if OPTIONS.bootStrap:
-        recalibatedEvalCalls = recalibrateCalls(recalEvalCalls, fields, covariates)
-        printCallQuals("QUAL", list(recalibatedEvalCalls), titvtarget, 'BOOTSTRAP EVAL CALLS')
-
-    return covariates
-
-def writeRecalibratedCalls(file, header, calls):
-    if file:
-        f = open(file, 'w')
-        #print 'HEADER', header
-        i = 0
-        for line in formatVCF(header, calls):
-            if i % 10000 == 0: print 'writing VCF record', i
-            i += 1
-            print >> f, line
-        f.close()
-
-def readTruth(truthVCF):
-    print 'Reading truth file', truthVCF
-    rawTruth = list(readVariants(truthVCF, maxRecords = None, decodeAll = True, mustBeVariant = True)[1])
-    truth = dict( [[v.getLoc(), v] for v in rawTruth])
-    print 'Number of raw and passing filter truth calls', len(rawTruth), len(truth)
-    return truth
-
-def evaluateTruth(header, callVCF, truth, truthVCF):
-    print 'Reading variants back in from', callVCF
-    header, calls = readVariants(callVCF)
-    calls = list(calls)
-    
-    print '--------------------------------------------------------------------------------'
-    print 'Comparing calls to truth', truthVCF
-    print ''
-
-    compareCalls(calls, truth)
-
-    writeRecalibratedCalls(callVCF, header, calls)
-
-    def isFN(v):
-        return isVariantInSample(v, OPTIONS.useSample) and not v.hasField("FN")
-
-    if truth <> None and OPTIONS.FNoutputVCF:
-        f = open(OPTIONS.FNoutputVCF, 'w')
-        #print 'HEADER', header
-        for line in formatVCF(header, filter( isFN, truth.itervalues())):
-            print >> f, line
-        f.close()
-
-TRUTH_CALLS = None
-RECAL_LOG = None
-def main():
-    global TRUTH_CALLS, RECAL_LOG
-    
-    args = setup()
-    fields = OPTIONS.fields.split(',')
-
-    truthVCF = None
-    #print("LENGTH OF ARGS "+str(len(args)))
-
-    if OPTIONS.truth <> None:
-        truthVCF = OPTIONS.truth
-        TRUTH_CALLS = readTruth(truthVCF)
-
-    if OPTIONS.recalLog <> None:
-        RECAL_LOG = open(OPTIONS.recalLog, "w") 
-        print >> RECAL_LOG, "# optimized vcf", args[0]
-        print >> RECAL_LOG, "# truth vcf", truthVCF
-        for key, value in OPTIONS.__dict__.iteritems():
-            print >> RECAL_LOG, '#', key, value
-
-    header, allCalls, titvTarget = assessCalls(args[0])
-    if not OPTIONS.dontRecalibrate: 
-        covariates = determineCovariates(allCalls, titvTarget, fields)
-        print OPTIONS
-        header, callsToRecalibate = readVariants(args[0], OPTIONS.maxRecords, minQScore = OPTIONS.minQScore, maxQScore = OPTIONS.maxQScore, skip = OPTIONS.skip)
-        RecalibratedCalls = recalibrateCalls(callsToRecalibate, fields, covariates)
-        writeRecalibratedCalls(OPTIONS.outputVCF, header, RecalibratedCalls)
-    else:
-        printFieldQualHeader()
-        printCallQuals("QUAL", allCalls, titvTarget)
-        OPTIONS.outputVCF = args[0]
-
-    if truthVCF <> None:
-        evaluateTruth(header, OPTIONS.outputVCF, TRUTH_CALLS, truthVCF)
-
-
-PROFILE = False
-if __name__ == "__main__":
-    if PROFILE:
-        import cProfile
-        cProfile.run('main()', 'fooprof')
-        import pstats
-        p = pstats.Stats('fooprof')
-        p.sort_stats('cumulative').print_stats(10)
-        p.sort_stats('time').print_stats(10)
-        p.sort_stats('time', 'cum').print_stats(.5, 'init')
-    else:
-        main()
--- a/python/subsetDbSNPWithrsIDs.py
+++ b/python/subsetDbSNPWithrsIDs.py
@ -1,49 +0,0 @@
-import sys
-from optparse import OptionParser
-
-def getRsIDSet(dbSNPIds, idIndex):
-    s = set()
-    for line in open(dbSNPIds):
-        #print line
-        rsID = line.split()[idIndex]
-        s.add(rsID)
-
-    return frozenset(s)
-
-
-def main():
-    global OPTIONS
-    usage = "usage: %prog [options] dbSNP.in.rsids dbSNP.to.match dbSNP.out"
-    parser = OptionParser(usage=usage)
-    parser.add_option("-v", "--verbose", dest="verbose",
-                        action='store_true', default=False,
-                        help="")
-                        
-    (OPTIONS, args) = parser.parse_args()
-    if len(args) != 3:
-        parser.error("incorrect number of arguments")
-
-    dbSNPIds = args[0]
-    dbSNPMatch = args[1]
-    dbSNPOut = args[2]
-    idIndex = 4
-
-    rsSet = getRsIDSet(dbSNPIds, idIndex)
-    print 'rsID set has %d elements' % len(rsSet)
-    
-    # 
-    count = 0
-    matched = 0 
-    out = open(dbSNPOut, 'w')
-    for line in open(dbSNPMatch):
-        count += 1
-        rsID = line.split()[idIndex]
-        if rsID in rsSet:
-            #sys.stdout.write(line)
-            matched += 1
-            out.write(line)
-    print 'Processed %d lines, matching %d elements, excluding %d' % ( count, matched, count - matched )
-    out.close()
-            
-if __name__ == "__main__":
-    main()
--- a/python/tgtc2sam.py
+++ b/python/tgtc2sam.py
@ -1,287 +0,0 @@
-#!/usr/bin/env python
-
-"""Starts from TGTC list of fastb, qualb, and qltout files and creates merged SAM/BAM files"""
-
-import os, commands, sys
-from farm_commands import cmd
-
-# Global list of failed conversions to be reported at end of execution
-failed_list = []
-
-class FailedConversion:
-    def __init__(self, lane, error_str):
-        self.lane = lane
-        self.error_str = error_str
-        print self
-    def __str__(self):
-        return self.error_str+" while attempting conversion of "+str(self.lane)
-
-class lane:
-    bam_ref_list = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bam.ref_list";    
-    pipe_dirs_head = "/slxa/pipelineOutput/farm"
-    pipe_dirs = os.listdir(pipe_dirs_head)
-    pipe_dirs_dict = dict( [(dir.split("_")[1], dir) for dir in pipe_dirs if "_" in dir] )
-
-    def __init__(self, line):
-        fields = line.split(" ")
-        if len(fields) == 5:
-            self.fdl, self.fastb, self.qualb, self.qltout, group = fields
-        else:
-            self.fdl, self.fastb, self.qualb, self.qltout = fields
-            self.qltout = self.qltout.rstrip()
-
-        self.fqltout = os.path.splitext(self.qltout)[0]+".fais_filtered.qltout"
-        self.flowcell, self.date, self.lane = self.fdl.split(".")
-        self.flowlane = self.flowcell+"."+self.lane
-        self.path_head = os.path.splitext(self.qltout)[0]
-        self.sam = self.path_head+".sam"
-        self.bam = self.path_head+".bam"
-        self.head = ".".join(os.path.basename(self.path_head).split(".")[:2]) # should amount to FLOWCELL.LANE
-
-        #self.refdict_metrics()
-
-    def pipe_dir(self):
-        dir = lane.pipe_dirs_dict[self.flowcell]
-        return lane.pipe_dirs_head+"/"+dir
-
-    def set_default_refdict(self):
-        self.refdict = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.dict";
-        #print "WARNING: Using standard Hg18 reference dictionary (refdict) for",self
-
-    def set_default_metrics(self):
-        self.metrics = "/broad/1KG/legacy_data/tgtc2sam/unknown.metrics"
-        #print "WARNING: Using default unkonwn.metrics file for",self
-
-    def refdict_metrics(self):
-        "Ensures that a reference dictionary and metrics are available"
-        try:
-            pipe_lane_dir = self.pipe_dir()
-        except KeyError:
-            self.set_default_refdict()
-            self.set_default_metrics()
-            return True
-
-        self.metrics = pipe_lane_dir+"/"+str(self.flowlane)+".metrics"
-        if os.path.exists(self.metrics):
-            line = commands.getoutput("grep '^reference=' "+pipe_lane_dir+"/"+self.flowlane+".metrics")
-            if line != "":
-                reference_fasta = line.split("=")[1]
-                self.refdict = os.path.splitext(reference_fasta)[0]+'.dict'
-                if "PhiX" in self.refdict:
-                    failed_list.append(FailedConversion(self, "PhiX dictionary missing"))
-                    return False
-            else:
-                self.set_default_refdict()
-            return True # We were able to generate refdict and metrics files
-        else:
-            self.set_default_refdict()
-            self.set_default_metrics()
-            return True # We were able to generate refdict file and substituted unknown.metrics as the metrics file
-
-            #failed_list.append(FailedConversion(self, "Could not find metrics file"))
-            #return False 
-                                      
-    def __str__(self):
-        return self.fdl
-
-def usage():
-    print "Usage: tgtc_sam.py <command> <command_args>..."
-    print
-    print "  Works from tgtc files which contain fastb, qualb, qltout filenames corresponding to one lane of data"
-
-    print "  Commands and command arguments:"
-    print "    convert tgtc_list - given a tgtc file converts files to SAM and BAM"
-    print "      tgtc - file with lines of fastb, qualb, qltout filenames"
-    print "      [clean] - optional 3rd argument that will create SAM/BAM files"
-    print "                regardless of whether they already exist"
-    print
-    print "    merge tgtc output_file - merges converted BAM files"
-    print "      tgtc - file with lines of fastb, qualb, qltout filenames"
-    print "      output_file - path and name of merged BAM file"
-    print 
-    print "    lists list_of_tgtc_lists - run over a list of tgtc lists given in a file"
-    print "      list_of_tgtc_files - file with tgtc filenames in it"
-    print
-    print "  Global options:"
-    print "    print_only: do not run commands, just print what they would be "
-    sys.exit(1)
-
-def make_lane_list(tgtc_file):
-    all_lanes = [lane(l) for l in open(tgtc_file) if not l.startswith("#")]
-    lanes = []
-    for ln in all_lanes:
-        if ln.refdict_metrics(): # if we were able to generate refdict and metrics
-            # append this lane to the lanes list that we will work with
-            lanes.append(ln)
-    unpaired = [l for l in lanes if ".end1." not in l.qualb and ".end2." not in l.qualb]
-    paired1 = [l for l in lanes if ".end1." in l.qualb]
-    paired2 = [l for l in lanes if ".end2." in l.qualb]
-    
-    print "Unpaired lanes:",len(unpaired)
-    print "Paired 1 lanes:",len(paired1)
-    print "Paired 2 lanes:",len(paired2)
-    assert len(paired1) == len(paired2)
-
-    return [(u, None) for u in unpaired] + zip(paired1, paired2)
-
-def convert_lanes(tgtc_file, recreate_all=False, just_print_commands=False):
-    lanes_done = 0
-    lane_list = make_lane_list(tgtc_file)
-    print "Beginning processing of these lanes (unpaired and paired):"
-    print "\n".join(["%s %s" % (p1,p2) for p1,p2 in lane_list])
- 
-    for p1, p2 in lane_list:
-        if recreate_all or not os.path.exists(p1.sam) or not os.path.getsize(p1.sam):
-
-            # Filer reads with > 4 errors
-            cmd_str = "\"FilterAlignsILTStyle QLTIN="+p1.qltout+" QLTOUT="+p1.fqltout
-
-            if p2 == None:
-                # This is an UNPAIRED LANE
-                print "Processing unpaired lane",p1
-                cmd_str += (" && Qltout2SAM"+
-                            " HEAD="+p1.head+
-                            " SAM="+p1.sam+
-                            " QLTOUT="+p1.fqltout+
-                            " FASTB="+p1.fastb+
-                            " QUALB="+p1.qualb+
-                            " REFDICT="+p1.refdict+
-                            " METRICS="+p1.metrics+
-                            " ALIGNER=PairFinder"+
-                            " TMPDIR=/broad/hptmp/andrewk")
-            else:
-                # This is a PAIRED LANE
-                print "Processing paired lanes",p1,p2
-                if p1.flowcell != p2.flowcell or p1.lane != p2.lane:
-                    print "### WARNING ### paired lanes ",p1,p2,"do not match! Skipping."
-                    continue
-                # Filter 2nd Qltout file too
-                cmd_str += " && FilterAlignsILTStyle QLTIN="+p2.qltout+" QLTOUT="+p2.fqltout
-                cmd_str += (" && QltoutPair2SAM"+
-                            " HEAD="+p1.head+
-                            " SAM="+p1.sam+
-                            " QLTOUT1="+p1.fqltout+" QLTOUT2="+p2.fqltout+
-                            " FASTB1="+p1.fastb+" FASTB2="+p2.fastb+
-                            " QUALB1="+p1.qualb+" QUALB2="+p2.qualb+
-                            " REFDICT="+p1.refdict+
-                            " METRICS="+p1.metrics+
-                            " ALIGNER=PairFinder"+
-                            " TMPDIR=/broad/hptmp/andrewk")
-                
-            # Make the BAM file
-            cmd_str += " && /seq/dirseq/samtools/current/samtools import "+lane.bam_ref_list+" "+p1.sam+" "+p1.bam+"\""
-            # Now remove one or both SAM files if all previous commands returned 0 for success
-            cmd_str += " && rm -f "+p1.sam
-            if p2 != None: cmd_str += " && rm -f "+p2.sam
-
-            status = cmd(cmd_str, "long", output_head=p1.path_head, just_print_commands=just_print_commands)
-            if status == 0:
-                lanes_done += 1
-            else:
-                failed_list.append("Unkown failure")
-            #if lanes_done + len(failed_list) > 0:
-                #break
-
-    failed = len(failed_list)
-    lanes_total = lanes_done+failed
-    percent_done = lanes_done / lanes_total if lanes_done != 0 else 0
-    print "Lane conversion success: %d/%d (%.1f%%)" % (lanes_done, lanes_total, percent_done)
-
-def merge_bams(tgtc_file, outfile, just_print_commands=False):
-    input_lane_list = make_lane_list(tgtc_file)
-    #lane_list = [lp for lp in lane_list if lp[0].flowcell =="30LLT"]
-    missing_files = False
-    lane_list = []
-    for p1,p2 in input_lane_list:
-        if "adapted" not in p1.qualb or (p2 != None and "adapted" not in p2.qualb):
-            print "NOT ADAPTED", p1.qualb, p2.qualb
-        if not os.path.exists(p1.bam):
-            #print "Missing file:",p1.bam
-            missing_files = True
-        else:
-            lane_list.append( (p1,p2) )
-
-    #return
-    if not missing_files:
-        if not os.path.exists(outfile):
-            print len(lane_list),"lanes / lane pairs to merge"
-            if not os.path.exists(os.path.dirname(outfile)):
-                os.makedirs(os.path.dirname(outfile))
-
-            cmd("java -cp /home/radon01/depristo/dev/workspace/Libraries/bin"+
-                " edu.mit.broad.picard.sam.MergeSamFiles"+
-                " "+" ".join(["I="+p1.bam for p1,p2 in lane_list])+
-                " O="+outfile+
-                " TMP_DIR=/broad/hptmp/andrewk/"+
-                " && /seq/dirseq/samtools/current/samtools index "+outfile+ 
-                " && /xchip/tcga/gbm/visualization/sam/IGVAlignmentProcessor/bin/run_linux-64.sh "+outfile+" "+os.path.dirname(outfile)+" hg18 -w 100",
-                "long", outfile, just_print_commands=just_print_commands)
-
-            # Original samtools merge command below that was used for rev. 1 merging
-
-            #cmd("/seq/dirseq/samtools/current/samtools merge "+outfile+
-            #    " "+" ".join([p1.bam for p1,p2 in lane_list])+
-            #    " && /seq/dirseq/samtools/current/samtools index "+outfile+
-            #    " && /xchip/tcga/gbm/visualization/sam/IGVAlignmentProcessor/bin/run_linux-64.sh "+outfile+" "+os.path.dirname(outfile)+" hg18 -w 100",
-            #    "long", outfile) 
-
-    else:
-        # Convert the lanes that had missing outfiles
-        #convert_lanes(tgtc_file)
-        pass
-
-def process_tgtc_lists(tgtc_list_file):
-    "Works through a list of tgtc files and merges them to produce the output"
-    lists_done = 0
-    
-    for line in open(tgtc_list_file):
-        if line[0] == '#':
-            continue
-        fields = line.split()
-        if len(fields) != 2:
-            continue
-        head, tgtc_list = fields
-        bam = head+".bam"
-        if True: #not os.path.exists(bam):
-            print "Processing tgtc_list_file",tgtc_list,"with goal of producing",bam
-            merge_bams(tgtc_list, bam)
-            print 
-        else:
-            print bam,"already exists\n"
-
-        lists_done += 1
-        #if lists_done > 0:
-        #    break
-
-if __name__ == "__main__":
-    if len(sys.argv) < 2:
-        usage()
-    operation = sys.argv[1]
-
-    # Check for print commands only option
-    if "print_only" in sys.argv:
-        just_print_commands = True
-        sys.argv.remove("print_only")
-    else:
-        just_print_commands = False
-
-    if operation == "convert":
-        if len(sys.argv) < 3:
-            usage()
-        tgtc_file = sys.argv[2]
-        # Run convert_lanes and recreate_all == True if clean option was given as 3rd arg
-        convert_lanes(tgtc_file, len(sys.argv) > 3 and sys.argv[3] == "clean", just_print_commands=just_print_commands)
-    elif operation == "merge":
-        if len(sys.argv) < 4:
-            usage()
-        tgtc_file = sys.argv[2]
-        outfile = sys.argv[3]
-        merge_bams(tgtc_file, outfile, just_print_commands=just_print_commands)
-    elif operation == "lists":
-        tgtc_list_file = "/broad/1KG/legacy_data/tgtc2sam/tgtc_lists_2_legacy_bams"
-        if len(sys.argv) >= 3:
-            tgtc_list_file = sys.argv[2]
-        process_tgtc_lists(tgtc_list_file)
-    else:
-        print "Didn't understand operation value: "+operation
-        usage()