deleting accidentally committed junk

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4464 348d0f76-0448-11de-a6fe-93d51630548a
2010-10-08 15:13:01 +00:00 · 2010-10-08 15:13:01 +00:00 · 39e373af6e
parent b3d81984aa
commit 39e373af6e
14 changed files with 1940 additions and 5 deletions
--- a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java
@ -88,7 +88,7 @@ public class VariantContextAdaptors {
                return null;
            Allele refAllele = Allele.create(DbSNPHelper.getReference(dbsnp), true);

-            if ( DbSNPHelper.isSNP(dbsnp) || DbSNPHelper.isIndel(dbsnp) || dbsnp.getVariantType().contains("mixed") ) {
+            if ( DbSNPHelper.isSNP(dbsnp) || DbSNPHelper.isIndel(dbsnp) || DbSNPHelper.isMNP(dbsnp) || dbsnp.getVariantType().contains("mixed") ) {
                // add the reference allele
                List<Allele> alleles = new ArrayList<Allele>();
                alleles.add(refAllele);
--- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelGenotyperV2Walker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelGenotyperV2Walker.java
@ -45,6 +45,9 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSource;
 import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
 import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.interval.IntervalUtils;
+import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator;
+import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
 import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.exceptions.StingException;
 import org.broadinstitute.sting.utils.sam.AlignmentUtils;
@ -84,6 +87,17 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
 //    @Argument(fullName="vcf_format", shortName="vcf", doc="generate output file in VCF format", required=false)
 //    boolean FORMAT_VCF = false;

+    @Argument(fullName = "genotype_intervals", shortName = "genotype",
+            doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or it's the ref", required = false)
+    public String genotypeIntervalsFile = null;
+
+    @Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false,
+            doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+
+                "if the list turns out to be unsorted, it will throw an exception.  "+
+                "Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+
+                "to sort and keep it in memory (increases memory usage!).")
+    protected boolean GENOTYPE_NOT_SORTED = false;
+
 	@Argument(fullName="somatic", shortName="somatic",
 			doc="Perform somatic calls; two input alignment files (-I option) must be specified. Calls are performed from the second file (\"tumor\") against the first one (\"normal\").", required=false)
 	boolean call_somatic = false;
@ -167,6 +181,11 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
 	private SAMRecord lastRead;
    private byte[] refBases;
    private ReferenceDataSource refData;
+    private Iterator<GenomeLoc> genotypeIntervals = null;
+    private GenomeLocSortedSet traverseIntervals = null; // these are the traversal intervals passed with -L option (if any)
+
+    // the current interval in the list of intervals, for which we want to do full genotyping
+    private GenomeLoc currentGenotypeInterval = null;

    // "/humgen/gsa-scr1/GATK_Data/refGene.sorted.txt"

@ -272,6 +291,22 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
                    normalSamples.add(rg.getSample());
                }
            }
+            if ( genotypeIntervalsFile != null ) {
+
+                traverseIntervals = getToolkit().getIntervals();
+
+                if ( ! GENOTYPE_NOT_SORTED && IntervalUtils.isIntervalFile(genotypeIntervalsFile)) {
+                    // prepare to read intervals one-by-one, as needed (assuming they are sorted).
+                    genotypeIntervals = new IntervalFileMergingIterator(
+                        new java.io.File(genotypeIntervalsFile), IntervalMergingRule.OVERLAPPING_ONLY );
+                } else {
+                    // read in the whole list of intervals for cleaning
+                    GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(
+                        IntervalUtils.parseIntervalArguments(Arrays.asList(genotypeIntervalsFile)), IntervalMergingRule.OVERLAPPING_ONLY);
+                    genotypeIntervals = locs.iterator();
+                }
+                currentGenotypeInterval = genotypeIntervals.hasNext() ? genotypeIntervals.next() : null;
+            }

        }

--- a/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentInfo.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentInfo.java
@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.oneoffprojects.utils;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: asivache
+ * Date: Aug 3, 2010
+ * Time: 4:10:42 PM
+ * To change this template use File | Settings | File Templates.
+ */
+
+/** A simple utility class that encapsulates information about a single alignment (offset, strand, overlap, mismatch count).
+ *
+ */
+public class AlignmentInfo {
+    private int offset;
+    private int overlap;
+    private int mm ;
+    private Assembly a = null;
+
+    private static int RIDICULOUSLY_LARGE_NUMBER = 1000000000;
+
+    public AlignmentInfo() {
+        offset = 0;
+        mm = RIDICULOUSLY_LARGE_NUMBER;
+        overlap = -1;
+        a = null;
+    }
+
+    public AlignmentInfo(int mm, int offset, boolean isRc, int overlap, Assembly a) {
+        this.offset = (isRc ? (-offset-1) : offset );
+        this.overlap = overlap;
+        this.mm = mm;
+        this.a = a;
+    }
+
+    boolean isAligned() { return mm < RIDICULOUSLY_LARGE_NUMBER; }
+
+    public boolean isNegativeStrand() { return offset < 0; }
+    public Assembly getAssembly() { return a; }
+    public int getOffset() { return ( offset < 0 ? (-offset-1) : offset ); }
+    public int getMismatchCount() { return mm; }
+    public int getOverlap() { return overlap; }
+    public double getMismatchRate() { return isAligned() ? ((double)mm)/overlap : 1.0; }
+}
--- a/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentList.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentList.java
@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.oneoffprojects.utils;
+
+import org.broadinstitute.sting.utils.exceptions.StingException;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Iterator;
+
+public class AlignmentList implements Iterable<AlignmentInfo> {
+        private int best_mm = 1000000000;
+        private int next_best_mm = 1000000000;
+        private List<AlignmentInfo> als = null;
+        private int next_best_count = 0;
+        private int best_overlap = 0;
+
+        private AlignmentStrategy strategy = null;
+
+        public AlignmentList(AlignmentStrategy s) {
+            this.strategy = s;
+            best_mm = 1000000000;
+            next_best_mm = 1000000000;
+            best_overlap = 0;
+            als = new ArrayList<AlignmentInfo>(1);
+        }
+
+        public boolean isAligned() {
+            return best_mm < 1000000000;
+        }
+
+        public List<AlignmentInfo> getAlignments() { return als; }
+
+        public int size() { return als.size(); }
+
+        public Iterator<AlignmentInfo> iterator() { return als.iterator(); }
+
+ //       public void tryAdd(int mm, int offset, boolean isRc, int overlap) {
+ //           tryAdd(new AlignmentInfo(mm,offset,isRc,overlap));
+ //       }
+
+        public void tryAdd(AlignmentInfo ai) {
+            AlignmentStrategy.Action a = strategy.action(ai,this) ;
+            switch ( a ) {
+                case DISCARD: break;
+                case REPLACE_BEST:
+                    next_best_mm = best_mm;
+                    next_best_count = size();
+                    als.clear();
+                    als.add(ai);
+                    best_mm = ai.getMismatchCount();
+                    best_overlap = ai.getOverlap();
+                    break;
+                case ADD_BEST:
+                    als.add(ai);
+                    if ( ai.getMismatchCount() < best_mm ) best_mm = ai.getMismatchCount();
+                    if ( ai.getOverlap() > best_overlap) best_overlap = ai.getOverlap();
+                    break;
+                case REPLACE_NEXTBEST:
+                    next_best_mm = ai.getMismatchCount();
+                    next_best_count = 1;
+                    break;
+                case ADD_NEXTBEST:
+                    next_best_count++;
+                    if ( ai.getMismatchCount() < next_best_mm ) next_best_mm = ai.getMismatchCount();
+                    break;
+                default: throw new StingException("Unrecognized action requested: "+a);
+            }
+        }
+
+        public void tryAddAll(AlignmentList al) {
+            for( AlignmentInfo ai : al) {
+                tryAdd(ai);
+            }
+        }
+
+        public int getBestMMCount() { return best_mm; }
+        public int getBestOverlap() { return best_overlap; }
+        public int getBestHitCount() { return als.size() ; }
+        public int getNextBestHitCount() { return next_best_count; }
+        public int getNextBestMMCount() { return next_best_mm; }
+//        public int getOverlap() { return overlap; }
+//        public int getOffset() { return offset; }
+//        public boolean isNegativeStrand() { return rc; }
+
+//        public double getMismatchRate() { return isAligned() ? ((double)best_mm)/overlap : 1.0 ; }
+
+}
--- a/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentStrategy.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentStrategy.java
@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.oneoffprojects.utils;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: asivache
+ * Date: Aug 3, 2010
+ * Time: 2:53:43 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public interface AlignmentStrategy {
+    enum Action {
+        DISCARD,
+        REPLACE_BEST,
+        ADD_BEST,
+        REPLACE_NEXTBEST,
+        ADD_NEXTBEST
+    };
+
+    public Action action(AlignmentInfo alignment, AlignmentList currentList);
+}
--- a/java/src/org/broadinstitute/sting/oneoffprojects/utils/Assembly.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/utils/Assembly.java
@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.oneoffprojects.utils;
+
+import org.broadinstitute.sting.utils.collections.PrimitivePair;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.exceptions.StingException;
+import org.broadinstitute.sting.utils.Utils;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+
+import net.sf.samtools.util.StringUtil;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: asivache
+ * Date: Aug 3, 2010
+ * Time: 2:20:22 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class Assembly {
+        private byte[] consensus;
+        private short[] coverage;
+        private short[] mismatches;
+        private short [][] base_counts;
+
+        private boolean debug = false;
+        private List<String> seq_ids;
+        private List<byte []> seqs;
+        private List<Integer> seq_offsets;
+
+        private KmerIndex lookup; // assembled consensus sequence is indexed here
+
+        private int hookedAt = -1; // if set, specifies start on the ref of the assembled consensus sequence
+
+        private static List<PrimitivePair.Int> EMPTY_KMER_LIST = new ArrayList<PrimitivePair.Int>(0);
+
+        private int K = 15;
+
+        private AlignmentStrategy strategy = null;
+    
+    /** Creates new assembly seeded with the specified sequence; default key length (15) is used.
+     *
+     * @param seq
+     * @param id
+     */
+        public Assembly(final byte[] seq, String id) {
+            this(15,seq,id);
+        }
+
+    /** Creates new assembly seeded with the specified sequence and sets kmer (key) length K for the internally maintained
+     * lookup index tables.
+     * @param K
+     * @param seq
+     * @param id
+     */
+        public Assembly(int K, final byte[] seq, String id) {
+            this.K = K;
+            seq_ids = new ArrayList<String>();
+            seq_offsets = new ArrayList<Integer>();
+            seqs = new ArrayList<byte[]>();
+            seq_ids.add(id);
+            seq_offsets.add(0);
+            seqs.add(seq);
+            consensus = Arrays.copyOf(seq,seq.length);
+            coverage = new short[seq.length];
+            Arrays.fill(coverage,(short)1);
+            mismatches = new short[seq.length]; // filled with 0's
+            base_counts = new short[4][seq.length];
+            for ( int i = 0 ; i < seq.length ; i++ ) {
+                int j = BaseUtils.simpleBaseToBaseIndex(seq[i]);
+                if ( j != -1) base_counts[j][i] = 1;
+            }
+            lookup = new KmerIndex(K,seq);
+            strategy = new DefaultAlignmentStrategy();
+        }
+
+    /** Creates new assembly seeded with the specified sequence; default key length (15) is used and the position on the
+     * reference of the entire assembly is set (as assemblly grows, position on the ref will be updated properly).
+     *
+     * @param seq
+     * @param id
+     */
+        public Assembly(final byte[] seq, String id, int posOnRef) {
+            this(seq,id);
+            hookedAt = posOnRef;
+        }
+
+    /** Creates new assembly seeded the specified sequence and sets kmer (key) length K for the internally maintained
+     * lookup index tables. Parameter <code>posOnRef</code> specifies the (initial) position of the entire assembly on the
+     * ref; as the assembly grows, the position on ref will be updated properly.
+     * @param K
+     * @param seq
+     * @param id
+     */
+        public Assembly(int K, final byte[] seq, String id, int posOnRef) {
+            this(K,seq,id);
+            hookedAt = posOnRef;
+        }
+
+    /** Returns total number of sequences currently held by this assembly.
+     * 
+     * @return
+     */
+        public int getNumSeqs() { return seqs.size() ; }
+
+        /** Attempts to align <code>seq</code> to this assembly's consensus. Does NOT add
+         * the sequence to the consensus even if it aligns! This methods returns a list of alternative
+         * best alignments found (according to the strategy used) in a newly allocated AlignmentList object.
+         * @param seq sequence to align to this consensus
+         * @param tryRC if true, will try aligning both seq and its reverse complement; otherwise
+         * only forward alignment will be attempted (i.e. best placement of the seq, as it is provided,
+         * along the assembled consensus sequence)
+         * @return a newly allocated alignment list; returned list can be empty if no alignments are found
+         */
+        public AlignmentList align(final byte[] seq, boolean tryRC) {
+            return align(seq,tryRC,null);
+        }
+
+        /** Attempts to align <code>seq</code> to this assembly's consensus. Does NOT add
+         * the sequence to the consensus even if it aligns! This method uses existing list of alignments
+         * (which can contain alignments to a different assembly) and updates it as necessary if a better alignment
+         * (or multiple better alignments) than the one(s) already held in the list is found. Reference to the
+         * <i>same</i> alignment list object is returned: this method modifies it's argument. If alignment list argument
+         * is <code>null</code>, new alignment list object will be allocated and returned by this method.
+         *
+         * @param seq sequence to align to this consensus
+         * @param tryRC if true, will try aligning both seq and its reverse complement; otherwise
+         * only forward alignment will be attempted (i.e. best placement of the seq, as it is provided,
+         * along the assembled consensus sequence)
+         * @return a newly allocated alignment list; returned list can be empty if no alignments are found
+       */
+        public AlignmentList align(final byte[] seq, boolean tryRC, AlignmentList a) {
+            if ( debug ) System.out.println("Assembly:: aligning sequence of length "+seq.length+"; tryRC="+tryRC+"; K="+K);
+
+            List<PrimitivePair.Int> fw_kmers = KmerIndex.toKeyOffsetList(K,seq);
+
+            if ( debug ) {
+                for( PrimitivePair.Int kmer: fw_kmers) {
+                    System.out.println("id="+kmer.getFirst()+" seq="+new String(KmerIndex.idToSeq(K,kmer.getFirst()))+" offset on seq="+kmer.getSecond());
+                }
+            }
+
+            byte [] rc_seq = (tryRC ? BaseUtils.simpleReverseComplement(seq) : null );
+            List<PrimitivePair.Int> rc_kmers = (tryRC ? KmerIndex.toKeyOffsetList(K,rc_seq) : EMPTY_KMER_LIST );
+
+            if ( a == null ) a = new AlignmentList(strategy);
+
+            // i is the position on the sequence seq or on its reverse complement
+            for(PrimitivePair.Int kmer : fw_kmers ) {
+
+                List<Integer> offsets = lookup.getOffsets(kmer.first);
+                if ( offsets != null ) {
+                    // kmer present in consensus sequence
+                    for ( int s : offsets ) {      // s=offset of the current kmer on the assembled consensus
+                        int trial_offset = s - kmer.second; // offset of the seq on the assembled consensus suggested by current kmer/offset
+                        int trial_mm = countMismatches(seq,trial_offset,a.getNextBestMMCount());
+                        a.tryAdd(new AlignmentInfo(trial_mm,trial_offset,false,overlap(trial_offset,seq.length),this));
+                    }
+                }
+            }
+
+            for ( PrimitivePair.Int kmer : rc_kmers ) {
+
+                List<Integer> offsets = lookup.getOffsets(kmer.first);
+                if ( offsets != null ) {
+                    // kmer present in consensus sequence
+                    for ( int s : offsets ) {
+                        int trial_offset = s - kmer.second;
+                        int trial_mm = countMismatches(rc_seq,trial_offset,a.getNextBestMMCount());
+                        a.tryAdd(new AlignmentInfo(trial_mm,trial_offset,true,overlap(trial_offset,seq.length),this));
+                    }
+                }
+            }
+            return a;
+        }
+
+        public void setDebug(boolean d) { this.debug = d; lookup.setDebug(d);}
+
+        public int numSequences() { return seq_ids.size(); }
+
+        private int overlap(int offset, int seq_length ) {
+            return Math.min(consensus.length,offset+seq_length)-Math.max(0,offset);
+        }
+
+        private int countMismatches(final byte seq[], int offset, int cutoff) {
+            int mm = 0;
+
+            int i ;
+            if ( offset >= 0 ) i = 0;
+            else { i = (-offset); offset = 0; }
+            for ( ; i < seq.length && offset < consensus.length ; i++ , offset++ ) {
+                if ( seq[i] != consensus[offset] ) {
+                    mm++;
+                    if ( mm > cutoff ) break;
+                }
+            }
+
+            return mm;
+        }
+
+        public byte[] getConsensus() { return consensus; }
+
+        public int getPosOnRef() { return hookedAt; }
+
+        public int getConsensusLength() { return consensus.length; }
+
+        public List<Integer> getOffsets() { return seq_offsets; }
+        public int getOffset(int i) { return seq_offsets.get(i); }
+
+        public List<String> getSeqIds() { return Collections.unmodifiableList(seq_ids); }
+
+    /** Adds specified sequence to this assembly according to the provided
+     * alignment information. Will properly update consensus sequence of this assembly
+     * and all associated information (mismatches, base counts etc)
+     * @param seq
+     * @param id
+     * @param a
+     */
+        public void add(final byte[] seq, String id, AlignmentInfo a) {
+
+            if ( ! a.isAligned() ) throw new StingException("Can not add sequence to the assembly: provided alignment is empty");
+
+            seq_ids.add(id);
+
+            int offset = a.getOffset();
+            int oldConsensusLength = consensus.length;
+
+            byte [] seq_to_add = ( a.isNegativeStrand() ? BaseUtils.simpleReverseComplement(seq) : seq);
+
+            seqs.add(seq_to_add);
+
+            int pos_on_seq = 0;
+            int pos_on_cons = 0;
+
+            int leftExtension = 0; // how many bases we added to the consensus on the left
+            int rightExtension = 0; // how many bases we added to the consensus on the right
+
+            if ( offset < 0 ) {
+                // if sequence sticks out to the left of the current consensus:
+
+                leftExtension = -offset;
+                for(int i = 0 ; i < seq_offsets.size() ; i++ ) {
+                    // we are going to extend consensus to the left, so we need to update all current offsets:
+                    seq_offsets.set(i,seq_offsets.get(i)+leftExtension);
+                }
+
+                if ( hookedAt > 0 ) hookedAt -= leftExtension;
+                // extend consensus and associated arrays to the left :
+
+                consensus = Utils.extend(consensus,offset,(byte)0); // remember, offset is negative here, extending to the left
+                coverage = Utils.extend(coverage,offset,(short)1) ;
+                mismatches = Utils.extend(mismatches,offset,(short)0);
+                for ( int i = 0 ; i < 4 ; i++ ) base_counts[i] = Utils.extend(base_counts[i],offset,(short)0);
+
+                for ( int j = 0 ; j < -offset ; j++ ) {
+                    consensus[j] = seq_to_add[j];
+                    int b = BaseUtils.simpleBaseToBaseIndex(seq_to_add[j]);
+                    if ( b != -1 ) base_counts[b][j]=1;
+                }
+
+                pos_on_seq = pos_on_cons = -offset;
+
+                offset = 0;
+            }
+            if ( offset > 0 ) pos_on_cons = offset;
+
+            seq_offsets.add(offset);
+
+            boolean consensus_changed = false;
+
+            for ( ; pos_on_seq < seq_to_add.length && pos_on_cons < consensus.length ; pos_on_seq++, pos_on_cons++ ) {
+                coverage[pos_on_cons]++;
+                final byte base = seq_to_add[pos_on_seq];
+                final int b = BaseUtils.simpleBaseToBaseIndex(base);
+                if ( b != -1 ) {
+                    // if base on seq is not a regular base, there is nothing to do;
+                    // otherwise count mismatches and optionally update consensus if current base tips the balance
+                    base_counts[b][pos_on_cons]++;
+                    int maxcount = 0;
+                    int maxb = -1;
+                    for ( int j = 0 ; j < 4 ; j++ ) {
+                        if ( base_counts[j][pos_on_cons] > maxcount ) {
+                            maxcount = base_counts[j][pos_on_cons];
+                            maxb = j;
+                        }
+                    }
+                    // we are guaranteed here that maxb != -1 since we just added one regular base (the current one)
+                    // few lines above...
+                    byte newbase = BaseUtils.baseIndexToSimpleBase(maxb);
+                    if ( newbase != consensus[pos_on_cons] ) { // need to change the consensus base (will recompute mismatches)
+                        consensus[pos_on_cons] = newbase;
+                        consensus_changed = true;
+                        mismatches[pos_on_cons] = 0;
+                        for ( int i = 0 ; i < 4 ; i++ ) {
+                             if ( i == maxb ) continue;
+                             mismatches[pos_on_cons] += base_counts[i][pos_on_cons];
+                        }
+                    } else { // consensus base did not change; just increment mismatches if current sequence's base differs from consensus
+                        if ( base != consensus[pos_on_cons]) mismatches[pos_on_cons]++;
+                    }
+                }
+
+            }
+
+            // Last step: if sequence sticks out of current consensus on the right, we need to extend the latter:
+
+            if ( pos_on_seq < seq_to_add.length ) {
+                // sequence sticks out of consensus to the right
+                rightExtension = seq_to_add.length - pos_on_seq;
+                consensus = Utils.extend(consensus,rightExtension,(byte)0);
+                coverage = Utils.extend(coverage,rightExtension,(short)1);
+                mismatches = Utils.extend(mismatches,rightExtension,(short)0);
+                for ( int i = 0 ; i < 4 ; i++ ) base_counts[i] = Utils.extend(base_counts[i],rightExtension,(short)0);
+                for ( ; pos_on_seq < seq_to_add.length ; pos_on_seq++, pos_on_cons++ ) {
+                    byte base = seq_to_add[pos_on_seq];
+                    consensus[pos_on_cons] = base;
+                    int b = BaseUtils.simpleBaseToBaseIndex(base);
+                    if ( b != -1 ) base_counts[b][pos_on_cons] = base;
+                }
+            }
+
+            // finally, the new sequence we just added could have mismatches that tip some consensus bases into new values;
+            // let's catch those cases:
+
+
+            for ( int i = 0 ; i < consensus.length ; i++ ) {
+                byte cons_base = consensus[i];
+                int b = BaseUtils.simpleBaseToBaseIndex(cons_base);
+            }
+
+            // there is probably a better way, but for now we just recompute the whole lookup table when consensus
+            // changes somewhere in the middle (if we want to be samrt we need to identify just the kmers that changed
+            // and find/change them in the lookup table).
+            if ( consensus_changed ) {
+                lookup.clear();
+                lookup.index(consensus);
+            } else {
+                if ( leftExtension > 0 || rightExtension > 0 ) lookup.updateIndex(consensus,leftExtension,oldConsensusLength);
+            }
+
+
+        }
+
+        public String toAlignmentString(boolean mismatchesOnly, boolean printNames) {
+
+            int maxNameLength = 0;
+            int spacing=3;
+
+            if ( printNames ) {
+                for ( String n : seq_ids ) if ( n.length() > maxNameLength ) maxNameLength++;
+            }
+
+            StringBuilder b = new StringBuilder();
+            if ( printNames ) b.append(Utils.dupString(' ',maxNameLength+spacing));
+            b.append(new String(consensus));
+            b.append('\n');
+
+            for ( int j = 0; j < seqs.size() ; j++ ) {
+                int offset = seq_offsets.get(j);
+                byte [] seq = seqs.get(j);
+
+                if ( printNames ) {
+                    b.append(seq_ids.get(j));
+                    b.append(Utils.dupString(' ',maxNameLength-seq_ids.get(j).length()+spacing));
+                }
+
+                for ( int i = 0 ; i < offset ; i++ ) b.append(' ');
+
+                for ( int i = 0 ; i < seq.length ; i++ ) {
+
+                    byte base = seq[i];
+                    if ( mismatchesOnly && base == consensus[i+offset] ) {
+                        b.append('.');
+                    } else b.append((char)base);
+                }
+                b.append('\n');
+            }
+            return b.toString();
+        }
+
+
+        public static void testMe(String [] argv ) {
+            byte [] seq1 =      "ACGTTGCGTGGTTCACTGCAGTAACTGACTGATGCA".getBytes();
+            byte [] seq2 =           "GCGTGGTTTACTGCAGTAACTGACTGATGCAACGTGTTTG".getBytes();
+            byte [] seq3 = "GGNTGACGTTGCGTGGTTTACTGCAGTAACTGACT".getBytes();
+            byte [] seq4 =      "NNNTTNCGTGGTTTACTGCAGTAACTGACTGATGCA".getBytes();
+
+            Assembly a = new Assembly(seq1,"1");
+
+            AlignmentList al = a.align(seq2,false);
+            if ( al.isAligned() ) System.out.println("seq 2 aligned");
+            else System.out.println("seq 2 did NOT align");
+
+            if ( al.size() == 1 ) a.add(seq2,"2",al.getAlignments().get(0));
+            else System.out.println("Multiple alignments found for seq 2");
+
+            al = a.align(seq3,false);
+            if ( al.isAligned() ) System.out.println("seq 3 aligned");
+            else System.out.println("seq 3 did NOT align");
+
+            if ( al.size() == 1 ) a.add(seq3,"3",al.getAlignments().get(0));
+            else System.out.println("Multiple alignments found for seq 3");
+
+            al = a.align(seq4,false);
+            if ( al.isAligned() ) System.out.println("seq 4 aligned");
+            else System.out.println("seq 4 did NOT align");
+
+            if ( al.size() == 1 ) a.add(seq4,"4",al.getAlignments().get(0));
+            else System.out.println("Multiple alignments found for seq 4");
+
+            System.out.println(a.toAlignmentString(true, true));
+
+        }
+
+}
--- a/java/src/org/broadinstitute/sting/oneoffprojects/utils/AssemblyGraph.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/utils/AssemblyGraph.java
@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.oneoffprojects.utils;
+
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.utils.sam.AlignmentUtils;
+import org.broadinstitute.sting.utils.exceptions.StingException;
+
+import java.util.List;
+import java.util.LinkedList;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: asivache
+ * Date: Sep 13, 2010
+ * Time: 5:53:33 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class AssemblyGraph {
+
+    private List<Assembly> sources;
+
+    public AssemblyGraph(Assembly a) {
+        sources = new LinkedList<Assembly>();
+        sources.add(a);
+    }
+
+    /** Initializes assembly from the single specified read, and sets this assembly as the root of this
+     * assembly graph
+     * @param r read; must be aligned, otherwise exception will be thrown
+     * @param K index (Kmer) size of the assembly that will be initialized with the read r
+     */
+    public AssemblyGraph(SAMRecord r, int K) {
+        if (AlignmentUtils.isReadUnmapped(r))
+            throw new StingException("Can not initialize assembly graph with unaligned read");
+        sources = new LinkedList<Assembly>();
+        sources.add( new Assembly(K,r.getReadBases(),r.getReadName(), r.getAlignmentStart()) );
+    }
+
+    public void add(SAMRecord r) {
+        
+    }
+
+}
--- a/java/src/org/broadinstitute/sting/oneoffprojects/utils/DefaultAlignmentStrategy.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/utils/DefaultAlignmentStrategy.java
@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.oneoffprojects.utils;
+
+import org.broadinstitute.sting.utils.exceptions.StingException;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: asivache
+ * Date: Aug 3, 2010
+ * Time: 4:53:01 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class DefaultAlignmentStrategy implements AlignmentStrategy {
+
+    public Action action(AlignmentInfo alignment, AlignmentList currentList) {
+        if ( currentList.size() == 0 ) return Action.REPLACE_BEST;
+
+        if ( alignment.getMismatchCount() > currentList.getNextBestMMCount() ) return Action.DISCARD;
+
+        if ( alignment.getMismatchCount() < currentList.getBestMMCount() ) return Action.REPLACE_BEST;
+        if ( alignment.getMismatchCount() == currentList.getBestMMCount() ) return Action.ADD_BEST;
+        if ( alignment.getMismatchCount() < currentList.getNextBestMMCount() ) return Action.REPLACE_NEXTBEST;
+        if ( alignment.getMismatchCount() == currentList.getNextBestMMCount() ) return Action.ADD_NEXTBEST;
+
+        throw new StingException("Unexpected case found and left unprocessed");
+//        return null;  //To change body of implemented methods use File | Settings | File Templates.
+    }
+}
--- a/java/src/org/broadinstitute/sting/oneoffprojects/utils/KmerIndex.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/utils/KmerIndex.java
@ -0,0 +1,332 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.oneoffprojects.utils;
+
+import org.broadinstitute.sting.utils.collections.PrimitivePair;
+import org.broadinstitute.sting.utils.exceptions.StingException;
+import org.broadinstitute.sting.utils.BaseUtils;
+
+import java.util.List;
+import java.util.HashMap;
+import java.util.ArrayList;
+import java.util.Map;
+
+/**
+ * Created by IntelliJ IDEA.
+* User: asivache
+* Date: Aug 3, 2010
+* Time: 1:31:15 PM
+* To change this template use File | Settings | File Templates.
+*/
+public class KmerIndex {
+   private HashMap<Integer,List<Integer> > lookup;
+   private int K = -1;
+   private int mask = 0;
+   private boolean debug = false;
+
+    /**
+     *  Translates sequence <code>seq</code> into the list of all valid kmers paired
+     * with their offsets on that sequence. Valid kmer is a kmer that contains only ACGT bases; if <code>seq</code>
+     * contains any other symbols, no kmers overlapping with such symbols will be generated. This method
+     * returns a linear (possibly gapped if non-ACGT symbols are present) representation of the sequence as its kmers
+     * with corresponding offsets, NOT a lookup index. If a specific kmer occurs on the sequence N times,
+     * the returned list will have N occurences of this kmer, each paired with one unique location on the sequence.
+     * Kmers themselves are represented as integer kmer ids here, see #idToSeq() if string (ACGT bases) representation
+     * of kmers is needed. Empty list if returned if no valid kmers are found on the sequence (i.e. too many non-ACGT bases)
+     *
+     * @param K key (kmer) length
+     * @param seq sequence to translate into kmer/offset representation
+     * @return list of kmer/offsets
+     */
+    public static List<PrimitivePair.Int> toKeyOffsetList(int K, byte seq[]) {
+       return toKeyOffsetList(K,seq,0,seq.length);
+    }
+
+    /** Same as #toKeyOffsetList(int K, byte [] seq) (see docs), except that this method is not static and
+     * uses key length K associated with the specific instance of the KmerIndex class.
+     * @param seq
+     * @return
+     */
+    public List<PrimitivePair.Int> toKeyOffsetList(byte [] seq) {
+        return toKeyOffsetList(this.K,seq);
+    }
+
+    /** Returns an ordered sequence of overlapping (1-shift in the ideal case) k-mers of length K found in the subsequence of
+     * length <code>length</code> of sequence <code>seq</code> starting at position <code>start</code>. All returned kmers
+     * are fully subsumed by the interval [start, start+length) on the sequence <code>seq</code> (no partial overlaps).
+     * Each kmer is paired with its offset on the (full length) <code>seq</code> in the returned list.
+     * Note that only k-mers on the forward strand are returned. You need to manually rc the string and
+     * call toKeyOffsetList() again to get rc k-mers. If sequence contains any other symbols than ACGT, all k-mers
+     * that would overlap with those symbols will be skipped (not present in the returned list). See also
+     * #toKeyOffsetList(int K, byte [] seq) which translates the whole sequence <code>seq</code>.
+     *
+     * @param K index key (k-mer) length
+     * @param seq sequence to compute k-mers from
+     * @param start compute kmers for subsequence [start, start+length) of <code>seq</code>
+     * @param length compute kmers for subsequence [start, start+length) of <code>seq</code>
+     * @return a list of pairs (kmer,offset_on_the_seq) for each valid kmer (i.e. kmer that does not overlap with
+     * non-ACGT bases); if no valid kmers exist, the returned list will be empty.
+     */
+     public static List<PrimitivePair.Int> toKeyOffsetList(int K, byte[] seq, int start, int length) {
+
+       if ( length < K ) throw new StingException("Can not index sequence that is shorter than key length: total seq="+seq.length+"; start="+start+"; length="+length);
+
+       int mask = 0;
+       if ( length > K ) {
+           for ( int i = 0; i < K ; i++ ) {
+               mask <<= 2;
+               mask |= 0x03;
+           }
+       }
+
+       int key = 0;
+       int i ;
+       final int final_pos = start+length; // first base *after* the last position we want to index
+
+       ArrayList<PrimitivePair.Int> l = new ArrayList<PrimitivePair.Int>(length-K+1);
+
+       PrimitivePair.Int firstK = toFirstKey(K,seq,start,final_pos);
+       if ( firstK == null ) {
+           // ooops, too many non-ACGT bases, we were not able to find a single valid k-mer on the whole sequence!
+           return l;
+       }
+
+       l.add(firstK);
+
+       start = firstK.getSecond();
+       i = start + K; // i points to the first base after the returned kmer firstK
+       key = firstK.getFirst();
+
+       // now let's try recomputing next kmers in an efficient way: we reuse previous kmer and add only the new last base.
+       // This will break if we encounter a non-ACGT base, in which case we will have to start over.
+
+       for ( start++ ; i < final_pos ; i++, start++ ) {
+           int d = BaseUtils.simpleBaseToBaseIndex(seq[i]);
+           if ( d == -1 ) {
+               // ooops, we ran into a bad base; let's jump over it completely and reinitialize the key
+               // (since all kmers overlapping with the current base are invalid)
+               firstK = toFirstKey(K,seq,i+1,final_pos);
+               if ( firstK == null ) break; // no more valid kmers
+               l.add(firstK);
+               start = firstK.getSecond();
+               i = start+K; // points to the base right after the Kmer we just found
+               key = firstK.getFirst(); // reset key to the new kmer we just found
+           } else {
+               // the base is good, so we can compute our new kmer very efficiently using the old one:
+               key <<= 2;
+               key &= mask;
+               key += d;
+               l.add(new PrimitivePair.Int(key,start));
+           }
+       }
+       return l;
+   }
+
+    /** Non-static version of #toKeyOffsetList(int K, byte [] seq, int start, int length) (see docs), which
+     * uses key length K associated with this instance of the KmerIndex object.
+     * @param seq
+     * @param start
+     * @param length
+     * @return
+     */
+   public List<PrimitivePair.Int> toKeyOffsetList(byte[] seq, int start, int length) {
+       return toKeyOffsetList(this.K,seq,start,length);
+   }
+
+
+    /** Computes index (key) of the first valid kmer in the interval [start,stop) of the sequence seq. Kmer is valid
+     * if it contains only valid (ACGT) bases. Returns key and actual offset of first such kmer found, or null
+     * if such kmer does not exist (i.e. if seq does not contain a continuous span of ACGT bases at least K bases long).
+      * @param K
+     * @param seq
+     * @param start
+     * @param stop
+     * @return
+     */
+   private static PrimitivePair.Int toFirstKey(int K, byte[] seq, int start, int stop) {
+       int d = -1;
+       int key = 0 ;
+       while ( d == -1 && start < stop - K + 1) {
+           key = 0;
+           for ( int i = start ; i < start+K; i++ ) {
+               key <<= 2;
+               d = BaseUtils.simpleBaseToBaseIndex(seq[i]);
+               if ( d == -1) {
+                   // ooops, non-ACGT base found, abort and start over. Next kmer that
+                   // have a chance to be valid (contain only ACGT bases) can start only after the current position:
+                   start = i+1;
+                   break;
+               }
+               key += d;
+           }
+       } // got the first key
+
+       if ( d != -1 ) return new PrimitivePair.Int(key,start);
+       else return null;
+   }
+
+    /** Creates an empty kmer index table with specified key length
+     *
+     * @param K
+     */
+   public KmerIndex(final int K) {
+       if ( K > 16 ) throw new StingException("Lookup keys longer than 16 bases are currently not supported");
+       if ( K % 2 == 0 ) throw new StingException("Even keys require additional processing of palindromes, currently not supported. Please use odd key.");
+       this.K = K;
+
+       mask = 0;
+       for ( int i = 0; i < K; i++ ) {
+           mask <<= 2;
+           mask |= 0x03;
+       } // got the first key
+
+       lookup = new HashMap<Integer,List<Integer>>();
+   }
+
+    /** Builds kmer index table with key length <code>K</code> for the sequence <code>seq</code>.
+     *
+     * @param K
+     * @param seq
+     */
+   public KmerIndex(final int K, final byte[] seq) {
+       this(K);
+
+       if ( seq.length < K ) throw new StingException("Sequence is shorter than requested lookup index key length");
+
+       addToIndex(toKeyOffsetList(K,seq,0,seq.length));
+   }
+
+    public void setDebug(boolean d) { this.debug = d; }
+
+    /** Clears current lookup index table completely (but preserves the key length previously set).
+     *
+     */
+   public void clear() { lookup.clear(); }
+
+    /** Builds complete index for the sequence seq. This method can be used only when lookup table is
+     * empty (i.e. use #clear() first), otherwise an exception will be thrown.
+     * @param seq
+     */
+   public void index(final byte[] seq) {
+       if ( ! lookup.isEmpty() ) {
+           throw new StingException("Can not index new sequence: lookup table is already non-empty");
+       }
+       addToIndex(toKeyOffsetList(K,seq,0,seq.length));
+   }
+
+    /**
+     * Updates existing index. It is assumed that the sequence that was already indexed by this KmerIndex object is
+     * the exact subsequence of length <code>old_length</code> of the new sequence <code>seq</code>, starting at
+     * position <code>old_start</code>. No checks are performed, so it is the responsibility of the caller to ensure
+     * that this is indeed the case, otherwise the index will be inconsistent. Since the old sequence is a part
+     * of the new one, this method will keep all the already computed kmers (and update their offsets as needed),
+     * and compute and add kmers/offsets for all the novel bases added to the sequence <code>seq</code> compared
+     * to the old, already indexed subsequnce. If <code>old_length</code> is less than
+     * K (i.e. old sequence could not be and was not indexed at all), the new sequence <code>seq</code> will
+     * be fully indexed from start to end.
+      * @param seq
+     * @param old_start already indexed subsequence starts at this position in <code>seq</code>
+     * @param old_length length of the already indexed subsequence
+     */
+   public void updateIndex(final byte[] seq, final int old_start, final int old_length) {
+
+       if ( old_length < K ) {
+           if ( ! lookup.isEmpty())
+               throw new StingException("It is claimed that old indexed sequence is shorter than K (i.e. it could not be indexed), but index is non empty");
+           addToIndex( toKeyOffsetList(K,seq,0,seq.length));
+           return;
+       }
+
+       if ( old_start > 0 ) {
+           // update positions of previously indexed k-mers:
+           for ( Map.Entry<Integer,List<Integer>> e : lookup.entrySet() ) {
+               List<Integer> l = e.getValue();
+               for ( int i = 0 ; i < l.size(); i++ ) l.set(i,l.get(i)+old_start);
+           }
+           // take care of additional k-mers appearing *before* the already indexed subsequence:
+           // if already indexed subsequence starts at 'start', the first k-mer from that sequence
+           // ends at start+K-1 (inclusive) and it is obviously already indexed. So the last k-mer we want to index now ends at
+           // start+K-2 (inclusive), the length of [0,start+K-2] interval that we need to index is
+           // start+K-1.
+           addToIndex( toKeyOffsetList(K,seq,0,old_start+K-1) );
+       }
+
+       // the last k-mer we already indexed ends at start+length-1 (inclusive); so it starts at start+length-1-(K-1)=start+length-K.
+       // Hence, the first k-mer that is not indexed yet starts at start+length-K+1. The length of the subsequence that
+       // we need to index, [start+length-K+1,seq.length) is seq.length - start - length +K - 1
+
+       int pos = old_start+old_length-K+1;
+       addToIndex( toKeyOffsetList(K,seq,pos,seq.length-pos) );
+
+
+   }
+
+    /** Convenience shortcut: takes the list of keys/offsets and pushes offsets into the lookup index for the keys that
+     * do exist already, or first creates the new entry and then pushes the offset for keys that are novel. This method
+     * is quiet: if <code>keys</code> is <code>null</code> or an empty list, it does nothing.
+     * @param keys
+     */
+   private void addToIndex(final List<PrimitivePair.Int> keys ) {
+       if ( keys == null ) return;
+       for ( PrimitivePair.Int key: keys ) {
+           List<Integer> l = lookup.get(key.getFirst());
+           if ( l == null ) {
+               l = new ArrayList<Integer>();
+               lookup.put(key.getFirst(),l);
+           }
+           l.add(key.getSecond());
+       }
+
+   }
+
+    /**
+     * Converts kmer (integer key) of length K into its sequence representation. Returns a sequence (over ACGT alphabet)
+     * of length K that corresponds to the specified key.
+     * @param K
+     * @param kmer
+     * @return
+     */
+   public static byte [] idToSeq(int K, int kmer) {
+       byte [] seq = new byte[K];
+       for ( int i = K-1; i >=0 ; i-- ) {
+           seq[i] = BaseUtils.baseIndexToSimpleBase(kmer & 0x3);
+           kmer >>= 2;
+       }
+       return seq;
+   }
+
+    /** Returns all offsets for the specified kmer (key) on the sequence indexed by this KmerIndex object. Returns
+     * null if specified kmer is not present on the indexed sequence.
+     * @param key
+     * @return
+     */
+   public List<Integer> getOffsets(int key) { return lookup.get(key); }
+//   public List<Integer> getOffsets(byte[] seq) {
+//       if ( seq.length != K ) throw new StingException("Can not perform direct lookup of a sequence with length different from key size");
+//
+//       return getOffsets( toKey(seq) ) ;
+//   }
+}
--- a/java/src/org/broadinstitute/sting/oneoffprojects/utils/ReadPair.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/utils/ReadPair.java
@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.oneoffprojects.utils;
+
+import net.sf.samtools.SAMRecord;
+import net.sf.samtools.CigarElement;
+import org.broadinstitute.sting.utils.sam.AlignmentUtils;
+import org.broadinstitute.sting.utils.exceptions.StingException;
+
+/**
+ * Created by IntelliJ IDEA.
+* User: asivache
+* Date: Aug 6, 2010
+* Time: 6:18:01 PM
+* To change this template use File | Settings | File Templates.
+*/
+public class ReadPair {
+
+    public enum PairType {
+        UNKNOWN,
+        BOTH_UNMAPPED,
+        ONE_UNMAPPED,
+        PROPER,
+        LEFT,
+        RIGHT,
+        OUTER,
+        INTER
+    };
+
+
+    private SAMRecord end1 = null;
+    private SAMRecord end2 = null;
+    private PairType pType = PairType.UNKNOWN;
+    private int leftStart = -1;
+    private int rightStart = -1;
+    private SAMRecord leftRead = null;
+    private SAMRecord rightRead = null;
+
+
+
+    /** Creates an empty read pair object */
+    public ReadPair() {}
+
+    /** Creates a read pair objects initialized with the specified read */
+    public ReadPair(SAMRecord read) {
+        addRead(read);
+    }
+
+    /** Returns name of the paired read (it is assumed that both individual reads in the pair share same name).
+     *
+     * @return
+     */
+    public String getName() { return ( end1 != null ? end1.getReadName() : (end2 != null ? end2.getReadName() : null) ); }
+
+    /** Returns true if both ends are recorded in this read pair object. Note that because SAM records carry 
+     * mate information, a pair can be (partially) initialized from one end. This method verifies that this is not the case
+     * and both records are actually present.
+     * @return
+     */
+    public boolean hasBothEnds() { return end1 != null && end2 != null ; }
+
+    /** Returns true if this pair object was initialized with at least one end. Since SAM records carry mate information,
+     * it is sometimes sufficient to have only one read (fragment end) actually recorded in the pair object, at which
+     * point some useful information can be retrieved for the pair already.
+     * @return
+     */
+    public boolean hasAnyData() { return end1 != null || end2 != null ; }
+
+    /** Returns true if both ends in the pair are mapped. The pair object must be at least partially initialized (i.e.
+     * it has to hold a reference to at least one end of the pair), otherwise an exception will be thrown.
+     * @return
+     */
+    public boolean bothEndsMapped() {
+        if ( pType == PairType.UNKNOWN ) throw new StingException("ReadPair object was not initialized yet, method can not be applied");
+
+        if ( pType == PairType.BOTH_UNMAPPED || pType == PairType.ONE_UNMAPPED ) return false;
+        return true;
+    }
+
+    /** Returns true if both ends in the pair are mapped uniquely. This method requires both ends being already registered
+     * in this pair object (i.e. hasBothEnds() is true), otherwise an exception will be thrown.
+     * @return
+     */
+    public boolean bothEndsUniquelyMapped() {
+        if ( ! hasBothEnds() ) throw new StingException("Can not determine if both ends are uniquely mapped until both ends are recorded");
+        return bothEndsMapped() && end1.getMappingQuality() > 0 && end2.getMappingQuality() > 0;
+    }
+
+    /** Returns true if this pair is in proper orientation, i.e. ---> <--- on the same contig */
+    public boolean isProper() { return pType == PairType.PROPER; }
+
+    /* Returns true if this pair is in outer orientation, i.e. <--- ---> on the same chromosome */
+    public boolean isOuter() { return pType == PairType.OUTER; }
+
+    /** Returns left (coordinate-wise) read in the pair. Both ends need to be mapped, and they should map
+     * onto the same contig, otherwise an exception will be thrown.
+      * @return
+     */
+    public SAMRecord getLeftRead() {
+        if ( ! bothEndsMapped() || pType == PairType.INTER )
+            throw new StingException("Left read can be identified only when both reads are mapped onto the same contig, and the are not for "+getName());
+        if ( leftRead == null )
+            throw new StingException("Left read is not recorded. Maybe we have not seen it yet? Pair: "+getName());
+        return leftRead;
+    }
+
+    /** Returns right (coordinate-wise) read in the pair. Both ends need to be mapped, and they should map
+     * onto the same contig, otherwise an exception will be thrown.
+      * @return
+     */
+    public SAMRecord getRightRead() {
+        if ( ! bothEndsMapped() || pType == PairType.INTER )
+            throw new StingException("Right read can be identified only when both reads are mapped onto the same contig, and the are not for "+getName());
+        if ( rightRead == null )
+            throw new StingException("Right read is not recorded. Maybe we have not seen it yet? Pair: "+getName());
+        return rightRead;
+    }
+
+    public SAMRecord getEnd1() { return end1; }
+    public SAMRecord getEnd2() { return end2; }
+
+    public PairType getPairType() { return pType ; }
+
+    public void addRead(SAMRecord r) {
+        if ( ! r.getReadPairedFlag() ) throw new StingException("Read "+r.getReadName() +" is unpaired");
+        if ( r.getFirstOfPairFlag() ) {
+            if ( end1 != null ) throw new StingException("Read "+r.getReadName()+" is first of pair and the pair already has first read recorded");
+            end1 = r;
+            if ( end2 != null && ! end1.getReadName().equals(end2.getReadName()) )
+                    throw new StingException("The pair already has read "+end2.getReadName() +"; the read being added does not match by name ("+r.getReadName()+")" );
+        } else {
+            if ( r.getSecondOfPairFlag() ) {
+                if ( end2 != null ) throw new StingException("Read "+r.getReadName()+" is second of pair and the pair already has second read recorded");
+                end2 = r;
+                if ( end1 != null && ! end1.getReadName().equals(end2.getReadName()) )
+                        throw new StingException("The pair already has read "+end1.getReadName() +"; the read being added does not match by name ("+r.getReadName()+")" );
+            } else {
+                throw new StingException("The read "+r.getReadName()+" is marked as paired, but the first/second of pair flag is not set");
+            }
+        }
+        setPairInfo(r);
+    }
+
+    /** If pair type has not been set yet, then sets it to <code>t</code>. Otherwise (pair type already set),
+     *  just checks if the pair type is <code>t</t>. If it is, the method returns quietly; if it is not (inconsistency detected),
+     *  throws an exception. 
+     *
+     */
+    private void setCheckPairType(PairType t) {
+        if ( pType != PairType.UNKNOWN ) {
+            if ( pType != t )
+                throw new StingException("In pair "+getName()+" two ends provide conflicting alignment information");
+        } else pType = t;
+    }
+
+    private void setCheckLeftStart(int pos) {
+        if ( leftStart >= 0  ) {
+            if ( leftStart != pos )
+                throw new StingException("In pair "+getName()+" two ends provide conflicting alignment information");
+        } else leftStart = pos;
+    }
+
+    private void setCheckRightStart(int pos) {
+        if ( rightStart >= 0  ) {
+            if ( rightStart != pos )
+                throw new StingException("In pair "+getName()+" two ends provide conflicting alignment information");
+        } else rightStart = pos;
+    }
+
+    private void setPairInfo(SAMRecord read) {
+
+        setCheckPairType(getPairType(read));
+
+        // there is nothing left to do unless both ends are mapped onto the same contig:
+        if ( pType == PairType.INTER ) return;
+
+        if ( pType == PairType.ONE_UNMAPPED ) {
+            // set putative left or right read depending on the orientation of the only mapped mate
+            if ( ! AlignmentUtils.isReadUnmapped(read ) ) {
+                // we can set left/right read only if it is the current read that is mapped; if we have the
+                // unmapped mate, skip and wait for the mapped read to come!
+                if ( read.getReadNegativeStrandFlag() ) {
+                    setCheckRightStart(read.getAlignmentStart());
+                    if ( rightRead != null ) throw new StingException("Right read was already set for the pair");
+                    rightRead = read;
+                } else {
+                    setCheckLeftStart(read.getAlignmentStart());
+                    if ( leftRead != null ) throw new StingException("Left read was already set for the pair");
+                    leftRead = read;                    
+                }
+            }
+            return;
+        }
+
+        // we are here if both ends are mapped and they map onto the same contig
+        if ( read.getAlignmentStart() < read.getMateAlignmentStart() ) { //left/right = read/mate
+
+            setCheckLeftStart(read.getAlignmentStart());
+            setCheckRightStart(read.getMateAlignmentStart());
+
+            if ( leftRead != null ) throw new StingException("Left read was already set for the pair");
+            leftRead = read;
+        } else {
+            // left/right = mate/read
+
+            setCheckLeftStart(read.getMateAlignmentStart());
+            setCheckRightStart(read.getAlignmentStart());
+
+            if ( rightRead != null ) throw new StingException("Right read was already set for the pair");
+            rightRead = read;
+        }
+    }
+
+    /** Returns pair type that describes this read and its mate. The alignment information for both the read itself
+     * and its mate is taken from the read's sam record passed as the argument, so the mate information is expected to be
+     * correctly set!
+     * @param read
+     * @return
+     */
+    public static PairType getPairType(SAMRecord read) {
+
+        if ( AlignmentUtils.isReadUnmapped(read) ) {
+            if ( AlignmentUtils.isMateUnmapped(read) ) return PairType.BOTH_UNMAPPED;
+            else return PairType.ONE_UNMAPPED;
+        }
+
+        return getWouldBePairType(read,read.getReferenceIndex(),read.getAlignmentStart(),read.getReadNegativeStrandFlag());
+    }
+
+    /** Returns pair type that would describe this read and its mate, if this read mapped onto refId:start in orientation
+     * given by rc (forward is rc=false, reverse is rc=true). The read's alignment information (if any,
+     * unmapped reads are allowed) present in the SAM record is completely ignored by this method,
+     * only mate's information is used.
+     * @param read
+     * @param refId
+     * @param start
+     * @param rc
+     * @return
+     */
+    public static PairType getWouldBePairType(SAMRecord read, int refId, int start, boolean rc) {
+
+
+        if ( AlignmentUtils.isMateUnmapped(read) ) return PairType.ONE_UNMAPPED ;
+
+        // both read and mate are mapped:
+
+        if ( refId != read.getMateReferenceIndex() ) return PairType.INTER;
+
+        // both read and its mate map onto the same chromosome
+
+        if ( start < read.getMateAlignmentStart() ) { //left/right = read/mate
+
+            if ( rc ) {
+                if ( read.getMateNegativeStrandFlag() ) return PairType.LEFT;
+                else return PairType.OUTER;
+            } else {
+                if ( read.getMateNegativeStrandFlag() ) return PairType.PROPER;
+                else return PairType.RIGHT;
+            }
+        } else {
+             // left/right = mate/read
+
+             if ( rc ) {
+                 if ( read.getMateNegativeStrandFlag() ) return PairType.LEFT;
+                 else return PairType.PROPER;
+             } else {
+                 if ( read.getMateNegativeStrandFlag() ) return PairType.OUTER;
+                 else return PairType.RIGHT;
+             }
+        }
+    }
+
+    public int getLeftStart() {
+        if ( ! hasAnyData() ) throw new StingException("ReadPair object was not initialized yet, method can not be applied");
+        return leftStart;
+    }
+
+    public int getRightStart() {
+        if ( ! hasAnyData() ) throw new StingException("ReadPair object was not initialized yet, method can not be applied");
+        return rightStart;
+    }
+
+    public int getFragmentSize() {
+        if ( ! hasBothEnds() ) throw new StingException("Can not determine fragment size: pair object does not have both ends yet");
+        if ( ! bothEndsMapped() ) throw new StingException("Can not determine fragment size: both ends must be mapped");
+        if ( pType != PairType.PROPER ) throw new StingException("The pais is not in proper orientation, can not determine fragment size");
+
+        return getFragmentSize(leftRead,rightRead);
+    }
+
+    /** Given a read (that must belong to this pair), returns the other end in the pair if it is already
+     * recorded, or null otherwise.
+     * @param read
+     * @return
+     */
+    public SAMRecord getOtherEnd(SAMRecord read) {
+        if ( read.getFirstOfPairFlag() ) return end2;
+        else {
+            if ( read.getSecondOfPairFlag() ) return end1;
+        }
+        return null;
+    }
+
+    public static int getFragmentSize(SAMRecord left, SAMRecord right) {
+
+        if ( left == null || right == null ||
+                AlignmentUtils.isReadUnmapped(left) || AlignmentUtils.isReadUnmapped(right) ) {
+            throw new StingException("No read (null) or unmapped read provided: fragment size is not defined");
+        }
+        if ( left.getReferenceIndex() != right.getReferenceIndex() ) {
+            throw new StingException("Left/right reads map onto different contigs: fragment size is not defined");
+        }
+
+        int fragment_length = left.getReadLength(); // fragment is at least as long as the left read, duh!
+        int leftEnd = left.getAlignmentEnd();
+        int rightStart = right.getAlignmentStart();
+
+        if ( rightStart > leftEnd ) {
+            // if reads are not overlapping, fragment length is lengths of both reads plus the distance (gap) between
+            // the reads. Note that if the sequence between the reads happens to have insirtions or deletions,
+            // our estimation of the actual distance between the reads (on the fragment) is incorrect, but we
+            // can not do better given just those reads. This estimation is, in particular, incorrect
+            // for left reads ending with 'I' and/or right reads starting with 'I'
+            //
+            //    left               right
+            //  -------->...gap...<--------     fragment = left+gap+right
+
+            return left.getReadLength() + right.getReadLength() + (rightStart - leftEnd-1);
+        }
+
+        // if we are here, the reads do overlap; fragment length is lengths of the two reads less the overlap.
+        // in this case we can compute the actual overlap between the reads (on the fragment) taking into
+        // account indels, if any
+        //
+        //      left    ****     right
+        //     ------------>                   ****=overlap; fragment = left+right - overlap
+        //              <--------------
+        //
+        // with deletion:
+        //
+        //      left    **   **     right
+        //     -----------ddd->                   ****=overlap; fragment = left+right - overlap
+        //              <-ddd-------------         note that overlap != leftEnd - rightStart+1
+        //                                         instead, overlap = leftEnd-rightStart+1- length(D)
+        // with insertion:
+        //
+        //     left      *******    right        ******* = overlap; fragment = left+right - overlap
+        //    -------------iii->                   note that overlap != leftEnd - rightStart +1
+        //               <-iii--------------       instead, overlap = leftEnd - rightStart +1 + length(I)
+        //                                         (since 'i' bases are NOT on the ref)
+
+        int posOnRef = rightStart;
+//            int posOnRightRead = 0;
+
+        int overlap = leftEnd - rightStart + 1 ;
+
+        for(CigarElement ce : left.getCigar().getCigarElements() ) {
+            switch(ce.getOperator()) {
+                case S:
+                case H:
+//                      posOnRightRead+=ce.getLength();
+                    break;
+                case I:
+                    overlap += ce.getLength();
+                    break;
+                case D:
+                case N:
+                    overlap -= ce.getLength();
+                case M:
+                    posOnRef += ce.getLength();
+                    break;
+                default:
+            }
+            if ( posOnRef > leftEnd ) break; // we need to examine only overlapping part of the reads
+        }
+        return left.getReadLength() + right.getReadLength() - overlap;
+    }
+}
--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DetectWGAWalker.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DetectWGAWalker.java
@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.oneoffprojects.walkers;
+
+import org.broadinstitute.sting.gatk.walkers.WalkerName;
+import org.broadinstitute.sting.gatk.walkers.Requires;
+import org.broadinstitute.sting.gatk.walkers.DataSource;
+import org.broadinstitute.sting.gatk.walkers.ReadWalker;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
+import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSource;
+import org.broadinstitute.sting.oneoffprojects.utils.ReadPair;
+import org.broadinstitute.sting.oneoffprojects.utils.AlignmentInfo;
+import org.broadinstitute.sting.oneoffprojects.utils.Assembly;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.sam.AlignmentUtils;
+import net.sf.samtools.SAMRecord;
+
+import java.util.Map;
+import java.util.HashMap;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: asivache
+ * Date: Sep 13, 2010
+ * Time: 12:45:42 PM
+ * To change this template use File | Settings | File Templates.
+ */
+
+@WalkerName("DetectWGA")
+@Requires(value={DataSource.REFERENCE_BASES})
+
+public class DetectWGAWalker extends ReadWalker<Integer,Integer> {
+
+    private int TIP_LENGTH = 10;
+    private int TIP_MM_THRESHOLD = 1;
+    private double TIP_AV_QUAL_THRESHOLD = 15.0;
+    
+    private boolean DEBUG = true;
+
+    Map<String, ReadPair> pairCache = null;
+    Map<String,MathUtils.RunningAverage> fragmentSizeMap = null;   // by library
+    private ReferenceDataSource refData;
+    private byte[] refBases;
+
+
+   
+
+    @Override
+    public void initialize() {
+        refData = new ReferenceDataSource(getToolkit().getArguments().referenceFile);
+        pairCache = new HashMap<String, ReadPair>();
+        fragmentSizeMap = new HashMap<String,MathUtils.RunningAverage>();
+    }
+
+
+    public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
+
+        if ( ! read.getReadPairedFlag() ) return 0; // for now!!
+
+        // read is paired
+
+        cacheReadAsPair(read);
+
+        /*
+        // if the read is already mapped (uniquely), we check if it may have the "colored tip" artifact on either side:
+        if ( AlignmentUtils.isReadUniquelyMapped(read) ) {
+
+            TipInfo tips = countTipMismatches(read,TIP_LENGTH);
+
+            if ( tips.leftMM() >= TIP_MM_THRESHOLD || tips.rightMM() >= TIP_MM_THRESHOLD ) {
+                if ( DEBUG ) {
+                    out.println("  Read "+read.getReadName()+ " has "+tips.leftMM()+"/"+tips.rightMM()+" mismatches in the tips");
+                    out.println("  Pair orientation: "+pair.getPairType());
+                }
+                // try adding read to existing assemblies:
+                AlignmentInfo al = alignToAllAddToBest(read,Math.min(3,tips.leftMM()+tips.rightMM())-1);
+                if ( al == null ) {
+                    if ( tips.leftMM() >= TIP_MM_THRESHOLD && tips.leftQ() >= TIP_AV_QUAL_THRESHOLD ||
+                        tips.rightMM() >= TIP_MM_THRESHOLD && tips.rightQ() >= TIP_AV_QUAL_THRESHOLD ) {
+                        if ( DEBUG ) out.println("   Initialized new assembly.") ;
+                        Assembly a = new Assembly(read.getReadBases(),read.getReadName(),read.getAlignmentStart());
+                        tryAndAddUnmapped(a); // see if we got unmapped reads that would align nicely
+                        assemblies.add(a);
+                    }
+                }
+            }
+            return 1;
+        }
+*/
+
+
+
+        return null;  //To change body of implemented methods use File | Settings | File Templates.
+    }
+
+    /**
+     * Provide an initial value for reduce computations.
+     *
+     * @return Initial value of reduce.
+     */
+    public Integer reduceInit() {
+        return null;  //To change body of implemented methods use File | Settings | File Templates.
+    }
+
+    /**
+     * Reduces a single map with the accumulator provided as the ReduceType.
+     *
+     * @param value result of the map.
+     * @param sum   accumulator for the reduce.
+     * @return accumulator with result of the map taken into account.
+     */
+    public Integer reduce(Integer value, Integer sum) {
+        return null;  //To change body of implemented methods use File | Settings | File Templates.
+    }
+
+    /** little helper: if we already cached the pair object for this read, just add the read to that object; if we did not - instantiate
+     * new pair objetc first and register it in the map, then add the read; this method also updates other cache(s)/trackers as needed, e.g.
+     * fragment size map
+     */
+    private void cacheReadAsPair(SAMRecord read) {
+        ReadPair pair = pairCache.get( read.getReadName() );
+        if ( pair == null ) {
+            pair = new ReadPair(read);
+            pairCache.put(read.getReadName(),pair);
+        }
+
+        pair.addRead(read);
+
+        // if it's a good pair, add its fragment size to the stats:
+        if ( pair.hasBothEnds() && pair.bothEndsMapped() && pair.isProper() ) {
+            String lib = read.getReadGroup().getLibrary();
+            MathUtils.RunningAverage fSize = fragmentSizeMap.get(lib);
+            if ( fSize == null ) {
+                fSize = new MathUtils.RunningAverage();
+                fragmentSizeMap.put(lib,fSize);
+            }
+            fSize.add(pair.getFragmentSize());
+        }
+
+    }
+
+    private TipInfo countTipMismatches(SAMRecord read, int tip_length) {
+
+            AlignmentUtils.MismatchCount left_mm = AlignmentUtils.getMismatchCount(read,refBases,read.getAlignmentStart()-1,0,tip_length);
+
+            int right_start = read.getReadLength()-tip_length;
+            AlignmentUtils.MismatchCount right_mm = AlignmentUtils.getMismatchCount(read,refBases,read.getAlignmentStart()-1,right_start,read.getReadLength()-right_start);
+
+            return new TipInfo(left_mm,right_mm);
+    }
+
+    class TipInfo {
+        AlignmentUtils.MismatchCount left_mm;
+        AlignmentUtils.MismatchCount right_mm;
+        double left_avQ;
+        double right_avQ;
+
+        public TipInfo(AlignmentUtils.MismatchCount l,AlignmentUtils.MismatchCount r) {
+            left_mm = l;
+            right_mm = r;
+            left_avQ = (l.numMismatches ==0 ? 0 : ((double)l.mismatchQualities)/l.numMismatches );
+            right_avQ = (r.numMismatches ==0 ? 0 : ((double)r.mismatchQualities)/r.numMismatches );
+        }
+
+        public int leftMM() { return left_mm.numMismatches; }
+        public int rightMM() { return right_mm.numMismatches; }
+        public double leftQ() { return left_avQ; }
+        public double rightQ() { return right_avQ; }
+    }
+
+}
--- a/java/src/org/broadinstitute/sting/playground/tools/PairMaker.java
+++ b/java/src/org/broadinstitute/sting/playground/tools/PairMaker.java
@ -363,7 +363,13 @@ public class PairMaker extends CommandLineProgram {
            }
        }
        if ( best.size() == 0 ) return null; // no unique alignment
-        if ( best.size() > 1 ) throw new RuntimeException("Multiple alignments for read "+l.get(0).getReadName()+", all with Q>="+minq);
+        if ( best.size() > 1 ) {
+            for ( SAMRecord r : best ) {
+                System.out.println("READ "+r.getReadName()+" mapQ="+r.getMappingQuality()+" at="+r.getReferenceName()+
+                        ":"+r.getAlignmentStart()+"("+(r.getReadNegativeStrandFlag()?"-":"+")+") cig="+r.getCigarString());
+            }
+            throw new RuntimeException("Multiple alignments for read "+l.get(0).getReadName()+", all with Q>="+minq);
+        }

        return best.get(0);
    }
--- a/java/src/org/broadinstitute/sting/utils/interval/OverlappingIntervalIterator.java
+++ b/java/src/org/broadinstitute/sting/utils/interval/OverlappingIntervalIterator.java
@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.interval;
+
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
+
+import java.util.Iterator;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: asivache
+ * Date: Oct 7, 2010
+ * Time: 2:40:02 PM
+ * To change this template use File | Settings | File Templates.
+ */
+
+/** This class provides an adapter to Iterator<GenomeLoc> that returns only (parts of) underlying iterator's
+ * intervals overlapping with specified "master set" of bounding intervals. The underlying iterator must return
+ * NON-overlapping intervals in coordinate-sorted order, otherwise the behavior is unspecified. If the master set is represented by
+ * another interval iterator, it should return sorted and NON-overlapping intervals.
+ *
+  */
+public class OverlappingIntervalIterator implements Iterator<GenomeLoc> {
+    PushbackIterator<GenomeLoc> iter = null;
+    PushbackIterator<GenomeLoc> boundBy = null;
+
+    GenomeLoc prefetchedOverlap = null;
+    GenomeLoc currentBound = null;
+    GenomeLoc currentInterval = null;
+
+    /** Creates new overlapping iterator that will internally traverse <code>intervals</code> and return only
+     * overlaps of those with set of intervals returned by <code>boundBy</code>.
+      * @param intervals
+     * @param boundBy
+     */
+    public OverlappingIntervalIterator(Iterator<GenomeLoc> intervals, Iterator<GenomeLoc> boundBy) {
+        this.iter = new PushbackIterator(intervals);
+        this.boundBy = new PushbackIterator(boundBy);
+
+        if ( iter.hasNext() && boundBy.hasNext() ) {
+            GenomeLoc currentInterval = iter.next(); // load first interval
+            GenomeLoc currentBound = boundBy.next(); // load first bounding interval
+            fetchNextOverlap();
+        }
+    }
+
+    /** Traverses both iterators in sync, until the first overlap between the two is reached. If no overlap is found
+     * until the end of the either of the two streams, leaves prefetchedOverlap set to null
+     */
+    private void fetchNextOverlap() {
+
+        prefetchedOverlap = null;
+
+        while ( prefetchedOverlap == null ) {
+
+            if ( currentInterval.isBefore(currentBound) ) {
+                if ( ! iter.hasNext() ) break; // no more intervals left; we are done
+                currentInterval = iter.next();
+                continue;
+            }
+
+            if ( currentInterval.isPast(currentBound) ) {
+                if ( ! boundBy.hasNext() ) break; // we are past the last available bounding interval, we are done!
+                currentBound = boundBy.next();
+                continue;
+            }
+
+            // we are at this point only if currentInterval overlaps with currentBound
+
+            prefetchedOverlap = currentInterval.intersect(currentBound);
+
+            // we still do not know if we are done with either current interval or current bound, because
+            // two special situations are possible:
+            //
+            //     1) next interval overlaps with              2) current interval also overlaps with
+            //        the same bounding interval;                  next bounding interval; note that
+            //        note that in this case next                  in this case next bound necessarily
+            //        interval necessarily starts before           starts before the next interval
+            //        the next bound
+            //
+            //   curr. int     next int.                              curr. int
+            //     -----        ------                        --------------------------
+            //      -------------------                     ---------         -------------
+            //           curr. bound                       curr. bound          next bound
+
+            // To solve this issue we update either only currentInterval or only currentBound to their next value,
+            // whichever comes first; the rest of the traversal to the next overlap will be performed on the next invocation of
+            // fetchNextOverlap().
+
+            if ( ! iter.hasNext() ) {
+                
+            }
+            GenomeLoc nextInterval = iter.next();
+            GenomeLoc nextBound = boundBy.next();
+
+            if ( nextInterval.startsBefore(nextBound)) {
+                currentInterval = nextInterval;
+                boundBy.pushback(nextBound); // in case next interval overlaps with the current bound
+            } else {
+                currentBound = nextBound;
+                iter.pushback(nextInterval); // in case current interval also overlaps with the next bound
+            }
+        }
+
+    }
+
+    /**
+     * Returns <tt>true</tt> if the iteration has more elements. (In other
+     * words, returns <tt>true</tt> if <tt>next</tt> would return an element
+     * rather than throwing an exception.)
+     *
+     * @return <tt>true</tt> if the iterator has more elements.
+     */
+    public boolean hasNext() {
+        return false;  //To change body of implemented methods use File | Settings | File Templates.
+    }
+
+    /**
+     * Returns the next element in the iteration.
+     *
+     * @return the next element in the iteration.
+     * @throws java.util.NoSuchElementException
+     *          iteration has no more elements.
+     */
+    public GenomeLoc next() {
+        return null;  //To change body of implemented methods use File | Settings | File Templates.
+    }
+
+    /**
+     * Removes from the underlying collection the last element returned by the
+     * iterator (optional operation).  This method can be called only once per
+     * call to <tt>next</tt>.  The behavior of an iterator is unspecified if
+     * the underlying collection is modified while the iteration is in
+     * progress in any way other than by calling this method.
+     *
+     * @throws UnsupportedOperationException if the <tt>remove</tt>
+     *                                       operation is not supported by this Iterator.
+     * @throws IllegalStateException         if the <tt>next</tt> method has not
+     *                                       yet been called, or the <tt>remove</tt> method has already
+     *                                       been called after the last call to the <tt>next</tt>
+     *                                       method.
+     */
+    public void remove() {
+        throw new UnsupportedOperationException("remove() method is not supported by OverlappingIntervalIterator");
+        //To change body of implemented methods use File | Settings | File Templates.
+    }
+}
--- a/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java
+++ b/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java
@ -41,7 +41,7 @@ import java.util.Arrays;

 public class AlignmentUtils {

-    private static class MismatchCount {
+    public static class MismatchCount {
        public int numMismatches = 0;
        public long mismatchQualities = 0;
    }
@ -98,13 +98,13 @@ public class AlignmentUtils {
        return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex);
     }

-    private static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) {
+    public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) {
        return getMismatchCount(r,refSeq,refIndex,0,r.getReadLength());
    }

    // todo -- this code and mismatchesInRefWindow should be combined and optimized into a single
    // todo -- high performance implementation.  We can do a lot better than this right now
-    private static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) {
+    public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) {
        MismatchCount mc = new MismatchCount();

        int readIdx = 0;
@ -387,6 +387,15 @@ public class AlignmentUtils {
        return true;
    }

+    /** Returns true is read is mapped and mapped uniquely (Q>0).
+     * 
+     * @param read
+     * @return
+     */
+    public static boolean isReadUniquelyMapped(SAMRecord read) {
+        return ( ! AlignmentUtils.isReadUnmapped(read) ) && read.getMappingQuality() > 0;
+    }
+
    /** Returns the array of base qualitites in the order the bases were read on the machine (i.e. always starting from
     * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base
     * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array