Porting SAM pileup ROD to Tribble as a case study.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3356 348d0f76-0448-11de-a6fe-93d51630548a
2010-05-13 00:22:59 +00:00 · 2010-05-13 00:22:59 +00:00 · 3e9ad4bbd0
parent 6839c194cb
commit 3e9ad4bbd0
6 changed files with 543 additions and 927 deletions
--- a/java/src/org/broadinstitute/sting/gatk/refdata/SAMPileupRecord.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/SAMPileupRecord.java
@ -1,693 +0,0 @@
-/*
- * Copyright (c) 2010 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.refdata;
-
-
-import java.io.FileNotFoundException;
-import java.util.*;
-import java.util.regex.Pattern;
-import java.util.regex.Matcher;
-
-import org.broadinstitute.sting.utils.*;
-import org.broadinstitute.sting.utils.text.XReadLines;
-
-import net.sf.picard.reference.ReferenceSequenceFileWalker;
-import net.sf.samtools.util.StringUtil;
-
-/**
- *  This class wraps Maq/samtools allele calls from pileup format and presents them as a ROD.<br>
- *
- * Example format:<br>
- *   for SNP:<br>
- *      [chr]       [pos] [ref]  [consensus allele(s)]    [consensus confidence]  [snp confidence]   [max mapping qual]  [num reads in the pile] <br>
- *     chrX       466     T                     Y                                   170                                 170                          88                              32 ... (piles of read bases  and quals follow) <br>
- *     <br>
- *   for indel: <br>
- *    [chr]         [pos]     [always *]  [consensus alleles]    [consensus conf.?]        [indel conf.?]     [max mapping qual]     [num reads in the pile]  [indel]   [always *?] [reads haveindel]  [reads may have indel] [other reads?]<br>     
- *     chrX       141444        *           +CA/+CA                               32                                       468            255                                         25                                        +CA     *                         5                                   2                                   12             <br>
- * User: asivache
- * Date: Apr 03, 2009
- * Time: 2:58:33 PM
- * To change this template use File | Settings | File Templates.
- */
-
-
-public class SAMPileupRecord {
-    private static final int NO_VARIANT = -1;
-    private static final int SNP_VARIANT = 0;
-    private static final int INSERTION_VARIANT = 1;
-    private static final int DELETION_VARIANT = 2;
-    private static final int INDEL_VARIANT = 3;
-
-    // allocate once and don't ever bother creating them again:
-    private static final String baseA = new String("A");
-    private static final String baseC = new String("C");
-    private static final String baseG = new String("G");
-    private static final String baseT = new String("T");
-    private static final String emptyStr = new String(); // we will use this for "reference" allele in insertions
-
-    protected GenomeLoc loc;       // genomic location of this genotyped site
-    // Reference sequence chromosome or scaffold
-    // Start and stop positions in chrom
-
-    protected char refBaseChar; // what we have set for the reference base (is set to a '*' for indel!)
-    protected String refBases;        // the reference base sequence according to NCBI; single base for point mutations, deleted bases for  deletions, empty string for insertions
-    protected String observedString; // stores the actual string representation of observed alleles (for point mutations stores as A/C, not in extended alphabet!!)
-
-    protected String pileupQuals;     // the read base qualities
-    protected String pileupBases;     // the read bases themselves
-
-    protected List<String> observedAlleles = null;    // The sequences of the observed alleles (e.g. {"A","C"} for point mutation or {"","+CC"} for het. insertion
-    protected int varType = NO_VARIANT;
-    protected int ploidy = 2; // how many allelic variants we have?
-    protected int nNonref = 0; // number of non-reference alleles observed
-    protected int eventLength = 0; // number of inserted or deleted bases
-
-    protected double consensusScore = 0;
-    protected double variantScore = 0;
-
-    private char fldDelim = '\t'; 
-    private String name = emptyStr;
-    // ----------------------------------------------------------------------
-    //
-    // Constructors
-    //
-    // ----------------------------------------------------------------------
-   public SAMPileupRecord(String name) {
-    	this.name = name; // keeps track name
-   }
-
-   public boolean parseLine(final String line) {
-//       0          1             2         3                  4         5            6         7
-//*     chrX     466           T         Y                170      170       88      32 ... (piles of read bases  and quals follow)
-//*    chrX    141444     *     +CA/+CA       32       468     255     25     +CA     *       5       2       12      6
-	   try {
-	
-		   int[] pos = Utils.indexOfAll(line, fldDelim );
-		   String contig = line.substring(0,pos[0] ); // field 0
-		   long start = Long.parseLong(line.substring(pos[0]+1, pos[1])) ; // field 1 
-
-		   refBaseChar = Character.toUpperCase(line.charAt(pos[1]+1)); // field 2 ( single char)
-
-		   observedString = line.substring(pos[2]+1, pos[3]).toUpperCase(); // field 3
-		   observedAlleles = new ArrayList<String>(2);
-
-		   consensusScore = Double.parseDouble(line.substring(pos[3]+1,pos[4]));
-		   variantScore = Double.parseDouble(line.substring(pos[4]+1, pos[5]));
-
-		   if ( refBaseChar == '*' ) {
- 	
-			   parseIndels(observedString) ;
-			   if ( varType == DELETION_VARIANT ) loc = GenomeLocParser.createGenomeLoc(contig, start, start+eventLength-1);
-			   else loc = GenomeLocParser.createGenomeLoc(contig, start, start-1); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
-		   } else {
-			   parseBasesAndQuals(line,pos[7]+1,pos[8], pos[8]+1, ( pos.length > 9 ? pos[9] : line.length()) );
-//			   parseBasesAndQuals(line.substring(pos[7]+1,pos[8]), line.substring(pos[8]+1, ( pos.length > 9 ? pos[9] : line.length()) ) );
-			   // if the variant is a SNP or a reference base (i.e. no variant at all)
-			   if ( observedString.length() != 1 ) throw new RuntimeException( "point mutation genotype is expected to be represented by a single letter");
-     
-			   refBases = line.substring(pos[1]+1, pos[2]).toUpperCase();
-			   eventLength = 1;
-			   //loc = new GenomeLoc(contig, start, start+1);
-			   loc = GenomeLocParser.createGenomeLoc(contig, start, start);
-
-			   char ch = observedString.charAt(0);
-
-			   switch ( ch ) {
-			   case 'A': observedAlleles.add(baseA); observedAlleles.add(baseA); break;
-			   case 'C': observedAlleles.add(baseC); observedAlleles.add(baseC); break;
-			   case 'G': observedAlleles.add(baseG); observedAlleles.add(baseG); break;
-			   case 'T': observedAlleles.add(baseT); observedAlleles.add(baseT); break;
-			   case 'M': observedAlleles.add(baseA); observedAlleles.add(baseC); break;
-			   case 'R': observedAlleles.add(baseA); observedAlleles.add(baseG); break;
-			   case 'W': observedAlleles.add(baseA); observedAlleles.add(baseT); break;
-			   case 'S': observedAlleles.add(baseC); observedAlleles.add(baseG); break;
-			   case 'Y': observedAlleles.add(baseC); observedAlleles.add(baseT); break;
-			   case 'K': observedAlleles.add(baseG); observedAlleles.add(baseT); break;
-			   }
-			   if ( observedAlleles.get(0).charAt(0) == refBaseChar && observedAlleles.get(1).charAt(0) == refBaseChar ) varType = NO_VARIANT;
-			   else {
-				   // 	we know that at least one allele is non-ref;
-				   // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
-				   // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
-				   varType = SNP_VARIANT;
-				   if ( observedAlleles.get(0).charAt(0) == refBaseChar ||
-						   observedAlleles.get(1).charAt(0) == refBaseChar ||
-						   observedAlleles.get(0) == observedAlleles.get(1)
-				   ) nNonref = 1;
-				   else nNonref = 2; // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
-			   }
-		   }
-	   } catch ( RuntimeException e ) {
-		   System.out.printf("  Exception caught during parsing BasicPileup line: %s%n", line);
-		   throw e;
-	   }
-	   return true;
-	   //      	if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant :  ("+refBaseChar+") -->"+toMediumString());
-   }
-
-   
-    /** Parses a line from SAM pileup file into this SAMPileupRecord object.. 
-     * @param parts line from SAM pileup file, split on delimiter character (tab)
-     */
-    public boolean parseLine(final String[] parts) {
-//    	          0          1             2         3                  4         5            6         7
-//    	*     chrX     466           T         Y                170      170       88      32 ... (piles of read bases  and quals follow)
-//    	 *    chrX    141444     *     +CA/+CA       32       468     255     25     +CA     *       5       2       12      6
-        try {
-        	
-            String contig = parts[0];
-            long start = Long.parseLong(parts[1]) ;
-
-            refBaseChar = Character.toUpperCase(parts[2].charAt(0));
-
-            parts[3] = parts[3].toUpperCase();
-            observedString = parts[3];
-            observedAlleles = new ArrayList<String>(2);
-
-        	consensusScore = Double.parseDouble(parts[4]);
-            variantScore = Double.parseDouble(parts[5]);
-
-            if ( refBaseChar == '*' ) {
-            	
-                parseIndels(parts[3]) ;
-                if ( varType == DELETION_VARIANT ) loc = GenomeLocParser.parseGenomeLoc(contig, start, start+eventLength-1);
-                else loc = GenomeLocParser.parseGenomeLoc(contig, start, start-1); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
-            }
-            else {
-                parseBasesAndQuals(parts[8], parts[9]);
-                // if the variant is a SNP or a reference base (i.e. no variant at all)
-                if ( parts[3].length() != 1 ) throw new RuntimeException( "point mutation genotype is expected to be represented by a single letter");
-                
-                refBases = parts[2].toUpperCase();
-                eventLength = 1;
-                //loc = GenomeLoc.parseGenomeLoc(contig, start, start+1);
-                loc = GenomeLocParser.parseGenomeLoc(contig, start, start);
-
-                char ch = parts[3].charAt(0);
-
-                switch ( ch ) {
-                    case 'A': observedAlleles.add(baseA); observedAlleles.add(baseA); break;
-                    case 'C': observedAlleles.add(baseC); observedAlleles.add(baseC); break;
-                    case 'G': observedAlleles.add(baseG); observedAlleles.add(baseG); break;
-                    case 'T': observedAlleles.add(baseT); observedAlleles.add(baseT); break;
-                    case 'M': observedAlleles.add(baseA); observedAlleles.add(baseC); break;
-                    case 'R': observedAlleles.add(baseA); observedAlleles.add(baseG); break;
-                    case 'W': observedAlleles.add(baseA); observedAlleles.add(baseT); break;
-                    case 'S': observedAlleles.add(baseC); observedAlleles.add(baseG); break;
-                    case 'Y': observedAlleles.add(baseC); observedAlleles.add(baseT); break;
-                    case 'K': observedAlleles.add(baseG); observedAlleles.add(baseT); break;
-                }
-                if ( observedAlleles.get(0).charAt(0) == refBaseChar && observedAlleles.get(1).charAt(0) == refBaseChar ) varType = NO_VARIANT;
-                else {
-                    // we know that at least one allele is non-ref;
-                    // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
-                    // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
-                    varType = SNP_VARIANT;
-                    if ( observedAlleles.get(0).charAt(0) == refBaseChar ||
-                            observedAlleles.get(1).charAt(0) == refBaseChar ||
-                            observedAlleles.get(0) == observedAlleles.get(1)
-                            ) nNonref = 1;
-                    else nNonref = 2; // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
-                }
-            }
-        } catch ( RuntimeException e ) {
-            System.out.printf("  Exception caught during parsing BasicPileup line: %s%n", Utils.join(" <=> ", parts));
-            throw e;
-        }
-        return true;
-  //      if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant :  ("+refBaseChar+") -->"+toMediumString());
-    }
-
-
-    private void parseIndels(String genotype) {
-        String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit
-
-        // if reference allele is among the observed alleles, we will need to take special care of it since we do not have direct access to the reference;
-        // if we have an insertion, the "reference" allele is going to be empty; if it it is a deletion, we will deduce the "reference allele" bases
-        // from what we have recorded for the deletion allele (e.g. "-CAC")
-        boolean hasRefAllele = false;
-
-        for ( int i = 0 ; i < obs.length ; i++ ) {
-            if ( obs[i].length() == 1 && obs[i].charAt(0) == '*'  ) {
-                hasRefAllele = true;
-                observedAlleles.add(emptyStr);
-                continue; 
-            }
-
-            String varBases = obs[i].toUpperCase();
-
-            switch ( obs[i].charAt(0) )  {
-                case '+':
-                    if ( varType != NO_VARIANT && varType != INSERTION_VARIANT )  varType = INDEL_VARIANT;
-                    else varType = INSERTION_VARIANT;
-                    refBases = emptyStr;
-                    break;
-                case '-' :
-                    if ( varType != NO_VARIANT && varType != DELETION_VARIANT )  varType = INDEL_VARIANT;
-                    else varType = DELETION_VARIANT;
-                    refBases = varBases; // remember what was deleted, this will be saved as "reference allele"
-                    break;
-                default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype);
-            }
-            observedAlleles.add(varBases);
-            eventLength = obs[i].length() - 1; // inconsistent for non-biallelic indels!!
-        }
-        if ( hasRefAllele ) {
-            // we got at least one ref. allele (out of two recorded)
-            if ( varType == NO_VARIANT ) { // both top theories are actually ref allele;
-                nNonref = 0; // no observations of non-reference allele at all
-                refBases = emptyStr;
-            } else {
-                nNonref = 1; // hasRefAllele = true, so one allele was definitely ref, hence there is only one left
-            }
-        } else {
-            // we observe two non-ref alleles; they better be the same variant, otherwise the site is not bi-allelic and at the moment we
-            // fail to set data in a consistent way.
-            if ( observedAlleles.get(0).equals(observedAlleles.get(1))  ) nNonref = 1;
-            else nNonref = 2;
-        }
-        // DONE with indels
-
-    }
-
-    private void parseBasesAndQuals(final String bases, final String quals)
-    {
-        //System.out.printf("%s%n%s%n", bases, quals);
-
-        // needs to convert the base string with it's . and , to the ref base
-        StringBuilder baseBuilder = new StringBuilder();
-        StringBuilder qualBuilder = new StringBuilder();
-        boolean done = false;
-        for ( int i = 0, j = 0; i < bases.length() && ! done; i++ ) {
-            //System.out.printf("%d %d%n", i, j);
-            char c = (char)bases.charAt(i);
-
-            switch ( c ) {
-                case '.':   // matches reference
-                case ',':   // matches reference
-                    baseBuilder.append(refBaseChar);
-                    qualBuilder.append((char)quals.charAt(j++));
-                    break;
-                case '$':   // end of read
-                    break;
-                case '*':   // end of indel?
-                    j++;
-                    break;
-                case '^':   // mapping quality
-                    i++;
-                    break;
-                case '+':   // start of indel
-                case '-':   // start of indel
-                    final Pattern regex = Pattern.compile("([0-9]+).*");             // matches case 1
-                    final String rest = bases.substring(i+1);
-                    //System.out.printf("sub is %s%n", rest);
-                    Matcher match = regex.matcher(rest);
-                    if ( ! match.matches() ) {
-                        if ( refBaseChar != '*' )
-                            throw new RuntimeException("Bad pileup format: " + bases + " at position " + i);
-                        done = true;
-                    }
-                    else {
-                        String g = match.group(1);
-                        //System.out.printf("group is %d, match is %s%n", match.groupCount(), g);
-                        int l = Integer.parseInt(g);
-                        i += l + g.length();    // length of number + that many bases + +/- at the start (included in the next i++)
-                        //System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1));
-                    }
-                    break;
-                default:   // non reference base
-                    baseBuilder.append(c);
-                    qualBuilder.append((char)quals.charAt(j++));
-            }
-        }
-        
-        pileupBases = baseBuilder.toString();
-        pileupQuals = qualBuilder.toString();
-    }
-
-    private void parseBasesAndQuals(final String line, int basesStart, int basesStop, int qualsStart, int qualsStop)
-    {
-        //System.out.printf("%s%n%s%n", bases, quals);
-
-        // needs to convert the base string with it's . and , to the ref base
-        StringBuilder baseBuilder = new StringBuilder();
-        StringBuilder qualBuilder = new StringBuilder();
-        boolean done = false;
-        for ( int i = basesStart, j = qualsStart; i < basesStop && ! done; i++ ) {
-            //System.out.printf("%d %d%n", i, j);
-            char c = (char)line.charAt(i);
-
-            switch ( c ) {
-                case '.':   // matches reference
-                case ',':   // matches reference
-                    baseBuilder.append(refBaseChar);
-                    qualBuilder.append((char)line.charAt(j++));
-                    break;
-                case '$':   // end of read
-                    break;
-                case '*':   // end of indel?
-                    j++;
-                    break;
-                case '^':   // mapping quality
-                    i++;
-                    break;
-                case '+':   // start of indel
-                case '-':   // start of indel
-                    final Pattern regex = Pattern.compile("([0-9]+).*");             // matches case 1
-                    final String rest = line.substring(i+1,basesStop);
-                    //System.out.printf("sub is %s%n", rest);
-                    Matcher match = regex.matcher(rest);
-                    if ( ! match.matches() ) {
-                        if ( refBaseChar != '*' )
-                            throw new RuntimeException("Bad pileup format: " + line.substring(basesStart, basesStop) + " at position " + (i-basesStart));
-                        done = true;
-                    }
-                    else {
-                        String g = match.group(1);
-                        //System.out.printf("group is %d, match is %s%n", match.groupCount(), g);
-                        int l = Integer.parseInt(g);
-                        i += l + g.length();    // length of number + that many bases + +/- at the start (included in the next i++)
-                        //System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1));
-                    }
-                    break;
-                default:   // non reference base
-                    baseBuilder.append(c);
-                    qualBuilder.append((char)line.charAt(j++));
-            }
-        }
-        
-        pileupBases = baseBuilder.toString();
-        pileupQuals = qualBuilder.toString();
-    }
-
-    
-    public GenomeLoc getLocation()  { return loc; }
-    public String getQualsAsString()        { return pileupQuals; }
-    
-    /** Returns reference base for point genotypes or '*' for indel genotypes, as a char.
-     * 
-     */
-    public char getRef()            { return refBaseChar; }
-    public int size()               { return pileupQuals.length(); }
-    
-    /** Returns pile of observed bases over the current genomic location.
-     *  
-     */
-    public String getBasesAsString()        { return pileupBases; }
-
-    /** Returns formatted pileup string for the current genomic location as
-     * "location: reference_base observed_base_pile observed_qual_pile"
-     */
-    public String getPileupString()
-    {
-        return String.format("%s: %s %s %s", getLocation(), getRef(), getBasesAsString(), getQualsAsString());
-    }
-
-    public ArrayList<Byte> getBasesAsArrayList()        { throw new StingException("Not implemented"); }
-    public ArrayList<Byte> getQualsAsArrayList()        { throw new StingException("Not implemented"); }
-
-    /**
-     * Gets the bases in byte array form.
-     * @return byte array of the available bases.
-     */
-    public byte[] getBases() {
-        return StringUtil.stringToBytes(getBasesAsString());
-    }
-
-    /**
-     * Gets the Phred base qualities without ASCII offset.
-     * @return Phred base qualities.
-     */
-    public byte[] getQuals() {
-        byte[] quals = StringUtil.stringToBytes(getQualsAsString());
-        for(int i = 0; i < quals.length; i++) quals[i] -= 33;
-        return quals;
-    }
-
-    /** Returns bases in the reference allele as a String. For point genotypes, the string consists of a single
-     * character (reference base). For indel genotypes, the string is empty for insertions into
-     * the reference, or consists of deleted bases for deletions.
-     *
-     * @return reference allele, forward strand
-     */
-    public String getFWDRefBases() {
-        return refBases;
-    }
-
-        
-    public List<String> getFWDAlleles()  {
-        return observedAlleles;
-    }
-
-    // ----------------------------------------------------------------------
-    //
-    // What kind of variant are we?
-    //
-    // ----------------------------------------------------------------------
-    public boolean isSNP() { return varType == SNP_VARIANT ; }
-    public boolean isInsertion() { return varType == INSERTION_VARIANT; }
-    public boolean isDeletion() { return varType == DELETION_VARIANT ; }
-    public boolean isIndel() { return isInsertion() || isDeletion() || varType == INDEL_VARIANT; }
-    public boolean isReference()  { return varType == NO_VARIANT; }
-
-    public boolean isHom() {
-    	// implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype
-    	if ( ! isIndel() ) return ( observedAlleles.get(0) == observedAlleles.get(1) ); 
-    	return ( isInsertion() || isDeletion() ) && observedAlleles.get(0).equals(observedAlleles.get(1) );
-    }
-    
-    public boolean isHet() {
-    	// implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype
-    	if ( ! isIndel() ) return ( observedAlleles.get(0) != observedAlleles.get(1) ); 
-    	return isIndel() || ( ! observedAlleles.get(0).equals(observedAlleles.get(1) ) );
-    }
-
-    // ----------------------------------------------------------------------
-    //
-    // formatting
-    //
-    // ----------------------------------------------------------------------
-    public String toString() {
-        return String.format("%s\t%d\t%d\t%s\t%s\t%s",
-                getLocation().getContig(), getLocation().getStart(), getLocation().getStop(), name, refBases, observedString);
-    }
-
-    public String toSimpleString() {
-        return String.format("%s:%s", name, observedString);
-    }
-
-    public String toMediumString() {
-
-        String s = null;
-        if ( refBaseChar == '*' ) s = String.format("%s:%s:%s", getLocation().toString(), name,observedString);
-        else s = String.format("%s:%s:%s/%s", getLocation().toString(), name, observedAlleles.get(0),observedAlleles.get(1));
-        if ( isSNP() ) s += ": SNP";
-        else {
-            if ( isInsertion() ) s += ": Insertion";
-            else {
-                if ( isDeletion() ) s+= ": Deletion";
-                else {
-                    if ( isIndel() ) s+=": Indel";
-                    else s+=": Reference";
-                }
-            }
-        }
-        return s;
-    }
-
-    public String repl() {
-        return String.format("REPL not implemented yet");
-    }
-    
-    
-    public double getVariantConfidence() {
-        return variantScore;
-    }
-
-
-    public boolean isBiallelic() {
-        return nNonref  < 2;
-    }
-
-    public double getConsensusConfidence() {
-        return consensusScore;
-    }
-
-    public int length() {
-        return eventLength;
-    }
-
-	public boolean isIndelGenotype() {
-		return refBaseChar == '*';
-	}
-
-
-	public boolean isPointGenotype() {
-		return ! isIndelGenotype();
-	}
-
-	/** Implements method required by GenotypeList interface. If this object represents
-	 * an indel genotype, then it returns itself through this method. If this object is a
-	 * point genotype, this method returns null.
-	 * @return
-	 */
-	public SAMPileupRecord getIndelGenotype() {
-		if ( isIndelGenotype() ) return this;
-		else return null;
-	}
-
-	/** Implements method required by GenotypeList interface. If this object represents
-	 * a point genotype, then it returns itself through this method. If this object is an
-	 * indel genotype, this method returns null.
-	 * @return
-	 */
-	public SAMPileupRecord getPointGenotype() {
-		if ( isPointGenotype() ) return this;
-		else return null;
-	}
-
-	/** Returns true if this object \em is an indel genotype (and thus
-	 * indel genotype is what it only has).
-	 * @return
-	 */
-	public boolean hasIndelGenotype() {
-		return isIndelGenotype();
-	}
-
-	/** Returns true if this object \em is a point genotype (and thus
-	 * point genotype is what it only has.
-	 * @return
-	 */
-	public boolean hasPointGenotype() {
-		return isPointGenotype();
-	}
-	
-	public int compareTo(ReferenceOrderedDatum o) {
-		return getLocation().compareTo(o.getLocation());
-	}
-
-	protected static class pileupFileIterator implements Iterator<SAMPileupRecord> {
-//		private String fieldDelimiter = new String("\t");
-		private String rodName = null;
-		private XReadLines parser = null;
-		private String lastProcessedLine = null;
-		
-//		private long z = 0;
-//		private long t = 0;
-		
-		pileupFileIterator(String name, java.io.File f) {
-			try {
-				parser = new XReadLines(f);
-			} catch ( FileNotFoundException e ) {
-				Utils.scareUser("Couldn't open file: " + f);
-			}
-			rodName = name;
-		}
-		
-		public boolean hasNext() {
-			return parser.hasNext();
-		}
-
-        /**
-         * @return the next element in the iteration.
-         * @throws NoSuchElementException - iterator has no more elements.
-         */
-		public SAMPileupRecord next() {
-             if (!this.hasNext()) throw new NoSuchElementException("SAMPileupRecord next called on iterator with no more elements");
-//			 if ( z == 0 ) t = System.currentTimeMillis();
-	          lastProcessedLine = parser.next();
-	          SAMPileupRecord n = new SAMPileupRecord(rodName);
-	           //System.out.printf("Line is %s%n", line);
-//	          String parts[] = lastProcessedLine.split(fieldDelimiter);
-//	          n.parseLine(parts);
-
-	          n.parseLine(lastProcessedLine);
-//	          if ( ++z == 1000000 ) { System.out.println(rodName + ": 1,000,000 records read in "+((double)(System.currentTimeMillis()-t)/1000.0) +" s");  z = 0; }
-	          return n ;
-		}
-
-
-		public void remove() {
-			throw new UnsupportedOperationException("'remove' operation is not supported for file-backed SAM pileups");
-		}
-		
-	}
-	
-	
-	public static Iterator<SAMPileupRecord> createIterator(String name, java.io.File f) {
-		return new pileupFileIterator(name,f);
-	}
-	
-
-    
-	public static void main(String argv[]) {
-//		String testFile = "/humgen/gsa-scr1/asivache/TCGA/Ovarian/C2K/0805/normal.pileup";
-		String testFile = "/humgen/gsa-scr1/asivache/trios/CEU/NA12891.12892.12878/mother.chr1.pileup.indel";
-		
-		Iterator<SAMPileupRecord> it = createIterator("test-normal", new java.io.File(testFile));
-		
-		ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker(
-              //new java.io.File(  "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
-              new java.io.File(  "/humgen/gsa-scr1/asivache/trios/CEU/NA12891.12892.12878/human_b36_both.fasta")
-                     );
-
-		if ( reference.getSequenceDictionary() == null ) {
-			System.out.println("No reference sequence dictionary found. Abort.");
-			System.exit(1);
-		}
-
-		GenomeLocParser.setupRefContigOrdering(reference.getSequenceDictionary());
-		
-		int counter = 0;
-		
-		while ( it.hasNext() && counter < 430 ) {
-			SAMPileupRecord p = it.next();
-			
-			System.out.print(p.getLocation().toString());
-			System.out.print('\t');
-			
-			if ( p.isIndel() && p.isSNP() ) { System.out.print("Indel+SNP"); }
-			else {
-				if ( p.isSNP() ) { System.out.print("SNP"); }
-				else { 
-					if ( p.isIndel() ) { System.out.print("Indel"); }
-					else { System.out.print("REF"); }
-				}
-			}
-			
-			System.out.print('\t');
-			System.out.print(p.getFWDAlleles().get(0)+"/"+p.getFWDAlleles().get(1));
-			System.out.print('\t');
-			System.out.println(p.getConsensusConfidence()+"\t"+p.getVariantConfidence());
-			counter++;
-		}
-	}
-
-
-
-
-}
--- a/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java
@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.refdata.features.sampileup;
+
+import org.broad.tribble.FeatureCodec;
+import org.broad.tribble.Feature;
+import org.broad.tribble.exception.CodecLineParsingException;
+import org.broad.tribble.util.ParsingUtils;
+import org.broadinstitute.sting.utils.GenomeLoc;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+
+import static org.broadinstitute.sting.gatk.refdata.features.sampileup.SAMPileupFeature.VariantType;
+
+/**
+ * A Tribble encoder / decoder for SAM pileup data. 
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class SAMPileupCodec implements FeatureCodec {
+    // the number of tokens we expect to parse from a dbSNP line
+    private static final int expectedTokenCount = 10;
+    private static final char fldDelim = '\t';
+
+    // allocate once and don't ever bother creating them again:
+    private static final String baseA = "A";
+    private static final String baseC = "C";
+    private static final String baseG = "G";
+    private static final String baseT = "T";
+    private static final String emptyStr = ""; // we will use this for "reference" allele in insertions
+
+    /**
+     * Return the # of header lines for this file.
+     *
+     * @param f the file
+     * @return 0 in this case, we assume no header lines.  The DbSNP file may have a
+     *         header line beginning with '#', but we can ignore that in the decode function.
+     */
+    public int headerLineCount(File f) {
+        // we don't require a header line, but it may exist.  We'll deal with that above.
+        return 0;
+    }
+
+    public Feature decode(String line) {
+//       0          1             2         3                  4         5            6         7
+//*     chrX     466           T         Y                170      170       88      32 ... (piles of read bases  and quals follow)
+//*    chrX    141444     *     +CA/+CA       32       468     255     25     +CA     *       5       2       12      6
+        String[] tokens = new String[expectedTokenCount];
+
+        // split the line
+        int count = ParsingUtils.split(line,tokens,fldDelim);
+
+        // check to see if we've parsed the string into the right number of tokens (expectedTokenCount)
+        if (count != expectedTokenCount)
+            throw new CodecLineParsingException("the SAM pileup line didn't have the expected number of tokens " +
+                                                "(expected = " + expectedTokenCount + ", saw = " + count + " on " +
+                                                "line = " + line + ")");
+
+        SAMPileupFeature feature = new SAMPileupFeature(tokens[0],
+                                                0,
+                                                0);
+
+        feature.setStart(Integer.parseInt(tokens[1]));
+
+        if(tokens[2].length() != 1)
+            throw new CodecLineParsingException("The SAM pileup line had unexpected base " + tokens[2] + " on line = " + line);
+        feature.setRef(Character.toUpperCase(tokens[2].charAt(0)));
+
+        String observedString = tokens[3].toUpperCase(); // field 3
+        feature.setFWDAlleles(new ArrayList<String>(2));
+
+        feature.setConsensusConfidence(Double.parseDouble(tokens[4]));
+        feature.setVariantConfidence(Double.parseDouble(tokens[5]));
+
+        if ( feature.getRef() == '*' ) {
+            parseIndels(observedString,feature) ;
+            if ( feature.isDeletion() ) feature.setEnd(feature.getStart()+feature.length()-1);
+            else feature.setEnd(feature.getStart()-1); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
+        } else {
+            parseBasesAndQuals(feature,tokens[8],tokens[9]);
+            // if the variant is a SNP or a reference base (i.e. no variant at all)
+            if ( observedString.length() != 1 ) throw new RuntimeException( "point mutation genotype is expected to be represented by a single letter");
+            feature.setRefBases(tokens[2].toUpperCase());
+            feature.setEnd(feature.getStart());
+
+            char ch = observedString.charAt(0);
+
+            switch ( ch ) {
+                case 'A': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseA); break;
+                case 'C': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseC); break;
+                case 'G': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseG); break;
+                case 'T': feature.getFWDAlleles().add(baseT); feature.getFWDAlleles().add(baseT); break;
+                case 'M': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseC); break;
+                case 'R': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseG); break;
+                case 'W': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseT); break;
+                case 'S': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseG); break;
+                case 'Y': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseT); break;
+                case 'K': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseT); break;
+            }
+            if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() && feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ) feature.setVariantType(VariantType.NONE);
+            else {
+                // 	we know that at least one allele is non-ref;
+                // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
+                // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
+                feature.setVariantType(VariantType.SNP);
+                if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() ||
+                        feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ||
+                        feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1))
+                        ) feature.setNumNonRef(1);
+                else feature.setNumNonRef(2); // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
+            }
+        }
+
+        return feature;
+    }
+
+    private void parseIndels(String genotype,SAMPileupFeature feature) {
+        String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit
+
+        // if reference allele is among the observed alleles, we will need to take special care of it since we do not have direct access to the reference;
+        // if we have an insertion, the "reference" allele is going to be empty; if it it is a deletion, we will deduce the "reference allele" bases
+        // from what we have recorded for the deletion allele (e.g. "-CAC")
+        boolean hasRefAllele = false;
+
+        for ( int i = 0 ; i < obs.length ; i++ ) {
+            if ( obs[i].length() == 1 && obs[i].charAt(0) == '*'  ) {
+                hasRefAllele = true;
+                feature.getFWDAlleles().add(emptyStr);
+                continue;
+            }
+
+            String varBases = obs[i].toUpperCase();
+
+            switch ( obs[i].charAt(0) )  {
+                case '+':
+                    if (!feature.isReference() && !feature.isInsertion()) feature.setVariantType(VariantType.INDEL);
+                    else feature.setVariantType(VariantType.INSERTION);
+                    feature.setRefBases(emptyStr);
+                    break;
+                case '-' :
+                    if (!feature.isReference() && !feature.isDeletion()) feature.setVariantType(VariantType.INDEL);
+                    else feature.setVariantType(VariantType.DELETION);
+                    feature.setRefBases(varBases); // remember what was deleted, this will be saved as "reference allele"
+                    break;
+                default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype);
+            }
+            feature.getFWDAlleles().add(varBases);
+            feature.setLength(obs[i].length()-1); // inconsistent for non-biallelic indels!!
+        }
+        if ( hasRefAllele ) {
+            // we got at least one ref. allele (out of two recorded)
+            if (feature.isReference()) { // both top theories are actually ref allele;
+                feature.setNumNonRef(0); // no observations of non-reference allele at all
+                feature.setRefBases(emptyStr);
+            } else {
+                feature.setNumNonRef(1); // hasRefAllele = true, so one allele was definitely ref, hence there is only one left
+            }
+        } else {
+            // we observe two non-ref alleles; they better be the same variant, otherwise the site is not bi-allelic and at the moment we
+            // fail to set data in a consistent way.
+            if ( feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1))) feature.setNumNonRef(1);
+            else feature.setNumNonRef(2);
+        }
+        // DONE with indels
+
+    }
+
+    private void parseBasesAndQuals(SAMPileupFeature feature, final String bases, final String quals)
+    {
+        //System.out.printf("%s%n%s%n", bases, quals);
+
+        // needs to convert the base string with it's . and , to the ref base
+        StringBuilder baseBuilder = new StringBuilder();
+        StringBuilder qualBuilder = new StringBuilder();
+        boolean done = false;
+        for ( int i = 0, j = 0; i < bases.length() && ! done; i++ ) {
+            //System.out.printf("%d %d%n", i, j);
+            char c = (char)bases.charAt(i);
+
+            switch ( c ) {
+                case '.':   // matches reference
+                case ',':   // matches reference
+                    baseBuilder.append(feature.getRef());
+                    qualBuilder.append(quals.charAt(j++));
+                    break;
+                case '$':   // end of read
+                    break;
+                case '*':   // end of indel?
+                    j++;
+                    break;
+                case '^':   // mapping quality
+                    i++;
+                    break;
+                case '+':   // start of indel
+                case '-':   // start of indel
+                    final Pattern regex = Pattern.compile("([0-9]+).*");             // matches case 1
+                    final String rest = bases.substring(i+1);
+                    //System.out.printf("sub is %s%n", rest);
+                    Matcher match = regex.matcher(rest);
+                    if ( ! match.matches() ) {
+                        if ( feature.getRef() != '*' )
+                            throw new RuntimeException("Bad pileup format: " + bases + " at position " + i);
+                        done = true;
+                    }
+                    else {
+                        String g = match.group(1);
+                        //System.out.printf("group is %d, match is %s%n", match.groupCount(), g);
+                        int l = Integer.parseInt(g);
+                        i += l + g.length();    // length of number + that many bases + +/- at the start (included in the next i++)
+                        //System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1));
+                    }
+                    break;
+                default:   // non reference base
+                    baseBuilder.append(c);
+                    qualBuilder.append(quals.charAt(j++));
+            }
+        }
+
+        feature.setPileupBases(baseBuilder.toString());
+        feature.setPileupQuals(qualBuilder.toString());
+    }
+
+}
--- a/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java
@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.refdata.features.sampileup;
+
+import org.broad.tribble.Feature;
+
+import java.util.List;
+
+import net.sf.samtools.util.StringUtil;
+
+/**
+ * A tribble feature representing a SAM pileup.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class SAMPileupFeature implements Feature {
+    public enum VariantType { NONE, SNP, INSERTION, DELETION, INDEL }; 
+
+    private String contig;            // genomic location of this genotyped site
+    private int start;
+    private int stop;
+
+    private char refBaseChar; // what we have set for the reference base (is set to a '*' for indel!)
+    private String refBases;        // the reference base sequence according to NCBI; single base for point mutations, deleted bases for  deletions, empty string for insertions
+
+    private String pileupQuals;     // the read base qualities
+    private String pileupBases;     // the read bases themselves
+
+    private List<String> observedAlleles = null;    // The sequences of the observed alleles (e.g. {"A","C"} for point mutation or {"","+CC"} for het. insertion
+    private VariantType varType = VariantType.NONE;
+    private int nNonref = 0; // number of non-reference alleles observed
+    private int eventLength = 0; // number of inserted or deleted bases    
+
+    private double consensusScore = 0;
+    private double variantScore = 0;
+
+    /**
+     * create the dbSNP feature, given the following information:
+     *
+     * @param contig the contig rsID
+     * @param start  the start position, one based
+     * @param stop   the stop position, one based
+     */
+    SAMPileupFeature(String contig,
+                     int start,
+                     int stop) {
+        this.contig = contig;
+        this.start = start;
+        this.stop = stop;
+    }
+
+    public String getChr() {
+        return contig;
+    }
+
+    protected void setChr(String chr) {
+        this.contig = chr;
+    }
+
+    public int getStart() {
+        return start;
+    }
+
+    protected void setStart(int start) {
+        this.start = start;
+    }
+
+    public int getEnd() {
+        return stop;
+    }
+
+    protected void setEnd(int end) {
+        this.stop = end;
+    }
+
+    public String getQualsAsString()        { return pileupQuals; }
+
+    protected void setPileupQuals(String pileupQuals) {
+        this.pileupQuals = pileupQuals;
+    }
+
+    /** Returns reference base for point genotypes or '*' for indel genotypes, as a char.
+     *
+     */
+    public char getRef()            { return refBaseChar; }
+
+    protected void setRef(char ref) {
+        this.refBaseChar = ref;
+    }
+
+    public int size()               { return pileupQuals.length(); }
+
+    /** Returns pile of observed bases over the current genomic location.
+     *
+     */
+    public String getBasesAsString()        { return pileupBases; }
+
+    protected void setPileupBases(String pileupBases) {
+        this.pileupBases = pileupBases;
+    }
+
+    /** Returns formatted pileup string for the current genomic location as
+     * "location: reference_base observed_base_pile observed_qual_pile"
+     */
+    public String getPileupString()
+    {
+        if(start == stop)
+            return String.format("%s:%d: %s %s %s", getChr(), getStart(), getRef(), getBasesAsString(), getQualsAsString());
+        else
+            return String.format("%s:%d-%d: %s %s %s", getChr(), getStart(), getEnd(), getRef(), getBasesAsString(), getQualsAsString());
+    }
+
+    /**
+     * Gets the bases in byte array form.
+     * @return byte array of the available bases.
+     */
+    public byte[] getBases() {
+        return StringUtil.stringToBytes(getBasesAsString());
+    }
+
+    /**
+     * Gets the Phred base qualities without ASCII offset.
+     * @return Phred base qualities.
+     */
+    public byte[] getQuals() {
+        byte[] quals = StringUtil.stringToBytes(getQualsAsString());
+        for(int i = 0; i < quals.length; i++) quals[i] -= 33;
+        return quals;
+    }
+
+    /** Returns bases in the reference allele as a String. For point genotypes, the string consists of a single
+     * character (reference base). For indel genotypes, the string is empty for insertions into
+     * the reference, or consists of deleted bases for deletions.
+     *
+     * @return reference allele, forward strand
+     */
+    public String getFWDRefBases() {
+        return refBases;
+    }
+
+    protected void setRefBases(String refBases) {
+        this.refBases = refBases;
+    }
+
+    public List<String> getFWDAlleles()  {
+        return observedAlleles;
+    }
+
+    protected void setFWDAlleles(List<String> alleles) {
+        this.observedAlleles = alleles;
+    }
+
+    // ----------------------------------------------------------------------
+    //
+    // What kind of variant are we?
+    //
+    // ----------------------------------------------------------------------
+    public boolean isSNP() { return varType == VariantType.SNP; }
+    public boolean isInsertion() { return varType == VariantType.INSERTION; }
+    public boolean isDeletion() { return varType == VariantType.DELETION ; }
+    public boolean isIndel() { return isInsertion() || isDeletion() || varType == VariantType.INDEL; }
+    public boolean isReference()  { return varType == VariantType.NONE; }
+
+    protected void setVariantType(VariantType variantType) {
+        this.varType = variantType;
+    }
+
+    public boolean isHom() {
+    	// implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype
+    	if ( ! isIndel() ) return ( observedAlleles.get(0).equals(observedAlleles.get(1)) );
+    	return ( isInsertion() || isDeletion() ) && observedAlleles.get(0).equals(observedAlleles.get(1) );
+    }
+
+    public boolean isHet() {
+    	// implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype
+    	if ( ! isIndel() ) return ( !(observedAlleles.get(0).equals(observedAlleles.get(1))) );
+    	return isIndel() || ( ! observedAlleles.get(0).equals(observedAlleles.get(1) ) );
+    }
+
+    public double getVariantConfidence() {
+        return variantScore;
+    }
+
+    protected void setVariantConfidence(double variantScore) {
+        this.variantScore = variantScore;
+    }
+
+    public boolean isBiallelic() {
+        return nNonref  < 2;
+    }
+
+    protected void setNumNonRef(int nNonref) {
+        this.nNonref = nNonref;
+    }
+
+    public double getConsensusConfidence() {
+        return consensusScore;
+    }
+
+    protected void setConsensusConfidence(double consensusScore) {
+        this.consensusScore = consensusScore;
+    }
+
+    public int length() {
+        return eventLength;
+    }
+
+    protected void setLength(int eventLength) {
+        this.eventLength = eventLength;
+    }
+
+	public boolean isIndelGenotype() {
+		return refBaseChar == '*';
+	}
+
+
+	public boolean isPointGenotype() {
+		return ! isIndelGenotype();
+	}
+
+	/** Implements method required by GenotypeList interface. If this object represents
+	 * an indel genotype, then it returns itself through this method. If this object is a
+	 * point genotype, this method returns null.
+	 * @return
+	 */
+	public SAMPileupFeature getIndelGenotype() {
+		if ( isIndelGenotype() ) return this;
+		else return null;
+	}
+
+	/** Implements method required by GenotypeList interface. If this object represents
+	 * a point genotype, then it returns itself through this method. If this object is an
+	 * indel genotype, this method returns null.
+	 * @return
+	 */
+	public SAMPileupFeature getPointGenotype() {
+		if ( isPointGenotype() ) return this;
+		else return null;
+	}
+
+	/** Returns true if this object \em is an indel genotype (and thus
+	 * indel genotype is what it only has).
+	 * @return
+	 */
+	public boolean hasIndelGenotype() {
+		return isIndelGenotype();
+	}
+
+	/** Returns true if this object \em is a point genotype (and thus
+	 * point genotype is what it only has.
+	 * @return
+	 */
+	public boolean hasPointGenotype() {
+		return isPointGenotype();
+	}
+
+
+
+}
--- a/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java
@ -1,225 +0,0 @@
-package org.broadinstitute.sting.gatk.refdata;
-
-
-import java.io.IOException;
-import java.util.*;
-
-import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.StingException;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-
-import net.sf.picard.reference.ReferenceSequenceFileWalker;
-
-/**
- *  This class wraps Maq/samtools allele calls from pileup format and presents them as a ROD.<br>
- *
- * Example format:<br>
- *   for SNP:<br>
- *      [chr]       [pos] [ref]  [consensus allele(s)]    [consensus confidence]  [snp confidence]   [max mapping qual]  [num reads in the pile] <br>
- *     chrX       466     T                     Y                                   170                                 170                          88                              32 ... (piles of read bases  and quals follow) <br>
- *     <br>
- *   for indel: <br>
- *    [chr]         [pos]     [always *]  [consensus alleles]    [consensus conf.?]        [indel conf.?]     [max mapping qual]     [num reads in the pile]  [indel]   [always *?] [reads haveindel]  [reads may have indel] [other reads?]<br>     
- *     chrX       141444        *           +CA/+CA                               32                                       468            255                                         25                                        +CA     *                         5                                   2                                   12             <br>
- * User: asivache
- * Date: Apr 03, 2009
- * Time: 2:58:33 PM
- * To change this template use File | Settings | File Templates.
- */
-
-
-
-public class rodSAMPileup extends BasicReferenceOrderedDatum {
-	// ----------------------------------------------------------------------
-	//
-	// Constructors
-	//
-	// ----------------------------------------------------------------------
-	
-	private SAMPileupRecord indelGenotype = null;
-	private SAMPileupRecord pointGenotype = null;
-	
-	public rodSAMPileup(final String name) {
-		super(name);
-	}
-	
-	@Override
-	public GenomeLoc getLocation() {
-		if ( pointGenotype != null ) return pointGenotype.getLocation();
-		if ( indelGenotype != null ) return indelGenotype.getLocation();
-		return null;
-	}
-
-	/** Required by ReferenceOrderedDatum interface. This implementation provides its own iterator,
-	 * so this method does nothing at all (always returns false).
-	 *  
-	 */
-	@Override
-	public boolean parseLine(Object header, String[] parts) throws IOException {
-		return false;
-	}
-
-	@Override
-	public String toString() {
-		StringBuilder b = new StringBuilder();
-		if ( pointGenotype != null ) {
-			b.append(pointGenotype.toString());
-			if ( indelGenotype != null ) b.append('\n');
-		}
-		if ( indelGenotype != null ) b.append(indelGenotype.toString());
-		return b.toString();
-	}
-
-	public SAMPileupRecord getIndelGenotype() {
-		return indelGenotype;
-	}
-
-	public SAMPileupRecord getPointGenotype() {
-		return pointGenotype;
-	}
-
-	public boolean hasIndelGenotype() {
-		return indelGenotype != null;
-	}
-
-	public boolean hasPointGenotype() {
-		return pointGenotype != null;
-	}
-	
-	/** Iterates over SAM pileup multiline records: each new invocation of next() will 
-	 * move to next genomic position in the pileup file, and if the record for that position 
-	 * consists of multiple lines (as is the case for indels), they all will be read and processed. 
-	 * @author asivache
-	 *
-	 */
-	private static class rodSAMPileupIterator implements Iterator<rodSAMPileup> {
-		//private xReadLines parser = null;
-		private String rodName = null;
-		private PushbackIterator<SAMPileupRecord> parser = null;
-		private long line = 0;
-		
-		rodSAMPileupIterator(String name, java.io.File f) {
-			parser = new PushbackIterator<SAMPileupRecord>(SAMPileupRecord.createIterator(name,f));
-			rodName = name;
-		}
-		
-		public boolean hasNext() {
-			return parser.hasNext();
-		}
-
-        /**
-         * @return the next element in the iteration.
-         * @throws NoSuchElementException - iterator has no more elements.
-         */
-		public rodSAMPileup next() {
-            if (!this.hasNext()) throw new NoSuchElementException("rodSAMPileup next called on iterator with no more elements");
-			rodSAMPileup result = new rodSAMPileup(rodName);
-			
-			SAMPileupRecord r = parser.next();
-			
-	//		if ( (++line)%10000 == 0 ) System.out.printf("%s: record is %d (%s)%n", rodName, line,r.getLocation().toString());
-
-			if ( r.isPointGenotype() ) result.pointGenotype = r;
-			else result.indelGenotype = r;
-			
-			if ( parser.hasNext() ) {
-
-				SAMPileupRecord r2 = parser.next();
-				int cmp = r.getLocation().compareTo(r2.getLocation());
-				switch ( cmp ) {
-				case -1 : // next record is at greater position
-					parser.pushback(r2); // return next record back to the stream
-					break;
-				case 0: // next record is at the same position; in this case r MUST be point and r2 MUST be indel
-					if ( result.indelGenotype != null ) { // oops, we've seen indel already
-						if ( r2.isPointGenotype() ) throw new StingException("SAM pileup file might be corrupted: point genotype follows indel genotype line at "+r.getLocation() );
-						else throw new StingException("SAM pileup file might be corrupted: two indel genotype lines found at same position "+r.getLocation() );
-					} else {
-						// ok, the first genotype we've seen was a point one
-						if ( r2.isPointGenotype() ) throw new StingException("SAM pileup file might be corrupted: two point genotype lines found at same position "+r.getLocation() );
-						// and the second is an indel, wheww...
-						result.indelGenotype = r2;
-					}
-					
-					// the following is solely for the purposes of strict validation of pileup file: since we've already read two lines at the
-					// the same genomic location (point and indel variants), there can not be any more:
-					if ( parser.hasNext() ) {
-						r2 = parser.next();
-						cmp = r.getLocation().compareTo(r2.getLocation() ) ;
-						if ( cmp == 0 ) throw new StingException("SAM pileup file is corrupted: more than two lines found at position "+r.getLocation());
-						if ( cmp > 0 ) throw new StingException("SAM pileup file or reference dictionary is corrupted: lesser position "+r2.getLocation()+" is encountered right after position " 
-								+ r.getLocation() ); 
-						parser.pushback(r2);
-					}
-					
-					break;
-				case 1: // next record is at lower position?? come'on!
-					throw new StingException("SAM pileup file or reference dictionary is corrupted: lesser position "+r2.getLocation()+" is encountered right after position " + r.getLocation() );
-				default:
-					throw new StingException("INTERNAL ERROR: this point should never be reached");
-				}
-			}
-			return result ;
-		}
-
-
-		public void remove() {
-			throw new UnsupportedOperationException("'remove' operation is not supported for file-backed SAM pileups");
-		}
-		
-	}
-	
-	public static Iterator<rodSAMPileup> createIterator(String name, java.io.File file) {
-		return new rodSAMPileup.rodSAMPileupIterator(name,file);
-	}
-
-	public static void main(String argv[]) {
-		String testFile = "/humgen/gsa-scr1/asivache/TCGA/Ovarian/C2K/0805/normal.pileup";
-//		String testFile = "/humgen/gsa-scr1/asivache/trios/CEU/NA12891.12892.12878/mother.chr1.pileup.indel";
-		
-		Iterator<rodSAMPileup> it = createIterator("test-normal", new java.io.File(testFile));
-		
-		ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker(
-              new java.io.File(  "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
-//              new java.io.File(  "/humgen/gsa-scr1/asivache/trios/CEU/NA12891.12892.12878/human_b36_both.fasta")
-                     );
-
-		if ( reference.getSequenceDictionary() == null ) {
-			System.out.println("No reference sequence dictionary found. Abort.");
-			System.exit(1);
-		}
-
-		GenomeLocParser.setupRefContigOrdering(reference.getSequenceDictionary());
-		
-		int counter = 0;
-		
-		while ( it.hasNext() && counter < 430 ) {
-			rodSAMPileup p = it.next();
-			System.out.println(p.toString());
-/*			
-			System.out.print(p.getLocation().toString());
-			System.out.print('\t');
-			
-			if ( p.isIndel() && p.isSNP() ) { System.out.print("Indel+SNP"); }
-			else {
-				if ( p.isSNP() ) { System.out.print("SNP"); }
-				else { 
-					if ( p.isIndel() ) { System.out.print("Indel"); }
-					else { System.out.print("REF"); }
-				}
-			}
-			
-			System.out.print('\t');
-			System.out.print(p.getFWDAlleles().get(0)+"/"+p.getFWDAlleles().get(1));
-			System.out.print('\t');
-			System.out.println(p.getConsensusConfidence()+"\t"+p.getVariantConfidence());
-			*/
-			counter++;
-		}
-	}
-
-
-}
-
-
--- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RODTrackBuilder.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RODTrackBuilder.java
@ -54,7 +54,6 @@ public class RODTrackBuilder implements RMDTrackBuilder {

    static {
        // All known ROD types
-        Types.put("SAMPileup", rodSAMPileup.class);
        Types.put("GELI", rodGELI.class);
        Types.put("RefSeq", rodRefSeq.class);
        Types.put("Table", TabularROD.class);
--- a/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java
@ -28,11 +28,12 @@ package org.broadinstitute.sting.gatk.walkers.qc;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.refdata.SAMPileupRecord;
-import org.broadinstitute.sting.gatk.refdata.rodSAMPileup;
+import org.broadinstitute.sting.gatk.refdata.features.sampileup.SAMPileupFeature;
 import org.broadinstitute.sting.gatk.walkers.*;
 import org.broadinstitute.sting.utils.StingException;
 import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocParser;
 import org.broadinstitute.sting.commandline.Argument;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;

@ -43,14 +44,14 @@ import java.util.Arrays;
 * each overlapping read, and quality score) to the reference pileup data generated by samtools.  Samtools' pileup data
 * should be specified using the command-line argument '-B pileup,SAMPileup,<your sam pileup file>'.
 */
-@Requires(value={DataSource.READS,DataSource.REFERENCE},referenceMetaData=@RMD(name="pileup",type=rodSAMPileup.class))
+@Requires(value={DataSource.READS,DataSource.REFERENCE},referenceMetaData=@RMD(name="pileup",type=SAMPileupFeature.class))
 public class ValidatingPileupWalker extends LocusWalker<Integer, ValidationStats>  implements TreeReducible<ValidationStats> {
    @Argument(fullName="continue_after_error",doc="Continue after an error",required=false)
    public boolean CONTINUE_AFTER_AN_ERROR = false;

    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
        ReadBackedPileup pileup = context.getBasePileup();
-        SAMPileupRecord truePileup = getTruePileup( tracker );
+        SAMPileupFeature truePileup = getTruePileup( tracker );

        if ( truePileup == null ) {
            out.printf("No truth pileup data available at %s%n", pileup.getPileupString(ref.getBase()));
@ -82,11 +83,12 @@ public class ValidatingPileupWalker extends LocusWalker<Integer, ValidationStats
            return x;
    }

-    public static String pileupDiff(final ReadBackedPileup a, final SAMPileupRecord b, boolean orderDependent)
+    public static String pileupDiff(final ReadBackedPileup a, final SAMPileupFeature b, boolean orderDependent)
    {
        if ( a.size() != b.size() )
            return "Sizes not equal";
-        if ( a.getLocation().compareTo(b.getLocation()) != 0 )
+        GenomeLoc featureLocation = GenomeLocParser.createGenomeLoc(b.getChr(),b.getStart(),b.getEnd());
+        if ( a.getLocation().compareTo(featureLocation) != 0 )
            return "Locations not equal";

        String aBases = maybeSorted(new String(a.getBases()), ! orderDependent );
@ -123,8 +125,8 @@ public class ValidatingPileupWalker extends LocusWalker<Integer, ValidationStats
     * @param tracker ROD tracker from which to extract pileup data.
     * @return True pileup data.
     */
-    private SAMPileupRecord getTruePileup( RefMetaDataTracker tracker ) {
-        rodSAMPileup pileup = tracker.lookup("pileup",rodSAMPileup.class);
+    private SAMPileupFeature getTruePileup( RefMetaDataTracker tracker ) {
+        SAMPileupFeature pileup = tracker.lookup("pileup",SAMPileupFeature.class);

        if( pileup == null)
            return null;