diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/SAMPileupRecord.java b/java/src/org/broadinstitute/sting/gatk/refdata/SAMPileupRecord.java
new file mode 100644
index 000000000..737649c34
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/SAMPileupRecord.java
@@ -0,0 +1,526 @@
+package org.broadinstitute.sting.gatk.refdata;
+
+
+import java.io.FileNotFoundException;
+import java.util.*;
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+
+import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
+import org.broadinstitute.sting.secondarybase.BasecallingReadModel;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.Pileup;
+import org.broadinstitute.sting.utils.xReadLines;
+
+import edu.mit.broad.picard.reference.ReferenceSequenceFile;
+import edu.mit.broad.picard.reference.ReferenceSequenceFileWalker;
+
+/**
+ * This class wraps Maq/samtools allele calls from pileup format and presents them as a ROD.
+ *
+ * Example format:
+ * for SNP:
+ * [chr] [pos] [ref] [consensus allele(s)] [consensus confidence] [snp confidence] [max mapping qual] [num reads in the pile]
+ * chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
+ *
+ * for indel:
+ * [chr] [pos] [always *] [consensus alleles] [consensus conf.?] [indel conf.?] [max mapping qual] [num reads in the pile] [indel] [always *?] [reads haveindel] [reads may have indel] [other reads?]
+ * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12
+ * User: asivache
+ * Date: Apr 03, 2009
+ * Time: 2:58:33 PM
+ * To change this template use File | Settings | File Templates.
+ */
+
+
+class SAMPileupRecord implements Genotype, GenotypeList, Pileup {
+ private static final int NO_VARIANT = -1;
+ private static final int SNP_VARIANT = 0;
+ private static final int INSERTION_VARIANT = 1;
+ private static final int DELETION_VARIANT = 2;
+ private static final int INDEL_VARIANT = 3;
+
+ // allocate once and don't ever bother creating them again:
+ private static final String baseA = new String("A");
+ private static final String baseC = new String("C");
+ private static final String baseG = new String("G");
+ private static final String baseT = new String("T");
+ private static final String emptyStr = new String(); // we will use this for "reference" allele in insertions
+
+ protected GenomeLoc loc; // genomic location of this genotyped site
+ // Reference sequence chromosome or scaffold
+ // Start and stop positions in chrom
+
+ protected char refBaseChar; // what we have set for the reference base (is set to a '*' for indel!)
+ protected String refBases; // the reference base sequence according to NCBI; single base for point mutations, deleted bases for deletions, empty string for insertions
+ protected String observedString; // stores the actual string representation of observed alleles (for point mutations stores as A/C, not in extended alphabet!!)
+
+ protected String pileupQuals; // the read base qualities
+ protected String pileupBases; // the read bases themselves
+
+ protected List observedAlleles = null; // The sequences of the observed alleles (e.g. {"A","C"} for point mutation or {"","+CC"} for het. insertion
+ protected int varType = NO_VARIANT;
+ protected int ploidy = 2; // how many allelic variants we have?
+ protected int nNonref = 0; // number of non-reference alleles observed
+ protected int eventLength = 0; // number of inserted or deleted bases
+
+ protected double consensusScore = 0;
+ protected double variantScore = 0;
+
+ private String name = emptyStr;
+ // ----------------------------------------------------------------------
+ //
+ // Constructors
+ //
+ // ----------------------------------------------------------------------
+ public SAMPileupRecord(String name) {
+ this.name = name; // keeps track name
+ }
+
+ /** Parses a line from SAM pileup file into this SAMPileupRecord object..
+ * @param parts line from SAM pileup file, split on delimiter character (tab)
+ */
+ public boolean parseLine(final String[] parts) {
+// 0 1 2 3 4 5 6 7
+// * chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
+// * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6
+ try {
+
+ String contig = parts[0];
+ long start = Long.parseLong(parts[1]) ;
+
+ refBaseChar = Character.toUpperCase(parts[2].charAt(0));
+
+ parts[3] = parts[3].toUpperCase();
+ observedString = parts[3];
+ observedAlleles = new ArrayList(2);
+
+ consensusScore = Double.parseDouble(parts[4]);
+ variantScore = Double.parseDouble(parts[5]);
+
+ if ( refBaseChar == '*' ) {
+
+ parseIndels(parts[3]) ;
+ if ( varType == DELETION_VARIANT ) loc = new GenomeLoc(contig, start, start+eventLength-1);
+ else loc = new GenomeLoc(contig, start, start-1); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
+ }
+ else {
+ parseBasesAndQuals(parts[8], parts[9]);
+ // if the variant is a SNP or a reference base (i.e. no variant at all)
+ if ( parts[3].length() != 1 ) throw new RuntimeException( "point mutation genotype is expected to be represented by a single letter");
+
+ refBases = parts[2].toUpperCase();
+ eventLength = 1;
+ //loc = new GenomeLoc(contig, start, start+1);
+ loc = new GenomeLoc(contig, start, start);
+
+ char ch = parts[3].charAt(0);
+
+ switch ( ch ) {
+ case 'A': observedAlleles.add(baseA); observedAlleles.add(baseA); break;
+ case 'C': observedAlleles.add(baseC); observedAlleles.add(baseC); break;
+ case 'G': observedAlleles.add(baseG); observedAlleles.add(baseG); break;
+ case 'T': observedAlleles.add(baseT); observedAlleles.add(baseT); break;
+ case 'M': observedAlleles.add(baseA); observedAlleles.add(baseC); break;
+ case 'R': observedAlleles.add(baseA); observedAlleles.add(baseG); break;
+ case 'W': observedAlleles.add(baseA); observedAlleles.add(baseT); break;
+ case 'S': observedAlleles.add(baseC); observedAlleles.add(baseG); break;
+ case 'Y': observedAlleles.add(baseC); observedAlleles.add(baseT); break;
+ case 'K': observedAlleles.add(baseG); observedAlleles.add(baseT); break;
+ }
+ if ( observedAlleles.get(0).charAt(0) == refBaseChar && observedAlleles.get(1).charAt(0) == refBaseChar ) varType = NO_VARIANT;
+ else {
+ // we know that at least one allele is non-ref;
+ // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
+ // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
+ varType = SNP_VARIANT;
+ if ( observedAlleles.get(0).charAt(0) == refBaseChar ||
+ observedAlleles.get(1).charAt(0) == refBaseChar ||
+ observedAlleles.get(0) == observedAlleles.get(1)
+ ) nNonref = 1;
+ else nNonref = 2; // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
+ }
+ }
+ } catch ( RuntimeException e ) {
+ System.out.printf(" Exception caught during parsing BasicPileup line: %s%n", Utils.join(" <=> ", parts));
+ throw e;
+ }
+ return true;
+ // if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant : ("+refBaseChar+") -->"+toMediumString());
+ }
+
+
+ private void parseIndels(String genotype) {
+ String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit
+
+ // if reference allele is among the observed alleles, we will need to take special care of it since we do not have direct access to the reference;
+ // if we have an insertion, the "reference" allele is going to be empty; if it it is a deletion, we will deduce the "reference allele" bases
+ // from what we have recorded for the deletion allele (e.g. "-CAC")
+ boolean hasRefAllele = false;
+
+ for ( int i = 0 ; i < obs.length ; i++ ) {
+ if ( obs[i].length() == 1 && obs[i].charAt(0) == '*' ) {
+ hasRefAllele = true;
+ observedAlleles.add(emptyStr);
+ continue;
+ }
+
+ String varBases = obs[i].toUpperCase();
+
+ switch ( obs[i].charAt(0) ) {
+ case '+':
+ if ( varType != NO_VARIANT && varType != INSERTION_VARIANT ) varType = INDEL_VARIANT;
+ else varType = INSERTION_VARIANT;
+ refBases = emptyStr;
+ break;
+ case '-' :
+ if ( varType != NO_VARIANT && varType != DELETION_VARIANT ) varType = INDEL_VARIANT;
+ else varType = DELETION_VARIANT;
+ refBases = varBases; // remember what was deleted, this will be saved as "reference allele"
+ break;
+ default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype);
+ }
+ observedAlleles.add(varBases);
+ eventLength = obs[i].length() - 1; // inconsistent for non-biallelic indels!!
+ }
+ if ( hasRefAllele ) {
+ // we got at least one ref. allele (out of two recorded)
+ if ( varType == NO_VARIANT ) { // both top theories are actually ref allele;
+ nNonref = 0; // no observations of non-reference allele at all
+ refBases = emptyStr;
+ } else {
+ nNonref = 1; // hasRefAllele = true, so one allele was definitely ref, hence there is only one left
+ }
+ } else {
+ // we observe two non-ref alleles; they better be the same variant, otherwise the site is not bi-allelic and at the moment we
+ // fail to set data in a consistent way.
+ if ( observedAlleles.get(0).equals(observedAlleles.get(1)) ) nNonref = 1;
+ else nNonref = 2;
+ }
+ // DONE with indels
+
+ }
+
+ private void parseBasesAndQuals(final String bases, final String quals)
+ {
+ //System.out.printf("%s%n%s%n", bases, quals);
+
+ // needs to convert the base string with it's . and , to the ref base
+ StringBuilder baseBuilder = new StringBuilder();
+ StringBuilder qualBuilder = new StringBuilder();
+ boolean done = false;
+ for ( int i = 0, j = 0; i < bases.length() && ! done; i++ ) {
+ //System.out.printf("%d %d%n", i, j);
+ char c = (char)bases.charAt(i);
+
+ switch ( c ) {
+ case '.': // matches reference
+ case ',': // matches reference
+ baseBuilder.append(refBaseChar);
+ qualBuilder.append((char)quals.charAt(j++));
+ break;
+ case '$': // end of read
+ break;
+ case '*': // end of indel?
+ j++;
+ break;
+ case '^': // mapping quality
+ i++;
+ break;
+ case '+': // start of indel
+ case '-': // start of indel
+ final Pattern regex = Pattern.compile("([0-9]+).*"); // matches case 1
+ final String rest = bases.substring(i+1);
+ //System.out.printf("sub is %s%n", rest);
+ Matcher match = regex.matcher(rest);
+ if ( ! match.matches() ) {
+ if ( refBaseChar != '*' )
+ throw new RuntimeException("Bad pileup format: " + bases + " at position " + i);
+ done = true;
+ }
+ else {
+ String g = match.group(1);
+ //System.out.printf("group is %d, match is %s%n", match.groupCount(), g);
+ int l = Integer.parseInt(g);
+ i += l + g.length(); // length of number + that many bases + +/- at the start (included in the next i++)
+ //System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1));
+ }
+ break;
+ default: // non reference base
+ baseBuilder.append(c);
+ qualBuilder.append((char)quals.charAt(j++));
+ }
+ }
+
+ pileupBases = baseBuilder.toString();
+ pileupQuals = qualBuilder.toString();
+ }
+
+ public GenomeLoc getLocation() { return loc; }
+ public String getQuals() { return pileupQuals; }
+
+ /** Returns reference base for point genotypes or '*' for indel genotypes, as a char.
+ *
+ */
+ public char getRef() { return refBaseChar; }
+ public int size() { return pileupQuals.length(); }
+
+ /** Returns pile of observed bases over the current genomic location.
+ *
+ */
+ public String getBases() { return pileupBases; }
+
+ /** Returns formatted pileup string for the current genomic location as
+ * "location: reference_base observed_base_pile observed_qual_pile"
+ */
+ public String getPileupString()
+ {
+ return String.format("%s: %s %s %s", getLocation(), getRef(), getBases(), getQuals());
+ }
+
+
+ /** Returns bases in the reference allele as a String. For point genotypes, the string consists of a single
+ * character (reference base). For indel genotypes, the string is empty for insertions into
+ * the reference, or consists of deleted bases for deletions.
+ *
+ * @return reference allele, forward strand
+ */
+ @Override
+ public String getFWDRefBases() {
+ return refBases;
+ }
+
+
+ @Override
+ public List getFWDAlleles() {
+ return observedAlleles;
+ }
+
+ // ----------------------------------------------------------------------
+ //
+ // What kind of variant are we?
+ //
+ // ----------------------------------------------------------------------
+ public boolean isSNP() { return varType == SNP_VARIANT ; }
+ public boolean isInsertion() { return varType == INSERTION_VARIANT; }
+ public boolean isDeletion() { return varType == DELETION_VARIANT ; }
+ public boolean isIndel() { return isInsertion() || isDeletion() || varType == INDEL_VARIANT; }
+ public boolean isReference() { return varType == NO_VARIANT; }
+
+ public boolean isHom() {
+ // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype
+ if ( ! isIndel() ) return ( observedAlleles.get(0) == observedAlleles.get(1) );
+ return ( isInsertion() || isDeletion() ) && observedAlleles.get(0).equals(observedAlleles.get(1) );
+ }
+
+ public boolean isHet() {
+ // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype
+ if ( ! isIndel() ) return ( observedAlleles.get(0) != observedAlleles.get(1) );
+ return isIndel() || ( ! observedAlleles.get(0).equals(observedAlleles.get(1) ) );
+ }
+
+ // ----------------------------------------------------------------------
+ //
+ // formatting
+ //
+ // ----------------------------------------------------------------------
+ public String toString() {
+ return String.format("%s\t%d\t%d\t%s\t%s\t%s",
+ getLocation().getContig(), getLocation().getStart(), getLocation().getStop(), name, refBases, observedString);
+ }
+
+ public String toSimpleString() {
+ return String.format("%s:%s", name, observedString);
+ }
+
+ public String toMediumString() {
+
+ String s = null;
+ if ( refBaseChar == '*' ) s = String.format("%s:%s:%s", getLocation().toString(), name,observedString);
+ else s = String.format("%s:%s:%s/%s", getLocation().toString(), name, observedAlleles.get(0),observedAlleles.get(1));
+ if ( isSNP() ) s += ": SNP";
+ else {
+ if ( isInsertion() ) s += ": Insertion";
+ else {
+ if ( isDeletion() ) s+= ": Deletion";
+ else {
+ if ( isIndel() ) s+=": Indel";
+ else s+=": Reference";
+ }
+ }
+ }
+ return s;
+ }
+
+ public String repl() {
+ return String.format("REPL not implemented yet");
+ }
+
+
+ @Override
+ public double getVariantConfidence() {
+ return variantScore;
+ }
+
+
+ @Override
+ public boolean isBiallelic() {
+ return nNonref < 2;
+ }
+
+ @Override
+ public double getConsensusConfidence() {
+ return consensusScore;
+ }
+
+ @Override
+ public boolean isIndelGenotype() {
+ return refBaseChar == '*';
+ }
+
+
+ @Override
+ public boolean isPointGenotype() {
+ return ! isIndelGenotype();
+ }
+
+ /** Implements method required by GenotypeList interface. If this object represents
+ * an indel genotype, then it returns itself through this method. If this object is a
+ * point genotype, this method returns null.
+ * @return
+ */
+ @Override
+ public Genotype getIndelGenotype() {
+ if ( isIndelGenotype() ) return this;
+ else return null;
+ }
+
+ /** Implements method required by GenotypeList interface. If this object represents
+ * a point genotype, then it returns itself through this method. If this object is an
+ * indel genotype, this method returns null.
+ * @return
+ */
+ @Override
+ public Genotype getPointGenotype() {
+ if ( isPointGenotype() ) return this;
+ else return null;
+ }
+
+ /** Returns true if this object \em is an indel genotype (and thus
+ * indel genotype is what it only has).
+ * @return
+ */
+ @Override
+ public boolean hasIndelGenotype() {
+ return isIndelGenotype();
+ }
+
+ /** Returns true if this object \em is a point genotype (and thus
+ * point genotype is what it only has.
+ * @return
+ */
+ @Override
+ public boolean hasPointGenotype() {
+ return isPointGenotype();
+ }
+
+
+ @Override
+ public int compareTo(ReferenceOrderedDatum o) {
+ return getLocation().compareTo(o.getLocation());
+ }
+
+ protected static class pileupFileIterator implements Iterator {
+ private String fieldDelimiter = new String("\t");
+ private String rodName = null;
+ private xReadLines parser = null;
+ private String lastProcessedLine = null;
+
+ pileupFileIterator(String name, java.io.File f) {
+ try {
+ parser = new xReadLines(f);
+ } catch ( FileNotFoundException e ) {
+ Utils.scareUser("Couldn't open file: " + f);
+ }
+ rodName = name;
+ }
+
+ @Override
+ public boolean hasNext() {
+ return parser.hasNext();
+ }
+
+ @Override
+ public SAMPileupRecord next() {
+ lastProcessedLine = parser.next();
+ //System.out.printf("Line is %s%n", line);
+ String parts[] = lastProcessedLine.split(fieldDelimiter);
+ SAMPileupRecord n = new SAMPileupRecord(rodName);
+ n.parseLine(parts);
+
+ return n ;
+ }
+
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException("'remove' operation is not supported for file-backed SAM pileups");
+ }
+
+ }
+
+
+ public static Iterator createIterator(String name, java.io.File f) {
+ return new pileupFileIterator(name,f);
+ }
+
+
+
+ public static void main(String argv[]) {
+// String testFile = "/humgen/gsa-scr1/asivache/TCGA/Ovarian/C2K/0805/normal.pileup";
+ String testFile = "/humgen/gsa-scr1/asivache/trios/CEU/NA12891.12892.12878/mother.chr1.pileup.indel";
+
+ Iterator it = createIterator("test-normal", new java.io.File(testFile));
+
+ ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker(
+ //new java.io.File( "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
+ new java.io.File( "/humgen/gsa-scr1/asivache/trios/CEU/NA12891.12892.12878/human_b36_both.fasta")
+ );
+
+ if ( reference.getSequenceDictionary() == null ) {
+ System.out.println("No reference sequence dictionary found. Abort.");
+ System.exit(1);
+ }
+
+ GenomeLoc.setupRefContigOrdering(reference.getSequenceDictionary());
+
+ int counter = 0;
+
+ while ( it.hasNext() && counter < 430 ) {
+ SAMPileupRecord p = it.next();
+
+ System.out.print(p.getLocation().toString());
+ System.out.print('\t');
+
+ if ( p.isIndel() && p.isSNP() ) { System.out.print("Indel+SNP"); }
+ else {
+ if ( p.isSNP() ) { System.out.print("SNP"); }
+ else {
+ if ( p.isIndel() ) { System.out.print("Indel"); }
+ else { System.out.print("REF"); }
+ }
+ }
+
+ System.out.print('\t');
+ System.out.print(p.getFWDAlleles().get(0)+"/"+p.getFWDAlleles().get(1));
+ System.out.print('\t');
+ System.out.println(p.getConsensusConfidence()+"\t"+p.getVariantConfidence());
+ counter++;
+ }
+ }
+
+
+
+
+}