diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/SAMPileupRecord.java b/java/src/org/broadinstitute/sting/gatk/refdata/SAMPileupRecord.java
deleted file mode 100644
index 1b90b55bf..000000000
--- a/java/src/org/broadinstitute/sting/gatk/refdata/SAMPileupRecord.java
+++ /dev/null
@@ -1,693 +0,0 @@
-/*
- * Copyright (c) 2010 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.refdata;
-
-
-import java.io.FileNotFoundException;
-import java.util.*;
-import java.util.regex.Pattern;
-import java.util.regex.Matcher;
-
-import org.broadinstitute.sting.utils.*;
-import org.broadinstitute.sting.utils.text.XReadLines;
-
-import net.sf.picard.reference.ReferenceSequenceFileWalker;
-import net.sf.samtools.util.StringUtil;
-
-/**
- * This class wraps Maq/samtools allele calls from pileup format and presents them as a ROD.
- *
- * Example format:
- * for SNP:
- * [chr] [pos] [ref] [consensus allele(s)] [consensus confidence] [snp confidence] [max mapping qual] [num reads in the pile]
- * chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
- *
- * for indel:
- * [chr] [pos] [always *] [consensus alleles] [consensus conf.?] [indel conf.?] [max mapping qual] [num reads in the pile] [indel] [always *?] [reads haveindel] [reads may have indel] [other reads?]
- * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12
- * User: asivache
- * Date: Apr 03, 2009
- * Time: 2:58:33 PM
- * To change this template use File | Settings | File Templates.
- */
-
-
-public class SAMPileupRecord {
- private static final int NO_VARIANT = -1;
- private static final int SNP_VARIANT = 0;
- private static final int INSERTION_VARIANT = 1;
- private static final int DELETION_VARIANT = 2;
- private static final int INDEL_VARIANT = 3;
-
- // allocate once and don't ever bother creating them again:
- private static final String baseA = new String("A");
- private static final String baseC = new String("C");
- private static final String baseG = new String("G");
- private static final String baseT = new String("T");
- private static final String emptyStr = new String(); // we will use this for "reference" allele in insertions
-
- protected GenomeLoc loc; // genomic location of this genotyped site
- // Reference sequence chromosome or scaffold
- // Start and stop positions in chrom
-
- protected char refBaseChar; // what we have set for the reference base (is set to a '*' for indel!)
- protected String refBases; // the reference base sequence according to NCBI; single base for point mutations, deleted bases for deletions, empty string for insertions
- protected String observedString; // stores the actual string representation of observed alleles (for point mutations stores as A/C, not in extended alphabet!!)
-
- protected String pileupQuals; // the read base qualities
- protected String pileupBases; // the read bases themselves
-
- protected List observedAlleles = null; // The sequences of the observed alleles (e.g. {"A","C"} for point mutation or {"","+CC"} for het. insertion
- protected int varType = NO_VARIANT;
- protected int ploidy = 2; // how many allelic variants we have?
- protected int nNonref = 0; // number of non-reference alleles observed
- protected int eventLength = 0; // number of inserted or deleted bases
-
- protected double consensusScore = 0;
- protected double variantScore = 0;
-
- private char fldDelim = '\t';
- private String name = emptyStr;
- // ----------------------------------------------------------------------
- //
- // Constructors
- //
- // ----------------------------------------------------------------------
- public SAMPileupRecord(String name) {
- this.name = name; // keeps track name
- }
-
- public boolean parseLine(final String line) {
-// 0 1 2 3 4 5 6 7
-//* chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
-//* chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6
- try {
-
- int[] pos = Utils.indexOfAll(line, fldDelim );
- String contig = line.substring(0,pos[0] ); // field 0
- long start = Long.parseLong(line.substring(pos[0]+1, pos[1])) ; // field 1
-
- refBaseChar = Character.toUpperCase(line.charAt(pos[1]+1)); // field 2 ( single char)
-
- observedString = line.substring(pos[2]+1, pos[3]).toUpperCase(); // field 3
- observedAlleles = new ArrayList(2);
-
- consensusScore = Double.parseDouble(line.substring(pos[3]+1,pos[4]));
- variantScore = Double.parseDouble(line.substring(pos[4]+1, pos[5]));
-
- if ( refBaseChar == '*' ) {
-
- parseIndels(observedString) ;
- if ( varType == DELETION_VARIANT ) loc = GenomeLocParser.createGenomeLoc(contig, start, start+eventLength-1);
- else loc = GenomeLocParser.createGenomeLoc(contig, start, start-1); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
- } else {
- parseBasesAndQuals(line,pos[7]+1,pos[8], pos[8]+1, ( pos.length > 9 ? pos[9] : line.length()) );
-// parseBasesAndQuals(line.substring(pos[7]+1,pos[8]), line.substring(pos[8]+1, ( pos.length > 9 ? pos[9] : line.length()) ) );
- // if the variant is a SNP or a reference base (i.e. no variant at all)
- if ( observedString.length() != 1 ) throw new RuntimeException( "point mutation genotype is expected to be represented by a single letter");
-
- refBases = line.substring(pos[1]+1, pos[2]).toUpperCase();
- eventLength = 1;
- //loc = new GenomeLoc(contig, start, start+1);
- loc = GenomeLocParser.createGenomeLoc(contig, start, start);
-
- char ch = observedString.charAt(0);
-
- switch ( ch ) {
- case 'A': observedAlleles.add(baseA); observedAlleles.add(baseA); break;
- case 'C': observedAlleles.add(baseC); observedAlleles.add(baseC); break;
- case 'G': observedAlleles.add(baseG); observedAlleles.add(baseG); break;
- case 'T': observedAlleles.add(baseT); observedAlleles.add(baseT); break;
- case 'M': observedAlleles.add(baseA); observedAlleles.add(baseC); break;
- case 'R': observedAlleles.add(baseA); observedAlleles.add(baseG); break;
- case 'W': observedAlleles.add(baseA); observedAlleles.add(baseT); break;
- case 'S': observedAlleles.add(baseC); observedAlleles.add(baseG); break;
- case 'Y': observedAlleles.add(baseC); observedAlleles.add(baseT); break;
- case 'K': observedAlleles.add(baseG); observedAlleles.add(baseT); break;
- }
- if ( observedAlleles.get(0).charAt(0) == refBaseChar && observedAlleles.get(1).charAt(0) == refBaseChar ) varType = NO_VARIANT;
- else {
- // we know that at least one allele is non-ref;
- // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
- // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
- varType = SNP_VARIANT;
- if ( observedAlleles.get(0).charAt(0) == refBaseChar ||
- observedAlleles.get(1).charAt(0) == refBaseChar ||
- observedAlleles.get(0) == observedAlleles.get(1)
- ) nNonref = 1;
- else nNonref = 2; // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
- }
- }
- } catch ( RuntimeException e ) {
- System.out.printf(" Exception caught during parsing BasicPileup line: %s%n", line);
- throw e;
- }
- return true;
- // if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant : ("+refBaseChar+") -->"+toMediumString());
- }
-
-
- /** Parses a line from SAM pileup file into this SAMPileupRecord object..
- * @param parts line from SAM pileup file, split on delimiter character (tab)
- */
- public boolean parseLine(final String[] parts) {
-// 0 1 2 3 4 5 6 7
-// * chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
-// * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6
- try {
-
- String contig = parts[0];
- long start = Long.parseLong(parts[1]) ;
-
- refBaseChar = Character.toUpperCase(parts[2].charAt(0));
-
- parts[3] = parts[3].toUpperCase();
- observedString = parts[3];
- observedAlleles = new ArrayList(2);
-
- consensusScore = Double.parseDouble(parts[4]);
- variantScore = Double.parseDouble(parts[5]);
-
- if ( refBaseChar == '*' ) {
-
- parseIndels(parts[3]) ;
- if ( varType == DELETION_VARIANT ) loc = GenomeLocParser.parseGenomeLoc(contig, start, start+eventLength-1);
- else loc = GenomeLocParser.parseGenomeLoc(contig, start, start-1); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
- }
- else {
- parseBasesAndQuals(parts[8], parts[9]);
- // if the variant is a SNP or a reference base (i.e. no variant at all)
- if ( parts[3].length() != 1 ) throw new RuntimeException( "point mutation genotype is expected to be represented by a single letter");
-
- refBases = parts[2].toUpperCase();
- eventLength = 1;
- //loc = GenomeLoc.parseGenomeLoc(contig, start, start+1);
- loc = GenomeLocParser.parseGenomeLoc(contig, start, start);
-
- char ch = parts[3].charAt(0);
-
- switch ( ch ) {
- case 'A': observedAlleles.add(baseA); observedAlleles.add(baseA); break;
- case 'C': observedAlleles.add(baseC); observedAlleles.add(baseC); break;
- case 'G': observedAlleles.add(baseG); observedAlleles.add(baseG); break;
- case 'T': observedAlleles.add(baseT); observedAlleles.add(baseT); break;
- case 'M': observedAlleles.add(baseA); observedAlleles.add(baseC); break;
- case 'R': observedAlleles.add(baseA); observedAlleles.add(baseG); break;
- case 'W': observedAlleles.add(baseA); observedAlleles.add(baseT); break;
- case 'S': observedAlleles.add(baseC); observedAlleles.add(baseG); break;
- case 'Y': observedAlleles.add(baseC); observedAlleles.add(baseT); break;
- case 'K': observedAlleles.add(baseG); observedAlleles.add(baseT); break;
- }
- if ( observedAlleles.get(0).charAt(0) == refBaseChar && observedAlleles.get(1).charAt(0) == refBaseChar ) varType = NO_VARIANT;
- else {
- // we know that at least one allele is non-ref;
- // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
- // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
- varType = SNP_VARIANT;
- if ( observedAlleles.get(0).charAt(0) == refBaseChar ||
- observedAlleles.get(1).charAt(0) == refBaseChar ||
- observedAlleles.get(0) == observedAlleles.get(1)
- ) nNonref = 1;
- else nNonref = 2; // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
- }
- }
- } catch ( RuntimeException e ) {
- System.out.printf(" Exception caught during parsing BasicPileup line: %s%n", Utils.join(" <=> ", parts));
- throw e;
- }
- return true;
- // if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant : ("+refBaseChar+") -->"+toMediumString());
- }
-
-
- private void parseIndels(String genotype) {
- String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit
-
- // if reference allele is among the observed alleles, we will need to take special care of it since we do not have direct access to the reference;
- // if we have an insertion, the "reference" allele is going to be empty; if it it is a deletion, we will deduce the "reference allele" bases
- // from what we have recorded for the deletion allele (e.g. "-CAC")
- boolean hasRefAllele = false;
-
- for ( int i = 0 ; i < obs.length ; i++ ) {
- if ( obs[i].length() == 1 && obs[i].charAt(0) == '*' ) {
- hasRefAllele = true;
- observedAlleles.add(emptyStr);
- continue;
- }
-
- String varBases = obs[i].toUpperCase();
-
- switch ( obs[i].charAt(0) ) {
- case '+':
- if ( varType != NO_VARIANT && varType != INSERTION_VARIANT ) varType = INDEL_VARIANT;
- else varType = INSERTION_VARIANT;
- refBases = emptyStr;
- break;
- case '-' :
- if ( varType != NO_VARIANT && varType != DELETION_VARIANT ) varType = INDEL_VARIANT;
- else varType = DELETION_VARIANT;
- refBases = varBases; // remember what was deleted, this will be saved as "reference allele"
- break;
- default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype);
- }
- observedAlleles.add(varBases);
- eventLength = obs[i].length() - 1; // inconsistent for non-biallelic indels!!
- }
- if ( hasRefAllele ) {
- // we got at least one ref. allele (out of two recorded)
- if ( varType == NO_VARIANT ) { // both top theories are actually ref allele;
- nNonref = 0; // no observations of non-reference allele at all
- refBases = emptyStr;
- } else {
- nNonref = 1; // hasRefAllele = true, so one allele was definitely ref, hence there is only one left
- }
- } else {
- // we observe two non-ref alleles; they better be the same variant, otherwise the site is not bi-allelic and at the moment we
- // fail to set data in a consistent way.
- if ( observedAlleles.get(0).equals(observedAlleles.get(1)) ) nNonref = 1;
- else nNonref = 2;
- }
- // DONE with indels
-
- }
-
- private void parseBasesAndQuals(final String bases, final String quals)
- {
- //System.out.printf("%s%n%s%n", bases, quals);
-
- // needs to convert the base string with it's . and , to the ref base
- StringBuilder baseBuilder = new StringBuilder();
- StringBuilder qualBuilder = new StringBuilder();
- boolean done = false;
- for ( int i = 0, j = 0; i < bases.length() && ! done; i++ ) {
- //System.out.printf("%d %d%n", i, j);
- char c = (char)bases.charAt(i);
-
- switch ( c ) {
- case '.': // matches reference
- case ',': // matches reference
- baseBuilder.append(refBaseChar);
- qualBuilder.append((char)quals.charAt(j++));
- break;
- case '$': // end of read
- break;
- case '*': // end of indel?
- j++;
- break;
- case '^': // mapping quality
- i++;
- break;
- case '+': // start of indel
- case '-': // start of indel
- final Pattern regex = Pattern.compile("([0-9]+).*"); // matches case 1
- final String rest = bases.substring(i+1);
- //System.out.printf("sub is %s%n", rest);
- Matcher match = regex.matcher(rest);
- if ( ! match.matches() ) {
- if ( refBaseChar != '*' )
- throw new RuntimeException("Bad pileup format: " + bases + " at position " + i);
- done = true;
- }
- else {
- String g = match.group(1);
- //System.out.printf("group is %d, match is %s%n", match.groupCount(), g);
- int l = Integer.parseInt(g);
- i += l + g.length(); // length of number + that many bases + +/- at the start (included in the next i++)
- //System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1));
- }
- break;
- default: // non reference base
- baseBuilder.append(c);
- qualBuilder.append((char)quals.charAt(j++));
- }
- }
-
- pileupBases = baseBuilder.toString();
- pileupQuals = qualBuilder.toString();
- }
-
- private void parseBasesAndQuals(final String line, int basesStart, int basesStop, int qualsStart, int qualsStop)
- {
- //System.out.printf("%s%n%s%n", bases, quals);
-
- // needs to convert the base string with it's . and , to the ref base
- StringBuilder baseBuilder = new StringBuilder();
- StringBuilder qualBuilder = new StringBuilder();
- boolean done = false;
- for ( int i = basesStart, j = qualsStart; i < basesStop && ! done; i++ ) {
- //System.out.printf("%d %d%n", i, j);
- char c = (char)line.charAt(i);
-
- switch ( c ) {
- case '.': // matches reference
- case ',': // matches reference
- baseBuilder.append(refBaseChar);
- qualBuilder.append((char)line.charAt(j++));
- break;
- case '$': // end of read
- break;
- case '*': // end of indel?
- j++;
- break;
- case '^': // mapping quality
- i++;
- break;
- case '+': // start of indel
- case '-': // start of indel
- final Pattern regex = Pattern.compile("([0-9]+).*"); // matches case 1
- final String rest = line.substring(i+1,basesStop);
- //System.out.printf("sub is %s%n", rest);
- Matcher match = regex.matcher(rest);
- if ( ! match.matches() ) {
- if ( refBaseChar != '*' )
- throw new RuntimeException("Bad pileup format: " + line.substring(basesStart, basesStop) + " at position " + (i-basesStart));
- done = true;
- }
- else {
- String g = match.group(1);
- //System.out.printf("group is %d, match is %s%n", match.groupCount(), g);
- int l = Integer.parseInt(g);
- i += l + g.length(); // length of number + that many bases + +/- at the start (included in the next i++)
- //System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1));
- }
- break;
- default: // non reference base
- baseBuilder.append(c);
- qualBuilder.append((char)line.charAt(j++));
- }
- }
-
- pileupBases = baseBuilder.toString();
- pileupQuals = qualBuilder.toString();
- }
-
-
- public GenomeLoc getLocation() { return loc; }
- public String getQualsAsString() { return pileupQuals; }
-
- /** Returns reference base for point genotypes or '*' for indel genotypes, as a char.
- *
- */
- public char getRef() { return refBaseChar; }
- public int size() { return pileupQuals.length(); }
-
- /** Returns pile of observed bases over the current genomic location.
- *
- */
- public String getBasesAsString() { return pileupBases; }
-
- /** Returns formatted pileup string for the current genomic location as
- * "location: reference_base observed_base_pile observed_qual_pile"
- */
- public String getPileupString()
- {
- return String.format("%s: %s %s %s", getLocation(), getRef(), getBasesAsString(), getQualsAsString());
- }
-
- public ArrayList getBasesAsArrayList() { throw new StingException("Not implemented"); }
- public ArrayList getQualsAsArrayList() { throw new StingException("Not implemented"); }
-
- /**
- * Gets the bases in byte array form.
- * @return byte array of the available bases.
- */
- public byte[] getBases() {
- return StringUtil.stringToBytes(getBasesAsString());
- }
-
- /**
- * Gets the Phred base qualities without ASCII offset.
- * @return Phred base qualities.
- */
- public byte[] getQuals() {
- byte[] quals = StringUtil.stringToBytes(getQualsAsString());
- for(int i = 0; i < quals.length; i++) quals[i] -= 33;
- return quals;
- }
-
- /** Returns bases in the reference allele as a String. For point genotypes, the string consists of a single
- * character (reference base). For indel genotypes, the string is empty for insertions into
- * the reference, or consists of deleted bases for deletions.
- *
- * @return reference allele, forward strand
- */
- public String getFWDRefBases() {
- return refBases;
- }
-
-
- public List getFWDAlleles() {
- return observedAlleles;
- }
-
- // ----------------------------------------------------------------------
- //
- // What kind of variant are we?
- //
- // ----------------------------------------------------------------------
- public boolean isSNP() { return varType == SNP_VARIANT ; }
- public boolean isInsertion() { return varType == INSERTION_VARIANT; }
- public boolean isDeletion() { return varType == DELETION_VARIANT ; }
- public boolean isIndel() { return isInsertion() || isDeletion() || varType == INDEL_VARIANT; }
- public boolean isReference() { return varType == NO_VARIANT; }
-
- public boolean isHom() {
- // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype
- if ( ! isIndel() ) return ( observedAlleles.get(0) == observedAlleles.get(1) );
- return ( isInsertion() || isDeletion() ) && observedAlleles.get(0).equals(observedAlleles.get(1) );
- }
-
- public boolean isHet() {
- // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype
- if ( ! isIndel() ) return ( observedAlleles.get(0) != observedAlleles.get(1) );
- return isIndel() || ( ! observedAlleles.get(0).equals(observedAlleles.get(1) ) );
- }
-
- // ----------------------------------------------------------------------
- //
- // formatting
- //
- // ----------------------------------------------------------------------
- public String toString() {
- return String.format("%s\t%d\t%d\t%s\t%s\t%s",
- getLocation().getContig(), getLocation().getStart(), getLocation().getStop(), name, refBases, observedString);
- }
-
- public String toSimpleString() {
- return String.format("%s:%s", name, observedString);
- }
-
- public String toMediumString() {
-
- String s = null;
- if ( refBaseChar == '*' ) s = String.format("%s:%s:%s", getLocation().toString(), name,observedString);
- else s = String.format("%s:%s:%s/%s", getLocation().toString(), name, observedAlleles.get(0),observedAlleles.get(1));
- if ( isSNP() ) s += ": SNP";
- else {
- if ( isInsertion() ) s += ": Insertion";
- else {
- if ( isDeletion() ) s+= ": Deletion";
- else {
- if ( isIndel() ) s+=": Indel";
- else s+=": Reference";
- }
- }
- }
- return s;
- }
-
- public String repl() {
- return String.format("REPL not implemented yet");
- }
-
-
- public double getVariantConfidence() {
- return variantScore;
- }
-
-
- public boolean isBiallelic() {
- return nNonref < 2;
- }
-
- public double getConsensusConfidence() {
- return consensusScore;
- }
-
- public int length() {
- return eventLength;
- }
-
- public boolean isIndelGenotype() {
- return refBaseChar == '*';
- }
-
-
- public boolean isPointGenotype() {
- return ! isIndelGenotype();
- }
-
- /** Implements method required by GenotypeList interface. If this object represents
- * an indel genotype, then it returns itself through this method. If this object is a
- * point genotype, this method returns null.
- * @return
- */
- public SAMPileupRecord getIndelGenotype() {
- if ( isIndelGenotype() ) return this;
- else return null;
- }
-
- /** Implements method required by GenotypeList interface. If this object represents
- * a point genotype, then it returns itself through this method. If this object is an
- * indel genotype, this method returns null.
- * @return
- */
- public SAMPileupRecord getPointGenotype() {
- if ( isPointGenotype() ) return this;
- else return null;
- }
-
- /** Returns true if this object \em is an indel genotype (and thus
- * indel genotype is what it only has).
- * @return
- */
- public boolean hasIndelGenotype() {
- return isIndelGenotype();
- }
-
- /** Returns true if this object \em is a point genotype (and thus
- * point genotype is what it only has.
- * @return
- */
- public boolean hasPointGenotype() {
- return isPointGenotype();
- }
-
- public int compareTo(ReferenceOrderedDatum o) {
- return getLocation().compareTo(o.getLocation());
- }
-
- protected static class pileupFileIterator implements Iterator {
-// private String fieldDelimiter = new String("\t");
- private String rodName = null;
- private XReadLines parser = null;
- private String lastProcessedLine = null;
-
-// private long z = 0;
-// private long t = 0;
-
- pileupFileIterator(String name, java.io.File f) {
- try {
- parser = new XReadLines(f);
- } catch ( FileNotFoundException e ) {
- Utils.scareUser("Couldn't open file: " + f);
- }
- rodName = name;
- }
-
- public boolean hasNext() {
- return parser.hasNext();
- }
-
- /**
- * @return the next element in the iteration.
- * @throws NoSuchElementException - iterator has no more elements.
- */
- public SAMPileupRecord next() {
- if (!this.hasNext()) throw new NoSuchElementException("SAMPileupRecord next called on iterator with no more elements");
-// if ( z == 0 ) t = System.currentTimeMillis();
- lastProcessedLine = parser.next();
- SAMPileupRecord n = new SAMPileupRecord(rodName);
- //System.out.printf("Line is %s%n", line);
-// String parts[] = lastProcessedLine.split(fieldDelimiter);
-// n.parseLine(parts);
-
- n.parseLine(lastProcessedLine);
-// if ( ++z == 1000000 ) { System.out.println(rodName + ": 1,000,000 records read in "+((double)(System.currentTimeMillis()-t)/1000.0) +" s"); z = 0; }
- return n ;
- }
-
-
- public void remove() {
- throw new UnsupportedOperationException("'remove' operation is not supported for file-backed SAM pileups");
- }
-
- }
-
-
- public static Iterator createIterator(String name, java.io.File f) {
- return new pileupFileIterator(name,f);
- }
-
-
-
- public static void main(String argv[]) {
-// String testFile = "/humgen/gsa-scr1/asivache/TCGA/Ovarian/C2K/0805/normal.pileup";
- String testFile = "/humgen/gsa-scr1/asivache/trios/CEU/NA12891.12892.12878/mother.chr1.pileup.indel";
-
- Iterator it = createIterator("test-normal", new java.io.File(testFile));
-
- ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker(
- //new java.io.File( "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
- new java.io.File( "/humgen/gsa-scr1/asivache/trios/CEU/NA12891.12892.12878/human_b36_both.fasta")
- );
-
- if ( reference.getSequenceDictionary() == null ) {
- System.out.println("No reference sequence dictionary found. Abort.");
- System.exit(1);
- }
-
- GenomeLocParser.setupRefContigOrdering(reference.getSequenceDictionary());
-
- int counter = 0;
-
- while ( it.hasNext() && counter < 430 ) {
- SAMPileupRecord p = it.next();
-
- System.out.print(p.getLocation().toString());
- System.out.print('\t');
-
- if ( p.isIndel() && p.isSNP() ) { System.out.print("Indel+SNP"); }
- else {
- if ( p.isSNP() ) { System.out.print("SNP"); }
- else {
- if ( p.isIndel() ) { System.out.print("Indel"); }
- else { System.out.print("REF"); }
- }
- }
-
- System.out.print('\t');
- System.out.print(p.getFWDAlleles().get(0)+"/"+p.getFWDAlleles().get(1));
- System.out.print('\t');
- System.out.println(p.getConsensusConfidence()+"\t"+p.getVariantConfidence());
- counter++;
- }
- }
-
-
-
-
-}
diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java
new file mode 100644
index 000000000..b9278ad30
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupCodec.java
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.refdata.features.sampileup;
+
+import org.broad.tribble.FeatureCodec;
+import org.broad.tribble.Feature;
+import org.broad.tribble.exception.CodecLineParsingException;
+import org.broad.tribble.util.ParsingUtils;
+import org.broadinstitute.sting.utils.GenomeLoc;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+
+import static org.broadinstitute.sting.gatk.refdata.features.sampileup.SAMPileupFeature.VariantType;
+
+/**
+ * A Tribble encoder / decoder for SAM pileup data.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class SAMPileupCodec implements FeatureCodec {
+ // the number of tokens we expect to parse from a dbSNP line
+ private static final int expectedTokenCount = 10;
+ private static final char fldDelim = '\t';
+
+ // allocate once and don't ever bother creating them again:
+ private static final String baseA = "A";
+ private static final String baseC = "C";
+ private static final String baseG = "G";
+ private static final String baseT = "T";
+ private static final String emptyStr = ""; // we will use this for "reference" allele in insertions
+
+ /**
+ * Return the # of header lines for this file.
+ *
+ * @param f the file
+ * @return 0 in this case, we assume no header lines. The DbSNP file may have a
+ * header line beginning with '#', but we can ignore that in the decode function.
+ */
+ public int headerLineCount(File f) {
+ // we don't require a header line, but it may exist. We'll deal with that above.
+ return 0;
+ }
+
+ public Feature decode(String line) {
+// 0 1 2 3 4 5 6 7
+//* chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
+//* chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6
+ String[] tokens = new String[expectedTokenCount];
+
+ // split the line
+ int count = ParsingUtils.split(line,tokens,fldDelim);
+
+ // check to see if we've parsed the string into the right number of tokens (expectedTokenCount)
+ if (count != expectedTokenCount)
+ throw new CodecLineParsingException("the SAM pileup line didn't have the expected number of tokens " +
+ "(expected = " + expectedTokenCount + ", saw = " + count + " on " +
+ "line = " + line + ")");
+
+ SAMPileupFeature feature = new SAMPileupFeature(tokens[0],
+ 0,
+ 0);
+
+ feature.setStart(Integer.parseInt(tokens[1]));
+
+ if(tokens[2].length() != 1)
+ throw new CodecLineParsingException("The SAM pileup line had unexpected base " + tokens[2] + " on line = " + line);
+ feature.setRef(Character.toUpperCase(tokens[2].charAt(0)));
+
+ String observedString = tokens[3].toUpperCase(); // field 3
+ feature.setFWDAlleles(new ArrayList(2));
+
+ feature.setConsensusConfidence(Double.parseDouble(tokens[4]));
+ feature.setVariantConfidence(Double.parseDouble(tokens[5]));
+
+ if ( feature.getRef() == '*' ) {
+ parseIndels(observedString,feature) ;
+ if ( feature.isDeletion() ) feature.setEnd(feature.getStart()+feature.length()-1);
+ else feature.setEnd(feature.getStart()-1); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
+ } else {
+ parseBasesAndQuals(feature,tokens[8],tokens[9]);
+ // if the variant is a SNP or a reference base (i.e. no variant at all)
+ if ( observedString.length() != 1 ) throw new RuntimeException( "point mutation genotype is expected to be represented by a single letter");
+ feature.setRefBases(tokens[2].toUpperCase());
+ feature.setEnd(feature.getStart());
+
+ char ch = observedString.charAt(0);
+
+ switch ( ch ) {
+ case 'A': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseA); break;
+ case 'C': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseC); break;
+ case 'G': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseG); break;
+ case 'T': feature.getFWDAlleles().add(baseT); feature.getFWDAlleles().add(baseT); break;
+ case 'M': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseC); break;
+ case 'R': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseG); break;
+ case 'W': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseT); break;
+ case 'S': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseG); break;
+ case 'Y': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseT); break;
+ case 'K': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseT); break;
+ }
+ if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() && feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ) feature.setVariantType(VariantType.NONE);
+ else {
+ // we know that at least one allele is non-ref;
+ // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
+ // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
+ feature.setVariantType(VariantType.SNP);
+ if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() ||
+ feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ||
+ feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1))
+ ) feature.setNumNonRef(1);
+ else feature.setNumNonRef(2); // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
+ }
+ }
+
+ return feature;
+ }
+
+ private void parseIndels(String genotype,SAMPileupFeature feature) {
+ String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit
+
+ // if reference allele is among the observed alleles, we will need to take special care of it since we do not have direct access to the reference;
+ // if we have an insertion, the "reference" allele is going to be empty; if it it is a deletion, we will deduce the "reference allele" bases
+ // from what we have recorded for the deletion allele (e.g. "-CAC")
+ boolean hasRefAllele = false;
+
+ for ( int i = 0 ; i < obs.length ; i++ ) {
+ if ( obs[i].length() == 1 && obs[i].charAt(0) == '*' ) {
+ hasRefAllele = true;
+ feature.getFWDAlleles().add(emptyStr);
+ continue;
+ }
+
+ String varBases = obs[i].toUpperCase();
+
+ switch ( obs[i].charAt(0) ) {
+ case '+':
+ if (!feature.isReference() && !feature.isInsertion()) feature.setVariantType(VariantType.INDEL);
+ else feature.setVariantType(VariantType.INSERTION);
+ feature.setRefBases(emptyStr);
+ break;
+ case '-' :
+ if (!feature.isReference() && !feature.isDeletion()) feature.setVariantType(VariantType.INDEL);
+ else feature.setVariantType(VariantType.DELETION);
+ feature.setRefBases(varBases); // remember what was deleted, this will be saved as "reference allele"
+ break;
+ default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype);
+ }
+ feature.getFWDAlleles().add(varBases);
+ feature.setLength(obs[i].length()-1); // inconsistent for non-biallelic indels!!
+ }
+ if ( hasRefAllele ) {
+ // we got at least one ref. allele (out of two recorded)
+ if (feature.isReference()) { // both top theories are actually ref allele;
+ feature.setNumNonRef(0); // no observations of non-reference allele at all
+ feature.setRefBases(emptyStr);
+ } else {
+ feature.setNumNonRef(1); // hasRefAllele = true, so one allele was definitely ref, hence there is only one left
+ }
+ } else {
+ // we observe two non-ref alleles; they better be the same variant, otherwise the site is not bi-allelic and at the moment we
+ // fail to set data in a consistent way.
+ if ( feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1))) feature.setNumNonRef(1);
+ else feature.setNumNonRef(2);
+ }
+ // DONE with indels
+
+ }
+
+ private void parseBasesAndQuals(SAMPileupFeature feature, final String bases, final String quals)
+ {
+ //System.out.printf("%s%n%s%n", bases, quals);
+
+ // needs to convert the base string with it's . and , to the ref base
+ StringBuilder baseBuilder = new StringBuilder();
+ StringBuilder qualBuilder = new StringBuilder();
+ boolean done = false;
+ for ( int i = 0, j = 0; i < bases.length() && ! done; i++ ) {
+ //System.out.printf("%d %d%n", i, j);
+ char c = (char)bases.charAt(i);
+
+ switch ( c ) {
+ case '.': // matches reference
+ case ',': // matches reference
+ baseBuilder.append(feature.getRef());
+ qualBuilder.append(quals.charAt(j++));
+ break;
+ case '$': // end of read
+ break;
+ case '*': // end of indel?
+ j++;
+ break;
+ case '^': // mapping quality
+ i++;
+ break;
+ case '+': // start of indel
+ case '-': // start of indel
+ final Pattern regex = Pattern.compile("([0-9]+).*"); // matches case 1
+ final String rest = bases.substring(i+1);
+ //System.out.printf("sub is %s%n", rest);
+ Matcher match = regex.matcher(rest);
+ if ( ! match.matches() ) {
+ if ( feature.getRef() != '*' )
+ throw new RuntimeException("Bad pileup format: " + bases + " at position " + i);
+ done = true;
+ }
+ else {
+ String g = match.group(1);
+ //System.out.printf("group is %d, match is %s%n", match.groupCount(), g);
+ int l = Integer.parseInt(g);
+ i += l + g.length(); // length of number + that many bases + +/- at the start (included in the next i++)
+ //System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1));
+ }
+ break;
+ default: // non reference base
+ baseBuilder.append(c);
+ qualBuilder.append(quals.charAt(j++));
+ }
+ }
+
+ feature.setPileupBases(baseBuilder.toString());
+ feature.setPileupQuals(qualBuilder.toString());
+ }
+
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java
new file mode 100644
index 000000000..6b7257913
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/sampileup/SAMPileupFeature.java
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.refdata.features.sampileup;
+
+import org.broad.tribble.Feature;
+
+import java.util.List;
+
+import net.sf.samtools.util.StringUtil;
+
+/**
+ * A tribble feature representing a SAM pileup.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class SAMPileupFeature implements Feature {
+ public enum VariantType { NONE, SNP, INSERTION, DELETION, INDEL };
+
+ private String contig; // genomic location of this genotyped site
+ private int start;
+ private int stop;
+
+ private char refBaseChar; // what we have set for the reference base (is set to a '*' for indel!)
+ private String refBases; // the reference base sequence according to NCBI; single base for point mutations, deleted bases for deletions, empty string for insertions
+
+ private String pileupQuals; // the read base qualities
+ private String pileupBases; // the read bases themselves
+
+ private List observedAlleles = null; // The sequences of the observed alleles (e.g. {"A","C"} for point mutation or {"","+CC"} for het. insertion
+ private VariantType varType = VariantType.NONE;
+ private int nNonref = 0; // number of non-reference alleles observed
+ private int eventLength = 0; // number of inserted or deleted bases
+
+ private double consensusScore = 0;
+ private double variantScore = 0;
+
+ /**
+ * create the dbSNP feature, given the following information:
+ *
+ * @param contig the contig rsID
+ * @param start the start position, one based
+ * @param stop the stop position, one based
+ */
+ SAMPileupFeature(String contig,
+ int start,
+ int stop) {
+ this.contig = contig;
+ this.start = start;
+ this.stop = stop;
+ }
+
+ public String getChr() {
+ return contig;
+ }
+
+ protected void setChr(String chr) {
+ this.contig = chr;
+ }
+
+ public int getStart() {
+ return start;
+ }
+
+ protected void setStart(int start) {
+ this.start = start;
+ }
+
+ public int getEnd() {
+ return stop;
+ }
+
+ protected void setEnd(int end) {
+ this.stop = end;
+ }
+
+ public String getQualsAsString() { return pileupQuals; }
+
+ protected void setPileupQuals(String pileupQuals) {
+ this.pileupQuals = pileupQuals;
+ }
+
+ /** Returns reference base for point genotypes or '*' for indel genotypes, as a char.
+ *
+ */
+ public char getRef() { return refBaseChar; }
+
+ protected void setRef(char ref) {
+ this.refBaseChar = ref;
+ }
+
+ public int size() { return pileupQuals.length(); }
+
+ /** Returns pile of observed bases over the current genomic location.
+ *
+ */
+ public String getBasesAsString() { return pileupBases; }
+
+ protected void setPileupBases(String pileupBases) {
+ this.pileupBases = pileupBases;
+ }
+
+ /** Returns formatted pileup string for the current genomic location as
+ * "location: reference_base observed_base_pile observed_qual_pile"
+ */
+ public String getPileupString()
+ {
+ if(start == stop)
+ return String.format("%s:%d: %s %s %s", getChr(), getStart(), getRef(), getBasesAsString(), getQualsAsString());
+ else
+ return String.format("%s:%d-%d: %s %s %s", getChr(), getStart(), getEnd(), getRef(), getBasesAsString(), getQualsAsString());
+ }
+
+ /**
+ * Gets the bases in byte array form.
+ * @return byte array of the available bases.
+ */
+ public byte[] getBases() {
+ return StringUtil.stringToBytes(getBasesAsString());
+ }
+
+ /**
+ * Gets the Phred base qualities without ASCII offset.
+ * @return Phred base qualities.
+ */
+ public byte[] getQuals() {
+ byte[] quals = StringUtil.stringToBytes(getQualsAsString());
+ for(int i = 0; i < quals.length; i++) quals[i] -= 33;
+ return quals;
+ }
+
+ /** Returns bases in the reference allele as a String. For point genotypes, the string consists of a single
+ * character (reference base). For indel genotypes, the string is empty for insertions into
+ * the reference, or consists of deleted bases for deletions.
+ *
+ * @return reference allele, forward strand
+ */
+ public String getFWDRefBases() {
+ return refBases;
+ }
+
+ protected void setRefBases(String refBases) {
+ this.refBases = refBases;
+ }
+
+ public List getFWDAlleles() {
+ return observedAlleles;
+ }
+
+ protected void setFWDAlleles(List alleles) {
+ this.observedAlleles = alleles;
+ }
+
+ // ----------------------------------------------------------------------
+ //
+ // What kind of variant are we?
+ //
+ // ----------------------------------------------------------------------
+ public boolean isSNP() { return varType == VariantType.SNP; }
+ public boolean isInsertion() { return varType == VariantType.INSERTION; }
+ public boolean isDeletion() { return varType == VariantType.DELETION ; }
+ public boolean isIndel() { return isInsertion() || isDeletion() || varType == VariantType.INDEL; }
+ public boolean isReference() { return varType == VariantType.NONE; }
+
+ protected void setVariantType(VariantType variantType) {
+ this.varType = variantType;
+ }
+
+ public boolean isHom() {
+ // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype
+ if ( ! isIndel() ) return ( observedAlleles.get(0).equals(observedAlleles.get(1)) );
+ return ( isInsertion() || isDeletion() ) && observedAlleles.get(0).equals(observedAlleles.get(1) );
+ }
+
+ public boolean isHet() {
+ // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype
+ if ( ! isIndel() ) return ( !(observedAlleles.get(0).equals(observedAlleles.get(1))) );
+ return isIndel() || ( ! observedAlleles.get(0).equals(observedAlleles.get(1) ) );
+ }
+
+ public double getVariantConfidence() {
+ return variantScore;
+ }
+
+ protected void setVariantConfidence(double variantScore) {
+ this.variantScore = variantScore;
+ }
+
+ public boolean isBiallelic() {
+ return nNonref < 2;
+ }
+
+ protected void setNumNonRef(int nNonref) {
+ this.nNonref = nNonref;
+ }
+
+ public double getConsensusConfidence() {
+ return consensusScore;
+ }
+
+ protected void setConsensusConfidence(double consensusScore) {
+ this.consensusScore = consensusScore;
+ }
+
+ public int length() {
+ return eventLength;
+ }
+
+ protected void setLength(int eventLength) {
+ this.eventLength = eventLength;
+ }
+
+ public boolean isIndelGenotype() {
+ return refBaseChar == '*';
+ }
+
+
+ public boolean isPointGenotype() {
+ return ! isIndelGenotype();
+ }
+
+ /** Implements method required by GenotypeList interface. If this object represents
+ * an indel genotype, then it returns itself through this method. If this object is a
+ * point genotype, this method returns null.
+ * @return
+ */
+ public SAMPileupFeature getIndelGenotype() {
+ if ( isIndelGenotype() ) return this;
+ else return null;
+ }
+
+ /** Implements method required by GenotypeList interface. If this object represents
+ * a point genotype, then it returns itself through this method. If this object is an
+ * indel genotype, this method returns null.
+ * @return
+ */
+ public SAMPileupFeature getPointGenotype() {
+ if ( isPointGenotype() ) return this;
+ else return null;
+ }
+
+ /** Returns true if this object \em is an indel genotype (and thus
+ * indel genotype is what it only has).
+ * @return
+ */
+ public boolean hasIndelGenotype() {
+ return isIndelGenotype();
+ }
+
+ /** Returns true if this object \em is a point genotype (and thus
+ * point genotype is what it only has.
+ * @return
+ */
+ public boolean hasPointGenotype() {
+ return isPointGenotype();
+ }
+
+
+
+}
diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java
deleted file mode 100644
index 8ab9c7f88..000000000
--- a/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java
+++ /dev/null
@@ -1,225 +0,0 @@
-package org.broadinstitute.sting.gatk.refdata;
-
-
-import java.io.IOException;
-import java.util.*;
-
-import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.StingException;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-
-import net.sf.picard.reference.ReferenceSequenceFileWalker;
-
-/**
- * This class wraps Maq/samtools allele calls from pileup format and presents them as a ROD.
- *
- * Example format:
- * for SNP:
- * [chr] [pos] [ref] [consensus allele(s)] [consensus confidence] [snp confidence] [max mapping qual] [num reads in the pile]
- * chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
- *
- * for indel:
- * [chr] [pos] [always *] [consensus alleles] [consensus conf.?] [indel conf.?] [max mapping qual] [num reads in the pile] [indel] [always *?] [reads haveindel] [reads may have indel] [other reads?]
- * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12
- * User: asivache
- * Date: Apr 03, 2009
- * Time: 2:58:33 PM
- * To change this template use File | Settings | File Templates.
- */
-
-
-
-public class rodSAMPileup extends BasicReferenceOrderedDatum {
- // ----------------------------------------------------------------------
- //
- // Constructors
- //
- // ----------------------------------------------------------------------
-
- private SAMPileupRecord indelGenotype = null;
- private SAMPileupRecord pointGenotype = null;
-
- public rodSAMPileup(final String name) {
- super(name);
- }
-
- @Override
- public GenomeLoc getLocation() {
- if ( pointGenotype != null ) return pointGenotype.getLocation();
- if ( indelGenotype != null ) return indelGenotype.getLocation();
- return null;
- }
-
- /** Required by ReferenceOrderedDatum interface. This implementation provides its own iterator,
- * so this method does nothing at all (always returns false).
- *
- */
- @Override
- public boolean parseLine(Object header, String[] parts) throws IOException {
- return false;
- }
-
- @Override
- public String toString() {
- StringBuilder b = new StringBuilder();
- if ( pointGenotype != null ) {
- b.append(pointGenotype.toString());
- if ( indelGenotype != null ) b.append('\n');
- }
- if ( indelGenotype != null ) b.append(indelGenotype.toString());
- return b.toString();
- }
-
- public SAMPileupRecord getIndelGenotype() {
- return indelGenotype;
- }
-
- public SAMPileupRecord getPointGenotype() {
- return pointGenotype;
- }
-
- public boolean hasIndelGenotype() {
- return indelGenotype != null;
- }
-
- public boolean hasPointGenotype() {
- return pointGenotype != null;
- }
-
- /** Iterates over SAM pileup multiline records: each new invocation of next() will
- * move to next genomic position in the pileup file, and if the record for that position
- * consists of multiple lines (as is the case for indels), they all will be read and processed.
- * @author asivache
- *
- */
- private static class rodSAMPileupIterator implements Iterator {
- //private xReadLines parser = null;
- private String rodName = null;
- private PushbackIterator parser = null;
- private long line = 0;
-
- rodSAMPileupIterator(String name, java.io.File f) {
- parser = new PushbackIterator(SAMPileupRecord.createIterator(name,f));
- rodName = name;
- }
-
- public boolean hasNext() {
- return parser.hasNext();
- }
-
- /**
- * @return the next element in the iteration.
- * @throws NoSuchElementException - iterator has no more elements.
- */
- public rodSAMPileup next() {
- if (!this.hasNext()) throw new NoSuchElementException("rodSAMPileup next called on iterator with no more elements");
- rodSAMPileup result = new rodSAMPileup(rodName);
-
- SAMPileupRecord r = parser.next();
-
- // if ( (++line)%10000 == 0 ) System.out.printf("%s: record is %d (%s)%n", rodName, line,r.getLocation().toString());
-
- if ( r.isPointGenotype() ) result.pointGenotype = r;
- else result.indelGenotype = r;
-
- if ( parser.hasNext() ) {
-
- SAMPileupRecord r2 = parser.next();
- int cmp = r.getLocation().compareTo(r2.getLocation());
- switch ( cmp ) {
- case -1 : // next record is at greater position
- parser.pushback(r2); // return next record back to the stream
- break;
- case 0: // next record is at the same position; in this case r MUST be point and r2 MUST be indel
- if ( result.indelGenotype != null ) { // oops, we've seen indel already
- if ( r2.isPointGenotype() ) throw new StingException("SAM pileup file might be corrupted: point genotype follows indel genotype line at "+r.getLocation() );
- else throw new StingException("SAM pileup file might be corrupted: two indel genotype lines found at same position "+r.getLocation() );
- } else {
- // ok, the first genotype we've seen was a point one
- if ( r2.isPointGenotype() ) throw new StingException("SAM pileup file might be corrupted: two point genotype lines found at same position "+r.getLocation() );
- // and the second is an indel, wheww...
- result.indelGenotype = r2;
- }
-
- // the following is solely for the purposes of strict validation of pileup file: since we've already read two lines at the
- // the same genomic location (point and indel variants), there can not be any more:
- if ( parser.hasNext() ) {
- r2 = parser.next();
- cmp = r.getLocation().compareTo(r2.getLocation() ) ;
- if ( cmp == 0 ) throw new StingException("SAM pileup file is corrupted: more than two lines found at position "+r.getLocation());
- if ( cmp > 0 ) throw new StingException("SAM pileup file or reference dictionary is corrupted: lesser position "+r2.getLocation()+" is encountered right after position "
- + r.getLocation() );
- parser.pushback(r2);
- }
-
- break;
- case 1: // next record is at lower position?? come'on!
- throw new StingException("SAM pileup file or reference dictionary is corrupted: lesser position "+r2.getLocation()+" is encountered right after position " + r.getLocation() );
- default:
- throw new StingException("INTERNAL ERROR: this point should never be reached");
- }
- }
- return result ;
- }
-
-
- public void remove() {
- throw new UnsupportedOperationException("'remove' operation is not supported for file-backed SAM pileups");
- }
-
- }
-
- public static Iterator createIterator(String name, java.io.File file) {
- return new rodSAMPileup.rodSAMPileupIterator(name,file);
- }
-
- public static void main(String argv[]) {
- String testFile = "/humgen/gsa-scr1/asivache/TCGA/Ovarian/C2K/0805/normal.pileup";
-// String testFile = "/humgen/gsa-scr1/asivache/trios/CEU/NA12891.12892.12878/mother.chr1.pileup.indel";
-
- Iterator it = createIterator("test-normal", new java.io.File(testFile));
-
- ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker(
- new java.io.File( "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
-// new java.io.File( "/humgen/gsa-scr1/asivache/trios/CEU/NA12891.12892.12878/human_b36_both.fasta")
- );
-
- if ( reference.getSequenceDictionary() == null ) {
- System.out.println("No reference sequence dictionary found. Abort.");
- System.exit(1);
- }
-
- GenomeLocParser.setupRefContigOrdering(reference.getSequenceDictionary());
-
- int counter = 0;
-
- while ( it.hasNext() && counter < 430 ) {
- rodSAMPileup p = it.next();
- System.out.println(p.toString());
-/*
- System.out.print(p.getLocation().toString());
- System.out.print('\t');
-
- if ( p.isIndel() && p.isSNP() ) { System.out.print("Indel+SNP"); }
- else {
- if ( p.isSNP() ) { System.out.print("SNP"); }
- else {
- if ( p.isIndel() ) { System.out.print("Indel"); }
- else { System.out.print("REF"); }
- }
- }
-
- System.out.print('\t');
- System.out.print(p.getFWDAlleles().get(0)+"/"+p.getFWDAlleles().get(1));
- System.out.print('\t');
- System.out.println(p.getConsensusConfidence()+"\t"+p.getVariantConfidence());
- */
- counter++;
- }
- }
-
-
-}
-
-
diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RODTrackBuilder.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RODTrackBuilder.java
index 9e9c898aa..0a6f35d16 100644
--- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RODTrackBuilder.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RODTrackBuilder.java
@@ -54,7 +54,6 @@ public class RODTrackBuilder implements RMDTrackBuilder {
static {
// All known ROD types
- Types.put("SAMPileup", rodSAMPileup.class);
Types.put("GELI", rodGELI.class);
Types.put("RefSeq", rodRefSeq.class);
Types.put("Table", TabularROD.class);
diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java
index 2a65a7d54..49d5ae586 100755
--- a/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java
@@ -28,11 +28,12 @@ package org.broadinstitute.sting.gatk.walkers.qc;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.refdata.SAMPileupRecord;
-import org.broadinstitute.sting.gatk.refdata.rodSAMPileup;
+import org.broadinstitute.sting.gatk.refdata.features.sampileup.SAMPileupFeature;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@@ -43,14 +44,14 @@ import java.util.Arrays;
* each overlapping read, and quality score) to the reference pileup data generated by samtools. Samtools' pileup data
* should be specified using the command-line argument '-B pileup,SAMPileup,'.
*/
-@Requires(value={DataSource.READS,DataSource.REFERENCE},referenceMetaData=@RMD(name="pileup",type=rodSAMPileup.class))
+@Requires(value={DataSource.READS,DataSource.REFERENCE},referenceMetaData=@RMD(name="pileup",type=SAMPileupFeature.class))
public class ValidatingPileupWalker extends LocusWalker implements TreeReducible {
@Argument(fullName="continue_after_error",doc="Continue after an error",required=false)
public boolean CONTINUE_AFTER_AN_ERROR = false;
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
ReadBackedPileup pileup = context.getBasePileup();
- SAMPileupRecord truePileup = getTruePileup( tracker );
+ SAMPileupFeature truePileup = getTruePileup( tracker );
if ( truePileup == null ) {
out.printf("No truth pileup data available at %s%n", pileup.getPileupString(ref.getBase()));
@@ -82,11 +83,12 @@ public class ValidatingPileupWalker extends LocusWalker