diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java
index 1249f6879..52ac3b2e8 100644
--- a/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java
@@ -1,13 +1,14 @@
package org.broadinstitute.sting.gatk.refdata;
+import java.io.IOException;
import java.util.*;
-import java.util.regex.Pattern;
-import java.util.regex.Matcher;
+import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.Utils;
-import org.broadinstitute.sting.utils.Pileup;
+import org.broadinstitute.sting.utils.StingException;
+
+import edu.mit.broad.picard.reference.ReferenceSequenceFileWalker;
/**
* This class wraps Maq/samtools allele calls from pileup format and presents them as a ROD.
@@ -18,404 +19,207 @@ import org.broadinstitute.sting.utils.Pileup;
* chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
*
* for indel:
- * [chr] [pos] [always *] [alleles] [?] [?] [?] [num reads in the pile] ...
- * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6
+ * [chr] [pos] [always *] [consensus alleles] [consensus conf.?] [indel conf.?] [max mapping qual] [num reads in the pile] [indel] [always *?] [reads haveindel] [reads may have indel] [other reads?]
+ * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12
* User: asivache
* Date: Apr 03, 2009
* Time: 2:58:33 PM
* To change this template use File | Settings | File Templates.
*/
-public class rodSAMPileup extends BasicReferenceOrderedDatum implements Genotype, Pileup {
- private static final int NO_VARIANT = -1;
- private static final int SNP_VARIANT = 0;
- private static final int INSERTION_VARIANT = 1;
- private static final int DELETION_VARIANT = 2;
- private static final int INDEL_VARIANT = 3;
-
- // allocate once and don't ever bother creating them again:
- private static final String baseA = new String("A");
- private static final String baseC = new String("C");
- private static final String baseG = new String("G");
- private static final String baseT = new String("T");
- private static final String emptyStr = new String(); // we will use this for "reference" allele in insertions
-
- protected GenomeLoc loc; // genomic location of this genotyped site
- // Reference sequence chromosome or scaffold
- // Start and stop positions in chrom
- protected char refBaseChar; // what we have set for the reference base (is set to a '*' for indel!)
- protected String refBases; // the reference base sequence according to NCBI
- protected String observedString; // store the actual string representation of observed alleles
-
- protected String pileupQuals; // the read base qualities
- protected String pileupBases; // the read bases themselves
-
- protected List observedAlleles = null; // The sequences of the observed alleles from rs-fasta files
- protected int varType = NO_VARIANT;
- protected int ploidy = 2; // how many allelic variants we observe?
- protected int nNonref = 0; // number of non-reference alleles
- protected int eventLength = 0;
-
- protected double consensusScore;
- protected double variantScore;
- // ----------------------------------------------------------------------
- //
- // Constructors
- //
- // ----------------------------------------------------------------------
- public rodSAMPileup(final String name) {
- super(name);
- }
-
- @Override
- public boolean parseLine(final Object header, final String[] parts) {
-// 0 1 2 3 4 5 6 7
-// * chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
-// * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6
- try {
-
- String contig = parts[0];
- long start = Long.parseLong(parts[1]) ;
-
- consensusScore = Double.parseDouble(parts[4]);
- variantScore = Double.parseDouble(parts[5]);
-
- refBaseChar = Character.toUpperCase(parts[2].charAt(0));
-
- parts[3] = parts[3].toUpperCase();
- observedString = parts[3];
-
- parseBasesAndQuals(parts[8], parts[9]);
-
- observedAlleles = new ArrayList(2);
-
- if ( refBaseChar == '*' ) {
- parseIndels(parts[3]) ;
- if ( varType == DELETION_VARIANT ) loc = new GenomeLoc(contig, start, start+eventLength-1);
- else loc = new GenomeLoc(contig, start, start-1); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
- }
- else {
- // if the variant is a SNP or a reference base (i.e. no variant at all)
- assert parts[3].length() == 1 : "non-indel genotype is expected to be represented by a single letter";
- refBases = parts[2].toUpperCase();
- eventLength = 1;
- //loc = new GenomeLoc(contig, start, start+1);
- loc = new GenomeLoc(contig, start, start);
-
- char ch = parts[3].charAt(0);
-
- switch ( ch ) {
- case 'A': observedAlleles.add(baseA); observedAlleles.add(baseA); break;
- case 'C': observedAlleles.add(baseC); observedAlleles.add(baseC); break;
- case 'G': observedAlleles.add(baseG); observedAlleles.add(baseG); break;
- case 'T': observedAlleles.add(baseT); observedAlleles.add(baseT); break;
- case 'M': observedAlleles.add(baseA); observedAlleles.add(baseC); break;
- case 'R': observedAlleles.add(baseA); observedAlleles.add(baseG); break;
- case 'W': observedAlleles.add(baseA); observedAlleles.add(baseT); break;
- case 'S': observedAlleles.add(baseC); observedAlleles.add(baseG); break;
- case 'Y': observedAlleles.add(baseC); observedAlleles.add(baseT); break;
- case 'K': observedAlleles.add(baseG); observedAlleles.add(baseT); break;
- }
- if ( observedAlleles.get(0).charAt(0) == refBaseChar && observedAlleles.get(1).charAt(0) == refBaseChar ) varType = NO_VARIANT;
- else {
- // we know that at least one allele is non-ref;
- // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
- // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
- varType = SNP_VARIANT;
- if ( observedAlleles.get(0).charAt(0) == refBaseChar ||
- observedAlleles.get(1).charAt(0) == refBaseChar ||
- observedAlleles.get(0) == observedAlleles.get(1)
- ) nNonref = 1;
- else nNonref = 2; // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
- }
- }
- } catch ( RuntimeException e ) {
- System.out.printf(" Exception caught during parsing BasicPileup line: %s%n", Utils.join(" <=> ", parts));
- throw e;
- }
- return true;
- // if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant : ("+refBaseChar+") -->"+toMediumString());
- }
-
-
- private void parseIndels(String genotype) {
- String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit
-
- // if reference allele is among the observed alleles, we will need to take special care of it since we do not have direct access to the reference;
- // if we have an insertion, the "reference" allele is going to be empty; if it it is a deletion, we will deduce the "reference allele" bases
- // from what we have recorded for the deletion allele (e.g. "-CAC")
- boolean hasRefAllele = false;
-
- for ( int i = 0 ; i < obs.length ; i++ ) {
- if ( obs[i].length() == 1 && obs[i].charAt(0) == '*' ) {
- hasRefAllele = true;
- continue; // we will fill reference allele later
- }
-
- String varBases = obs[i].toUpperCase();
-
- switch ( obs[i].charAt(0) ) {
- case '+':
- if ( varType != NO_VARIANT && varType != INSERTION_VARIANT ) varType = INDEL_VARIANT;
- else varType = INSERTION_VARIANT;
- refBases = emptyStr;
- break;
- case '-' :
- if ( varType != NO_VARIANT && varType != DELETION_VARIANT ) varType = INDEL_VARIANT;
- else varType = DELETION_VARIANT;
- refBases = varBases; // remember what was deleted, this will be saved as "reference allele"
- break;
- default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype);
- }
- observedAlleles.add(varBases);
- eventLength = obs[i].length() - 1; // inconsistent for non-biallelic indels!!
- }
- if ( hasRefAllele ) {
- // we got at least one ref. allele (out of two recorded)
- if ( varType == NO_VARIANT ) { // both top theories are actually ref allele;
- observedAlleles.add(emptyStr);
- observedAlleles.add(emptyStr); // no inserted/deleted bases at the site!
- nNonref = 0; // no observations of non-reference allele at all
- } else {
- nNonref = 1; // hasRefAllele = true, so one allele was definitely ref, hence there is only one left
-
- // whether we have insertion or deletion, one allele (ref for insertion, or alt for deletion) is empty;
- // the one that contain actual bases (alt for insertion or ref for deletion) was already filled above:
- observedAlleles.add(emptyStr);
- }
- } else {
- // we observe two non-ref alleles; they better be the same variant, otherwise the site is not bi-allelic and at the moment we
- // fail to set data in a consistent way.. (the check for INDEL_VARIANT ensures that recorded variants are indeed both insertions
- // or both deletions as compared to +ACC/-ACC which would still have the same bases
- if ( observedAlleles.get(0).equals(observedAlleles.get(1)) ) nNonref = 1;
- else nNonref = 2;
- }
- // DONE with indels
-
- }
-
- private void parseBasesAndQuals(final String bases, final String quals)
- {
- //System.out.printf("%s%n%s%n", bases, quals);
-
- // needs to convert the base string with it's . and , to the ref base
- StringBuilder baseBuilder = new StringBuilder();
- StringBuilder qualBuilder = new StringBuilder();
- boolean done = false;
- for ( int i = 0, j = 0; i < bases.length() && ! done; i++ ) {
- //System.out.printf("%d %d%n", i, j);
- char c = (char)bases.charAt(i);
-
- switch ( c ) {
- case '.': // matches reference
- case ',': // matches reference
- baseBuilder.append(refBaseChar);
- qualBuilder.append((char)quals.charAt(j++));
- break;
- case '$': // end of read
- break;
- case '*': // end of indel?
- j++;
- break;
- case '^': // mapping quality
- i++;
- break;
- case '+': // start of indel
- case '-': // start of indel
- final Pattern regex = Pattern.compile("([0-9]+).*"); // matches case 1
- final String rest = bases.substring(i+1);
- //System.out.printf("sub is %s%n", rest);
- Matcher match = regex.matcher(rest);
- if ( ! match.matches() ) {
- if ( refBaseChar != '*' )
- throw new RuntimeException("Bad pileup format: " + bases + " at position " + i);
- done = true;
- }
- else {
- String g = match.group(1);
- //System.out.printf("group is %d, match is %s%n", match.groupCount(), g);
- int l = Integer.parseInt(g);
- i += l + g.length(); // length of number + that many bases + +/- at the start (included in the next i++)
- //System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1));
- }
- break;
- default: // non reference base
- baseBuilder.append(c);
- qualBuilder.append((char)quals.charAt(j++));
- }
- }
-
- pileupBases = baseBuilder.toString();
- pileupQuals = qualBuilder.toString();
- }
-
- public GenomeLoc getLocation() { return loc; }
- public String getQuals() { return pileupQuals; }
- public char getRef() { return refBaseChar; }
- public int size() { return pileupQuals.length(); }
- public String getBases() { return pileupBases; }
-
- public String getPileupString()
- {
- return String.format("%s: %s %s %s", getLocation(), getRef(), getBases(), getQuals());
- }
-
-
- /** Returns bases in the reference allele as a String. String can be empty (as in insertion into
- * the reference), can contain a single character (as in SNP or one-base deletion), or multiple characters
- * (for longer indels).
- *
- * @return reference allele, forward strand
- */
- public String getRefBasesFWD() {
- return refBases;
- }
-
-
- /**
- * Returns reference (major) allele base for a SNP variant as a character; should throw IllegalStateException
- * if variant is not a SNP.
- *
- * @return reference base on the forward strand
- */
- /* public char getRefSnpFWD() throws IllegalStateException {
- if ( isIndel() ) throw new IllegalStateException("Variant is not a SNP");
- return refBaseChar;
- }
-*/
-
- @Override
- public List getFWDAlleles() {
- return observedAlleles;
- }
-
- // ----------------------------------------------------------------------
- //
- // What kind of variant are we?
- //
- // ----------------------------------------------------------------------
- public boolean isSNP() { return varType == SNP_VARIANT ; }
- public boolean isInsertion() { return varType == INSERTION_VARIANT; }
- public boolean isDeletion() { return varType == DELETION_VARIANT ; }
- public boolean isIndel() { return isInsertion() || isDeletion() || varType == INDEL_VARIANT; }
- public boolean isReference() { return varType == NO_VARIANT; }
-
- public boolean isHom() {
- // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype
- if ( ! isIndel() ) return ( observedAlleles.get(0) == observedAlleles.get(1) );
- return ( isInsertion() || isDeletion() ) && observedAlleles.get(0).equals(observedAlleles.get(1) );
- }
-
- public boolean isHet() {
- // implementation-dependent: here we use the fact that for ref and snps we actually use fixed static strings to remember the genotype
- if ( ! isIndel() ) return ( observedAlleles.get(0) != observedAlleles.get(1) );
- return isIndel() || ( ! observedAlleles.get(0).equals(observedAlleles.get(1) ) );
- }
-
- // ----------------------------------------------------------------------
- //
- // formatting
- //
- // ----------------------------------------------------------------------
- public String toString() {
- return String.format("%s\t%d\t%d\t%s\t%s\t%s",
- getLocation().getContig(), getLocation().getStart(), getLocation().getStop(), name, refBases, observedString);
- }
-
- public String toSimpleString() {
- return String.format("%s:%s", name, observedString);
- }
-
- public String toMediumString() {
-
- String s = null;
- if ( refBaseChar == '*' ) s = String.format("%s:%s:%s", getLocation().toString(), name,observedString);
- else s = String.format("%s:%s:%s/%s", getLocation().toString(), name, observedAlleles.get(0),observedAlleles.get(1));
- if ( isSNP() ) s += ": SNP";
- else {
- if ( isInsertion() ) s += ": Insertion";
- else {
- if ( isDeletion() ) s+= ": Deletion";
- else {
- if ( isIndel() ) s+=": Indel";
- else s+=": Reference";
- }
- }
- }
- return s;
- }
-
- public String repl() {
- return String.format("REPL not implemented yet");
- }
-
-/*
- @Override
- public String getAltBasesFWD() {
- if ( ! isSNP() && ! isIndel() ) return emptyStr;
-
- if ( isSNP() ) {
- if ( observedAlleles.get(0).charAt(0) == refBaseChar ) return observedAlleles.get(1);
- else return observedAlleles.get(0);
- }
-
- if ( isInsertion() ) {
- if ( observedAlleles.get(0) == emptyStr ) return observedAlleles.get(1);
- else return observedAlleles.get(0);
- }
-
- if ( isDeletion() ) {
- if ( observedAlleles.get(0) == emptyStr ) return observedAlleles.get(0);
- else return observedAlleles.get(1);
-
- }
-
- System.out.printf("WARNING: unexpected variant type in pileup %s at %s%n",name,getLocation().toString());
- return null;
- }
-*/
-
- /*
- @Override
- public char getAltSnpFWD() throws IllegalStateException {
- if ( ! isSNP() ) throw new IllegalStateException("Variant is not a SNP");
- if ( observedAlleles.get(0).charAt(0) == refBaseChar ) return observedAlleles.get(1).charAt(0);
- else return observedAlleles.get(0).charAt(0);
-
- }
-
- @Override
- public double getConsensusConfidence() {
- return consensusScore;
- }
-*/
-
- /*
- @Override
- public int getPloidy() throws IllegalStateException {
- return 2; // ???
- }
-*/
-
- @Override
- public double getVariantConfidence() {
- return variantScore;
- }
-
-
- @Override
- public boolean isBiallelic() {
- return nNonref < 2;
- }
+public class rodSAMPileup extends BasicReferenceOrderedDatum implements GenotypeList {
+ // ----------------------------------------------------------------------
+ //
+ // Constructors
+ //
+ // ----------------------------------------------------------------------
+
+ private SAMPileupRecord indelGenotype = null;
+ private SAMPileupRecord pointGenotype = null;
+
+ public rodSAMPileup(final String name) {
+ super(name);
+ }
+
@Override
- public double getConsensusConfidence() {
- return consensusScore;
+ public GenomeLoc getLocation() {
+ if ( pointGenotype != null ) return pointGenotype.getLocation();
+ if ( indelGenotype != null ) return indelGenotype.getLocation();
+ return null;
+ }
+
+ /** Required by ReferenceOrderedDatum interface. This implementation provides its own iterator,
+ * so this method does nothing at all (always returns false).
+ *
+ */
+ @Override
+ public boolean parseLine(Object header, String[] parts) throws IOException {
+ return false;
}
@Override
- public String getFWDRefBases() {
- return refBases;
+ public String toString() {
+ StringBuilder b = new StringBuilder();
+ if ( pointGenotype != null ) {
+ b.append(pointGenotype.toString());
+ if ( indelGenotype != null ) b.append('\n');
+ }
+ if ( indelGenotype != null ) b.append(indelGenotype.toString());
+ return b.toString();
}
+
+ @Override
+ public Genotype getIndelGenotype() {
+ return indelGenotype;
+ }
+
+ @Override
+ public Genotype getPointGenotype() {
+ return pointGenotype;
+ }
+
+ @Override
+ public boolean hasIndelGenotype() {
+ return indelGenotype != null;
+ }
+
+ @Override
+ public boolean hasPointGenotype() {
+ return pointGenotype != null;
+ }
+
+ /** Iterates over SAM pileup multiline records: each new invocation of next() will
+ * move to next genomic position in the pileup file, and if the record for that position
+ * consists of multiple lines (as is the case for indels), they all will be read and processed.
+ * @author asivache
+ *
+ */
+ private static class rodSAMPileupIterator implements Iterator {
+ //private xReadLines parser = null;
+ private String rodName = null;
+ private PushbackIterator parser = null;
+
+ rodSAMPileupIterator(String name, java.io.File f) {
+ parser = new PushbackIterator(SAMPileupRecord.createIterator(name,f));
+ rodName = name;
+ }
+
+ @Override
+ public boolean hasNext() {
+ return parser.hasNext();
+ }
+
+ @Override
+ public rodSAMPileup next() {
+
+ rodSAMPileup result = new rodSAMPileup(rodName);
+
+ SAMPileupRecord r = parser.next();
+ //System.out.printf("Line is %s%n", line);
+
+ if ( r.isPointGenotype() ) result.pointGenotype = r;
+ else result.indelGenotype = r;
+
+ if ( parser.hasNext() ) {
+
+ SAMPileupRecord r2 = parser.next();
+ int cmp = r.getLocation().compareTo(r2.getLocation());
+ switch ( cmp ) {
+ case -1 : // next record is at greater position
+ parser.pushback(r2); // return next record back to the stream
+ break;
+ case 0: // next record is at the same position; in this case r MUST be point and r2 MUST be indel
+ if ( result.indelGenotype != null ) { // oops, we've seen indel already
+ if ( r2.isPointGenotype() ) throw new StingException("SAM pileup file might be corrupted: point genotype follows indel genotype line at "+r.getLocation() );
+ else throw new StingException("SAM pileup file might be corrupted: two indel genotype lines found at same position "+r.getLocation() );
+ } else {
+ // ok, the first genotype we've seen was a point one
+ if ( r2.isPointGenotype() ) throw new StingException("SAM pileup file might be corrupted: two point genotype lines found at same position "+r.getLocation() );
+ // and the second is an indel, wheww...
+ result.indelGenotype = r2;
+ }
+
+ // the following is solely for the purposes of strict validation of pileup file: since we've already read two lines at the
+ // the same genomic location (point and indel variants), there can not be any more:
+ if ( parser.hasNext() ) {
+ r2 = parser.next();
+ cmp = r.getLocation().compareTo(r2.getLocation() ) ;
+ if ( cmp == 0 ) throw new StingException("SAM pileup file is corrupted: more than two lines found at position "+r.getLocation());
+ if ( cmp > 0 ) throw new StingException("SAM pileup file or reference dictionary is corrupted: lesser position "+r2.getLocation()+" is encountered right after position "
+ + r.getLocation() );
+ parser.pushback(r2);
+ }
+
+ break;
+ case 1: // next record is at lower position?? come'on!
+ throw new StingException("SAM pileup file or reference dictionary is corrupted: lesser position "+r2.getLocation()+" is encountered right after position " + r.getLocation() );
+ default:
+ throw new StingException("INTERNAL ERROR: this point should never be reached");
+ }
+ }
+ return result ;
+ }
+
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException("'remove' operation is not supported for file-backed SAM pileups");
+ }
+
+ }
+
+ public static Iterator createIterator(String name, java.io.File file) {
+ return new rodSAMPileup.rodSAMPileupIterator(name,file);
+ }
+
+ public static void main(String argv[]) {
+ String testFile = "/humgen/gsa-scr1/asivache/TCGA/Ovarian/C2K/0805/normal.pileup";
+// String testFile = "/humgen/gsa-scr1/asivache/trios/CEU/NA12891.12892.12878/mother.chr1.pileup.indel";
+
+ Iterator it = createIterator("test-normal", new java.io.File(testFile));
+
+ ReferenceSequenceFileWalker reference = new ReferenceSequenceFileWalker(
+ new java.io.File( "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
+// new java.io.File( "/humgen/gsa-scr1/asivache/trios/CEU/NA12891.12892.12878/human_b36_both.fasta")
+ );
+
+ if ( reference.getSequenceDictionary() == null ) {
+ System.out.println("No reference sequence dictionary found. Abort.");
+ System.exit(1);
+ }
+
+ GenomeLoc.setupRefContigOrdering(reference.getSequenceDictionary());
+
+ int counter = 0;
+
+ while ( it.hasNext() && counter < 430 ) {
+ rodSAMPileup p = it.next();
+ System.out.println(p.toString());
+/*
+ System.out.print(p.getLocation().toString());
+ System.out.print('\t');
+
+ if ( p.isIndel() && p.isSNP() ) { System.out.print("Indel+SNP"); }
+ else {
+ if ( p.isSNP() ) { System.out.print("SNP"); }
+ else {
+ if ( p.isIndel() ) { System.out.print("Indel"); }
+ else { System.out.print("REF"); }
+ }
+ }
+
+ System.out.print('\t');
+ System.out.print(p.getFWDAlleles().get(0)+"/"+p.getFWDAlleles().get(1));
+ System.out.print('\t');
+ System.out.println(p.getConsensusConfidence()+"\t"+p.getVariantConfidence());
+ */
+ counter++;
+ }
+ }
+
+
}
+
+