diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java new file mode 100644 index 000000000..b81919a70 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java @@ -0,0 +1,327 @@ +package org.broadinstitute.sting.gatk.refdata; + + +import java.util.*; + +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.gatk.refdata.AllelicVariant; + +/** + * This class wraps Maq/samtools allele calls from pileup format and presents them as a ROD.
+ * + * Example format:
+ * for SNP:
+ * [chr] [pos] [ref] [consensus allele(s)] [consensus confidence] [snp confidence] [max mapping qual] [num reads in the pile]
+ * chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
+ *
+ * for indel:
+ * [chr] [pos] [always *] [alleles] [?] [?] [?] [num reads in the pile] ...
+ * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6
+ * User: asivache + * Date: Apr 03, 2009 + * Time: 2:58:33 PM + * To change this template use File | Settings | File Templates. + */ +public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVariant { + private static final int NO_VARIANT = -1; + private static final int SNP_VARIANT = 0; + private static final int INSERTION_VARIANT = 1; + private static final int DELETION_VARIANT = 2; + private static final int INDEL_VARIANT = 3; + + // allocate once and don't ever bother creating them again: + private static final String baseA = new String("A"); + private static final String baseC = new String("C"); + private static final String baseG = new String("G"); + private static final String baseT = new String("T"); + private static final String emptyStr = new String(); // we will use this for "reference" allele in insertions + + protected GenomeLoc loc; // genome location of SNP + // Reference sequence chromosome or scaffold + // Start and stop positions in chrom + + + protected char refBaseChar; // what we have set for the reference base (is set to a '*' for indel!) + protected String refBases; // the reference base according to NCBI, in the dbSNP file + protected String observedString; // store the actual string representation of observed alleles + + protected List observedAlleles = null; // The sequences of the observed alleles from rs-fasta files + protected int varType = NO_VARIANT; + protected int ploidy = 2; // how many allelic variants we get? + protected int nNonref = 0; // number of non-reference alleles + protected int eventLength = 0; + + protected double consensusScore; + protected double variantScore; + // ---------------------------------------------------------------------- + // + // Constructors + // + // ---------------------------------------------------------------------- + public rodSAMPileup(final String name) { + super(name); + } + + @Override + public void parseLine(final String[] parts) { +// 0 1 2 3 4 5 6 7 +// * chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow) +// * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6 + try { + + String contig = parts[0]; + long start = Long.parseLong(parts[1]) ; + + consensusScore = Double.parseDouble(parts[4]); + variantScore = Double.parseDouble(parts[5]); + + refBaseChar = Character.toUpperCase(parts[2].charAt(0)); + + parts[3] = parts[3].toUpperCase(); + observedString = parts[3]; + + observedAlleles = new ArrayList(2); + + if ( refBaseChar == '*' ) { + parseIndels(parts[3]) ; + if ( varType == DELETION_VARIANT ) loc = new GenomeLoc(contig, start, start+eventLength); + else loc = new GenomeLoc(contig, start, start); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!! + } + else { + // if the variant is a SNP or a reference base (i.e. no variant at all) + assert parts[3].length() == 1 : "non-indel genotype is expected to be represented by a single letter"; + refBases = parts[2].toUpperCase(); + eventLength = 1; + loc = new GenomeLoc(contig, start, start+1); + + char ch = parts[3].charAt(0); + + switch ( ch ) { + case 'A': observedAlleles.add(baseA); observedAlleles.add(baseA); break; + case 'C': observedAlleles.add(baseC); observedAlleles.add(baseC); break; + case 'G': observedAlleles.add(baseG); observedAlleles.add(baseG); break; + case 'T': observedAlleles.add(baseT); observedAlleles.add(baseT); break; + case 'M': observedAlleles.add(baseA); observedAlleles.add(baseC); break; + case 'R': observedAlleles.add(baseA); observedAlleles.add(baseG); break; + case 'W': observedAlleles.add(baseA); observedAlleles.add(baseT); break; + case 'S': observedAlleles.add(baseC); observedAlleles.add(baseG); break; + case 'Y': observedAlleles.add(baseC); observedAlleles.add(baseT); break; + case 'K': observedAlleles.add(baseG); observedAlleles.add(baseT); break; + } + if ( observedAlleles.get(0).charAt(0) == refBaseChar && observedAlleles.get(1).charAt(0) == refBaseChar ) varType = NO_VARIANT; + else { + varType = SNP_VARIANT; + if ( observedAlleles.get(0).charAt(0) == refBaseChar || observedAlleles.get(1).charAt(0) == refBaseChar ) nNonref = 1; + else nNonref = 2; + } + } + } catch ( RuntimeException e ) { + System.out.printf(" Exception caught during parsing Pileup line: %s%n", Utils.join(" <=> ", parts)); + throw e; + } + if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant at "+ loc.toString()); + } + + + private void parseIndels(String genotype) { + String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit + + // if reference allele is among the observed alleles, we will need to take special care of it since we do not have direct access to the reference; + // if we have an insertion, the "reference" allele is going to be empty; if it it is a deletion, we will deduce the "reference allele" bases + // from what we have recorded for the deletion allele (e.g. "-CAC") + boolean hasRefAllele = false; + + for ( int i = 0 ; i < obs.length ; i++ ) { + if ( obs[i].length() == 1 && obs[i].charAt(0) == '*' ) { + hasRefAllele = true; + continue; // we will fill reference allele later + } + + String varBases = obs[i].substring(1).toUpperCase(); + + switch ( obs[i].charAt(0) ) { + case '+': + if ( varType != NO_VARIANT && varType != INSERTION_VARIANT ) varType = INDEL_VARIANT; + else varType = INSERTION_VARIANT; + refBases = emptyStr; + break; + case '-' : + if ( varType != -1 && varType != DELETION_VARIANT ) varType = INDEL_VARIANT; + else varType = DELETION_VARIANT; + refBases = varBases; // remember what was deleted, this will be saved as "reference allele" + break; + default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype); + } + observedAlleles.add(varBases); + eventLength = obs[i].length() - 1; // inconsistent for non-biallelic indels!! + } + if ( hasRefAllele ) { + // we got at least one ref. allele (out of two recorded) + if ( varType == NO_VARIANT ) { // both top theories are actually ref allele; + observedAlleles.add(emptyStr); + observedAlleles.add(emptyStr); // no inserted/deleted bases at the site! + nNonref = 0; // no observations of non-reference allele at all + } else { + nNonref = 1; // hasRefAllele = true, so one allele was definitely ref, hence there is only one left + + // whether we have insertion or deletion, one allele (ref for insertion, or alt for deletion) is empty; + // the one that contain actual bases (alt for insertion or ref for deletion) was already filled above: + observedAlleles.add(emptyStr); + } + } else { + // we observe two non-ref alleles; they better be the same variant, otherwise the site is not bi-allelic and at the moment we + // fail to set data in a consistent way.. (the check for INDEL_VARIANT ensures that recorded variants are indeed both insertions + // or both deletions as compared to +ACC/-ACC which would still have the same bases (no matter how crazy and improbable + // such event would be) + if ( observedAlleles.get(0).equals(observedAlleles.get(1)) && varType != INDEL_VARIANT ) nNonref = 1; + else nNonref = 2; + } + // DONE with indels + + } + + + public GenomeLoc getLocation() { return loc; } + + /** Returns bases in the reference allele as a String. String can be empty (as in insertion into + * the reference), can contain a single character (as in SNP or one-base deletion), or multiple characters + * (for longer indels). + * + * @return reference allele, forward strand + */ + public String getRefBasesFWD() { + return refBases; + } + + /** + * Returns reference (major) allele base for a SNP variant as a character; should throw IllegalStateException + * if variant is not a SNP. + * + * @return reference base on the forward strand + */ + public char getRefSnpFWD() throws IllegalStateException { + if ( isIndel() ) throw new IllegalStateException("Variant is not a SNP"); + return refBaseChar; + } + + @Override + public List getGenotype() { + return observedAlleles; + } + + // ---------------------------------------------------------------------- + // + // What kind of variant are we? + // + // ---------------------------------------------------------------------- + public boolean isSNP() { return varType == SNP_VARIANT ; } + public boolean isInsertion() { return varType == INSERTION_VARIANT; } + public boolean isDeletion() { return varType == DELETION_VARIANT ; } + public boolean isIndel() { return isInsertion() || isDeletion() || varType == INDEL_VARIANT; } + public boolean isReference() { return varType == NO_VARIANT; } + + // ---------------------------------------------------------------------- + // + // formatting + // + // ---------------------------------------------------------------------- + public String toString() { + return String.format("%s\t%d\t%d\t%s\t%s\t%s", + getContig(), getStart(), getStop(), name, refBases, observedString); + } + + public String toSimpleString() { + return String.format("%s:%s", name, observedString); + } + + public String toMediumString() { + String s = String.format("%s:%s:%s", getLocation().toString(), name, observedString); + if ( isSNP() ) s += ": SNP"; + else { + if ( isInsertion() ) s += ": Insertoin"; + else { + if ( isDeletion() ) s+= ": Deletion"; + else { + if ( isIndel() ) s+=": Indel"; + else s+=": Reference"; + } + } + } + return s; + } + + public String repl() { + return String.format("REPL not implemented yet"); + } + + + @Override + public String getAltBasesFWD() { + if ( ! isSNP() && ! isIndel() ) return emptyStr; + + if ( isSNP() ) { + if ( observedAlleles.get(0).charAt(0) == refBaseChar ) return observedAlleles.get(1); + else return observedAlleles.get(0); + } + + if ( isInsertion() ) { + if ( observedAlleles.get(0) == emptyStr ) return observedAlleles.get(1); + else return observedAlleles.get(0); + } + + if ( isDeletion() ) { + if ( observedAlleles.get(0) == emptyStr ) return observedAlleles.get(0); + else return observedAlleles.get(1); + + } + + System.out.printf("WARNING: unexpected variant type in pileup %s at %s%n",name,getLocation().toString()); + return null; + } + + @Override + public char getAltSnpFWD() throws IllegalStateException { + if ( ! isSNP() ) throw new IllegalStateException("Variant is not a SNP"); + if ( observedAlleles.get(0).charAt(0) == refBaseChar ) return observedAlleles.get(1).charAt(0); + else return observedAlleles.get(0).charAt(0); + + } + + @Override + public double getConsensusConfidence() { + return consensusScore; + } + + + @Override + public double getMAF() { + if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: can not determine minor allele freq for multiallelic site"); + if ( isSNP() || isIndel() ) { + if ( observedAlleles.get(0).equals(observedAlleles.get(1)) ) return 1.0; + else return 0.5; + } + return 0; + } + + @Override + public int getPloidy() throws IllegalStateException { + return 2; // ??? + } + + @Override + public double getVariationConfidence() { + return variantScore; + } + + @Override + public boolean isGenotype() { + return true; + } + + @Override + public boolean isBiallelic() { + return nNonref < 2; + } +}