package org.broadinstitute.sting.gatk.refdata; import java.util.*; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.gatk.refdata.AllelicVariant; /** * This class wraps Maq/samtools allele calls from pileup format and presents them as a ROD.
* * Example format:
* for SNP:
* [chr] [pos] [ref] [consensus allele(s)] [consensus confidence] [snp confidence] [max mapping qual] [num reads in the pile]
* chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
*
* for indel:
* [chr] [pos] [always *] [alleles] [?] [?] [?] [num reads in the pile] ...
* chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6
* User: asivache * Date: Apr 03, 2009 * Time: 2:58:33 PM * To change this template use File | Settings | File Templates. */ public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVariant { private static final int NO_VARIANT = -1; private static final int SNP_VARIANT = 0; private static final int INSERTION_VARIANT = 1; private static final int DELETION_VARIANT = 2; private static final int INDEL_VARIANT = 3; // allocate once and don't ever bother creating them again: private static final String baseA = new String("A"); private static final String baseC = new String("C"); private static final String baseG = new String("G"); private static final String baseT = new String("T"); private static final String emptyStr = new String(); // we will use this for "reference" allele in insertions protected GenomeLoc loc; // genome location of SNP // Reference sequence chromosome or scaffold // Start and stop positions in chrom protected char refBaseChar; // what we have set for the reference base (is set to a '*' for indel!) protected String refBases; // the reference base according to NCBI, in the dbSNP file protected String observedString; // store the actual string representation of observed alleles protected List observedAlleles = null; // The sequences of the observed alleles from rs-fasta files protected int varType = NO_VARIANT; protected int ploidy = 2; // how many allelic variants we get? protected int nNonref = 0; // number of non-reference alleles protected int eventLength = 0; protected double consensusScore; protected double variantScore; // ---------------------------------------------------------------------- // // Constructors // // ---------------------------------------------------------------------- public rodSAMPileup(final String name) { super(name); } @Override public void parseLine(final String[] parts) { // 0 1 2 3 4 5 6 7 // * chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow) // * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6 try { String contig = parts[0]; long start = Long.parseLong(parts[1]) ; consensusScore = Double.parseDouble(parts[4]); variantScore = Double.parseDouble(parts[5]); refBaseChar = Character.toUpperCase(parts[2].charAt(0)); parts[3] = parts[3].toUpperCase(); observedString = parts[3]; observedAlleles = new ArrayList(2); if ( refBaseChar == '*' ) { parseIndels(parts[3]) ; if ( varType == DELETION_VARIANT ) loc = new GenomeLoc(contig, start, start+eventLength); else loc = new GenomeLoc(contig, start, start); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!! } else { // if the variant is a SNP or a reference base (i.e. no variant at all) assert parts[3].length() == 1 : "non-indel genotype is expected to be represented by a single letter"; refBases = parts[2].toUpperCase(); eventLength = 1; loc = new GenomeLoc(contig, start, start+1); char ch = parts[3].charAt(0); switch ( ch ) { case 'A': observedAlleles.add(baseA); observedAlleles.add(baseA); break; case 'C': observedAlleles.add(baseC); observedAlleles.add(baseC); break; case 'G': observedAlleles.add(baseG); observedAlleles.add(baseG); break; case 'T': observedAlleles.add(baseT); observedAlleles.add(baseT); break; case 'M': observedAlleles.add(baseA); observedAlleles.add(baseC); break; case 'R': observedAlleles.add(baseA); observedAlleles.add(baseG); break; case 'W': observedAlleles.add(baseA); observedAlleles.add(baseT); break; case 'S': observedAlleles.add(baseC); observedAlleles.add(baseG); break; case 'Y': observedAlleles.add(baseC); observedAlleles.add(baseT); break; case 'K': observedAlleles.add(baseG); observedAlleles.add(baseT); break; } if ( observedAlleles.get(0).charAt(0) == refBaseChar && observedAlleles.get(1).charAt(0) == refBaseChar ) varType = NO_VARIANT; else { varType = SNP_VARIANT; if ( observedAlleles.get(0).charAt(0) == refBaseChar || observedAlleles.get(1).charAt(0) == refBaseChar ) nNonref = 1; else nNonref = 2; } } } catch ( RuntimeException e ) { System.out.printf(" Exception caught during parsing Pileup line: %s%n", Utils.join(" <=> ", parts)); throw e; } if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant at "+ loc.toString()); } private void parseIndels(String genotype) { String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit // if reference allele is among the observed alleles, we will need to take special care of it since we do not have direct access to the reference; // if we have an insertion, the "reference" allele is going to be empty; if it it is a deletion, we will deduce the "reference allele" bases // from what we have recorded for the deletion allele (e.g. "-CAC") boolean hasRefAllele = false; for ( int i = 0 ; i < obs.length ; i++ ) { if ( obs[i].length() == 1 && obs[i].charAt(0) == '*' ) { hasRefAllele = true; continue; // we will fill reference allele later } String varBases = obs[i].substring(1).toUpperCase(); switch ( obs[i].charAt(0) ) { case '+': if ( varType != NO_VARIANT && varType != INSERTION_VARIANT ) varType = INDEL_VARIANT; else varType = INSERTION_VARIANT; refBases = emptyStr; break; case '-' : if ( varType != -1 && varType != DELETION_VARIANT ) varType = INDEL_VARIANT; else varType = DELETION_VARIANT; refBases = varBases; // remember what was deleted, this will be saved as "reference allele" break; default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype); } observedAlleles.add(varBases); eventLength = obs[i].length() - 1; // inconsistent for non-biallelic indels!! } if ( hasRefAllele ) { // we got at least one ref. allele (out of two recorded) if ( varType == NO_VARIANT ) { // both top theories are actually ref allele; observedAlleles.add(emptyStr); observedAlleles.add(emptyStr); // no inserted/deleted bases at the site! nNonref = 0; // no observations of non-reference allele at all } else { nNonref = 1; // hasRefAllele = true, so one allele was definitely ref, hence there is only one left // whether we have insertion or deletion, one allele (ref for insertion, or alt for deletion) is empty; // the one that contain actual bases (alt for insertion or ref for deletion) was already filled above: observedAlleles.add(emptyStr); } } else { // we observe two non-ref alleles; they better be the same variant, otherwise the site is not bi-allelic and at the moment we // fail to set data in a consistent way.. (the check for INDEL_VARIANT ensures that recorded variants are indeed both insertions // or both deletions as compared to +ACC/-ACC which would still have the same bases (no matter how crazy and improbable // such event would be) if ( observedAlleles.get(0).equals(observedAlleles.get(1)) && varType != INDEL_VARIANT ) nNonref = 1; else nNonref = 2; } // DONE with indels } public GenomeLoc getLocation() { return loc; } /** Returns bases in the reference allele as a String. String can be empty (as in insertion into * the reference), can contain a single character (as in SNP or one-base deletion), or multiple characters * (for longer indels). * * @return reference allele, forward strand */ public String getRefBasesFWD() { return refBases; } /** * Returns reference (major) allele base for a SNP variant as a character; should throw IllegalStateException * if variant is not a SNP. * * @return reference base on the forward strand */ public char getRefSnpFWD() throws IllegalStateException { if ( isIndel() ) throw new IllegalStateException("Variant is not a SNP"); return refBaseChar; } @Override public List getGenotype() { return observedAlleles; } // ---------------------------------------------------------------------- // // What kind of variant are we? // // ---------------------------------------------------------------------- public boolean isSNP() { return varType == SNP_VARIANT ; } public boolean isInsertion() { return varType == INSERTION_VARIANT; } public boolean isDeletion() { return varType == DELETION_VARIANT ; } public boolean isIndel() { return isInsertion() || isDeletion() || varType == INDEL_VARIANT; } public boolean isReference() { return varType == NO_VARIANT; } // ---------------------------------------------------------------------- // // formatting // // ---------------------------------------------------------------------- public String toString() { return String.format("%s\t%d\t%d\t%s\t%s\t%s", getContig(), getStart(), getStop(), name, refBases, observedString); } public String toSimpleString() { return String.format("%s:%s", name, observedString); } public String toMediumString() { String s = String.format("%s:%s:%s", getLocation().toString(), name, observedString); if ( isSNP() ) s += ": SNP"; else { if ( isInsertion() ) s += ": Insertoin"; else { if ( isDeletion() ) s+= ": Deletion"; else { if ( isIndel() ) s+=": Indel"; else s+=": Reference"; } } } return s; } public String repl() { return String.format("REPL not implemented yet"); } @Override public String getAltBasesFWD() { if ( ! isSNP() && ! isIndel() ) return emptyStr; if ( isSNP() ) { if ( observedAlleles.get(0).charAt(0) == refBaseChar ) return observedAlleles.get(1); else return observedAlleles.get(0); } if ( isInsertion() ) { if ( observedAlleles.get(0) == emptyStr ) return observedAlleles.get(1); else return observedAlleles.get(0); } if ( isDeletion() ) { if ( observedAlleles.get(0) == emptyStr ) return observedAlleles.get(0); else return observedAlleles.get(1); } System.out.printf("WARNING: unexpected variant type in pileup %s at %s%n",name,getLocation().toString()); return null; } @Override public char getAltSnpFWD() throws IllegalStateException { if ( ! isSNP() ) throw new IllegalStateException("Variant is not a SNP"); if ( observedAlleles.get(0).charAt(0) == refBaseChar ) return observedAlleles.get(1).charAt(0); else return observedAlleles.get(0).charAt(0); } @Override public double getConsensusConfidence() { return consensusScore; } @Override public double getMAF() { if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: can not determine minor allele freq for multiallelic site"); if ( isSNP() || isIndel() ) { if ( observedAlleles.get(0).equals(observedAlleles.get(1)) ) return 1.0; else return 0.5; } return 0; } @Override public int getPloidy() throws IllegalStateException { return 2; // ??? } @Override public double getVariationConfidence() { return variantScore; } @Override public boolean isGenotype() { return true; } @Override public boolean isBiallelic() { return nNonref < 2; } }