package org.broadinstitute.sting.oneoffprojects.variantcontext; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.gatk.refdata.*; import java.util.*; import org.apache.commons.jexl.*; /** * @author ebanks *

* Class VariantContext *

* This class represents a context that unifies one or more variants */ public class VariantContext { private GenomeLoc loc; private Set alleles = new HashSet(); private Set genotypes = new HashSet(); private HashMap attributes = new HashMap(); Type type = null; private double negLog10PError = 0.0; // todo - fixme /** Have we checked this VariantContext already? */ private boolean validatedP = false; // public VariantContext(VariationRod rod) { // // // TODO -- VariationRod should eventually require classes to implement toVariationContext() // // TODO -- (instead of using a temporary adapter class) // // loc = rod.getLocation(); // reference = new Allele(Allele.AlleleType.REFERENCE, rod.getReference()); // // // TODO -- populate the alleles and genotypes through an adapter // alleles = new HashSet(); // genotypes = new HashSet(); // // attributes = new HashMap(); // } // --------------------------------------------------------------------------------------------------------- // // constructors // // --------------------------------------------------------------------------------------------------------- public VariantContext(GenomeLoc loc) { if ( loc == null ) { throw new StingException("GenomeLoc cannot be null"); } this.loc = loc; } protected VariantContext(VariantContext parent, Set genotypes, HashMap attributes) { this(parent.getLocation(), parent.getAlleles(), genotypes, attributes); } public VariantContext(GenomeLoc loc, Set alleles, Set genotypes, HashMap attributes) { this(loc); // todo -- add extensive testing here // todo -- check that exactly one allele is tagged as reference this.alleles = new HashSet(alleles); this.genotypes = new HashSet(genotypes); this.attributes = new HashMap(attributes); } // --------------------------------------------------------------------------------------------------------- // // type operations // // --------------------------------------------------------------------------------------------------------- /** * see: http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=handbook&part=ch5&rendertype=table&id=ch5.ch5_t3 * * Format: * dbSNP variation class * Rules for assigning allele classes * Sample allele definition * * Single Nucleotide Polymorphisms (SNPs)a * Strictly defined as single base substitutions involving A, T, C, or G. * A/T * * Deletion/Insertion Polymorphisms (DIPs) * Designated using the full sequence of the insertion as one allele, and either a fully * defined string for the variant allele or a '-' character to specify the deleted allele. * This class will be assigned to a variation if the variation alleles are of different lengths or * if one of the alleles is deleted ('-'). * T/-/CCTA/G * * No-variation * Reports may be submitted for segments of sequence that are assayed and determined to be invariant * in the sample. * (NoVariation) * * Mixed * Mix of other classes * * * Not currently supported: * * Heterozygous sequencea * The term heterozygous is used to specify a region detected by certain methods that do not * resolve the polymorphism into a specific sequence motif. In these cases, a unique flanking * sequence must be provided to define a sequence context for the variation. * (heterozygous) * * Microsatellite or short tandem repeat (STR) * Alleles are designated by providing the repeat motif and the copy number for each allele. * Expansion of the allele repeat motif designated in dbSNP into full-length sequence will * be only an approximation of the true genomic sequence because many microsatellite markers are * not fully sequenced and are resolved as size variants only. * (CAC)8/9/10/11 * * Named variant * Applies to insertion/deletion polymorphisms of longer sequence features, such as retroposon * dimorphism for Alu or line elements. These variations frequently include a deletion '-' indicator * for the absent allele. * (alu) / - * * Multi-Nucleotide Polymorphism (MNP) * Assigned to variations that are multi-base variations of a single, common length * GGA/AGT */ public enum Type { NO_VARIATION, SNP, INDEL, MIXED } /** * convenience method for switching over the allele type * * @return the AlleleType of this allele **/ public Type getType() { if ( type == null ) determineType(); return type; } /** * convenience method for SNPs * * @return true if this is a SNP, false otherwise */ public boolean isSNP() { return getType() == Type.SNP; } /** * convenience method for variants * * @return true if this is a variant allele, false if it's reference */ public boolean isVariant() { return getType() != Type.NO_VARIATION; } /** * convenience method for indels * * @return true if this is an indel, false otherwise */ public boolean isIndel() { return getType() == Type.INDEL; } /** * convenience method for indels * * @return true if this is an mixed variation, false otherwise */ public boolean isMixed() { return getType() == Type.MIXED; } // --------------------------------------------------------------------------------------------------------- // // Generic accessors // // --------------------------------------------------------------------------------------------------------- /** * @return the location of this context */ public GenomeLoc getLocation() { return loc; } // --------------------------------------------------------------------------------------------------------- // // Working with alleles // // --------------------------------------------------------------------------------------------------------- /** * @return the reference allele for this context */ public Allele getReference() { for ( Allele allele : getAlleles() ) if ( allele.isReference() ) return allele; throw new StingException("BUG: no reference allele found at " + this); } /** * @return true if the context is strictly bi-allelic */ public boolean isBiallelic() { //return getAlternateAlleles().size() == 1; return getAlleles().size() == 2; } /** * Gets the alleles. This method should return all of the alleles present at the location, * including the reference allele. There are no constraints imposed on the ordering of alleles * in the set. If the reference is not an allele in this context it will not be included. * * @return the set of alleles */ public Set getAlleles() { return alleles; } /** * Gets the alternate alleles. This method should return all the alleles present at the location, * NOT including the reference allele. There are no constraints imposed on the ordering of alleles * in the set. * * @return the set of alternate alleles */ public Set getAlternateAlleles() { HashSet altAlleles = new HashSet(); for ( Allele allele : alleles ) { if ( allele.isNonReference() ) altAlleles.add(allele); } return altAlleles; } // --------------------------------------------------------------------------------------------------------- // // Working with genotypes // // --------------------------------------------------------------------------------------------------------- /** * @return true if the context represents variants with associated genotypes */ public boolean hasGenotypes() { return genotypes.size() > 0; } /** * @return set of all Genotypes associated with this context */ // todo -- genotypes should really be stored as map, not set public Set getGenotypes() { return genotypes; } public Map getGenotypeMap() { HashMap map = new HashMap(); for ( Genotype g : genotypes ) map.put(g.getSample(), g); return map; } /** * @return the set of all sample names in this context */ public Set getSampleNames() { return getGenotypeMap().keySet(); } /** * @param sample the sample name * * @return the Genotype associated with the given sample in this context or null if the sample is not in this context */ public Genotype getGenotype(String sample) { return getGenotypeMap().get(sample); } // --------------------------------------------------------------------------------------------------------- // // Working with attributes // // --------------------------------------------------------------------------------------------------------- // todo -- refactor into AttributedObject and have VariantContext and Genotype inherit from them // todo -- define common attributes as enum /** * Sets the given attribute * * @param key the attribute key * @param value the attribute value */ public void putAttribute(Object key, Object value) { attributes.put(key, value); } public void putAttributes(Map map) { attributes.putAll(map); } public boolean hasAttribute(Object key) { return attributes.containsKey(key); } public int getNumAttributes() { return attributes.size(); } /** * @param key the attribute key * * @return the attribute value for the given key (or null if not set) */ public Object getAttribute(Object key) { return attributes.get(key); } public Object getAttribute(Object key, Object defaultValue) { if ( hasAttribute(key) ) return attributes.get(key); else return defaultValue; } public String getAttributeAsString(Object key) { return (String)getAttribute(key); } public int getAttributeAsInt(Object key) { return (Integer)getAttribute(key); } public double getAttributeAsDouble(Object key) { return (Double)getAttribute(key); } public String getAttributeAsString(Object key, String defaultValue) { return (String)getAttribute(key, defaultValue); } public int getAttributeAsInt(Object key, int defaultValue) { return (Integer)getAttribute(key, defaultValue); } public double getAttributeAsDouble(Object key, double defaultValue) { return (Double)getAttribute(key, defaultValue); } /** * @return the attribute map */ public Map getAttributes() { return attributes; } // --------------------------------------------------------------------------------------------------------- // // validation // // --------------------------------------------------------------------------------------------------------- /** * To be called by any modifying routines */ private void invalidate() { validatedP = false; } public boolean validate() { return validate(true); } public boolean validate(boolean throwException) { if ( ! validatedP ) { boolean valid = false; // todo -- add extensive validation checking here if ( valid ) { validatedP = valid; } else if ( throwException ) { throw new StingException(this + " failed validation"); } return valid; } else { return validatedP; } } // --------------------------------------------------------------------------------------------------------- // // utility routines // // --------------------------------------------------------------------------------------------------------- private void determineType() { if ( type == null ) { // todo -- figure out the variation type } } // todo -- toString() method /** * @return true if the context represents point alleles only (i.e. no indels or structural variants) */ // public boolean isPointAllele() { // for ( Allele allele : alleles ) { // if ( allele.isVariant() && !allele.isSNP() ) // return false; // } // return true; // } // // /** // * @return set of all subclasses within this context // */ // public Set getSubclasses() { // Set subclasses = new HashSet(); // for ( Genotype g : genotypes ) // subclasses.addAll(g.getAttributes().keySet()); // return subclasses; // } /** * @param allele the allele to be queried * * @return the frequency of the given allele in this context */ public double getAlleleFrequency(Allele allele) { int alleleCount = 0; int totalCount = 0; for ( Genotype g : genotypes ) { for ( Allele a : g.getAlleles() ) { totalCount++; if ( allele.equals(a) ) alleleCount++; } } return totalCount == 0 ? 0.0 : (double)alleleCount / (double)totalCount; } }