From f6bca7873c28596518f42fef04077ebd1947075b Mon Sep 17 00:00:00 2001 From: depristo Date: Thu, 28 Jan 2010 04:10:16 +0000 Subject: [PATCH] V3 of VariantContext. Support for Genotypes and NO_CALL alleles. QUAL fields fully implemented. Can parse VCF records and dbSNP. More complete validation. Detailed testing routines for VariantContext and Allele. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2718 348d0f76-0448-11de-a6fe-93d51630548a --- .../oneoffprojects/variantcontext/Allele.java | 58 +++- .../variantcontext/AttributedObject.java | 59 +++- .../variantcontext/Genotype.java | 171 +++++++---- .../TestVariantContextWalker.java | 34 ++- .../variantcontext/VariantContext.java | 276 +++++++++++------- .../VariantContextAdaptors.java | 84 +++++- .../variantcontext/VariantContextUtils.java | 4 +- .../variantcontext/AlleleTest.java | 25 +- .../variantcontext/VariantContextTest.java | 170 +++++++++-- 9 files changed, 637 insertions(+), 244 deletions(-) diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/Allele.java b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/Allele.java index c508a845e..de72ca049 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/Allele.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/Allele.java @@ -64,29 +64,66 @@ import java.util.Arrays; * If you know where allele is the reference, you can determine whether the variant is an insertion or deletion */ public class Allele { - private static final byte[] NULL_ALLELE_BASES = new byte[0]; + private static final byte[] EMPTY_ALLELE_BASES = new byte[0]; +// private static final byte[] NULL_ALLELE_BASES = new byte[0]; +// private static final byte[] NO_CALL_ALLELE_BASES = ".".getBytes(); private boolean isRef = false; + private boolean isNull = false; + private boolean isNoCall = false; + private byte[] bases = null; + public final static Allele NO_CALL = new Allele("."); + public Allele(byte[] bases, boolean isRef) { if ( bases == null ) throw new IllegalArgumentException("Constructor: the Allele base string cannot be null; use new Allele() or new Allele(\"\") to create a Null allele"); // standardize our representation of null allele and bases - if ( (bases.length == 1 && bases[0] == '-') || bases.length == 0) - bases = NULL_ALLELE_BASES; + if ( wouldBeNullAllele(bases) ) { + bases = EMPTY_ALLELE_BASES; + isNull = true; + } + if ( wouldBeNoCallAllele(bases) ) { + bases = EMPTY_ALLELE_BASES; + isNoCall = true; + if ( isRef ) throw new IllegalArgumentException("Cannot tag a NoCall allele as the reference allele"); + } else bases = new String(bases).toUpperCase().getBytes(); // todo -- slow performance this.isRef = isRef; - this.bases = bases; + + if ( ! acceptableAlleleBases(bases) ) + throw new IllegalArgumentException("Unexpected base in allele bases " + new String(bases)); + } + + public final static boolean wouldBeNullAllele(byte[] bases) { + return (bases.length == 1 && bases[0] == '-') || bases.length == 0; + } + + public final static boolean wouldBeNoCallAllele(byte[] bases) { + return bases.length == 1 && bases[0] == '.'; + } + + + public final static boolean acceptableAlleleBases(String bases) { + return acceptableAlleleBases(bases.getBytes()); + } + + public final static boolean acceptableAlleleBases(byte[] bases) { + if ( (bases.length == 1 && bases[0] == '-') || bases.length == 0) + return true; + for ( byte b : bases ) { if ( ! BaseUtils.isRegularBase(b) ) { - throw new IllegalArgumentException("Unexpected base in allele bases " + new String(bases)); + return false; } } + + return true; } /** null allele creation method */ @@ -107,14 +144,17 @@ public class Allele { // accessor routines // // - public boolean isNullAllele() { return length() == 0; } - public boolean isNonNullAllele() { return ! isNullAllele(); } + public boolean isNull() { return isNull; } + public boolean isNonNull() { return ! isNull(); } + + public boolean isNoCall() { return isNoCall; } + public boolean isCalled() { return ! isNoCall(); } public boolean isReference() { return isRef; } public boolean isNonReference() { return ! isReference(); } public String toString() { - return isNullAllele() ? "-" : new String(getBases()) + ( isReference() ? "*" : ""); + return (isNull() ? "-" : ( isNoCall() ? "." : new String(getBases()))) + (isReference() ? "*" : ""); } /** @@ -131,7 +171,7 @@ public class Allele { * @return true if these alleles are equal */ public boolean equals(Allele other) { - return isRef == other.isRef && this.basesMatch(other.getBases()); + return isRef == other.isRef && isNull == other.isNull && isNoCall == other.isNoCall && this.basesMatch(other.getBases()); } // todo -- notice case insensitivity diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/AttributedObject.java b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/AttributedObject.java index 8aa3cbfd2..a433fc667 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/AttributedObject.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/AttributedObject.java @@ -7,23 +7,58 @@ import java.util.*; /** - * @author ebanks + * @author depristo *

- * Class VariantContext + * Class AttributedObject *

- * This class represents a context that unifies one or more variants + * Common functions in VariantContext */ public class AttributedObject { + public static final double NO_NEG_LOG_10PERROR = 0.0; + private double negLog10PError = NO_NEG_LOG_10PERROR; + private Map attributes = new HashMap(); - public AttributedObject() { - ; + public AttributedObject() { } + + public AttributedObject(double negLog10PError) { + setNegLog10PError(negLog10PError); } - public AttributedObject(Map attributes) { + public AttributedObject(Map attributes) { setAttributes(attributes); } + public AttributedObject(Map attributes, double negLog10PError) { + this(attributes); + + setNegLog10PError(negLog10PError); + } + + + // --------------------------------------------------------------------------------------------------------- + // + // Working with log error rates + // + // --------------------------------------------------------------------------------------------------------- + + public boolean hasNegLog10PError() { + return getNegLog10PError() == NO_NEG_LOG_10PERROR; + } + + /** + * @return the -1 * log10-based error estimate + */ + public double getNegLog10PError() { return negLog10PError; } + + public void setNegLog10PError(double negLog10PError) { + if ( negLog10PError < 0 && negLog10PError != NO_NEG_LOG_10PERROR ) throw new IllegalArgumentException("BUG: negLog10PError cannot be < than 0 : " + negLog10PError); + if ( Double.isInfinite(negLog10PError) ) throw new IllegalArgumentException("BUG: negLog10PError should not be Infinity"); + if ( Double.isNaN(negLog10PError) ) throw new IllegalArgumentException("BUG: negLog10PError should not be NaN"); + + this.negLog10PError = negLog10PError; + } + // --------------------------------------------------------------------------------------------------------- // // Working with attributes @@ -42,9 +77,9 @@ public class AttributedObject { // todo -- define common attributes as enum - public void setAttributes(Map map) { + public void setAttributes(Map map) { this.attributes.clear(); - putAttributes(attributes); + putAttributes(map); } public void putAttribute(Object key, Object value) { @@ -62,9 +97,11 @@ public class AttributedObject { this.attributes.remove(key); } - public void putAttributes(Map map) { - for ( Map.Entry elt : attributes.entrySet() ) { - putAttribute(elt.getKey(), elt.getValue()); + public void putAttributes(Map map) { + if ( map != null ) { + for ( Map.Entry elt : map.entrySet() ) { + putAttribute(elt.getKey(), elt.getValue()); + } } } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/Genotype.java b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/Genotype.java index f508ebfcc..26c5ea3cc 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/Genotype.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/Genotype.java @@ -10,106 +10,153 @@ import java.util.*; * This class emcompasses all the basic information about a genotype */ public class Genotype extends AttributedObject { - private List alleles; + private List alleles = new ArrayList(); + private String sampleName = null; - private double negLog10PError; + public Genotype(VariantContext vc, List alleles, String sampleName, double negLog10PError) { + this(resolveAlleles(vc, alleles), sampleName, negLog10PError); + } - private String sample; - - public Genotype(List alleles, String sample, double negLog10PError) { - this.alleles = new ArrayList(alleles); - this.sample = sample; - this.negLog10PError = negLog10PError; + public Genotype(List alleles, String sampleName, double negLog10PError) { + setAlleles(alleles); + setSampleName(sampleName); + setNegLog10PError(negLog10PError); } /** * @return the alleles for this genotype */ - public List getAlleles() { return alleles; } + public List getAlleles() { + return alleles; + } + + public List getAlleles(Allele allele) { + List al = new ArrayList(); + for ( Allele a : alleles ) + if ( a.equals(allele) ) + al.add(a); + + return al; + } + /** * @return the ploidy of this genotype */ public int getPloidy() { return alleles.size(); } + public enum Type { + NO_CALL, + HOM_REF, + HET, + HOM_VAR + } + + public Type getType() { + Allele firstAllele = alleles.get(0); + + if ( firstAllele.isNoCall() ) { + return Type.NO_CALL; + } + + for (Allele a : alleles) { + if ( ! firstAllele.equals(a) ) + return Type.HET; + } + return firstAllele.isReference() ? Type.HOM_REF : Type.HOM_VAR; + } + /** * @return true if all observed alleles are the same (regardless of whether they are ref or alt) */ - public boolean isHom() { - - // do not assume ploidy == 2 - - if ( alleles.size() == 0 ) - return true; - - Allele first = alleles.get(0); - for ( int i = 1; i < alleles.size(); i++ ) { - if ( !first.equals(alleles.get(i)) ) - return false; - } - - return true; - } + public boolean isHom() { return getType() == Type.HOM_REF || getType() == Type.HOM_VAR; } /** * @return true if we're het (observed alleles differ) */ - public boolean isHet() { return !isHom(); } + public boolean isHet() { return getType() == Type.HET; } /** * @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF) */ - public boolean isNoCall() { - // TODO -- implement me - return false; - } + public boolean isNoCall() { return getType() == Type.NO_CALL; } - /** - * @return true if all alleles for this genotype are SNPs or reference - */ -// public boolean isPointGenotype() { -//// for ( Allele allele : alleles ) { -//// if ( allele.isVariant() && !allele.isSNP() ) -//// return false; -//// } -// return true; +// /** +// * @return true if this is a variant genotype, false if it's reference +// */ +// public boolean isVariant() { +// for ( Allele allele : alleles ) { +// if ( allele.isNonReference() ) +// return true; +// } +// return false; // } - /** - * @return true if this is a variant genotype, false if it's reference - */ - public boolean isVariant() { - for ( Allele allele : alleles ) { - if ( allele.isNonReference() ) - return true; - } - return false; - } - - /** - * @return the -1 * log10-based error estimate - */ - public double getNegLog10PError() { return negLog10PError; } - /** * @return the sample name */ - public String getSample() { return sample; } + public String getSampleName() { + return sampleName; + } /** * Sets the sample name * - * @param sample the sample name + * @param sampleName the sample name */ - public void setSample(String sample) { - this.sample = sample; + public void setSampleName(String sampleName) { + if ( sampleName == null ) throw new IllegalArgumentException("Sample name cannot be null " + this); + this.sampleName = sampleName; } /** - * @param ref the reference base * - * @return this genotype as a variation + * @param alleles */ - // TODO -- implement me - // public Variation toVariation(char ref); + public void setAlleles(List alleles) { + this.alleles.clear(); + + // todo -- add validation checking here + + if ( alleles == null ) throw new IllegalArgumentException("BUG: alleles cannot be null in setAlleles"); + if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0 in setAlleles"); + + int nNoCalls = 0; + for ( Allele allele : alleles ) { nNoCalls += allele.isNoCall() ? 1 : 0; } + if ( nNoCalls > 0 && nNoCalls != alleles.size() ) + throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this); + + for ( Allele allele : alleles ) { + addAllele(allele); + } + } + + public void addAllele(Allele allele) { + if ( allele == null ) throw new IllegalArgumentException("BUG: Cannot add a null allele to a genotype"); + this.alleles.add(allele); + } + + + private static List resolveAlleles(VariantContext vc, List alleleStrings) { + List alleles = new ArrayList(); + for ( String alleleString : alleleStrings ) { + Allele allele = vc.getAllele(alleleString); + + if ( allele == null ) { + if ( Allele.wouldBeNoCallAllele(alleleString.getBytes()) ) { + allele = new Allele(alleleString); + } else { + throw new IllegalArgumentException("Allele " + alleleString + " not present in the list of alleles in VariantContext " + vc); + } + } + + alleles.add(allele); + } + + return alleles; + } + + public String toString() { + return String.format("[GT: %s %s %s Q%.2f %s]", getSampleName(), getAlleles(), getType(), getNegLog10PError(), getAttributes()); + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/TestVariantContextWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/TestVariantContextWalker.java index 0a47ccd33..74c790efb 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/TestVariantContextWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/TestVariantContextWalker.java @@ -2,10 +2,7 @@ package org.broadinstitute.sting.oneoffprojects.variantcontext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RODRecordList; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.rodDbSNP; +import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.*; @@ -15,20 +12,35 @@ import org.broadinstitute.sting.utils.*; public class TestVariantContextWalker extends RodWalker { public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - GenomeLoc cur = context.getLocation(); - if ( ref == null ) return 0; else { RODRecordList dbsnpList = tracker.getTrackData("dbsnp", null); - if (dbsnpList == null) - return 0; - else { + if (dbsnpList != null) { + // do dbSNP conversion int n = 0; for (ReferenceOrderedDatum d : dbsnpList) { rodDbSNP dbsnpRecord = (rodDbSNP)d; - VariantContext vc = VariantContextAdaptors.dbsnp2VariantContext(dbsnpRecord); + if ( dbsnpRecord.getLocation().getStart() == context.getLocation().getStart() ) { + VariantContext vc = VariantContextAdaptors.dbsnp2VariantContext(dbsnpRecord); + if ( vc != null ) { + n++; + System.out.printf("%s%n", vc); + } + } + } + + return n; + } + + RODRecordList vcfList = tracker.getTrackData("vcf", null); + if (vcfList != null) { + // do vcf conversion + int n = 0; + for (ReferenceOrderedDatum d : vcfList) { + RodVCF vcfRecord = (RodVCF)d; + VariantContext vc = VariantContextAdaptors.vcf2VariantContext(vcfRecord); if ( vc != null ) { n++; System.out.printf("%s%n", vc); @@ -37,6 +49,8 @@ public class TestVariantContextWalker extends RodWalker { return n; } + + return 0; } } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContext.java b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContext.java index f4de1b8f8..2cbd76239 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContext.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContext.java @@ -2,10 +2,8 @@ package org.broadinstitute.sting.oneoffprojects.variantcontext; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.gatk.refdata.*; import java.util.*; -import org.apache.commons.jexl.*; /** @@ -22,26 +20,9 @@ public class VariantContext extends AttributedObject { private Map genotypes = new HashMap(); - Type type = null; + Type type = Type.UNDETERMINED; - // todo -- add QUAL and FILTER - - //private double negLog10PError = 0.0; // todo - fixme - -// public VariantContext(VariationRod rod) { -// -// // TODO -- VariationRod should eventually require classes to implement toVariationContext() -// // TODO -- (instead of using a temporary adapter class) -// -// loc = rod.getLocation(); -// reference = new Allele(Allele.AlleleType.REFERENCE, rod.getReference()); -// -// // TODO -- populate the alleles and genotypes through an adapter -// alleles = new HashSet(); -// genotypes = new HashSet(); -// -// attributes = new HashMap(); -// } + // todo -- add FILTER // --------------------------------------------------------------------------------------------------------- // @@ -166,7 +147,10 @@ public class VariantContext extends AttributedObject { NO_VARIATION, SNP, INDEL, - MIXED + MIXED, + + // special types + UNDETERMINED // do not use this value, it is reserved for the VariantContext itself } /** @@ -175,7 +159,7 @@ public class VariantContext extends AttributedObject { * @return the AlleleType of this allele **/ public Type getType() { - if ( type == null ) + if ( type == Type.UNDETERMINED ) determineType(); return type; @@ -202,9 +186,19 @@ public class VariantContext extends AttributedObject { */ public boolean isIndel() { return getType() == Type.INDEL; } - // todo -- implement, looking at reference allele - //public boolean isInsertion() { return getType() == Type.INDEL; } - //public boolean isDeletion() { return getType() == Type.INDEL; } + /** + * @return true if the alleles indicate a simple insertion (i.e., the reference allele is Null) + */ + public boolean isInsertion() { + return getType() == Type.INDEL && getReference().isNull(); + } + + /** + * @return true if the alleles indicate a simple deletion (i.e., a single alt allele that is Null) + */ + public boolean isDeletion() { + return getType() == Type.INDEL && ! isInsertion(); + } /** * convenience method for indels @@ -248,7 +242,6 @@ public class VariantContext extends AttributedObject { return null; } - /** * @return true if the context is strictly bi-allelic */ @@ -260,6 +253,48 @@ public class VariantContext extends AttributedObject { return alleles.size(); } + public Allele getAllele(String allele) { + return getAllele(allele.getBytes()); + } + + public Allele getAllele(byte[] allele) { + for ( Allele a : getAlleles() ) { + if ( a.basesMatch(allele) ) { + return a; + } + } + + return null; // couldn't find anything + } + + public boolean hasAllele(Allele allele) { + for ( Allele a : getAlleles() ) { + if ( a.equals(allele) ) + return true; + } + + return false; + } + +// public int getMaxAlleleLength() { +// int max = 0; +// +// for ( Allele a : getAlleles() ) { +// max = a.length() > max ? a.length() : max; +// } +// +// return max; +// } +// +// public int getMinAlleleLength() { +// int min = Integer.MAX_VALUE; +// +// for ( Allele a : getAlleles() ) { +// min = a.length() < min ? a.length() : min; +// } +// +// return min; +// } /** * Gets the alleles. This method should return all of the alleles present at the location, @@ -287,15 +322,15 @@ public class VariantContext extends AttributedObject { return altAlleles; } - public Allele getAlternateAllele(int count) { + public Allele getAlternateAllele(int i) { int n = 0; for ( Allele allele : alleles ) { - if ( allele.isNonReference() && n++ == count ) + if ( allele.isNonReference() && n++ == i ) return allele; } - throw new IllegalArgumentException("Requested " + count + " alternative allele but there are only " + n + " alternative alleles " + this); + throw new IllegalArgumentException("Requested " + i + " alternative allele but there are only " + n + " alternative alleles " + this); } @@ -310,6 +345,8 @@ public class VariantContext extends AttributedObject { } public void addAllele(Allele allele, boolean allowDuplicates) { + type = Type.UNDETERMINED; + for ( Allele a : alleles ) { if ( a.basesMatch(allele) && ! allowDuplicates ) throw new IllegalArgumentException("Duplicate allele added to VariantContext" + this); @@ -349,8 +386,13 @@ public class VariantContext extends AttributedObject { * @return */ public int getChromosomeCount() { - // todo -- return the number of ! no_call alleles - return 0; + int n = 0; + + for ( Genotype g : getGenotypes().values() ) { + n += g.isNoCall() ? 0 : g.getPloidy(); + } + + return n; } /** @@ -360,8 +402,13 @@ public class VariantContext extends AttributedObject { * @return */ public int getChromosomeCount(Allele a) { - // todo -- walk through genotypes and count genotypes with allele - return 0; + int n = 0; + + for ( Genotype g : getGenotypes().values() ) { + n += g.getAlleles(a).size(); + } + + return n; } /** @@ -400,7 +447,7 @@ public class VariantContext extends AttributedObject { this.genotypes.clear(); for ( Genotype g : genotypes ) { - addGenotype(g.getSample(), g); + addGenotype(g.getSampleName(), g); } } @@ -412,11 +459,21 @@ public class VariantContext extends AttributedObject { } } - public void addGenotype(Genotype genotype) { - addGenotype(genotype.getSample(), genotype, false); + public void addGenotypes(Map genotypes) { + for ( Map.Entry g : genotypes.entrySet() ) + addGenotype(g.getKey(), g.getValue()); } + public void addGenotypes(Collection genotypes) { + for ( Genotype g : genotypes ) + addGenotype(g); + } + + public void addGenotype(Genotype genotype) { + addGenotype(genotype.getSampleName(), genotype, false); + } + public void addGenotype(String sampleName, Genotype genotype) { addGenotype(sampleName, genotype, false); } @@ -425,29 +482,23 @@ public class VariantContext extends AttributedObject { if ( hasGenotype(sampleName) && ! allowOverwrites ) throw new StingException("Attempting to overwrite sample->genotype binding: " + sampleName + " this=" + this); - if ( ! sampleName.equals(genotype.getSample()) ) + if ( ! sampleName.equals(genotype.getSampleName()) ) throw new StingException("Sample name doesn't equal genotype.getSample(): " + sampleName + " genotype=" + genotype); this.genotypes.put(sampleName, genotype); } public void removeGenotype(String sampleName) { + if ( ! this.genotypes.containsKey(sampleName) ) + throw new IllegalArgumentException("Sample name isn't contained in genotypes " + sampleName + " genotypes =" + genotypes); + this.genotypes.remove(sampleName); } public void removeGenotype(Genotype genotype) { - removeGenotype(genotype.getSample()); + removeGenotype(genotype.getSampleName()); } - // --------------------------------------------------------------------------------------------------------- - // - // Working with attributes - // - // --------------------------------------------------------------------------------------------------------- - - // todo -- define common attributes as enum - - // --------------------------------------------------------------------------------------------------------- // // validation @@ -464,27 +515,9 @@ public class VariantContext extends AttributedObject { } public boolean validate(boolean throwException) { - // todo -- add extensive testing here - // todo -- check that exactly one allele is tagged as reference - // todo -- check that there's only one null allele - try { - // check alleles - boolean alreadySeenRef = false, alreadySeenNull = false; - for ( Allele allele : alleles ) { - if ( allele.isReference() ) { - if ( alreadySeenRef ) throw new IllegalArgumentException("BUG: Received two reference tagged alleles in VariantContext " + alleles + " this=" + this); - alreadySeenRef = true; - } - - if ( allele.isNullAllele() ) { - if ( alreadySeenNull ) throw new IllegalArgumentException("BUG: Received two null alleles in VariantContext " + alleles + " this=" + this); - alreadySeenNull = true; - } - } - - if ( ! alreadySeenRef ) - throw new IllegalArgumentException("No reference allele found in VariantContext"); + validateAlleles(); + validateGenotypes(); } catch ( IllegalArgumentException e ) { if ( throwException ) throw e; @@ -495,6 +528,58 @@ public class VariantContext extends AttributedObject { return true; } + private void validateAlleles() { + // check alleles + boolean alreadySeenRef = false, alreadySeenNull = false; + for ( Allele allele : alleles ) { + // make sure there's only one reference allele + if ( allele.isReference() ) { + if ( alreadySeenRef ) throw new IllegalArgumentException("BUG: Received two reference tagged alleles in VariantContext " + alleles + " this=" + this); + alreadySeenRef = true; + } + + if ( allele.isNoCall() ) { + throw new IllegalArgumentException("BUG: Cannot add a no call allele to a variant context " + alleles + " this=" + this); + } + + // make sure there's only one null allele + if ( allele.isNull() ) { + if ( alreadySeenNull ) throw new IllegalArgumentException("BUG: Received two null alleles in VariantContext " + alleles + " this=" + this); + alreadySeenNull = true; + } + } + + // make sure there's one reference allele + if ( ! alreadySeenRef ) + throw new IllegalArgumentException("No reference allele found in VariantContext"); + + if ( getType() == Type.INDEL ) { +// if ( getReference().length() != (getLocation().size()-1) ) { + if ( (getReference().isNull() && getLocation().size() != 1 ) || + (getReference().isNonNull() && getReference().length() != getLocation().size()) ) { + throw new IllegalStateException("BUG: GenomeLoc " + getLocation() + " has a size == " + getLocation().size() + " but the variation reference allele has length " + getReference().length() + " this = " + this); + } + } + } + + private void validateGenotypes() { + if ( this.genotypes == null ) throw new IllegalStateException("Genotypes is null"); + + for ( Map.Entry elt : this.genotypes.entrySet() ) { + String name = elt.getKey(); + Genotype g = elt.getValue(); + + if ( ! name.equals(g.getSampleName()) ) throw new IllegalStateException("Bound sample name " + name + " does not equal the name of the genotype " + g.getSampleName()); + + for ( Allele gAllele : g.getAlleles() ) { + if ( ! hasAllele(gAllele) && gAllele.isCalled() ) + throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles); + } + } + } + + + // --------------------------------------------------------------------------------------------------------- // // utility routines @@ -502,7 +587,7 @@ public class VariantContext extends AttributedObject { // --------------------------------------------------------------------------------------------------------- private void determineType() { - if ( type == null ) { + if ( type == Type.UNDETERMINED ) { if ( alleles.size() == 0 ) { throw new StingException("Unexpected requested type of VariantContext with no alleles!" + this); } else if ( alleles.size() == 1 ) { @@ -540,53 +625,30 @@ public class VariantContext extends AttributedObject { Allele a2 = it.next(); return a1.length() != a2.length(); } - public String toString() { return String.format("[VC @ %s of type=%s alleles=%s attr=%s GT=%s", - getLocation(), this.getType(), this.getAlleles(), this.getAttributes(), this.getGenotypes()); + getLocation(), this.getType(), this.getAlleles(), this.getAttributes(), this.getGenotypes().values()); } - /** - * @return true if the context represents point alleles only (i.e. no indels or structural variants) - */ -// public boolean isPointAllele() { -// for ( Allele allele : alleles ) { -// if ( allele.isVariant() && !allele.isSNP() ) -// return false; -// } -// return true; -// } -// - -// /** -// * @return set of all subclasses within this context -// */ -// public Set getSubclasses() { -// Set subclasses = new HashSet(); -// for ( Genotype g : genotypes ) -// subclasses.addAll(g.getAttributes().keySet()); -// return subclasses; -// } - // todo -- move to utils /** * @param allele the allele to be queried * * @return the frequency of the given allele in this context */ - public double getAlleleFrequency(Allele allele) { - int alleleCount = 0; - int totalCount = 0; - - for ( Genotype g : getGenotypes().values() ) { - for ( Allele a : g.getAlleles() ) { - totalCount++; - if ( allele.equals(a) ) - alleleCount++; - } - } - - return totalCount == 0 ? 0.0 : (double)alleleCount / (double)totalCount; - } +// public double getAlleleFrequency(Allele allele) { +// int alleleCount = 0; +// int totalCount = 0; +// +// for ( Genotype g : getGenotypes().values() ) { +// for ( Allele a : g.getAlleles() ) { +// totalCount++; +// if ( allele.equals(a) ) +// alleleCount++; +// } +// } +// +// return totalCount == 0 ? 0.0 : (double)alleleCount / (double)totalCount; +// } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContextAdaptors.java b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContextAdaptors.java index 66df6eeee..6906301b6 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContextAdaptors.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContextAdaptors.java @@ -1,24 +1,86 @@ package org.broadinstitute.sting.oneoffprojects.variantcontext; import org.broadinstitute.sting.gatk.refdata.rodDbSNP; +import org.broadinstitute.sting.gatk.refdata.RodVCF; +import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord; +import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeEncoding; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Set; +import java.util.*; public class VariantContextAdaptors { public static VariantContext dbsnp2VariantContext(rodDbSNP dbsnp) { - VariantContext vc = new VariantContext(dbsnp.getLocation()); + if ( dbsnp.isSNP() || dbsnp.isIndel() || dbsnp.varType.contains("mixed") ) { + VariantContext vc = new VariantContext(dbsnp.getLocation()); - // add the reference allele - Allele refAllele = new Allele(dbsnp.getReference(), true); - vc.addAllele(refAllele); + // add the reference allele + if ( ! Allele.acceptableAlleleBases(dbsnp.getReference()) ) { + System.out.printf("Excluding dbsnp record %s%n", dbsnp); + return null; + } - // add all of the alt alleles - for ( String alt : dbsnp.getAlternateAlleleList() ) - vc.addAllele(new Allele(alt, false)); + Allele refAllele = new Allele(dbsnp.getReference(), true); + vc.addAllele(refAllele); - return vc; + // add all of the alt alleles + for ( String alt : dbsnp.getAlternateAlleleList() ) { + if ( ! Allele.acceptableAlleleBases(alt) ) { + System.out.printf("Excluding dbsnp record %s%n", dbsnp); + return null; + } + vc.addAllele(new Allele(alt, false)); + } + + vc.validate(); + return vc; + } else + return null; // can't handle anything else + } + + public static VariantContext vcf2VariantContext(RodVCF vcf) { + if ( vcf.isSNP() || vcf.isIndel() ) { + VariantContext vc = new VariantContext(vcf.getLocation()); + + // add the reference allele + if ( ! Allele.acceptableAlleleBases(vcf.getReference()) ) { + System.out.printf("Excluding vcf record %s%n", vcf); + return null; + } + + Allele refAllele = new Allele(vcf.getReference(), true); + vc.addAllele(refAllele); + vc.setNegLog10PError(vcf.getNegLog10PError()); + vc.setAttributes(vcf.getInfoValues()); + vc.putAttribute("ID", vcf.getID()); + vc.putAttribute("FILTER", vcf.getFilterString()); + + // add all of the alt alleles + for ( String alt : vcf.getAlternateAlleleList() ) { + if ( ! Allele.acceptableAlleleBases(alt) ) { + System.out.printf("Excluding vcf record %s%n", vcf); + return null; + } + vc.addAllele(new Allele(alt, false)); + } + + for ( VCFGenotypeRecord vcfG : vcf.getVCFGenotypeRecords() ) { + List alleleStrings = new ArrayList(); + for ( VCFGenotypeEncoding s : vcfG.getAlleles() ) + alleleStrings.add(s.getBases()); + + double pError = vcfG.getNegLog10PError() == VCFGenotypeRecord.MISSING_GENOTYPE_QUALITY ? AttributedObject.NO_NEG_LOG_10PERROR : vcfG.getNegLog10PError(); + Genotype g = new Genotype(vc, alleleStrings, vcfG.getSampleName(), pError); + for ( Map.Entry e : vcfG.getFields().entrySet() ) { + if ( ! e.getKey().equals(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY) ) + g.putAttribute(e.getKey(), e.getValue()); + } + + vc.addGenotype(g); + } + + vc.validate(); + return vc; + } else + return null; // can't handle anything else } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContextUtils.java b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContextUtils.java index 1b8dcf4f5..c1fddd5cb 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContextUtils.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContextUtils.java @@ -35,9 +35,9 @@ public class VariantContextUtils { Set Gs = new HashSet(left.getGenotypes().values()); for ( Genotype g : other.getGenotypes().values() ) { - if ( samples.contains(g.getSample()) ) { + if ( samples.contains(g.getSampleName()) ) { if ( uniquifySamples ) - g.setSample(g.getSample() + UNIQUIFIED_SUFFIX); + g.setSampleName(g.getSampleName() + UNIQUIFIED_SUFFIX); else throw new IllegalStateException("The same sample name exists in both contexts when attempting to merge"); } diff --git a/java/test/org/broadinstitute/sting/oneoffprojects/variantcontext/AlleleTest.java b/java/test/org/broadinstitute/sting/oneoffprojects/variantcontext/AlleleTest.java index 11f8b95ef..3869c8900 100755 --- a/java/test/org/broadinstitute/sting/oneoffprojects/variantcontext/AlleleTest.java +++ b/java/test/org/broadinstitute/sting/oneoffprojects/variantcontext/AlleleTest.java @@ -27,7 +27,7 @@ import org.junit.Test; * Basic unit test for RecalData */ public class AlleleTest extends BaseTest { - Allele ARef, del, delRef, A, T, ATIns, ATCIns; + Allele ARef, del, delRef, A, T, ATIns, ATCIns, NoCall; @Before public void before() { @@ -40,6 +40,8 @@ public class AlleleTest extends BaseTest { ATIns = new Allele("AT"); ATCIns = new Allele("ATC"); + + NoCall = new Allele("."); } @Test @@ -50,8 +52,8 @@ public class AlleleTest extends BaseTest { Assert.assertFalse(A.isReference()); Assert.assertTrue(A.basesMatch("A")); Assert.assertEquals(A.length(), 1); - Assert.assertTrue(A.isNonNullAllele()); - Assert.assertFalse(A.isNullAllele()); + Assert.assertTrue(A.isNonNull()); + Assert.assertFalse(A.isNull()); Assert.assertTrue(ARef.isReference()); Assert.assertFalse(ARef.isNonReference()); @@ -64,6 +66,19 @@ public class AlleleTest extends BaseTest { Assert.assertFalse(T.basesMatch("A")); } + @Test + public void testCreatingNoCallAlleles() { + logger.warn("testCreatingNoCallAlleles"); + + Assert.assertTrue(NoCall.isNonReference()); + Assert.assertFalse(NoCall.isReference()); + Assert.assertFalse(NoCall.basesMatch(".")); + Assert.assertEquals(NoCall.length(), 0); + Assert.assertTrue(NoCall.isNonNull()); + Assert.assertFalse(NoCall.isNull()); + } + + @Test public void testCreatingIndelAlleles() { logger.warn("testCreatingIndelAlleles"); @@ -80,8 +95,8 @@ public class AlleleTest extends BaseTest { Assert.assertFalse(del.basesMatch("-")); Assert.assertTrue(del.basesMatch("")); Assert.assertEquals(del.length(), 0); - Assert.assertFalse(del.isNonNullAllele()); - Assert.assertTrue(del.isNullAllele()); + Assert.assertFalse(del.isNonNull()); + Assert.assertTrue(del.isNull()); } diff --git a/java/test/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContextTest.java b/java/test/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContextTest.java index 4b23eaa4a..7c60338d2 100755 --- a/java/test/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContextTest.java +++ b/java/test/org/broadinstitute/sting/oneoffprojects/variantcontext/VariantContextTest.java @@ -13,9 +13,7 @@ import org.junit.Before; import org.junit.Test; import org.junit.BeforeClass; -import java.util.Map; import java.util.Arrays; -import java.util.Set; import java.util.List; import java.io.FileNotFoundException; import java.io.File; @@ -42,13 +40,13 @@ public class VariantContextTest extends BaseTest { GenomeLoc snpLoc = GenomeLocParser.createGenomeLoc("chr1", 10, 11); // - / ATC [ref] from 20-23 - GenomeLoc delLoc = GenomeLocParser.createGenomeLoc("chr1", 20, 23); + GenomeLoc delLoc = GenomeLocParser.createGenomeLoc("chr1", 20, 22); // - [ref] / ATC from 20-20 GenomeLoc insLoc = GenomeLocParser.createGenomeLoc("chr1", 20, 20); // - / A / T / ATC [ref] from 20-23 - GenomeLoc mixedLoc = GenomeLocParser.createGenomeLoc("chr1", 20, 23); + GenomeLoc mixedLoc = GenomeLocParser.createGenomeLoc("chr1", 20, 22); @Before public void before() { @@ -64,8 +62,6 @@ public class VariantContextTest extends BaseTest { ATCref = new Allele("ATC", true); } - // todo -- create reference context - @Test public void testCreatingSNPVariantContext() { logger.warn("testCreatingSNPVariantContext"); @@ -78,8 +74,8 @@ public class VariantContextTest extends BaseTest { Assert.assertEquals(vc.getType(), VariantContext.Type.SNP); Assert.assertTrue(vc.isSNP()); Assert.assertFalse(vc.isIndel()); - //Assert.assertFalse(vc.isInsertion()); - //Assert.assertFalse(vc.isDeletion()); + Assert.assertFalse(vc.isInsertion()); + Assert.assertFalse(vc.isDeletion()); Assert.assertFalse(vc.isMixed()); Assert.assertTrue(vc.isBiallelic()); Assert.assertEquals(vc.getNAlleles(), 2); @@ -106,8 +102,8 @@ public class VariantContextTest extends BaseTest { Assert.assertEquals(vc.getType(), VariantContext.Type.NO_VARIATION); Assert.assertFalse(vc.isSNP()); Assert.assertFalse(vc.isIndel()); - //Assert.assertFalse(vc.isInsertion()); - //Assert.assertFalse(vc.isDeletion()); + Assert.assertFalse(vc.isInsertion()); + Assert.assertFalse(vc.isDeletion()); Assert.assertFalse(vc.isMixed()); Assert.assertFalse(vc.isBiallelic()); Assert.assertEquals(vc.getNAlleles(), 1); @@ -133,8 +129,8 @@ public class VariantContextTest extends BaseTest { Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); Assert.assertFalse(vc.isSNP()); Assert.assertTrue(vc.isIndel()); - //Assert.assertFalse(vc.isInsertion()); - //Assert.assertFalse(vc.isDeletion()); + Assert.assertFalse(vc.isInsertion()); + Assert.assertTrue(vc.isDeletion()); Assert.assertFalse(vc.isMixed()); Assert.assertTrue(vc.isBiallelic()); Assert.assertEquals(vc.getNAlleles(), 2); @@ -161,8 +157,8 @@ public class VariantContextTest extends BaseTest { Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); Assert.assertFalse(vc.isSNP()); Assert.assertTrue(vc.isIndel()); - //Assert.assertFalse(vc.isInsertion()); - //Assert.assertFalse(vc.isDeletion()); + Assert.assertTrue(vc.isInsertion()); + Assert.assertFalse(vc.isDeletion()); Assert.assertFalse(vc.isMixed()); Assert.assertTrue(vc.isBiallelic()); Assert.assertEquals(vc.getNAlleles(), 2); @@ -206,8 +202,140 @@ public class VariantContextTest extends BaseTest { logger.warn("testBadConstructorArgsDuplicateAlleles2"); new VariantContext(insLoc, Arrays.asList(Aref, A)); } + + + @Test + public void testAccessingSimpleSNPGenotypes() { + logger.warn("testAccessingSimpleSNPGenotypes"); + + List alleles = Arrays.asList(Aref, T); + VariantContext vc = new VariantContext(snpLoc, alleles); + logger.warn("vc = " + vc); + + Genotype g1 = new Genotype(Arrays.asList(Aref, Aref), "AA", 10); + Genotype g2 = new Genotype(Arrays.asList(Aref, T), "AT", 10); + Genotype g3 = new Genotype(Arrays.asList(T, T), "TT", 10); + + vc.addGenotypes(Arrays.asList(g1, g2, g3)); + + Assert.assertTrue(vc.hasGenotypes()); + Assert.assertFalse(vc.isMonomorphic()); + Assert.assertTrue(vc.isPolymorphic()); + Assert.assertEquals(vc.getSampleNames().size(), 3); + + Assert.assertEquals(vc.getGenotypes().size(), 3); + Assert.assertEquals(vc.getGenotypes().get("AA"), g1); + Assert.assertEquals(vc.getGenotype("AA"), g1); + Assert.assertEquals(vc.getGenotypes().get("AT"), g2); + Assert.assertEquals(vc.getGenotype("AT"), g2); + Assert.assertEquals(vc.getGenotypes().get("TT"), g3); + Assert.assertEquals(vc.getGenotype("TT"), g3); + + Assert.assertTrue(vc.hasGenotype("AA")); + Assert.assertTrue(vc.hasGenotype("AT")); + Assert.assertTrue(vc.hasGenotype("TT")); + Assert.assertFalse(vc.hasGenotype("foo")); + Assert.assertFalse(vc.hasGenotype("TTT")); + Assert.assertFalse(vc.hasGenotype("at")); + Assert.assertFalse(vc.hasGenotype("tt")); + + Assert.assertEquals(vc.getChromosomeCount(), 6); + Assert.assertEquals(vc.getChromosomeCount(Aref), 3); + Assert.assertEquals(vc.getChromosomeCount(T), 3); + } + + @Test + public void testAccessingCompleteGenotypes() { + logger.warn("testAccessingCompleteGenotypes"); + + List alleles = Arrays.asList(Aref, T, del); + VariantContext vc = new VariantContext(delLoc, alleles); + logger.warn("vc = " + vc); + + Genotype g1 = new Genotype(Arrays.asList(Aref, Aref), "AA", 10); + Genotype g2 = new Genotype(Arrays.asList(Aref, T), "AT", 10); + Genotype g3 = new Genotype(Arrays.asList(T, T), "TT", 10); + Genotype g4 = new Genotype(Arrays.asList(T, del), "Td", 10); + Genotype g5 = new Genotype(Arrays.asList(del, del), "dd", 10); + Genotype g6 = new Genotype(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), "..", 10); + + vc.addGenotypes(Arrays.asList(g1, g2, g3, g4, g5, g6)); + + Assert.assertTrue(vc.hasGenotypes()); + Assert.assertFalse(vc.isMonomorphic()); + Assert.assertTrue(vc.isPolymorphic()); + Assert.assertEquals(vc.getGenotypes().size(), 6); + + Assert.assertEquals(10, vc.getChromosomeCount()); + Assert.assertEquals(3, vc.getChromosomeCount(Aref)); + Assert.assertEquals(4, vc.getChromosomeCount(T)); + Assert.assertEquals(3, vc.getChromosomeCount(del)); + Assert.assertEquals(2, vc.getChromosomeCount(Allele.NO_CALL)); + } + + @Test + public void testAccessingRefGenotypes() { + logger.warn("testAccessingRefGenotypes"); + + List alleles1 = Arrays.asList(Aref, T); + List alleles2 = Arrays.asList(Aref); + List alleles3 = Arrays.asList(Aref, T, del); + for ( List alleles : Arrays.asList(alleles1, alleles2, alleles3)) { + VariantContext vc = new VariantContext(snpLoc, alleles); + logger.warn("vc = " + vc); + + Genotype g1 = new Genotype(Arrays.asList(Aref, Aref), "AA1", 10); + Genotype g2 = new Genotype(Arrays.asList(Aref, Aref), "AA2", 10); + Genotype g3 = new Genotype(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), "..", 10); + + vc.addGenotypes(Arrays.asList(g1, g2, g3)); + + Assert.assertTrue(vc.hasGenotypes()); + Assert.assertTrue(vc.isMonomorphic()); + Assert.assertFalse(vc.isPolymorphic()); + Assert.assertEquals(vc.getGenotypes().size(), 3); + + Assert.assertEquals(4, vc.getChromosomeCount()); + Assert.assertEquals(4, vc.getChromosomeCount(Aref)); + Assert.assertEquals(0, vc.getChromosomeCount(T)); + Assert.assertEquals(2, vc.getChromosomeCount(Allele.NO_CALL)); + } + } + + + @Test + public void testManipulatingAlleles() { + logger.warn("testManipulatingAlleles"); + // todo -- add tests that call add/set/remove + } + + @Test + public void testManipulatingGenotypes() { + logger.warn("testManipulatingGenotypes"); + // todo -- add tests that call add/set/remove + } } +// genotype functions +// public boolean hasGenotypes() { return genotypes.size() > 0; } +// public Map getGenotypes() { return genotypes; } +// public Set getSampleNames() { +// public int getChromosomeCount() { +// public int getChromosomeCount(Allele a) { +// public boolean isMonomorphic() { +// public boolean isPolymorphic() { +// public Genotype getGenotype(String sample) { +// public boolean hasGenotype(String sample) { +// public void setGenotypes(Genotype genotype) { +// public void setGenotypes(Collection genotypes) { +// public void setGenotypes(Map genotypes) { +// public void addGenotype(Genotype genotype) { +// public void addGenotype(String sampleName, Genotype genotype) { +// public void addGenotype(String sampleName, Genotype genotype, boolean allowOverwrites) { +// public void removeGenotype(String sampleName) { +// public void removeGenotype(Genotype genotype) { + +// all functions // public Type getType() { // public boolean isSNP() { return getType() == Type.SNP; } // public boolean isVariant() { return getType() != Type.NO_VARIATION; } @@ -225,17 +353,5 @@ public class VariantContextTest extends BaseTest { // public void setAlleles(Set alleles) { // public void addAllele(Allele allele) { // public void addAllele(Allele allele, boolean allowDuplicates) { -// public boolean hasGenotypes() { return genotypes.size() > 0; } -// public Map getGenotypes() { return genotypes; } -// public Set getSampleNames() { -// public Genotype getGenotype(String sample) { -// public boolean hasGenotype(String sample) { -// public void setGenotypes(Genotype genotype) { -// public void setGenotypes(Collection genotypes) { -// public void setGenotypes(Map genotypes) { -// public void addGenotype(Genotype genotype) { -// public void addGenotype(String sampleName, Genotype genotype) { -// public void addGenotype(String sampleName, Genotype genotype, boolean allowOverwrites) { -// public void removeGenotype(String sampleName) { -// public void removeGenotype(Genotype genotype) { + // public boolean validate() {