improvements to variant eval 2. Now has titv calculations and mendelian violation detect support. we only make ~80 mendelian violations in 380K calls for the YRI trio, in case you are interested

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2768 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-02-03 16:03:19 +00:00
parent af7cd9cf58
commit c89ba7b1a4
7 changed files with 371 additions and 50 deletions

View File

@ -24,12 +24,22 @@ public class Genotype extends AttributedObject {
this(resolveAlleles(vc, alleles), sampleName, negLog10PError);
}
public Genotype(VariantContext vc, List<String> alleles, String sampleName) {
this(resolveAlleles(vc, alleles), sampleName);
}
public Genotype(List<Allele> alleles, String sampleName, double negLog10PError) {
setAlleles(alleles);
setSampleName(sampleName);
setNegLog10PError(negLog10PError);
}
public Genotype(List<Allele> alleles, String sampleName) {
setAlleles(alleles);
setSampleName(sampleName);
}
/**
* @return the alleles for this genotype
*/
@ -46,6 +56,10 @@ public class Genotype extends AttributedObject {
return al;
}
public Allele getAllele(int i) {
return alleles.get(i);
}
private final static String ALLELE_SEPARATOR = "/";
public String getGenotypeString() {
return Utils.join(ALLELE_SEPARATOR, getAllelesString());
@ -89,8 +103,10 @@ public class Genotype extends AttributedObject {
/**
* @return true if all observed alleles are the same (regardless of whether they are ref or alt)
*/
public boolean isHom() { return getType() == Type.HOM_REF || getType() == Type.HOM_VAR; }
public boolean isHom() { return isHomRef() || isHomVar(); }
public boolean isHomRef() { return getType() == Type.HOM_REF; }
public boolean isHomVar() { return getType() == Type.HOM_VAR; }
/**
* @return true if we're het (observed alleles differ)
*/
@ -177,6 +193,47 @@ public class Genotype extends AttributedObject {
}
public String toString() {
return String.format("[GT: %s %s %s Q%.2f %s]", getSampleName(), getAlleles(), getType(), getNegLog10PError(), getAttributes());
return String.format("[GT: %s %s %s Q%.2f %s]", getSampleName(), getAlleles(), getType(), 10 * getNegLog10PError(), getAttributes());
}
public String toBriefString() {
return String.format("%s:Q%.2f", getAlleles(), 10 * getNegLog10PError());
}
public boolean sameGenotype(Genotype other) {
return sameGenotype(other, true);
}
public boolean sameGenotype(Genotype other, boolean ignorePhase) {
if ( getPloidy() != other.getPloidy() )
return false; // gotta have the same number of allele to be equal for gods sake
// algorithms are wildly different if phase is kept of ignored
if ( ignorePhase ) {
for ( int i = 0; i < getPloidy(); i++) {
Allele myAllele = getAllele(i);
Allele otherAllele = other.getAllele(i);
if ( ! myAllele.basesMatch(otherAllele) )
return false;
}
} else {
List<Allele> otherAlleles = other.getAlleles();
for ( Allele myAllele : getAlleles() ) {
Allele alleleToRemove = null;
for ( Allele otherAllele : otherAlleles ) {
if ( myAllele.basesMatch(otherAllele) ) {
alleleToRemove = otherAllele;
break;
}
}
if ( alleleToRemove != null )
otherAlleles.remove(alleleToRemove);
else
return false; // we couldn't find our allele
}
}
return true;
}
}

View File

@ -26,17 +26,12 @@ public class CountVariants extends VariantEvaluator {
long nHomRef = 0;
long nHomVar = 0;
private double rate(long n) {
return n / (1.0 * Math.max(nProcessedLoci, 1));
public CountVariants(VariantEval2Walker parent) {
// don't do anything
}
private long inverseRate(long n) {
return n == 0 ? 0 : nProcessedLoci / Math.max(n, 1);
}
private double ratio(long num, long denom) {
return ((double)num) / (Math.max(denom, 1));
}
private double perLocusRate(long n) { return rate(n, nProcessedLoci); }
private long perLocusRInverseRate(long n) { return inverseRate(n, nProcessedLoci); }
public String getName() {
return "counter";
@ -87,11 +82,11 @@ public class CountVariants extends VariantEvaluator {
"%.2e %d %.2f " +
"%.2f %d %.2f",
nProcessedLoci, nCalledLoci, nRefLoci, nVariantLoci,
rate(nVariantLoci), inverseRate(nVariantLoci),
perLocusRate(nVariantLoci), perLocusRInverseRate(nVariantLoci),
nSNPs, nDeletions, nInsertions, nComplex,
nHomRef, nHets, nHomVar,
rate(nHets), inverseRate(nHets), ratio(nHets, nHomVar),
rate(nDeletions + nInsertions), inverseRate(nDeletions + nInsertions), ratio(nDeletions, nInsertions));
perLocusRate(nHets), perLocusRInverseRate(nHets), ratio(nHets, nHomVar),
perLocusRate(nDeletions + nInsertions), perLocusRInverseRate(nDeletions + nInsertions), ratio(nDeletions, nInsertions));
}
private static List<String> HEADER =

View File

@ -29,6 +29,10 @@ public class DbSNPPercentage extends VariantEvaluator {
private long nSNPsAtdbSNPs = 0;
private long nConcordant = 0;
public DbSNPPercentage(VariantEval2Walker parent) {
// don't do anything
}
public String getName() {
return "dbsnp_percentage";
}
@ -86,28 +90,8 @@ public class DbSNPPercentage extends VariantEvaluator {
*
* @return
*/
public double dbSNPRate() {
return nSNPsAtdbSNPs() / (1.0 * Math.max(nEvalSNPs(), 1));
}
public double concordanceRate() {
return nConcordant() / (1.0 * Math.max(nSNPsAtdbSNPs(), 1));
}
// public static Variation getFirstRealSNP(RODRecordList<ReferenceOrderedDatum> dbsnpList) {
// if (dbsnpList == null)
// return null;
//
// Variation dbsnp = null;
// for (ReferenceOrderedDatum d : dbsnpList) {
// if (((Variation) d).isSNP() && (! (d instanceof RodVCF) || ! ((RodVCF)d).isFiltered())) {
// dbsnp = (Variation)d;
// break;
// }
// }
//
// return dbsnp;
// }
public double dbSNPRate() { return rate(nSNPsAtdbSNPs(), nEvalSNPs()); }
public double concordanceRate() { return rate(nConcordant(), nSNPsAtdbSNPs()); }
public void update2(VariantContext eval, VariantContext dbsnp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
boolean dbSNPIsGood = dbsnp != null && dbsnp.isSNP() && dbsnp.isNotFiltered();

View File

@ -0,0 +1,191 @@
package org.broadinstitute.sting.oneoffprojects.variantcontext.varianteval2;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContext;
import org.broadinstitute.sting.oneoffprojects.variantcontext.Genotype;
import org.broadinstitute.sting.oneoffprojects.variantcontext.Allele;
import org.broadinstitute.sting.utils.StingException;
import java.util.List;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
/**
* Mendelian violation detection and counting
*
* a violation looks like:
* Suppose dad = A/B and mom = C/D
* The child can be [A or B] / [C or D].
* If the child doesn't match this, the site is a violation
*
* Some examples:
*
* mom = A/A, dad = C/C
* child can be A/C only
*
* mom = A/C, dad = C/C
* child can be A/C or C/C
*
* mom = A/C, dad = A/C
* child can be A/A, A/C, C/C
*
* The easiest way to do this calculation is to:
*
* Get alleles for mom => A/B
* Get alleles for dad => C/D
* Make allowed genotypes for child: A/C, A/D, B/C, B/D
* Check that the child is one of these.
*/
public class MendelianViolationEvaluator extends VariantEvaluator {
long nVariants, nViolations, nOverCalls, nUnderCalls;
String mom, dad, child;
VariantEval2Walker parent;
private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)");
public MendelianViolationEvaluator(VariantEval2Walker parent) {
this.parent = parent;
if ( enabled() ) {
Matcher m = FAMILY_PATTERN.matcher(parent.FAMILY_STRUCTURE);
if ( m.matches() ) {
//System.out.printf("Found a family pattern: %s%n", parent.FAMILY_STRUCTURE);
mom = m.group(1);
dad = m.group(2);
child = m.group(3);
parent.getLogger().debug(String.format("Found a family pattern: %s mom=%s dad=%s child=%s", parent.FAMILY_STRUCTURE, mom, dad, child));
} else {
throw new IllegalArgumentException("Malformatted family structure string: " + parent.FAMILY_STRUCTURE + " required format is mom+dad=child");
}
}
}
private boolean enabled() {
return parent.FAMILY_STRUCTURE != null;
}
private double getQThreshold() {
return parent.MENDELIAN_VIOLATION_QUAL_THRESHOLD / 10; // we aren't 10x scaled in the GATK a la phred
}
public String getName() {
return "mendelian_violations";
}
public int getComparisonOrder() {
return 1; // we only need to see each eval track
}
public void update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( vc.isBiallelic() && vc.hasGenotypes() ) { // todo -- currently limited to biallelic loci
nVariants++;
Genotype momG = vc.getGenotype(mom);
Genotype dadG = vc.getGenotype(dad);
Genotype childG = vc.getGenotype(child);
if ( momG.getNegLog10PError() > getQThreshold() && dadG.getNegLog10PError() > getQThreshold() && childG.getNegLog10PError() > getQThreshold() ) {
// all genotypes are good, so let's see if child is a violation
if ( isViolation(vc, momG, dadG, childG) ) {
nViolations++;
String label = null;
switch ( getViolationType( vc, momG, dadG, childG ) ) {
case UNDER_CALL:
nUnderCalls++;
label = "under_called";
break;
case OVER_CALL:
nOverCalls++;
label = "over_called";
break;
default:
throw new StingException("BUG: unexpected violation type at " + vc);
}
String why = String.format("Mendelian violation %s: at %s m=%s d=%s c=%s", label, vc.getLocation(), momG.toBriefString(), dadG.toBriefString(), childG.toBriefString());
addInterestingSite(why , vc);
}
}
}
}
/**
* Are we under or over calling?
*
* Assuming this is a bialleic locus, we then have 2 alleles A and B. There are really two types of violations:
*
* Undercall: where the child is A/A but parent genotypes imply that child must carry at least one B allele
* Overall: where the child carries a B allele but this B allele couldn't have been inherited from either parent
*
* The way to determine this is to look at mom and dad separately. If the child doesn't carry at least one
* allele from each parent, it's an under calls. Otherwise it's an overcall.
*/
public ViolationType getViolationType(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG ) {
switch ( childG.getType() ) {
case HOM_REF:
return ViolationType.UNDER_CALL; // if you have to undercalled as a hom ref child
case HET:
// the only two violations of a het is where both parents are hom. If they are hom ref, you overcalled,
// otherwise you undercalled
return momG.isHomRef() ? ViolationType.OVER_CALL : ViolationType.UNDER_CALL;
case HOM_VAR:
return ViolationType.OVER_CALL; // if you have to overcalled as a hom var child
default:
throw new StingException("BUG: unexpected child genotype class " + childG);
}
}
private enum ViolationType {
UNDER_CALL, OVER_CALL
}
public boolean isViolation(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG ) {
VariantContext momVC = vc.subContextFromGenotypes(momG);
VariantContext dadVC = vc.subContextFromGenotypes(dadG);
int i = 0;
for ( Allele momAllele : momVC.getAlleles() ) {
for ( Allele dadAllele : dadVC.getAlleles() ) {
Genotype possibleChild = new Genotype(Arrays.asList(momAllele, dadAllele), "possibleGenotype" + i);
if ( childG.sameGenotype(possibleChild, false) ) {
return false;
}
}
}
return true;
}
public String toString() {
return getName() + ": " + summaryLine();
}
private String summaryLine() {
return String.format("%d %d %.2e %d %.2e %.2f %d %.2e %.2f",
nVariants, nViolations, rate(nViolations, nVariants),
nOverCalls, rate(nOverCalls, nVariants), ratio(nOverCalls, nViolations),
nUnderCalls, rate(nUnderCalls, nVariants), ratio(nUnderCalls, nViolations));
}
private static List<String> HEADER =
Arrays.asList("nVariants",
"nViolations", "violationRate",
"nOverCalls", "overCallRate", "overCallFraction",
"nUnderCalls", "underCallRate", "underCallFraction");
// making it a table
public List<String> getTableHeader() {
return HEADER;
}
public List<List<String>> getTableRows() {
return Arrays.asList(Arrays.asList(summaryLine().split(" ")));
}
}

View File

@ -14,8 +14,8 @@ public class TiTvVariantEvaluator extends VariantEvaluator {
long nTi = 0, nTv = 0;
long nTiInStd = 0, nTvInStd = 0;
private double ratio(long num, long denom) {
return ((double)num) / (Math.max(denom, 1));
public TiTvVariantEvaluator(VariantEval2Walker parent) {
// don't do anything
}
public String getName() {

View File

@ -10,13 +10,46 @@ import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContext;
import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContextAdaptors;
import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContextUtils;
import org.apache.log4j.Logger;
import java.util.*;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
//
// todo -- write a simple column table system and have the evaluators return this instead of the list<list<string>> objects
// todo -- site frequency spectrum eval (freq. of variants in eval as a function of their AC and AN numbers)
// todo -- multiple sample concordance tool (genotypes in eval vs. genotypes in truth)
// todo -- allele freqeuncy discovery tool (FREQ in true vs. discovery counts in eval). Needs to process subset of samples in true (pools)
// todo -- clustered SNP counter
// todo -- HWEs
// todo -- Validation data analysis from VE1? What is it and should we transition it over?
// todo -- indel metrics [count of sizes in/del should be in CountVariants]
// todo -- create JEXL context implementing object that simply looks up values for JEXL evaluations. Throws error for unknown fields
//
//
// Todo -- should really include argument parsing @annotations from subclass in this walker. Very
// todo -- useful general capability. Right now you need to add arguments to VariantEval2 to handle new
// todo -- evaluation arguments (which is better than passing a string!)
//
//
// todo -- the whole organization only supports a single eval x comp evaluation. We need to instantiate
// todo -- new contexts for each comparison object too!
//
/**
* Test routine for new VariantContext object
*/
public class VariantEval2Walker extends RodWalker<Integer, Integer> {
// --------------------------------------------------------------------------------------------------------------
//
// walker arguments
//
// --------------------------------------------------------------------------------------------------------------
// todo -- add doc string
@Argument(shortName="select", doc="", required=false)
protected String[] SELECT_EXPS = {"QUAL > 500.0", "HARD_TO_VALIDATE==1", "GATK_STANDARD==1"};
@ -28,6 +61,27 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
@Argument(shortName="known", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false)
protected String[] KNOWN_NAMES = {"dbsnp"};
//
// Arguments for Mendelian Violation calculations
//
@Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
protected String FAMILY_STRUCTURE;
@Argument(shortName="MVQ", fullName="MendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50;
@Argument(shortName="PI", fullName="PrintInterestingSites", doc="If provided, interesting sites in the unselected, called set will be printed", required=false)
protected boolean PRINT_INTERESTING_SITES = false;
// @Argument(shortName="PIA", fullName="PrintAllInterestingSites", doc="If provided, interesting sites will be printed for all sets. Verbose", required=false)
// protected boolean PRINT_ALL_INTERESTING_SITES = false;
// --------------------------------------------------------------------------------------------------------------
//
// private walker data
//
// --------------------------------------------------------------------------------------------------------------
/** private class holding all of the information about a single evaluation group (e.g., for eval ROD) */
private class EvaluationContext extends HashMap<String, Set<VariantEvaluator>> {
// useful for typing
@ -60,6 +114,7 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
// initialize
//
// --------------------------------------------------------------------------------------------------------------
public void initialize() {
determineAllEvalations();
List<VariantContextUtils.MatchExp> selectExps = VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS);
@ -82,7 +137,7 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
private void determineAllEvalations() {
evaluationClasses = PackageUtils.getClassesImplementingInterface(VariantEvaluator.class);
for ( VariantEvaluator e : instantiateEvalationsSet() ) {
for ( VariantEvaluator e : instantiateEvalationsSet(false, null) ) {
// for collecting purposes
variantEvaluationNames.add(e.getName());
logger.debug("Including VariantEvaluator " + e.getName() + " of class " + e.getClass());
@ -91,15 +146,25 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
Collections.sort(variantEvaluationNames);
}
private Set<VariantEvaluator> instantiateEvalationsSet() {
private Set<VariantEvaluator> instantiateEvalationsSet(boolean baseline, String name) {
Set<VariantEvaluator> evals = new HashSet<VariantEvaluator>();
Object[] args = new Object[]{this};
Class[] argTypes = new Class[]{this.getClass()};
for ( Class c : evaluationClasses ) {
try {
evals.add((VariantEvaluator) c.newInstance());
Constructor constructor = c.getConstructor(argTypes);
VariantEvaluator eval = (VariantEvaluator)constructor.newInstance(args);
if ( baseline ) eval.printInterestingSites(name);
evals.add(eval);
} catch (InstantiationException e) {
throw new StingException(String.format("Cannot instantiate annotation class '%s': must be concrete class", c.getSimpleName()));
throw new StingException(String.format("Cannot instantiate class '%s': must be concrete class", c.getSimpleName()));
} catch (NoSuchMethodException e) {
throw new StingException(String.format("Cannot find expected constructor for class '%s': must have constructor accepting a single VariantEval2Walker object", c.getSimpleName()));
} catch (IllegalAccessException e) {
throw new StingException(String.format("Cannot instantiate annotation class '%s': must have no-arg constructor", c.getSimpleName()));
throw new StingException(String.format("Cannot instantiate class '%s':", c.getSimpleName()));
} catch (InvocationTargetException e) {
throw new StingException(String.format("Cannot instantiate class '%s':", c.getSimpleName()));
}
}
@ -111,7 +176,8 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
for ( String filteredName : Arrays.asList(RAW_SET_NAME, RETAINED_SET_NAME, FILTERED_SET_NAME) ) {
for ( String subname : Arrays.asList(ALL_SET_NAME, KNOWN_SET_NAME, NOVEL_SET_NAME) ) {
group.put(subname + "." + filteredName, instantiateEvalationsSet());
String name = subname + "." + filteredName;
group.put(name, instantiateEvalationsSet(subname == ALL_SET_NAME && filteredName == RETAINED_SET_NAME, trackName + "." + (selectExp == null ? "all" : selectExp.name) + "." + name));
}
}
@ -146,7 +212,8 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
// now call the single or paired update function
switch ( evaluation.getComparisonOrder() ) {
case 1:
if ( evalWantsVC && vc != null ) evaluation.update1(vc, tracker, ref, context);
if ( evalWantsVC && vc != null )
evaluation.update1(vc, tracker, ref, context);
break;
case 2:
for ( VariantContext comp : comps.values() ) {
@ -193,6 +260,8 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
comps.put(compName, getVariantContext(compName, tracker, context));
}
// todo -- remove me when the loop works correctly for comparisons of eval x comp for each comp
if ( comps.size() > 1 ) throw new StingException("VariantEval2 currently only supports comparisons of N eval tracks vs. a single comparison track. Yes, I know...");
return comps;
}
@ -314,14 +383,16 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
String keyWord = contextName + "." + evalSubgroupName;
if ( first ) {
out.printf("%20s %s %s%n", evalName, formatKeyword(CONTEXT_HEADER), Utils.join("\t\t", eval.getTableHeader()));
out.printf("%20s %s %s%n", evalName, formatKeyword(CONTEXT_HEADER), Utils.join("\t", eval.getTableHeader()));
first = false;
}
for ( List<String> row : eval.getTableRows() )
out.printf("%20s %s %s%n", evalName, formatKeyword(keyWord), Utils.join("\t\t", row));
out.printf("%20s %s %s%n", evalName, formatKeyword(keyWord), Utils.join("\t", row));
}
}
}
}
protected Logger getLogger() { return logger; }
}

View File

@ -16,7 +16,8 @@ import java.util.ArrayList;
* To change this template use File | Settings | File Templates.
*/
public abstract class VariantEvaluator {
protected boolean accumulateInterestingSites = false;
protected boolean accumulateInterestingSites = false, printInterestingSites = false;
protected String interestingSitePrefix = null;
protected boolean processedASite = false;
protected List<VariantContext> interestingSites = new ArrayList<VariantContext>();
@ -24,9 +25,15 @@ public abstract class VariantEvaluator {
// do we want to keep track of things that are interesting
public void accumulateInterestingSites(boolean enable) { accumulateInterestingSites = enable; }
public void printInterestingSites(String prefix) { printInterestingSites = true; interestingSitePrefix = prefix; }
public boolean isAccumulatingInterestingSites() { return accumulateInterestingSites; }
public List<VariantContext> getInterestingSites() { return interestingSites; }
protected void addInterestingSite(String why, VariantContext vc) { interestingSites.add(vc); }
protected void addInterestingSite(String why, VariantContext vc) {
if ( accumulateInterestingSites )
interestingSites.add(vc);
if ( printInterestingSites )
System.out.printf("%40s %s%n", interestingSitePrefix, why);
}
public boolean processedAnySites() { return processedASite; }
protected void markSiteAsProcessed() { processedASite = true; }
@ -46,4 +53,20 @@ public abstract class VariantEvaluator {
// making it a table
public abstract List<String> getTableHeader();
public abstract List<List<String>> getTableRows();
//
// useful common utility routines
//
protected double rate(long n, long d) {
return n / (1.0 * Math.max(d, 1));
}
protected long inverseRate(long n, long d) {
return n == 0 ? 0 : d / Math.max(n, 1);
}
protected double ratio(long num, long denom) {
return ((double)num) / (Math.max(denom, 1));
}
}