managed to actually move the files too! Damn you svn
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2785 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
8938a4146d
commit
777617b6c7
|
|
@ -0,0 +1,108 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.variantcontext.varianteval2;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.oneoffprojects.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class CountVariants extends VariantEvaluator {
|
||||
long nProcessedLoci = 0;
|
||||
long nCalledLoci = 0;
|
||||
long nVariantLoci = 0;
|
||||
long nRefLoci = 0;
|
||||
|
||||
long nSNPs = 0;
|
||||
long nInsertions = 0;
|
||||
long nDeletions = 0;
|
||||
long nComplex = 0;
|
||||
|
||||
long nNoCalls = 0;
|
||||
long nHets = 0;
|
||||
long nHomRef = 0;
|
||||
long nHomVar = 0;
|
||||
|
||||
public CountVariants(VariantEval2Walker parent) {
|
||||
// don't do anything
|
||||
}
|
||||
|
||||
private double perLocusRate(long n) { return rate(n, nProcessedLoci); }
|
||||
private long perLocusRInverseRate(long n) { return inverseRate(n, nProcessedLoci); }
|
||||
|
||||
public String getName() {
|
||||
return "counter";
|
||||
}
|
||||
|
||||
public int getComparisonOrder() {
|
||||
return 1; // we only need to see each eval track
|
||||
}
|
||||
|
||||
public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
nProcessedLoci += context.getSkippedBases() + 1;
|
||||
}
|
||||
|
||||
public void update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
//nProcessedLoci++;
|
||||
nCalledLoci++;
|
||||
|
||||
if ( vc1.isVariant() ) nVariantLoci++;
|
||||
switch ( vc1.getType() ) {
|
||||
case NO_VARIATION: nRefLoci++; break;
|
||||
case SNP: nSNPs++; break;
|
||||
case INDEL:
|
||||
if ( vc1.isInsertion() ) nInsertions++; else nDeletions++;
|
||||
break;
|
||||
case MIXED: nComplex++; break;
|
||||
}
|
||||
|
||||
for ( Genotype g : vc1.getGenotypes().values() ) {
|
||||
switch ( g.getType() ) {
|
||||
case NO_CALL: nNoCalls++; break;
|
||||
case HOM_REF: nHomRef++; break;
|
||||
case HET: nHets++; break;
|
||||
case HOM_VAR: nHomVar++; break;
|
||||
default: throw new StingException("BUG: Unexpected genotype type: " + g);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return getName() + ": " + summaryLine();
|
||||
}
|
||||
|
||||
private String summaryLine() {
|
||||
return String.format("%d %d %d %d " +
|
||||
"%.2e %d " +
|
||||
"%d %d %d %d " +
|
||||
"%d %d %d " +
|
||||
"%.2e %d %.2f " +
|
||||
"%.2f %d %.2f",
|
||||
nProcessedLoci, nCalledLoci, nRefLoci, nVariantLoci,
|
||||
perLocusRate(nVariantLoci), perLocusRInverseRate(nVariantLoci),
|
||||
nSNPs, nDeletions, nInsertions, nComplex,
|
||||
nHomRef, nHets, nHomVar,
|
||||
perLocusRate(nHets), perLocusRInverseRate(nHets), ratio(nHets, nHomVar),
|
||||
perLocusRate(nDeletions + nInsertions), perLocusRInverseRate(nDeletions + nInsertions), ratio(nDeletions, nInsertions));
|
||||
}
|
||||
|
||||
private static List<String> HEADER =
|
||||
Arrays.asList("nProcessedLoci", "nCalledLoci", "nRefLoci", "nVariantLoci",
|
||||
"variantRate", "variantRatePerBp",
|
||||
"nSNPs", "nDeletions", "nInsertions", "nComplex",
|
||||
"nHomRefGenotypes", "nHetGenotypes", "nHomVarGenotypes",
|
||||
"heterozygosity", "heterozygosityPerBp", "hetHomRatio",
|
||||
"indelRate", "indelRatePerBp", "deletionInsertionRatio");
|
||||
|
||||
// making it a table
|
||||
public List<String> getTableHeader() {
|
||||
return HEADER;
|
||||
}
|
||||
|
||||
public List<List<String>> getTableRows() {
|
||||
return Arrays.asList(Arrays.asList(summaryLine().split(" ")));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,110 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.variantcontext.varianteval2;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.oneoffprojects.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.oneoffprojects.variantcontext.Allele;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
* <p/>
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
public class DbSNPPercentage extends VariantEvaluator {
|
||||
private long nDBSNPs = 0;
|
||||
private long nEvalSNPs = 0;
|
||||
private long nSNPsAtdbSNPs = 0;
|
||||
private long nConcordant = 0;
|
||||
|
||||
public DbSNPPercentage(VariantEval2Walker parent) {
|
||||
// don't do anything
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return "dbsnp_percentage";
|
||||
}
|
||||
|
||||
public int getComparisonOrder() {
|
||||
return 2; // we need to see each eval track and each comp track
|
||||
}
|
||||
|
||||
public long nDBSNPs() { return nDBSNPs; }
|
||||
public long nEvalSNPs() { return nEvalSNPs; }
|
||||
public long nSNPsAtdbSNPs() { return nSNPsAtdbSNPs; }
|
||||
public long nConcordant() { return nConcordant; }
|
||||
public long nNovelSites() { return Math.abs(nEvalSNPs() - nSNPsAtdbSNPs()); }
|
||||
|
||||
public String toString() {
|
||||
return getName() + ": " + summaryLine();
|
||||
}
|
||||
|
||||
private String summaryLine() {
|
||||
return String.format("%d %d %d %d %d %.2f %.2f",
|
||||
nDBSNPs(), nEvalSNPs(), nSNPsAtdbSNPs(), nConcordant(), nNovelSites(), 100 * dbSNPRate(), 100 * concordanceRate());
|
||||
}
|
||||
|
||||
private static List<String> HEADER =
|
||||
Arrays.asList("n_dbsnps", "n_eval_snps", "n_overlapping_snps", "n_concordant", "n_novel_snps", "dbsnp_rate", "concordance_rate");
|
||||
|
||||
// making it a table
|
||||
public List<String> getTableHeader() {
|
||||
return HEADER;
|
||||
}
|
||||
|
||||
public List<List<String>> getTableRows() {
|
||||
return Arrays.asList(Arrays.asList(summaryLine().split(" ")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if every allele in eval is also in dbsnp
|
||||
*
|
||||
* @param eval
|
||||
* @param dbsnp
|
||||
* @return
|
||||
*/
|
||||
public boolean discordantP(VariantContext eval, VariantContext dbsnp ) {
|
||||
for ( Allele a : eval.getAlleles() ) {
|
||||
if ( ! dbsnp.hasAllele(a, true) )
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* What fraction of the evaluated site variants were also found in the db?
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double dbSNPRate() { return rate(nSNPsAtdbSNPs(), nEvalSNPs()); }
|
||||
public double concordanceRate() { return rate(nConcordant(), nSNPsAtdbSNPs()); }
|
||||
|
||||
public void update2(VariantContext eval, VariantContext dbsnp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
boolean dbSNPIsGood = dbsnp != null && dbsnp.isSNP() && dbsnp.isNotFiltered();
|
||||
boolean evalIsGood = eval != null && eval.isSNP();
|
||||
|
||||
if ( dbSNPIsGood ) nDBSNPs++; // count the number of dbSNP events
|
||||
if ( evalIsGood ) nEvalSNPs++; // count the number of dbSNP events
|
||||
|
||||
if ( dbSNPIsGood && evalIsGood ) {
|
||||
nSNPsAtdbSNPs++;
|
||||
|
||||
if ( ! discordantP(eval, dbsnp) ) // count whether we're concordant or not with the dbSNP value
|
||||
nConcordant++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,191 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.variantcontext.varianteval2;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.oneoffprojects.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.oneoffprojects.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
|
||||
/**
|
||||
* Mendelian violation detection and counting
|
||||
*
|
||||
* a violation looks like:
|
||||
* Suppose dad = A/B and mom = C/D
|
||||
* The child can be [A or B] / [C or D].
|
||||
* If the child doesn't match this, the site is a violation
|
||||
*
|
||||
* Some examples:
|
||||
*
|
||||
* mom = A/A, dad = C/C
|
||||
* child can be A/C only
|
||||
*
|
||||
* mom = A/C, dad = C/C
|
||||
* child can be A/C or C/C
|
||||
*
|
||||
* mom = A/C, dad = A/C
|
||||
* child can be A/A, A/C, C/C
|
||||
*
|
||||
* The easiest way to do this calculation is to:
|
||||
*
|
||||
* Get alleles for mom => A/B
|
||||
* Get alleles for dad => C/D
|
||||
* Make allowed genotypes for child: A/C, A/D, B/C, B/D
|
||||
* Check that the child is one of these.
|
||||
*/
|
||||
public class MendelianViolationEvaluator extends VariantEvaluator {
|
||||
long nVariants, nViolations, nOverCalls, nUnderCalls;
|
||||
String mom, dad, child;
|
||||
VariantEval2Walker parent;
|
||||
|
||||
private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)");
|
||||
|
||||
public MendelianViolationEvaluator(VariantEval2Walker parent) {
|
||||
this.parent = parent;
|
||||
|
||||
if ( enabled() ) {
|
||||
Matcher m = FAMILY_PATTERN.matcher(parent.FAMILY_STRUCTURE);
|
||||
if ( m.matches() ) {
|
||||
//System.out.printf("Found a family pattern: %s%n", parent.FAMILY_STRUCTURE);
|
||||
mom = m.group(1);
|
||||
dad = m.group(2);
|
||||
child = m.group(3);
|
||||
parent.getLogger().debug(String.format("Found a family pattern: %s mom=%s dad=%s child=%s", parent.FAMILY_STRUCTURE, mom, dad, child));
|
||||
|
||||
} else {
|
||||
throw new IllegalArgumentException("Malformatted family structure string: " + parent.FAMILY_STRUCTURE + " required format is mom+dad=child");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean enabled() {
|
||||
return parent.FAMILY_STRUCTURE != null;
|
||||
}
|
||||
|
||||
private double getQThreshold() {
|
||||
return parent.MENDELIAN_VIOLATION_QUAL_THRESHOLD / 10; // we aren't 10x scaled in the GATK a la phred
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return "mendelian_violations";
|
||||
}
|
||||
|
||||
public int getComparisonOrder() {
|
||||
return 1; // we only need to see each eval track
|
||||
}
|
||||
|
||||
public void update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( vc.isBiallelic() && vc.hasGenotypes() ) { // todo -- currently limited to biallelic loci
|
||||
nVariants++;
|
||||
|
||||
Genotype momG = vc.getGenotype(mom);
|
||||
Genotype dadG = vc.getGenotype(dad);
|
||||
Genotype childG = vc.getGenotype(child);
|
||||
|
||||
if ( momG.getNegLog10PError() > getQThreshold() && dadG.getNegLog10PError() > getQThreshold() && childG.getNegLog10PError() > getQThreshold() ) {
|
||||
// all genotypes are good, so let's see if child is a violation
|
||||
|
||||
if ( isViolation(vc, momG, dadG, childG) ) {
|
||||
nViolations++;
|
||||
|
||||
String label = null;
|
||||
switch ( getViolationType( vc, momG, dadG, childG ) ) {
|
||||
case UNDER_CALL:
|
||||
nUnderCalls++;
|
||||
label = "under_called";
|
||||
break;
|
||||
case OVER_CALL:
|
||||
nOverCalls++;
|
||||
label = "over_called";
|
||||
break;
|
||||
default:
|
||||
throw new StingException("BUG: unexpected violation type at " + vc);
|
||||
|
||||
}
|
||||
|
||||
String why = String.format("Mendelian violation %s: at %s m=%s d=%s c=%s", label, vc.getLocation(), momG.toBriefString(), dadG.toBriefString(), childG.toBriefString());
|
||||
addInterestingSite(why , vc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Are we under or over calling?
|
||||
*
|
||||
* Assuming this is a bialleic locus, we then have 2 alleles A and B. There are really two types of violations:
|
||||
*
|
||||
* Undercall: where the child is A/A but parent genotypes imply that child must carry at least one B allele
|
||||
* Overall: where the child carries a B allele but this B allele couldn't have been inherited from either parent
|
||||
*
|
||||
* The way to determine this is to look at mom and dad separately. If the child doesn't carry at least one
|
||||
* allele from each parent, it's an under calls. Otherwise it's an overcall.
|
||||
*/
|
||||
public ViolationType getViolationType(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG ) {
|
||||
switch ( childG.getType() ) {
|
||||
case HOM_REF:
|
||||
return ViolationType.UNDER_CALL; // if you have to undercalled as a hom ref child
|
||||
case HET:
|
||||
// the only two violations of a het is where both parents are hom. If they are hom ref, you overcalled,
|
||||
// otherwise you undercalled
|
||||
return momG.isHomRef() ? ViolationType.OVER_CALL : ViolationType.UNDER_CALL;
|
||||
case HOM_VAR:
|
||||
return ViolationType.OVER_CALL; // if you have to overcalled as a hom var child
|
||||
default:
|
||||
throw new StingException("BUG: unexpected child genotype class " + childG);
|
||||
}
|
||||
}
|
||||
|
||||
private enum ViolationType {
|
||||
UNDER_CALL, OVER_CALL
|
||||
}
|
||||
|
||||
public boolean isViolation(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG ) {
|
||||
VariantContext momVC = vc.subContextFromGenotypes(momG);
|
||||
VariantContext dadVC = vc.subContextFromGenotypes(dadG);
|
||||
int i = 0;
|
||||
for ( Allele momAllele : momVC.getAlleles() ) {
|
||||
for ( Allele dadAllele : dadVC.getAlleles() ) {
|
||||
Genotype possibleChild = new Genotype("possibleGenotype" + i, Arrays.asList(momAllele, dadAllele));
|
||||
if ( childG.sameGenotype(possibleChild, false) ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return getName() + ": " + summaryLine();
|
||||
}
|
||||
|
||||
private String summaryLine() {
|
||||
return String.format("%d %d %.2e %d %.2e %.2f %d %.2e %.2f",
|
||||
nVariants, nViolations, rate(nViolations, nVariants),
|
||||
nOverCalls, rate(nOverCalls, nVariants), ratio(nOverCalls, nViolations),
|
||||
nUnderCalls, rate(nUnderCalls, nVariants), ratio(nUnderCalls, nViolations));
|
||||
}
|
||||
|
||||
private static List<String> HEADER =
|
||||
Arrays.asList("nVariants",
|
||||
"nViolations", "violationRate",
|
||||
"nOverCalls", "overCallRate", "overCallFraction",
|
||||
"nUnderCalls", "underCallRate", "underCallFraction");
|
||||
|
||||
// making it a table
|
||||
public List<String> getTableHeader() {
|
||||
return HEADER;
|
||||
}
|
||||
|
||||
public List<List<String>> getTableRows() {
|
||||
return Arrays.asList(Arrays.asList(summaryLine().split(" ")));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.variantcontext.varianteval2;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.oneoffprojects.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class TiTvVariantEvaluator extends VariantEvaluator {
|
||||
long nTi = 0, nTv = 0;
|
||||
long nTiInStd = 0, nTvInStd = 0;
|
||||
|
||||
public TiTvVariantEvaluator(VariantEval2Walker parent) {
|
||||
// don't do anything
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return "titv";
|
||||
}
|
||||
|
||||
public int getComparisonOrder() {
|
||||
return 2; // we only need to see each eval track
|
||||
}
|
||||
|
||||
public void updateTiTv(VariantContext vc, boolean updateStandard) {
|
||||
if ( vc != null && vc.isSNP() && vc.isBiallelic() ) {
|
||||
if ( vc.isTransition() ) {
|
||||
if ( updateStandard ) nTiInStd++; else nTi++;
|
||||
} else {
|
||||
if ( updateStandard ) nTvInStd++; else nTv++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void update2(VariantContext vc1, VariantContext vc2, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( vc1 != null ) updateTiTv(vc1, false);
|
||||
if ( vc2 != null ) updateTiTv(vc2, true);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return getName() + ": " + summaryLine();
|
||||
}
|
||||
|
||||
private String summaryLine() {
|
||||
return String.format("%d %d %.2f %d %d %.2f",
|
||||
nTi, nTv, ratio(nTi, nTv),
|
||||
nTiInStd, nTvInStd, ratio(nTiInStd, nTvInStd));
|
||||
}
|
||||
|
||||
private static List<String> HEADER =
|
||||
Arrays.asList("nTi", "nTv", "TiTvRatio", "nTiStandard", "nTvStandard", "TiTvRatioStandard");
|
||||
|
||||
// making it a table
|
||||
public List<String> getTableHeader() {
|
||||
return HEADER;
|
||||
}
|
||||
|
||||
public List<List<String>> getTableRows() {
|
||||
return Arrays.asList(Arrays.asList(summaryLine().split(" ")));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,398 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.variantcontext.varianteval2;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContextAdaptors;
|
||||
import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContextUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.*;
|
||||
import java.lang.reflect.Constructor;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
|
||||
|
||||
//
|
||||
// todo -- write a simple column table system and have the evaluators return this instead of the list<list<string>> objects
|
||||
// todo -- site frequency spectrum eval (freq. of variants in eval as a function of their AC and AN numbers)
|
||||
// todo -- multiple sample concordance tool (genotypes in eval vs. genotypes in truth)
|
||||
// todo -- allele freqeuncy discovery tool (FREQ in true vs. discovery counts in eval). Needs to process subset of samples in true (pools)
|
||||
// todo -- clustered SNP counter
|
||||
// todo -- HWEs
|
||||
// todo -- Validation data analysis from VE1? What is it and should we transition it over?
|
||||
// todo -- indel metrics [count of sizes in/del should be in CountVariants]
|
||||
// todo -- create JEXL context implementing object that simply looks up values for JEXL evaluations. Throws error for unknown fields
|
||||
//
|
||||
|
||||
//
|
||||
// Todo -- should really include argument parsing @annotations from subclass in this walker. Very
|
||||
// todo -- useful general capability. Right now you need to add arguments to VariantEval2 to handle new
|
||||
// todo -- evaluation arguments (which is better than passing a string!)
|
||||
//
|
||||
|
||||
//
|
||||
// todo -- the whole organization only supports a single eval x comp evaluation. We need to instantiate
|
||||
// todo -- new contexts for each comparison object too!
|
||||
//
|
||||
|
||||
/**
|
||||
* Test routine for new VariantContext object
|
||||
*/
|
||||
public class VariantEval2Walker extends RodWalker<Integer, Integer> {
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// walker arguments
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
// todo -- add doc string
|
||||
@Argument(shortName="select", doc="", required=false)
|
||||
protected String[] SELECT_EXPS = {"QUAL > 500.0", "HARD_TO_VALIDATE==1", "GATK_STANDARD==1"};
|
||||
|
||||
// todo -- add doc string
|
||||
@Argument(shortName="selectName", doc="", required=false)
|
||||
protected String[] SELECT_NAMES = {"q500plus", "low_mapq", "gatk_std_filters"};
|
||||
|
||||
@Argument(shortName="known", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false)
|
||||
protected String[] KNOWN_NAMES = {"dbsnp"};
|
||||
|
||||
//
|
||||
// Arguments for Mendelian Violation calculations
|
||||
//
|
||||
@Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
|
||||
protected String FAMILY_STRUCTURE;
|
||||
|
||||
@Argument(shortName="MVQ", fullName="MendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
|
||||
protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50;
|
||||
|
||||
@Argument(shortName="PI", fullName="PrintInterestingSites", doc="If provided, interesting sites in the unselected, called set will be printed", required=false)
|
||||
protected boolean PRINT_INTERESTING_SITES = false;
|
||||
|
||||
// @Argument(shortName="PIA", fullName="PrintAllInterestingSites", doc="If provided, interesting sites will be printed for all sets. Verbose", required=false)
|
||||
// protected boolean PRINT_ALL_INTERESTING_SITES = false;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// private walker data
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/** private class holding all of the information about a single evaluation group (e.g., for eval ROD) */
|
||||
private class EvaluationContext extends HashMap<String, Set<VariantEvaluator>> {
|
||||
// useful for typing
|
||||
public String trackName, contextName;
|
||||
VariantContextUtils.MatchExp selectExp;
|
||||
|
||||
public EvaluationContext(String trackName, String contextName, VariantContextUtils.MatchExp selectExp) {
|
||||
this.trackName = trackName;
|
||||
this.contextName = contextName;
|
||||
this.selectExp = selectExp;
|
||||
}
|
||||
}
|
||||
|
||||
private HashMap<String, EvaluationContext> contexts = new HashMap<String, EvaluationContext>();
|
||||
private Set<String> compNames = new HashSet<String>();
|
||||
|
||||
private static String RAW_SET_NAME = "raw";
|
||||
private static String RETAINED_SET_NAME = "called";
|
||||
private static String FILTERED_SET_NAME = "filtered";
|
||||
private static String ALL_SET_NAME = "all";
|
||||
private static String KNOWN_SET_NAME = "known";
|
||||
private static String NOVEL_SET_NAME = "novel";
|
||||
|
||||
// Dynamically determined variantEvaluation classes
|
||||
private List<String> variantEvaluationNames = new ArrayList<String>();
|
||||
private List<Class<? extends VariantEvaluator>> evaluationClasses = null;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
determineAllEvalations();
|
||||
List<VariantContextUtils.MatchExp> selectExps = VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS);
|
||||
|
||||
for ( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
|
||||
if ( d.getName().startsWith("eval") ) {
|
||||
for ( VariantContextUtils.MatchExp e : selectExps ) {
|
||||
addNewContext(d.getName(), d.getName() + "." + e.name, e);
|
||||
}
|
||||
addNewContext(d.getName(), d.getName() + ".all", null);
|
||||
} else if ( d.getName().startsWith("dbsnp") || d.getName().startsWith("hapmap") || d.getName().startsWith("comp") ) {
|
||||
compNames.add(d.getName());
|
||||
} else {
|
||||
logger.info("Not evaluating ROD binding " + d.getName());
|
||||
}
|
||||
}
|
||||
|
||||
determineContextNamePartSizes();
|
||||
}
|
||||
|
||||
private void determineAllEvalations() {
|
||||
evaluationClasses = PackageUtils.getClassesImplementingInterface(VariantEvaluator.class);
|
||||
for ( VariantEvaluator e : instantiateEvalationsSet(false, null) ) {
|
||||
// for collecting purposes
|
||||
variantEvaluationNames.add(e.getName());
|
||||
logger.debug("Including VariantEvaluator " + e.getName() + " of class " + e.getClass());
|
||||
}
|
||||
|
||||
Collections.sort(variantEvaluationNames);
|
||||
}
|
||||
|
||||
private Set<VariantEvaluator> instantiateEvalationsSet(boolean baseline, String name) {
|
||||
Set<VariantEvaluator> evals = new HashSet<VariantEvaluator>();
|
||||
Object[] args = new Object[]{this};
|
||||
Class[] argTypes = new Class[]{this.getClass()};
|
||||
|
||||
for ( Class c : evaluationClasses ) {
|
||||
try {
|
||||
Constructor constructor = c.getConstructor(argTypes);
|
||||
VariantEvaluator eval = (VariantEvaluator)constructor.newInstance(args);
|
||||
if ( baseline ) eval.printInterestingSites(name);
|
||||
evals.add(eval);
|
||||
} catch (InstantiationException e) {
|
||||
throw new StingException(String.format("Cannot instantiate class '%s': must be concrete class", c.getSimpleName()));
|
||||
} catch (NoSuchMethodException e) {
|
||||
throw new StingException(String.format("Cannot find expected constructor for class '%s': must have constructor accepting a single VariantEval2Walker object", c.getSimpleName()));
|
||||
} catch (IllegalAccessException e) {
|
||||
throw new StingException(String.format("Cannot instantiate class '%s':", c.getSimpleName()));
|
||||
} catch (InvocationTargetException e) {
|
||||
throw new StingException(String.format("Cannot instantiate class '%s':", c.getSimpleName()));
|
||||
}
|
||||
}
|
||||
|
||||
return evals;
|
||||
}
|
||||
|
||||
private void addNewContext(String trackName, String contextName, VariantContextUtils.MatchExp selectExp) {
|
||||
EvaluationContext group = new EvaluationContext(trackName, contextName, selectExp);
|
||||
|
||||
for ( String filteredName : Arrays.asList(RAW_SET_NAME, RETAINED_SET_NAME, FILTERED_SET_NAME) ) {
|
||||
for ( String subname : Arrays.asList(ALL_SET_NAME, KNOWN_SET_NAME, NOVEL_SET_NAME) ) {
|
||||
String name = subname + "." + filteredName;
|
||||
group.put(name, instantiateEvalationsSet(subname == ALL_SET_NAME && filteredName == RETAINED_SET_NAME, trackName + "." + (selectExp == null ? "all" : selectExp.name) + "." + name));
|
||||
}
|
||||
}
|
||||
|
||||
contexts.put(contextName, group);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
//System.out.printf("map at %s with %d skipped%n", context.getLocation(), context.getSkippedBases());
|
||||
|
||||
Map<String, VariantContext> comps = getCompVariantContexts(tracker, context);
|
||||
|
||||
// to enable walking over pairs where eval or comps have no elements
|
||||
for ( EvaluationContext group : contexts.values() ) {
|
||||
VariantContext vc = getVariantContext(group.trackName, tracker, context);
|
||||
|
||||
//logger.debug(String.format("Updating %s of %s with variant", group.name, vc));
|
||||
for ( Map.Entry<String, Set<VariantEvaluator>> namedEvaluations : group.entrySet() ) {
|
||||
String evaluationName = namedEvaluations.getKey();
|
||||
Set<VariantEvaluator> evaluations = namedEvaluations.getValue();
|
||||
boolean evalWantsVC = applyVCtoEvaluation(evaluationName, vc, comps, group);
|
||||
|
||||
for ( VariantEvaluator evaluation : evaluations ) {
|
||||
// we always call update0 in case the evaluation tracks things like number of bases covered
|
||||
evaluation.update0(tracker, ref, context);
|
||||
|
||||
// now call the single or paired update function
|
||||
switch ( evaluation.getComparisonOrder() ) {
|
||||
case 1:
|
||||
if ( evalWantsVC && vc != null )
|
||||
evaluation.update1(vc, tracker, ref, context);
|
||||
break;
|
||||
case 2:
|
||||
for ( VariantContext comp : comps.values() ) {
|
||||
evaluation.update2( evalWantsVC ? vc : null, comp, tracker, ref, context);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new StingException("BUG: Unexpected evaluation order " + evaluation);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private boolean applyVCtoEvaluation(String evaluationName, VariantContext vc, Map<String, VariantContext> comps, EvaluationContext group) {
|
||||
if ( vc == null )
|
||||
return true;
|
||||
|
||||
if ( evaluationName.contains(FILTERED_SET_NAME) && vc.isNotFiltered() )
|
||||
return false;
|
||||
|
||||
if ( evaluationName.contains(RETAINED_SET_NAME) && vc.isFiltered() )
|
||||
return false;
|
||||
|
||||
boolean vcKnown = vcIsKnown(vc, comps, KNOWN_NAMES);
|
||||
if ( evaluationName.contains(KNOWN_SET_NAME) && ! vcKnown )
|
||||
return false;
|
||||
else if ( evaluationName.contains(NOVEL_SET_NAME) && vcKnown )
|
||||
return false;
|
||||
|
||||
if ( group.selectExp != null && ! VariantContextUtils.match(vc, group.selectExp) )
|
||||
return false;
|
||||
|
||||
// nothing invalidated our membership in this set
|
||||
return true;
|
||||
}
|
||||
|
||||
private Map<String, VariantContext> getCompVariantContexts(RefMetaDataTracker tracker, AlignmentContext context) {
|
||||
Map<String, VariantContext> comps = new HashMap<String, VariantContext>();
|
||||
|
||||
for ( String compName : compNames ) {
|
||||
comps.put(compName, getVariantContext(compName, tracker, context));
|
||||
}
|
||||
|
||||
// todo -- remove me when the loop works correctly for comparisons of eval x comp for each comp
|
||||
if ( comps.size() > 1 ) throw new StingException("VariantEval2 currently only supports comparisons of N eval tracks vs. a single comparison track. Yes, I know...");
|
||||
return comps;
|
||||
}
|
||||
|
||||
private boolean vcIsKnown(VariantContext vc, Map<String, VariantContext> comps, String[] knownNames ) {
|
||||
for ( String knownName : knownNames ) {
|
||||
VariantContext known = comps.get(knownName);
|
||||
if ( known != null && known.isNotFiltered() && known.getType() == vc.getType() )
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// can't handle this situation
|
||||
// todo -- warning, this leads to some missing SNPs at complex loci, such as:
|
||||
// todo -- 591 1 841619 841620 rs4970464 0 - A A -/C/T genomic mixed unknown 0 0 near-gene-3 exact 1
|
||||
// todo -- 591 1 841619 841620 rs62677860 0 + A A C/T genomic single unknown 0 0 near-gene-3 exact 1
|
||||
//
|
||||
//logger.info(String.format("Ignore second+ events at locus %s in rod %s => rec is %s", context.getLocation(), rodList.getName(), rec));
|
||||
|
||||
private VariantContext getVariantContext(String name, RefMetaDataTracker tracker, AlignmentContext context) {
|
||||
if ( tracker != null ) {
|
||||
RODRecordList<ReferenceOrderedDatum> rodList = tracker.getTrackData(name, null);
|
||||
if ( rodList != null ) {
|
||||
for ( ReferenceOrderedDatum rec : rodList.getRecords() ) {
|
||||
if ( rec.getLocation().getStart() == context.getLocation().getStart() ) {
|
||||
VariantContext vc = VariantContextAdaptors.convertToVariantContext(name, rec);
|
||||
if ( vc != null ) {
|
||||
return vc;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Integer reduce(Integer point, Integer sum) {
|
||||
return point + sum;
|
||||
}
|
||||
|
||||
public VariantEvaluator getEvalByName(String name, Set<VariantEvaluator> s) {
|
||||
for ( VariantEvaluator e : s )
|
||||
if ( e.getName().equals(name) )
|
||||
return e;
|
||||
return null;
|
||||
}
|
||||
|
||||
private <T extends Comparable<T>> List<T> sorted(Collection<T> c ) {
|
||||
List<T> l = new ArrayList<T>(c);
|
||||
Collections.sort(l);
|
||||
return l;
|
||||
}
|
||||
|
||||
private final static String CONTEXT_HEADER = "track.subset.novelty.filter";
|
||||
private final static int N_CONTEXT_NAME_PARTS = CONTEXT_HEADER.split("\\.").length;
|
||||
private static int[] nameSizes = new int[N_CONTEXT_NAME_PARTS];
|
||||
static {
|
||||
int i = 0;
|
||||
for ( String elt : CONTEXT_HEADER.split("\\.") )
|
||||
nameSizes[i++] = elt.length();
|
||||
}
|
||||
|
||||
private void determineContextNamePartSizes() {
|
||||
for ( String contextName : sorted(contexts.keySet()) ) {
|
||||
EvaluationContext group = contexts.get(contextName);
|
||||
for ( String evalSubgroupName : sorted(group.keySet()) ) {
|
||||
String keyWord = contextName + "." + evalSubgroupName;
|
||||
String[] parts = keyWord.split("\\.");
|
||||
if ( parts.length != N_CONTEXT_NAME_PARTS ) {
|
||||
throw new StingException("Unexpected number of eval name parts " + keyWord + " length = " + parts.length + ", expected " + N_CONTEXT_NAME_PARTS);
|
||||
} else {
|
||||
for ( int i = 0; i < parts.length; i++ )
|
||||
nameSizes[i] = Math.max(nameSizes[i], parts[i].length());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private String formatKeyword(String keyWord) {
|
||||
//System.out.printf("keyword %s%n", keyWord);
|
||||
|
||||
StringBuilder s = new StringBuilder();
|
||||
int i = 0;
|
||||
for ( String part : keyWord.split("\\.") ) {
|
||||
//System.out.printf("part %s %d%n", part, nameSizes[i]);
|
||||
s.append(String.format("%"+nameSizes[i]+"s ", part));
|
||||
i++;
|
||||
}
|
||||
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
// todo -- this really needs to be pretty printed; use some kind of table organization
|
||||
// todo -- so that we can load up all of the data in one place, analyze the widths of the columns
|
||||
// todo -- and pretty print it
|
||||
for ( String evalName : variantEvaluationNames ) {
|
||||
boolean first = true;
|
||||
out.printf("%n%n");
|
||||
// todo -- show that comp is dbsnp, etc. is columns
|
||||
for ( String contextName : sorted(contexts.keySet()) ) {
|
||||
EvaluationContext group = contexts.get(contextName);
|
||||
|
||||
out.printf("%s%n", Utils.dupString('-', 80));
|
||||
for ( String evalSubgroupName : sorted(group.keySet()) ) {
|
||||
Set<VariantEvaluator> evalSet = group.get(evalSubgroupName);
|
||||
VariantEvaluator eval = getEvalByName(evalName, evalSet);
|
||||
String keyWord = contextName + "." + evalSubgroupName;
|
||||
|
||||
if ( first ) {
|
||||
out.printf("%20s %s %s%n", evalName, formatKeyword(CONTEXT_HEADER), Utils.join("\t", eval.getTableHeader()));
|
||||
first = false;
|
||||
}
|
||||
|
||||
for ( List<String> row : eval.getTableRows() )
|
||||
out.printf("%20s %s %s%n", evalName, formatKeyword(keyWord), Utils.join("\t", row));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected Logger getLogger() { return logger; }
|
||||
}
|
||||
|
|
@ -0,0 +1,72 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.variantcontext.varianteval2;
|
||||
|
||||
import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: Jan 29, 2010
|
||||
* Time: 3:38:02 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public abstract class VariantEvaluator {
|
||||
protected boolean accumulateInterestingSites = false, printInterestingSites = false;
|
||||
protected String interestingSitePrefix = null;
|
||||
protected boolean processedASite = false;
|
||||
protected List<VariantContext> interestingSites = new ArrayList<VariantContext>();
|
||||
|
||||
public abstract String getName();
|
||||
|
||||
// do we want to keep track of things that are interesting
|
||||
public void accumulateInterestingSites(boolean enable) { accumulateInterestingSites = enable; }
|
||||
public void printInterestingSites(String prefix) { printInterestingSites = true; interestingSitePrefix = prefix; }
|
||||
public boolean isAccumulatingInterestingSites() { return accumulateInterestingSites; }
|
||||
public List<VariantContext> getInterestingSites() { return interestingSites; }
|
||||
protected void addInterestingSite(String why, VariantContext vc) {
|
||||
if ( accumulateInterestingSites )
|
||||
interestingSites.add(vc);
|
||||
if ( printInterestingSites )
|
||||
System.out.printf("%40s %s%n", interestingSitePrefix, why);
|
||||
}
|
||||
|
||||
public boolean processedAnySites() { return processedASite; }
|
||||
protected void markSiteAsProcessed() { processedASite = true; }
|
||||
|
||||
/** Should return the number of VariantContexts expected as inputs to update. Can be 1 or 2 */
|
||||
public abstract int getComparisonOrder();
|
||||
|
||||
/** called at all sites, regardless of eval context itself; useful for counting processed bases */
|
||||
public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { }
|
||||
public void update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { }
|
||||
public void update2(VariantContext vc1, VariantContext vc2, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { }
|
||||
|
||||
public void finalize() {}
|
||||
|
||||
public abstract String toString();
|
||||
|
||||
// making it a table
|
||||
public abstract List<String> getTableHeader();
|
||||
public abstract List<List<String>> getTableRows();
|
||||
|
||||
//
|
||||
// useful common utility routines
|
||||
//
|
||||
protected double rate(long n, long d) {
|
||||
return n / (1.0 * Math.max(d, 1));
|
||||
}
|
||||
|
||||
protected long inverseRate(long n, long d) {
|
||||
return n == 0 ? 0 : d / Math.max(n, 1);
|
||||
}
|
||||
|
||||
protected double ratio(long num, long denom) {
|
||||
return ((double)num) / (Math.max(denom, 1));
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue