VariantEval now understands the difference between a population-level analysis and a genotype analysis, and handles both. All analyses annotated as supporting one or the other or both. Preparation for genotype chip concordance calculations as well as called sites, etc analyses

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1247 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-07-15 14:07:13 +00:00
parent 513d43b5f3
commit 1798aff01b
15 changed files with 282 additions and 39 deletions

View File

@ -8,6 +8,16 @@ import java.io.PrintStream;
import java.util.List;
import java.util.ArrayList;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public abstract class BasicVariantAnalysis implements VariantAnalysis {
protected String name;
protected PrintStream out;

View File

@ -13,13 +13,16 @@ import java.util.HashSet;
import java.io.PrintStream;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Jun 4, 2009
* Time: 4:38:19 PM
* To change this template use File | Settings | File Templates.
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class ClusterCounterAnalysis extends BasicVariantAnalysis {
public class ClusterCounterAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
ArrayList<HashSet<GenomeLoc>> variantsWithClusters;
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100};
AllelicVariant lastVariant = null;

View File

@ -0,0 +1,29 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.LocusContext;
import java.io.PrintStream;
import java.util.List;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
* If an analysis implements this interface, it asserts that it performs a genotype based analysis, as
* opposed a straight variant analysis. The difference here is that variants are not asserted to be
* the actual genotype of a particular person, but are really just variation "out-there" in a population.
* A genotype analysis would be something like covered bases, confidently called bases, genotyping
* concordance, etc.
*
*/
public interface GenotypeAnalysis {
}

View File

@ -0,0 +1,81 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.LocusContext;
import java.io.PrintStream;
import java.util.List;
import java.util.Arrays;
import java.util.ArrayList;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class GenotypeConcordance extends BasicVariantAnalysis implements GenotypeAnalysis {
private String dbName;
private int nDBObs = 0;
private int nEvalObs = 0;
private int nOverlapping = 0;
public GenotypeConcordance(final String name) {
super("genotype_concordance");
dbName = name;
}
public void inc(boolean inDB, boolean inEval) {
if (inDB) nDBObs++;
if (inEval) nEvalObs++;
if (inDB && inEval) nOverlapping++;
}
public int nDBSites() { return nDBObs; }
public int nEvalSites() { return nEvalObs; }
public int nOverlappingSites() { return nOverlapping; }
public int nNovelSites() { return Math.abs(nEvalSites() - nOverlappingSites()); }
/**
* What fraction of the evaluated site variants were also found in the db?
*
* @return
*/
public double fractionEvalSitesCoveredByDB() {
return nOverlappingSites() / (1.0 * nEvalSites());
}
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
// There are four cases here:
AllelicVariant dbsnp = (AllelicVariant)tracker.lookup(dbName, null);
inc(dbsnp != null, eval != null);
return dbsnp == null && eval != null ? "Novel " + eval : null;
}
/**
* What fraction of the DB sites were discovered in the evalution calls?
*
* @return
*/
public double fractionDBSitesDiscoveredInEval() {
return nOverlappingSites() / (1.0 * nDBSites());
}
public List<String> done() {
List<String> s = new ArrayList<String>();
s.add(String.format("%d\t%d\t%d\t%.2f\t%.2f", nDBSites(), nEvalSites(), nOverlappingSites(), fractionEvalSitesCoveredByDB(), fractionDBSitesDiscoveredInEval()));
s.add(String.format("name %s", dbName));
s.add(String.format("n_db_sites %d", nDBSites()));
s.add(String.format("n_eval_sites %d", nEvalSites()));
s.add(String.format("n_overlapping_sites %d", nOverlappingSites()));
s.add(String.format("n_novel_sites %d", nNovelSites()));
s.add(String.format("per_eval_sites_in_db %.2f", 100*fractionEvalSitesCoveredByDB()));
s.add(String.format("per_db_sites_in_eval %.2f", 100*fractionDBSitesDiscoveredInEval()));
return s;
}
}

View File

@ -16,13 +16,16 @@ import java.util.List;
import cern.jet.math.Arithmetic;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Jun 4, 2009
* Time: 4:38:00 PM
* To change this template use File | Settings | File Templates.
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class HardyWeinbergEquilibrium extends ViolationVariantAnalysis {
public class HardyWeinbergEquilibrium extends ViolationVariantAnalysis implements PopulationAnalysis {
private double threshold;
int nSites = 0;
int nViolations = 0;

View File

@ -3,11 +3,14 @@ package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import java.util.ArrayList;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: May 27, 2009
* Time: 2:37:56 PM
* To change this template use File | Settings | File Templates.
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class Histogram<T> {
ArrayList<T> data;

View File

@ -14,13 +14,16 @@ import java.io.File;
import java.io.FileNotFoundException;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Jun 4, 2009
* Time: 4:38:19 PM
* To change this template use File | Settings | File Templates.
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class NeighborDistanceAnalysis extends ViolationVariantAnalysis {
public class NeighborDistanceAnalysis extends ViolationVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
ArrayList<Long> neighborWiseDistances;
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100, 1000, 10000};

View File

@ -0,0 +1,29 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.LocusContext;
import java.io.PrintStream;
import java.util.List;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
* If an analysis implements this interface, it asserts that it performs a genotype based analysis, as
* opposed a straight variant analysis. The difference here is that variants are not asserted to be
* the actual genotype of a particular person, but are really just variation "out-there" in a population.
* A genotype analysis would be something like covered bases, confidently called bases, genotyping
* concordance, etc.
*
*/
public interface PopulationAnalysis {
}

View File

@ -10,13 +10,16 @@ import java.util.ArrayList;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Jun 4, 2009
* Time: 4:38:00 PM
* To change this template use File | Settings | File Templates.
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class TransitionTranversionAnalysis extends BasicVariantAnalysis {
public class TransitionTranversionAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
int N_TRANSITION_TRANVERSION_BINS = 100;
Histogram<Integer> transitions;
Histogram<Integer> transversions;

View File

@ -7,6 +7,16 @@ import org.broadinstitute.sting.gatk.LocusContext;
import java.io.PrintStream;
import java.util.List;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public interface VariantAnalysis {
public String getName();
public PrintStream getSummaryPrintStream();

View File

@ -8,7 +8,17 @@ import java.io.PrintStream;
import java.util.List;
import java.util.ArrayList;
public class VariantCounter extends BasicVariantAnalysis {
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class VariantCounter extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
long nBasesCovered = 0;
int nSNPs = 0;

View File

@ -9,7 +9,17 @@ import java.util.List;
import java.util.Arrays;
import java.util.ArrayList;
public class VariantDBCoverage extends BasicVariantAnalysis {
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
private String dbName;
private int nDBObs = 0;
private int nEvalObs = 0;

View File

@ -10,6 +10,16 @@ import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.*;
import java.io.*;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
@By(DataSource.REFERENCE)
@Requires(DataSource.REFERENCE)
@Allows(DataSource.REFERENCE)
@ -23,11 +33,15 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
@Argument(shortName="badHWEThreshold", doc="XXX", required=false)
public double badHWEThreshold = 1e-3;
@Argument(shortName="evalContainsGenotypes", doc="If true, the input list of variants will be treated as a genotyping file, containing assertions of actual genotype values for a particular person. Analyses that only make sense on at the population level will be disabled, while those operating on genotypes will be enabled", required=false)
public boolean evalContainsGenotypes = false;
String analysisFilenameBase = null;
String COMMENT_STRING = "";
final String knownSNPDBName = "dbSNP";
final String genotypeChipName = "hapmap-chip";
HashMap<String, ArrayList<VariantAnalysis>> analysisSets;
@ -38,12 +52,16 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
final String TWOHIT_SNPS = "2plus_hit";
final String KNOWN_SNPS = "known";
final String NOVEL_SNPS = "novel";
final String[] ALL_ANALYSIS_NAMES = { ALL_SNPS, SINGLETON_SNPS, TWOHIT_SNPS, KNOWN_SNPS, NOVEL_SNPS };
final String[] POPULATION_ANALYSIS_NAMES = { ALL_SNPS, SINGLETON_SNPS, TWOHIT_SNPS, KNOWN_SNPS, NOVEL_SNPS };
final String[] GENOTYPE_ANALYSIS_NAMES = { ALL_SNPS, KNOWN_SNPS, NOVEL_SNPS };
String[] ALL_ANALYSIS_NAMES = null;
public void initialize() {
ALL_ANALYSIS_NAMES = evalContainsGenotypes ? GENOTYPE_ANALYSIS_NAMES : POPULATION_ANALYSIS_NAMES;
// setup the path to the analysis
if ( this.getToolkit().getArguments().outFileName != null ) {
analysisFilenameBase = this.getToolkit().getArguments().outFileName + ".analysis.";
analysisFilenameBase = this.getToolkit().getArguments().outFileName + "."; // + ".analysis.";
}
analysisSets = new HashMap<String, ArrayList<VariantAnalysis>>();
@ -64,11 +82,29 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
//
analyses.add(new VariantCounter());
analyses.add(new VariantDBCoverage(knownSNPDBName));
analyses.add(new GenotypeConcordance(genotypeChipName));
analyses.add(new TransitionTranversionAnalysis());
analyses.add(new NeighborDistanceAnalysis());
analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold));
analyses.add(new ClusterCounterAnalysis());
//
// Filter out analyzes inappropriate for our evaluation type Population or Genotype
//
Iterator<VariantAnalysis> iter = analyses.iterator();
while ( iter.hasNext() ) {
VariantAnalysis analysis = iter.next();
boolean disableForGenotyping = evalContainsGenotypes && ! (analysis instanceof GenotypeAnalysis);
boolean disableForPopulation = ! evalContainsGenotypes && ! (analysis instanceof PopulationAnalysis);
boolean disable = disableForGenotyping | disableForPopulation;
String causeName = disableForGenotyping ? "genotype" : (disableForPopulation ? "population" : null);
if ( disable ) {
logger.info(String.format("Disabling %s-only analysis %s in set %s", causeName, analysis, setName));
iter.remove();
}
}
if ( printVariants ) analyses.add(new VariantMatcher(knownSNPDBName));
for ( VariantAnalysis analysis : analyses ) {

View File

@ -9,7 +9,17 @@ import java.util.List;
import java.util.Arrays;
import java.util.ArrayList;
public class VariantMatcher extends BasicVariantAnalysis {
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class VariantMatcher extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
String dbName;
public VariantMatcher(final String name) {

View File

@ -39,11 +39,14 @@ import java.io.File;
import java.io.FileNotFoundException;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Jun 4, 2009
* Time: 4:38:19 PM
* To change this template use File | Settings | File Templates.
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public abstract class ViolationVariantAnalysis extends BasicVariantAnalysis {
PrintStream violationsOut = null;