VariantEval now understands the difference between a population-level analysis and a genotype analysis, and handles both. All analyses annotated as supporting one or the other or both. Preparation for genotype chip concordance calculations as well as called sites, etc analyses

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1247 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-07-15 14:07:13 +00:00
parent 513d43b5f3
commit 1798aff01b
15 changed files with 282 additions and 39 deletions

View File

@ -8,6 +8,16 @@ import java.io.PrintStream;
import java.util.List; import java.util.List;
import java.util.ArrayList; import java.util.ArrayList;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public abstract class BasicVariantAnalysis implements VariantAnalysis { public abstract class BasicVariantAnalysis implements VariantAnalysis {
protected String name; protected String name;
protected PrintStream out; protected PrintStream out;

View File

@ -13,13 +13,16 @@ import java.util.HashSet;
import java.io.PrintStream; import java.io.PrintStream;
/** /**
* Created by IntelliJ IDEA. * The Broad Institute
* User: depristo * SOFTWARE COPYRIGHT NOTICE AGREEMENT
* Date: Jun 4, 2009 * This software and its documentation are copyright 2009 by the
* Time: 4:38:19 PM * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* To change this template use File | Settings | File Templates. *
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/ */
public class ClusterCounterAnalysis extends BasicVariantAnalysis { public class ClusterCounterAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
ArrayList<HashSet<GenomeLoc>> variantsWithClusters; ArrayList<HashSet<GenomeLoc>> variantsWithClusters;
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100}; int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100};
AllelicVariant lastVariant = null; AllelicVariant lastVariant = null;

View File

@ -0,0 +1,29 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.LocusContext;
import java.io.PrintStream;
import java.util.List;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
* If an analysis implements this interface, it asserts that it performs a genotype based analysis, as
* opposed a straight variant analysis. The difference here is that variants are not asserted to be
* the actual genotype of a particular person, but are really just variation "out-there" in a population.
* A genotype analysis would be something like covered bases, confidently called bases, genotyping
* concordance, etc.
*
*/
public interface GenotypeAnalysis {
}

View File

@ -0,0 +1,81 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.LocusContext;
import java.io.PrintStream;
import java.util.List;
import java.util.Arrays;
import java.util.ArrayList;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class GenotypeConcordance extends BasicVariantAnalysis implements GenotypeAnalysis {
private String dbName;
private int nDBObs = 0;
private int nEvalObs = 0;
private int nOverlapping = 0;
public GenotypeConcordance(final String name) {
super("genotype_concordance");
dbName = name;
}
public void inc(boolean inDB, boolean inEval) {
if (inDB) nDBObs++;
if (inEval) nEvalObs++;
if (inDB && inEval) nOverlapping++;
}
public int nDBSites() { return nDBObs; }
public int nEvalSites() { return nEvalObs; }
public int nOverlappingSites() { return nOverlapping; }
public int nNovelSites() { return Math.abs(nEvalSites() - nOverlappingSites()); }
/**
* What fraction of the evaluated site variants were also found in the db?
*
* @return
*/
public double fractionEvalSitesCoveredByDB() {
return nOverlappingSites() / (1.0 * nEvalSites());
}
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
// There are four cases here:
AllelicVariant dbsnp = (AllelicVariant)tracker.lookup(dbName, null);
inc(dbsnp != null, eval != null);
return dbsnp == null && eval != null ? "Novel " + eval : null;
}
/**
* What fraction of the DB sites were discovered in the evalution calls?
*
* @return
*/
public double fractionDBSitesDiscoveredInEval() {
return nOverlappingSites() / (1.0 * nDBSites());
}
public List<String> done() {
List<String> s = new ArrayList<String>();
s.add(String.format("%d\t%d\t%d\t%.2f\t%.2f", nDBSites(), nEvalSites(), nOverlappingSites(), fractionEvalSitesCoveredByDB(), fractionDBSitesDiscoveredInEval()));
s.add(String.format("name %s", dbName));
s.add(String.format("n_db_sites %d", nDBSites()));
s.add(String.format("n_eval_sites %d", nEvalSites()));
s.add(String.format("n_overlapping_sites %d", nOverlappingSites()));
s.add(String.format("n_novel_sites %d", nNovelSites()));
s.add(String.format("per_eval_sites_in_db %.2f", 100*fractionEvalSitesCoveredByDB()));
s.add(String.format("per_db_sites_in_eval %.2f", 100*fractionDBSitesDiscoveredInEval()));
return s;
}
}

View File

@ -16,13 +16,16 @@ import java.util.List;
import cern.jet.math.Arithmetic; import cern.jet.math.Arithmetic;
/** /**
* Created by IntelliJ IDEA. * The Broad Institute
* User: depristo * SOFTWARE COPYRIGHT NOTICE AGREEMENT
* Date: Jun 4, 2009 * This software and its documentation are copyright 2009 by the
* Time: 4:38:00 PM * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* To change this template use File | Settings | File Templates. *
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/ */
public class HardyWeinbergEquilibrium extends ViolationVariantAnalysis { public class HardyWeinbergEquilibrium extends ViolationVariantAnalysis implements PopulationAnalysis {
private double threshold; private double threshold;
int nSites = 0; int nSites = 0;
int nViolations = 0; int nViolations = 0;

View File

@ -3,11 +3,14 @@ package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import java.util.ArrayList; import java.util.ArrayList;
/** /**
* Created by IntelliJ IDEA. * The Broad Institute
* User: depristo * SOFTWARE COPYRIGHT NOTICE AGREEMENT
* Date: May 27, 2009 * This software and its documentation are copyright 2009 by the
* Time: 2:37:56 PM * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* To change this template use File | Settings | File Templates. *
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/ */
public class Histogram<T> { public class Histogram<T> {
ArrayList<T> data; ArrayList<T> data;

View File

@ -14,13 +14,16 @@ import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
/** /**
* Created by IntelliJ IDEA. * The Broad Institute
* User: depristo * SOFTWARE COPYRIGHT NOTICE AGREEMENT
* Date: Jun 4, 2009 * This software and its documentation are copyright 2009 by the
* Time: 4:38:19 PM * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* To change this template use File | Settings | File Templates. *
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/ */
public class NeighborDistanceAnalysis extends ViolationVariantAnalysis { public class NeighborDistanceAnalysis extends ViolationVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
ArrayList<Long> neighborWiseDistances; ArrayList<Long> neighborWiseDistances;
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100, 1000, 10000}; int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100, 1000, 10000};

View File

@ -0,0 +1,29 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.LocusContext;
import java.io.PrintStream;
import java.util.List;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
* If an analysis implements this interface, it asserts that it performs a genotype based analysis, as
* opposed a straight variant analysis. The difference here is that variants are not asserted to be
* the actual genotype of a particular person, but are really just variation "out-there" in a population.
* A genotype analysis would be something like covered bases, confidently called bases, genotyping
* concordance, etc.
*
*/
public interface PopulationAnalysis {
}

View File

@ -10,13 +10,16 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
/** /**
* Created by IntelliJ IDEA. * The Broad Institute
* User: depristo * SOFTWARE COPYRIGHT NOTICE AGREEMENT
* Date: Jun 4, 2009 * This software and its documentation are copyright 2009 by the
* Time: 4:38:00 PM * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* To change this template use File | Settings | File Templates. *
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/ */
public class TransitionTranversionAnalysis extends BasicVariantAnalysis { public class TransitionTranversionAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
int N_TRANSITION_TRANVERSION_BINS = 100; int N_TRANSITION_TRANVERSION_BINS = 100;
Histogram<Integer> transitions; Histogram<Integer> transitions;
Histogram<Integer> transversions; Histogram<Integer> transversions;

View File

@ -7,6 +7,16 @@ import org.broadinstitute.sting.gatk.LocusContext;
import java.io.PrintStream; import java.io.PrintStream;
import java.util.List; import java.util.List;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public interface VariantAnalysis { public interface VariantAnalysis {
public String getName(); public String getName();
public PrintStream getSummaryPrintStream(); public PrintStream getSummaryPrintStream();

View File

@ -8,7 +8,17 @@ import java.io.PrintStream;
import java.util.List; import java.util.List;
import java.util.ArrayList; import java.util.ArrayList;
public class VariantCounter extends BasicVariantAnalysis { /**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class VariantCounter extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
long nBasesCovered = 0; long nBasesCovered = 0;
int nSNPs = 0; int nSNPs = 0;

View File

@ -9,7 +9,17 @@ import java.util.List;
import java.util.Arrays; import java.util.Arrays;
import java.util.ArrayList; import java.util.ArrayList;
public class VariantDBCoverage extends BasicVariantAnalysis { /**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
private String dbName; private String dbName;
private int nDBObs = 0; private int nDBObs = 0;
private int nEvalObs = 0; private int nEvalObs = 0;

View File

@ -10,6 +10,16 @@ import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.*; import java.util.*;
import java.io.*; import java.io.*;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
@By(DataSource.REFERENCE) @By(DataSource.REFERENCE)
@Requires(DataSource.REFERENCE) @Requires(DataSource.REFERENCE)
@Allows(DataSource.REFERENCE) @Allows(DataSource.REFERENCE)
@ -23,11 +33,15 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
@Argument(shortName="badHWEThreshold", doc="XXX", required=false) @Argument(shortName="badHWEThreshold", doc="XXX", required=false)
public double badHWEThreshold = 1e-3; public double badHWEThreshold = 1e-3;
@Argument(shortName="evalContainsGenotypes", doc="If true, the input list of variants will be treated as a genotyping file, containing assertions of actual genotype values for a particular person. Analyses that only make sense on at the population level will be disabled, while those operating on genotypes will be enabled", required=false)
public boolean evalContainsGenotypes = false;
String analysisFilenameBase = null; String analysisFilenameBase = null;
String COMMENT_STRING = ""; String COMMENT_STRING = "";
final String knownSNPDBName = "dbSNP"; final String knownSNPDBName = "dbSNP";
final String genotypeChipName = "hapmap-chip";
HashMap<String, ArrayList<VariantAnalysis>> analysisSets; HashMap<String, ArrayList<VariantAnalysis>> analysisSets;
@ -38,12 +52,16 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
final String TWOHIT_SNPS = "2plus_hit"; final String TWOHIT_SNPS = "2plus_hit";
final String KNOWN_SNPS = "known"; final String KNOWN_SNPS = "known";
final String NOVEL_SNPS = "novel"; final String NOVEL_SNPS = "novel";
final String[] ALL_ANALYSIS_NAMES = { ALL_SNPS, SINGLETON_SNPS, TWOHIT_SNPS, KNOWN_SNPS, NOVEL_SNPS }; final String[] POPULATION_ANALYSIS_NAMES = { ALL_SNPS, SINGLETON_SNPS, TWOHIT_SNPS, KNOWN_SNPS, NOVEL_SNPS };
final String[] GENOTYPE_ANALYSIS_NAMES = { ALL_SNPS, KNOWN_SNPS, NOVEL_SNPS };
String[] ALL_ANALYSIS_NAMES = null;
public void initialize() { public void initialize() {
ALL_ANALYSIS_NAMES = evalContainsGenotypes ? GENOTYPE_ANALYSIS_NAMES : POPULATION_ANALYSIS_NAMES;
// setup the path to the analysis // setup the path to the analysis
if ( this.getToolkit().getArguments().outFileName != null ) { if ( this.getToolkit().getArguments().outFileName != null ) {
analysisFilenameBase = this.getToolkit().getArguments().outFileName + ".analysis."; analysisFilenameBase = this.getToolkit().getArguments().outFileName + "."; // + ".analysis.";
} }
analysisSets = new HashMap<String, ArrayList<VariantAnalysis>>(); analysisSets = new HashMap<String, ArrayList<VariantAnalysis>>();
@ -64,11 +82,29 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
// //
analyses.add(new VariantCounter()); analyses.add(new VariantCounter());
analyses.add(new VariantDBCoverage(knownSNPDBName)); analyses.add(new VariantDBCoverage(knownSNPDBName));
analyses.add(new GenotypeConcordance(genotypeChipName));
analyses.add(new TransitionTranversionAnalysis()); analyses.add(new TransitionTranversionAnalysis());
analyses.add(new NeighborDistanceAnalysis()); analyses.add(new NeighborDistanceAnalysis());
analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold)); analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold));
analyses.add(new ClusterCounterAnalysis()); analyses.add(new ClusterCounterAnalysis());
//
// Filter out analyzes inappropriate for our evaluation type Population or Genotype
//
Iterator<VariantAnalysis> iter = analyses.iterator();
while ( iter.hasNext() ) {
VariantAnalysis analysis = iter.next();
boolean disableForGenotyping = evalContainsGenotypes && ! (analysis instanceof GenotypeAnalysis);
boolean disableForPopulation = ! evalContainsGenotypes && ! (analysis instanceof PopulationAnalysis);
boolean disable = disableForGenotyping | disableForPopulation;
String causeName = disableForGenotyping ? "genotype" : (disableForPopulation ? "population" : null);
if ( disable ) {
logger.info(String.format("Disabling %s-only analysis %s in set %s", causeName, analysis, setName));
iter.remove();
}
}
if ( printVariants ) analyses.add(new VariantMatcher(knownSNPDBName)); if ( printVariants ) analyses.add(new VariantMatcher(knownSNPDBName));
for ( VariantAnalysis analysis : analyses ) { for ( VariantAnalysis analysis : analyses ) {

View File

@ -9,7 +9,17 @@ import java.util.List;
import java.util.Arrays; import java.util.Arrays;
import java.util.ArrayList; import java.util.ArrayList;
public class VariantMatcher extends BasicVariantAnalysis { /**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class VariantMatcher extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
String dbName; String dbName;
public VariantMatcher(final String name) { public VariantMatcher(final String name) {

View File

@ -39,11 +39,14 @@ import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
/** /**
* Created by IntelliJ IDEA. * The Broad Institute
* User: depristo * SOFTWARE COPYRIGHT NOTICE AGREEMENT
* Date: Jun 4, 2009 * This software and its documentation are copyright 2009 by the
* Time: 4:38:19 PM * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* To change this template use File | Settings | File Templates. *
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/ */
public abstract class ViolationVariantAnalysis extends BasicVariantAnalysis { public abstract class ViolationVariantAnalysis extends BasicVariantAnalysis {
PrintStream violationsOut = null; PrintStream violationsOut = null;