VariantEval now understands the difference between a population-level analysis and a genotype analysis, and handles both. All analyses annotated as supporting one or the other or both. Preparation for genotype chip concordance calculations as well as called sites, etc analyses
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1247 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
513d43b5f3
commit
1798aff01b
|
|
@ -8,6 +8,16 @@ import java.io.PrintStream;
|
|||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public abstract class BasicVariantAnalysis implements VariantAnalysis {
|
||||
protected String name;
|
||||
protected PrintStream out;
|
||||
|
|
|
|||
|
|
@ -13,13 +13,16 @@ import java.util.HashSet;
|
|||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: Jun 4, 2009
|
||||
* Time: 4:38:19 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public class ClusterCounterAnalysis extends BasicVariantAnalysis {
|
||||
public class ClusterCounterAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
||||
ArrayList<HashSet<GenomeLoc>> variantsWithClusters;
|
||||
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100};
|
||||
AllelicVariant lastVariant = null;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,29 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
* If an analysis implements this interface, it asserts that it performs a genotype based analysis, as
|
||||
* opposed a straight variant analysis. The difference here is that variants are not asserted to be
|
||||
* the actual genotype of a particular person, but are really just variation "out-there" in a population.
|
||||
* A genotype analysis would be something like covered bases, confidently called bases, genotyping
|
||||
* concordance, etc.
|
||||
*
|
||||
*/
|
||||
public interface GenotypeAnalysis {
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public class GenotypeConcordance extends BasicVariantAnalysis implements GenotypeAnalysis {
|
||||
private String dbName;
|
||||
private int nDBObs = 0;
|
||||
private int nEvalObs = 0;
|
||||
private int nOverlapping = 0;
|
||||
|
||||
public GenotypeConcordance(final String name) {
|
||||
super("genotype_concordance");
|
||||
dbName = name;
|
||||
}
|
||||
|
||||
public void inc(boolean inDB, boolean inEval) {
|
||||
if (inDB) nDBObs++;
|
||||
if (inEval) nEvalObs++;
|
||||
if (inDB && inEval) nOverlapping++;
|
||||
}
|
||||
|
||||
public int nDBSites() { return nDBObs; }
|
||||
public int nEvalSites() { return nEvalObs; }
|
||||
public int nOverlappingSites() { return nOverlapping; }
|
||||
public int nNovelSites() { return Math.abs(nEvalSites() - nOverlappingSites()); }
|
||||
|
||||
/**
|
||||
* What fraction of the evaluated site variants were also found in the db?
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double fractionEvalSitesCoveredByDB() {
|
||||
return nOverlappingSites() / (1.0 * nEvalSites());
|
||||
}
|
||||
|
||||
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
// There are four cases here:
|
||||
AllelicVariant dbsnp = (AllelicVariant)tracker.lookup(dbName, null);
|
||||
inc(dbsnp != null, eval != null);
|
||||
return dbsnp == null && eval != null ? "Novel " + eval : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* What fraction of the DB sites were discovered in the evalution calls?
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double fractionDBSitesDiscoveredInEval() {
|
||||
return nOverlappingSites() / (1.0 * nDBSites());
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
List<String> s = new ArrayList<String>();
|
||||
s.add(String.format("%d\t%d\t%d\t%.2f\t%.2f", nDBSites(), nEvalSites(), nOverlappingSites(), fractionEvalSitesCoveredByDB(), fractionDBSitesDiscoveredInEval()));
|
||||
s.add(String.format("name %s", dbName));
|
||||
s.add(String.format("n_db_sites %d", nDBSites()));
|
||||
s.add(String.format("n_eval_sites %d", nEvalSites()));
|
||||
s.add(String.format("n_overlapping_sites %d", nOverlappingSites()));
|
||||
s.add(String.format("n_novel_sites %d", nNovelSites()));
|
||||
s.add(String.format("per_eval_sites_in_db %.2f", 100*fractionEvalSitesCoveredByDB()));
|
||||
s.add(String.format("per_db_sites_in_eval %.2f", 100*fractionDBSitesDiscoveredInEval()));
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
|
@ -16,13 +16,16 @@ import java.util.List;
|
|||
import cern.jet.math.Arithmetic;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: Jun 4, 2009
|
||||
* Time: 4:38:00 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public class HardyWeinbergEquilibrium extends ViolationVariantAnalysis {
|
||||
public class HardyWeinbergEquilibrium extends ViolationVariantAnalysis implements PopulationAnalysis {
|
||||
private double threshold;
|
||||
int nSites = 0;
|
||||
int nViolations = 0;
|
||||
|
|
|
|||
|
|
@ -3,11 +3,14 @@ package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
|||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: May 27, 2009
|
||||
* Time: 2:37:56 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public class Histogram<T> {
|
||||
ArrayList<T> data;
|
||||
|
|
|
|||
|
|
@ -14,13 +14,16 @@ import java.io.File;
|
|||
import java.io.FileNotFoundException;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: Jun 4, 2009
|
||||
* Time: 4:38:19 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public class NeighborDistanceAnalysis extends ViolationVariantAnalysis {
|
||||
public class NeighborDistanceAnalysis extends ViolationVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
||||
ArrayList<Long> neighborWiseDistances;
|
||||
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100, 1000, 10000};
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,29 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
* If an analysis implements this interface, it asserts that it performs a genotype based analysis, as
|
||||
* opposed a straight variant analysis. The difference here is that variants are not asserted to be
|
||||
* the actual genotype of a particular person, but are really just variation "out-there" in a population.
|
||||
* A genotype analysis would be something like covered bases, confidently called bases, genotyping
|
||||
* concordance, etc.
|
||||
*
|
||||
*/
|
||||
public interface PopulationAnalysis {
|
||||
|
||||
}
|
||||
|
|
@ -10,13 +10,16 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: Jun 4, 2009
|
||||
* Time: 4:38:00 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public class TransitionTranversionAnalysis extends BasicVariantAnalysis {
|
||||
public class TransitionTranversionAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
||||
int N_TRANSITION_TRANVERSION_BINS = 100;
|
||||
Histogram<Integer> transitions;
|
||||
Histogram<Integer> transversions;
|
||||
|
|
|
|||
|
|
@ -7,6 +7,16 @@ import org.broadinstitute.sting.gatk.LocusContext;
|
|||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public interface VariantAnalysis {
|
||||
public String getName();
|
||||
public PrintStream getSummaryPrintStream();
|
||||
|
|
|
|||
|
|
@ -8,7 +8,17 @@ import java.io.PrintStream;
|
|||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class VariantCounter extends BasicVariantAnalysis {
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public class VariantCounter extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
||||
long nBasesCovered = 0;
|
||||
int nSNPs = 0;
|
||||
|
||||
|
|
|
|||
|
|
@ -9,7 +9,17 @@ import java.util.List;
|
|||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class VariantDBCoverage extends BasicVariantAnalysis {
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
||||
private String dbName;
|
||||
private int nDBObs = 0;
|
||||
private int nEvalObs = 0;
|
||||
|
|
|
|||
|
|
@ -10,6 +10,16 @@ import org.broadinstitute.sting.utils.cmdLine.Argument;
|
|||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
@By(DataSource.REFERENCE)
|
||||
@Requires(DataSource.REFERENCE)
|
||||
@Allows(DataSource.REFERENCE)
|
||||
|
|
@ -23,11 +33,15 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
|||
@Argument(shortName="badHWEThreshold", doc="XXX", required=false)
|
||||
public double badHWEThreshold = 1e-3;
|
||||
|
||||
@Argument(shortName="evalContainsGenotypes", doc="If true, the input list of variants will be treated as a genotyping file, containing assertions of actual genotype values for a particular person. Analyses that only make sense on at the population level will be disabled, while those operating on genotypes will be enabled", required=false)
|
||||
public boolean evalContainsGenotypes = false;
|
||||
|
||||
String analysisFilenameBase = null;
|
||||
|
||||
String COMMENT_STRING = "";
|
||||
|
||||
final String knownSNPDBName = "dbSNP";
|
||||
final String genotypeChipName = "hapmap-chip";
|
||||
|
||||
HashMap<String, ArrayList<VariantAnalysis>> analysisSets;
|
||||
|
||||
|
|
@ -38,12 +52,16 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
|||
final String TWOHIT_SNPS = "2plus_hit";
|
||||
final String KNOWN_SNPS = "known";
|
||||
final String NOVEL_SNPS = "novel";
|
||||
final String[] ALL_ANALYSIS_NAMES = { ALL_SNPS, SINGLETON_SNPS, TWOHIT_SNPS, KNOWN_SNPS, NOVEL_SNPS };
|
||||
final String[] POPULATION_ANALYSIS_NAMES = { ALL_SNPS, SINGLETON_SNPS, TWOHIT_SNPS, KNOWN_SNPS, NOVEL_SNPS };
|
||||
final String[] GENOTYPE_ANALYSIS_NAMES = { ALL_SNPS, KNOWN_SNPS, NOVEL_SNPS };
|
||||
String[] ALL_ANALYSIS_NAMES = null;
|
||||
|
||||
public void initialize() {
|
||||
ALL_ANALYSIS_NAMES = evalContainsGenotypes ? GENOTYPE_ANALYSIS_NAMES : POPULATION_ANALYSIS_NAMES;
|
||||
|
||||
// setup the path to the analysis
|
||||
if ( this.getToolkit().getArguments().outFileName != null ) {
|
||||
analysisFilenameBase = this.getToolkit().getArguments().outFileName + ".analysis.";
|
||||
analysisFilenameBase = this.getToolkit().getArguments().outFileName + "."; // + ".analysis.";
|
||||
}
|
||||
|
||||
analysisSets = new HashMap<String, ArrayList<VariantAnalysis>>();
|
||||
|
|
@ -64,11 +82,29 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
|||
//
|
||||
analyses.add(new VariantCounter());
|
||||
analyses.add(new VariantDBCoverage(knownSNPDBName));
|
||||
analyses.add(new GenotypeConcordance(genotypeChipName));
|
||||
analyses.add(new TransitionTranversionAnalysis());
|
||||
analyses.add(new NeighborDistanceAnalysis());
|
||||
analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold));
|
||||
analyses.add(new ClusterCounterAnalysis());
|
||||
|
||||
//
|
||||
// Filter out analyzes inappropriate for our evaluation type Population or Genotype
|
||||
//
|
||||
Iterator<VariantAnalysis> iter = analyses.iterator();
|
||||
while ( iter.hasNext() ) {
|
||||
VariantAnalysis analysis = iter.next();
|
||||
boolean disableForGenotyping = evalContainsGenotypes && ! (analysis instanceof GenotypeAnalysis);
|
||||
boolean disableForPopulation = ! evalContainsGenotypes && ! (analysis instanceof PopulationAnalysis);
|
||||
boolean disable = disableForGenotyping | disableForPopulation;
|
||||
String causeName = disableForGenotyping ? "genotype" : (disableForPopulation ? "population" : null);
|
||||
if ( disable ) {
|
||||
logger.info(String.format("Disabling %s-only analysis %s in set %s", causeName, analysis, setName));
|
||||
iter.remove();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if ( printVariants ) analyses.add(new VariantMatcher(knownSNPDBName));
|
||||
|
||||
for ( VariantAnalysis analysis : analyses ) {
|
||||
|
|
|
|||
|
|
@ -9,7 +9,17 @@ import java.util.List;
|
|||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class VariantMatcher extends BasicVariantAnalysis {
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public class VariantMatcher extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
||||
String dbName;
|
||||
|
||||
public VariantMatcher(final String name) {
|
||||
|
|
|
|||
|
|
@ -39,11 +39,14 @@ import java.io.File;
|
|||
import java.io.FileNotFoundException;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: Jun 4, 2009
|
||||
* Time: 4:38:19 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public abstract class ViolationVariantAnalysis extends BasicVariantAnalysis {
|
||||
PrintStream violationsOut = null;
|
||||
|
|
|
|||
Loading…
Reference in New Issue