Significant output formatting improvements. SNPs as indels analysis. heterozygosity rate calculations

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1478 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-08-29 21:49:09 +00:00
parent bde67428fd
commit 6ab9ddf9f5
9 changed files with 151 additions and 124 deletions

View File

@ -20,14 +20,16 @@ import java.util.ArrayList;
*/
public abstract class BasicVariantAnalysis implements VariantAnalysis {
protected String name;
protected PrintStream out;
protected VariantEvalWalker master;
protected PrintStream out, callOut;
private VariantEvalWalker master;
protected String filename;
public BasicVariantAnalysis(String name) {
this.name = name;
}
public VariantEvalWalker getMaster() { return master; }
public String getName() {
return name;
}
@ -36,9 +38,10 @@ public abstract class BasicVariantAnalysis implements VariantAnalysis {
return new ArrayList<String>();
}
public void initialize(VariantEvalWalker master, PrintStream out, String filename) {
public void initialize(VariantEvalWalker master, PrintStream out, PrintStream callOut, String filename) {
this.master = master;
this.out = out;
this.callOut = callOut;
this.filename = filename;
}
@ -47,7 +50,7 @@ public abstract class BasicVariantAnalysis implements VariantAnalysis {
}
public PrintStream getCallPrintStream() {
return out;
return callOut;
}
public List<String> done() {

View File

@ -22,6 +22,7 @@ import java.util.HashSet;
*/
public class ClusterCounterAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
ArrayList<HashSet<GenomeLoc>> variantsWithClusters;
int minDistanceForFlagging = 5;
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100};
AllelicVariant lastVariant = null;
GenomeLoc lastVariantInterval = null;
@ -60,7 +61,7 @@ public class ClusterCounterAnalysis extends BasicVariantAnalysis implements Geno
variantsWithClusters.get(i).add(eL);
}
}
r = String.format("snp_within_cluster %d %s %s %s", d, eL, lvL, s.toString());
r = d <= minDistanceForFlagging ? String.format("snp_within_cluster %d %s %s %s", d, eL, lvL, s.toString()) : null;
}
}
}

View File

@ -22,7 +22,7 @@ import cern.jet.math.Arithmetic;
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class HardyWeinbergEquilibrium extends ViolationVariantAnalysis implements PopulationAnalysis {
public class HardyWeinbergEquilibrium extends BasicVariantAnalysis implements PopulationAnalysis {
private double threshold;
int nSites = 0;
int nViolations = 0;

View File

@ -21,8 +21,9 @@ import java.io.PrintStream;
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class NeighborDistanceAnalysis extends ViolationVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
public class NeighborDistanceAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
ArrayList<Long> neighborWiseDistances;
int minDistanceForFlagging = 5;
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100, 1000, 10000};
AllelicVariant lastVariant = null;
@ -51,7 +52,7 @@ public class NeighborDistanceAnalysis extends ViolationVariantAnalysis implement
//out.printf("# Excluding %d %s %s vs. %s %s%n", d, eL, interval, lvL, lastVariantInterval);
} else {
neighborWiseDistances.add(d);
r = String.format("neighbor-distance %d %s %s", d, eL, lvL);
r = d <= minDistanceForFlagging ? String.format("neighbor-distance %d %s %s", d, eL, lvL) : null;
}
}
}

View File

@ -22,7 +22,7 @@ public interface VariantAnalysis {
public PrintStream getSummaryPrintStream();
public PrintStream getCallPrintStream();
public List<String> getParams();
public void initialize(VariantEvalWalker master, PrintStream out, String filename);
public void initialize(VariantEvalWalker master, PrintStream out, PrintStream callOut, String filename);
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, AlignmentContext context);
public void finalize(long nSites);
public List<String> done();

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.GenotypeUtils;
import java.util.List;
import java.util.ArrayList;
@ -20,6 +21,7 @@ import java.util.ArrayList;
public class VariantCounter extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
long nBasesCovered = 0;
int nSNPs = 0;
int nHets = 0;
public VariantCounter() {
super("variant_counts");
@ -27,6 +29,10 @@ public class VariantCounter extends BasicVariantAnalysis implements GenotypeAnal
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
nSNPs += eval == null ? 0 : 1;
if ( this.getMaster().evalContainsGenotypes && eval != null && eval.isSNP() && GenotypeUtils.isHet(eval) )
nHets++;
return null;
}
@ -38,12 +44,31 @@ public class VariantCounter extends BasicVariantAnalysis implements GenotypeAnal
nBasesCovered = nSites;
}
private double variantRate(int n) {
return n / (1.0 * Math.max(nBasesCovered, 1));
}
private long variantRateInverse(int n) {
return nBasesCovered / Math.max(n, 1);
}
public List<String> done() {
List<String> s = new ArrayList<String>();
s.add(String.format("n bases covered: %d", nBasesCovered));
s.add(String.format("sites: %d", nSNPs));
s.add(String.format("variant rate: %.5f confident variants per base", nSNPs / (1.0 * Math.max(nBasesCovered, 1))));
s.add(String.format("variant rate: 1 / %d confident variants per base", nBasesCovered / Math.max(nSNPs, 1)));
s.add(String.format("n bases covered %d", nBasesCovered));
s.add(String.format("variants %d", nSNPs));
s.add(String.format("variant rate %.5f confident variants per base", variantRate(nSNPs)));
s.add(String.format("variant rate 1 / %d confident variants per base [human single sample genome-wide expectation is ~1 / 666]", variantRateInverse(nSNPs)));
if ( this.getMaster().evalContainsGenotypes ) {
s.add(String.format("heterozygotes %d", nHets));
s.add(String.format("homozygotes %d", nSNPs - nHets));
s.add(String.format("heterozygosity %.5f confident hets per base", variantRate(nHets)));
s.add(String.format("heterozygosity 1 / %d confident hets per base [human single sample expectation is ~1 / 1000]", variantRateInverse(nHets)));
s.add(String.format("het to hom ratio %.2f confident hets per confident homozygote non-refs [human single sample genome-wide expectation is 2:1]",
((double)nHets) / (Math.max(nSNPs - nHets, 1))));
}
return s;
}
}

View File

@ -19,10 +19,12 @@ import java.util.ArrayList;
*/
public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
private String dbName;
private int nDBObs = 0;
private int nDBSNPs = 0;
private int nDBIndels = 0;
private int nEvalObs = 0;
private int nOverlapping = 0;
private int nConcordant = 0;
private int nSNPsCalledAtIndels = 0;
public VariantDBCoverage(final String name) {
super("db_coverage");
@ -33,33 +35,38 @@ public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeA
boolean inDB = dbSNP != null;
boolean inEval = eval != null;
if (inDB) nDBObs++;
if (inDB ) {
if ( dbSNP.isSNP() ) nDBSNPs++;
if ( dbSNP.isIndel() ) nDBIndels++;
//System.out.printf("snp=%b ins=%b del=%b indel=%b %s%n", dbSNP.isSNP(), dbSNP.isInsertion(), dbSNP.isDeletion(), dbSNP.isIndel(), dbSNP);
}
if (inEval) nEvalObs++;
if (inDB && inEval) {
nOverlapping++;
if ( dbSNP.isSNP() ) { // changes the calculation a bit
nOverlapping++;
if ( ! discordantP(dbSNP, eval) )
nConcordant++;
if ( ! discordantP(dbSNP, eval) )
nConcordant++;
}
if ( dbSNP.isIndel() && eval.isSNP() )
nSNPsCalledAtIndels++;
}
}
public int nDBSites() { return nDBObs; }
public int nDBSNPs() { return nDBSNPs; }
public int nDBIndels() { return nDBIndels; }
public int nEvalSites() { return nEvalObs; }
public int nOverlappingSites() { return nOverlapping; }
public int nConcordant() { return nConcordant; }
public int nNovelSites() { return Math.abs(nEvalSites() - nOverlappingSites()); }
public int nSNPsAtIndels() { return nSNPsCalledAtIndels; }
public boolean discordantP(AllelicVariant dbSNP, AllelicVariant eval) {
if (dbSNP != null && dbSNP.isSNP() && eval != null ) {
boolean concordant = dbSNP.getAltSnpFWD() == eval.getAltSnpFWD() || dbSNP.getRefSnpFWD() == eval.getAltSnpFWD();
//System.out.printf("dbSNP=%s | %c, eval=%s | %c, concordant=%b %s %s%n",
// dbSNP.getGenotype().get(0), dbSNP.getAltSnpFWD(),
// eval.getGenotype().get(0), eval.getAltSnpFWD(),
// concordant,
// dbSNP, eval);
return ! concordant;
return ! (dbSNP.getAltSnpFWD() == eval.getAltSnpFWD() || dbSNP.getRefSnpFWD() == eval.getAltSnpFWD());
} else {
return false;
}
@ -83,8 +90,20 @@ public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeA
// There are four cases here:
AllelicVariant dbsnp = (AllelicVariant)tracker.lookup(dbName, null);
boolean isSNP = dbsnp != null && dbsnp.isSNP();
inc(isSNP ? dbsnp : null, eval);
return ! isSNP && eval != null ? "Novel " + eval : (discordantP(dbsnp, eval) ? (String.format("Discordant DBSNP=%s %s", dbsnp.getGenotype().get(0), eval)) : null);
inc(dbsnp, eval);
if ( dbsnp != null && eval != null ) {
if ( dbsnp.isSNP() && eval.isSNP() && discordantP(dbsnp, eval) ) {
return String.format("Discordant [DBSNP %s] [EVAL %s]", dbsnp, eval);
} else if ( dbsnp.isIndel() && eval.isSNP() ) {
return String.format("SNP-at-indel DBSNP=%s %s", dbsnp.getGenotype().get(0), eval);
} else {
return null;
// return "Novel " + eval;
}
} else {
return null;
}
}
/**
@ -93,21 +112,30 @@ public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeA
* @return
*/
public double fractionDBSitesDiscoveredInEval() {
return nOverlappingSites() / (1.0 * nDBSites());
return nOverlappingSites() / (1.0 * nDBSNPs());
}
public List<String> done() {
List<String> s = new ArrayList<String>();
s.add(String.format("%d\t%d\t%d\t%.2f\t%.2f", nDBSites(), nEvalSites(), nOverlappingSites(), fractionEvalSitesCoveredByDB(), fractionDBSitesDiscoveredInEval()));
s.add(String.format("name %s", dbName));
s.add(String.format("n_db_sites %d", nDBSites()));
s.add(String.format("n_eval_sites %d", nEvalSites()));
s.add(String.format("n_overlapping_sites %d", nOverlappingSites()));
s.add(String.format("n_concordant %d", nConcordant()));
s.add(String.format("n_novel_sites %d", nNovelSites()));
s.add(String.format("per_eval_sites_in_db %.2f", 100*fractionEvalSitesCoveredByDB()));
s.add(String.format("concordance_rate %.2f", 100*concordanceRate()));
s.add(String.format("per_db_sites_in_eval %.2f", 100*fractionDBSitesDiscoveredInEval()));
//s.add(String.format("%d\t%d\t%d\t%.2f\t%.2f", nDBSNPs(), nEvalSites(), nOverlappingSites(), fractionEvalSitesCoveredByDB(), fractionDBSitesDiscoveredInEval()));
s.add(String.format("name %s", dbName));
s.add(String.format("n_db_sites %d", nDBSNPs() + nDBIndels()));
s.add(String.format("n_db_snps %d", nDBSNPs()));
s.add(String.format("n_db_indels %d", nDBIndels()));
s.add(String.format("n_eval_sites %d", nEvalSites()));
s.add(String.format("n_overlapping_sites %d", nOverlappingSites()));
s.add(String.format("n_concordant %d", nConcordant()));
s.add(String.format("n_novel_sites %d", nNovelSites()));
s.add(String.format("percent_eval_sites_in_db %.2f", 100*fractionEvalSitesCoveredByDB()));
s.add(String.format("concordance_rate %.2f", 100*concordanceRate()));
s.add(String.format("percent_db_sites_in_eval %.2f", 100*fractionDBSitesDiscoveredInEval()));
s.add(String.format("n_snp_calls_at_indels %d", nSNPsAtIndels()));
s.add(String.format("percent_calls_at_indels %.2f", nSNPsAtIndels() / (0.01 * nEvalSites())));
return s;
}
}

View File

@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.SNPCallFromGenotypes;
import org.broadinstitute.sting.gatk.refdata.IntervalRod;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
@ -27,8 +28,8 @@ import java.util.*;
*
*/
@By(DataSource.REFERENCE)
@Requires(DataSource.REFERENCE)
@Allows(DataSource.REFERENCE)
@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="eval",type=AllelicVariant.class)})
@Allows(value={DataSource.REFERENCE},referenceMetaData = {@RMD(name="eval",type=AllelicVariant.class), @RMD(name="dbsnp",type=AllelicVariant.class),@RMD(name="hapmap-chip",type=AllelicVariant.class), @RMD(name="interval",type=IntervalRod.class)})
public class VariantEvalWalker extends RefWalker<Integer, Integer> {
@Argument(shortName="minConfidenceScore", doc="Minimum confidence score to consider an evaluation SNP a variant", required=false)
public int minConfidenceScore = -1;
@ -39,18 +40,27 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
@Argument(shortName="badHWEThreshold", doc="XXX", required=false)
public double badHWEThreshold = 1e-3;
@Argument(shortName="evalContainsGenotypes", doc="If true, the input list of variants will be treated as a genotyping file, containing assertions of actual genotype values for a particular person. Analyses that only make sense on at the population level will be disabled, while those operating on genotypes will be enabled", required=false)
@Argument(fullName="evalContainsGenotypes", shortName = "G", doc="If true, the input list of variants will be treated as a genotyping file, containing assertions of actual genotype values for a particular person. Analyses that only make sense on at the population level will be disabled, while those operating on genotypes will be enabled", required=false)
public boolean evalContainsGenotypes = false;
String analysisFilenameBase = null;
@Argument(fullName="explode", shortName = "E", doc="Old style formatting, with each analysis split into separate files.", required=false)
public boolean explode = false;
String COMMENT_STRING = "";
@Argument(fullName="includeViolations", shortName = "V", doc="If provided, violations will be written out along with summary information", required=false)
public boolean includeViolations = false;
@Argument(fullName="extensiveSubsets", shortName = "A", doc="If provided, output will be calculated over a lot of subsets, by default we only operate over all variants", required=false)
public boolean extensiveSubsets = false;
String analysisFilenameBase = null;
final String knownSNPDBName = "dbSNP";
final String genotypeChipName = "hapmap-chip";
HashMap<String, ArrayList<VariantAnalysis>> analysisSets;
PrintStream perLocusStream = null;
long nSites = 0;
final String ALL_SNPS = "all";
@ -60,10 +70,13 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
final String NOVEL_SNPS = "novel";
final String[] POPULATION_ANALYSIS_NAMES = { ALL_SNPS, SINGLETON_SNPS, TWOHIT_SNPS, KNOWN_SNPS, NOVEL_SNPS };
final String[] GENOTYPE_ANALYSIS_NAMES = { ALL_SNPS, KNOWN_SNPS, NOVEL_SNPS };
final String[] SIMPLE_ANALYSIS_NAMES = { ALL_SNPS };
String[] ALL_ANALYSIS_NAMES = null;
public void initialize() {
ALL_ANALYSIS_NAMES = evalContainsGenotypes ? GENOTYPE_ANALYSIS_NAMES : POPULATION_ANALYSIS_NAMES;
ALL_ANALYSIS_NAMES = SIMPLE_ANALYSIS_NAMES;
if ( extensiveSubsets )
ALL_ANALYSIS_NAMES = evalContainsGenotypes ? GENOTYPE_ANALYSIS_NAMES : POPULATION_ANALYSIS_NAMES;
// setup the path to the analysis
if ( this.getToolkit().getArguments().outFileName != null ) {
@ -77,7 +90,7 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
}
private ArrayList<VariantAnalysis> getAnalysisSet(final String name) {
return analysisSets.get(name);
return analysisSets.containsKey(name) ? analysisSets.get(name) : null;
}
private ArrayList<VariantAnalysis> initializeAnalysisSet(final String setName) {
@ -137,12 +150,20 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
public void initializeAnalysisOutputStream(final String setName, VariantAnalysis analysis) {
final String filename = getAnalysisFilename(setName + "." + analysis.getName(), analysis.getParams());
if ( filename == null )
analysis.initialize(this, out, filename);
try {
if ( perLocusStream == null )
perLocusStream = filename == null ? out : new PrintStream(new File(analysisFilenameBase + "interesting_sites"));
} catch (FileNotFoundException e) {
throw new RuntimeException(e);
}
if ( filename == null || ! explode )
analysis.initialize(this, out, perLocusStream, filename);
else {
File file = new File(filename);
try {
analysis.initialize(this, new PrintStream(new FileOutputStream(file)), filename);
analysis.initialize(this, new PrintStream(new FileOutputStream(file)), perLocusStream, filename);
} catch (FileNotFoundException e) {
throw new StingException("Couldn't open analysis output file " + filename, e);
}
@ -191,9 +212,13 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
public void updateAnalysisSet(final String analysisSetName, AllelicVariant eval,
RefMetaDataTracker tracker, char ref, AlignmentContext context) {
// Iterate over each analysis, and update it
for ( VariantAnalysis analysis : getAnalysisSet(analysisSetName) ) {
String s = analysis.update(eval, tracker, ref, context);
if ( s != null ) analysis.getCallPrintStream().println(s);
if ( getAnalysisSet(analysisSetName) != null ) {
for ( VariantAnalysis analysis : getAnalysisSet(analysisSetName) ) {
String s = analysis.update(eval, tracker, ref, context);
if ( s != null && includeViolations ) {
analysis.getCallPrintStream().println(getLineHeader(analysisSetName, "flagged", analysis.getName()) + s);
}
}
}
}
@ -212,20 +237,26 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
}
}
private String getLineHeader( final String analysisSetName, final String keyword, final String analysis) {
String s = Utils.join(",", Arrays.asList(analysisSetName, keyword, analysis));
return s + Utils.dupString(' ', 50 - s.length());
}
private void printAnalysisSet( final String analysisSetName ) {
out.printf("Writing analysis set %s", analysisSetName);
//out.printf("Writing analysis set %s", analysisSetName);
Date now = new Date();
for ( VariantAnalysis analysis : getAnalysisSet(analysisSetName) ) {
String header = getLineHeader(analysisSetName, "summary", analysis.getName());
analysis.finalize(nSites);
PrintStream stream = analysis.getSummaryPrintStream();
stream.printf("%s%s%n", COMMENT_STRING, Utils.dupString('-', 78));
stream.printf("%sAnalysis set %s%n", COMMENT_STRING, analysisSetName);
stream.printf("%sAnalysis name %s%n", COMMENT_STRING, analysis.getName());
stream.printf("%sAnalysis params %s%n", COMMENT_STRING, Utils.join(" ", analysis.getParams()));
stream.printf("%sAnalysis class %s%n", COMMENT_STRING, analysis );
stream.printf("%sAnalysis time %s%n", COMMENT_STRING, now );
stream.printf("%s%s%n", header, Utils.dupString('-', 78));
//stream.printf("%s Analysis set %s%n", analysisSetName, , analysisSetName);
stream.printf("%sAnalysis name %s%n", header, analysis.getName());
stream.printf("%sAnalysis params %s%n", header, Utils.join(" ", analysis.getParams()));
stream.printf("%sAnalysis class %s%n", header, analysis);
stream.printf("%sAnalysis time %s%n", header, now);
for ( String line : analysis.done()) {
stream.printf("%s%s%n", COMMENT_STRING, line);
stream.printf("%s%s%n", header, line);
}
}
}

View File

@ -1,62 +0,0 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import java.io.PrintStream;
import java.io.File;
import java.io.FileNotFoundException;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public abstract class ViolationVariantAnalysis extends BasicVariantAnalysis {
PrintStream violationsOut = null;
public ViolationVariantAnalysis(final String name) {
super(name);
}
public void initialize(VariantEvalWalker master, PrintStream out, String filename) {
super.initialize(master, out, filename);
try {
violationsOut = filename == null ? out : new PrintStream(new File(filename + ".violations"));
} catch (FileNotFoundException e) {
throw new RuntimeException(e);
}
}
public PrintStream getCallPrintStream() {
return violationsOut;
}
}