Removing VariantEval (v1); everyone should be using VE2 now. Docs coming ASAP.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3177 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-04-15 19:53:02 +00:00
parent 3330e254a9
commit ac9dc0b4b4
26 changed files with 1 additions and 2321 deletions

View File

@ -1,59 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.genotype.Variation;
import java.io.PrintStream;
import java.util.List;
import java.util.ArrayList;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Sep 3, 2009
* Time: 5:04:40 PM
* To change this template use File | Settings | File Templates.
*/
public abstract class BasicPoolVariantAnalysis implements VariantAnalysis{
protected int numIndividualsInPool;
protected String name;
protected String filenames;
protected PrintStream out, callOut;
private VariantEvalWalker master;
public BasicPoolVariantAnalysis(String name, int nIndividuals) {
this.name = name;
this.numIndividualsInPool = nIndividuals;
}
public String getName() { return this.name; }
public int getNumberOfIndividualsInPool() { return this.numIndividualsInPool; }
public PrintStream getSummaryPrintStream() { return this.out; }
public PrintStream getCallPrintStream() { return this.callOut; }
public VariantEvalWalker getMaster() { return this.master; }
public List<String> getParams() { return new ArrayList<String>(); }
public List<String> done() { return new ArrayList<String>(); }
public int getNumberOfAllelesInPool() { return 2*getNumberOfIndividualsInPool(); }
public void initialize(VariantEvalWalker master, PrintStream out, PrintStream callOut, String filenames) {
this.master = master;
this.out = out;
this.callOut = callOut;
this.filenames = filenames;
}
public void finalize(long nSites) {
// no need to finalize data in general
}
public abstract String update(Variation variant, RefMetaDataTracker tracker, char ref, AlignmentContext context);
}

View File

@ -1,69 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.genotype.Variation;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public abstract class BasicVariantAnalysis implements VariantAnalysis {
protected String name;
protected PrintStream out, callOut;
private VariantEvalWalker master;
protected String filename;
public BasicVariantAnalysis(String name) {
this.name = name;
}
public VariantEvalWalker getMaster() { return master; }
public String getName() {
return name;
}
public List<String> getParams() {
return new ArrayList<String>();
}
public void initialize(VariantEvalWalker master, PrintStream out, PrintStream callOut, String filename) {
this.master = master;
this.out = out;
this.callOut = callOut;
this.filename = filename;
}
public PrintStream getSummaryPrintStream() {
return out;
}
public PrintStream getCallPrintStream() {
return callOut;
}
public List<String> done() {
return new ArrayList<String>();
}
/**
* No need to finalize the data in general
* @param nSites
*/
public void finalize(long nSites) {
}
public abstract String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context);
}

View File

@ -1,103 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
import org.broadinstitute.sting.utils.genotype.Variation;
import java.util.ArrayList;
import java.util.List;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public class CallableBasesAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis {
//long all_bases = 0;
long all_calls = 0;
final static double[] thresholds = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 50, 100};
long[] discoverable_bases = new long[thresholds.length];
long[] genotypable_bases = new long[thresholds.length];
public CallableBasesAnalysis() {
super("callable_bases");
}
public long nSites() {
return super.getMaster().getNMappedSites();
}
public long nCalls() {
return all_calls;
}
public long nDiscoverable(int index) {
return discoverable_bases[index];
}
public double percentDiscoverable(int index) {
return (100.0 * nDiscoverable(index)) / nSites();
}
public long nGenotypable(int index) {
return genotypable_bases[index];
}
public double percentGenotypable(int index) {
return (100.0 * nGenotypable(index)) / nSites();
}
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
//all_bases++;
if (eval == null) // no data here!
return null;
// we actually have a record here
if (!(eval instanceof VariantBackedByGenotype)) { // evaluation record isn't a genotype, die!
return null;
//throw new RuntimeException("Evaluation track isn't backed by a Genotype!");
}
all_calls++;
// For every threshold, updated discoverable and callable
for (int i = 0; i < thresholds.length; i++) {
double threshold = thresholds[i];
// update discoverable
if ( eval.getNegLog10PError() >= threshold )
discoverable_bases[i]++;
List<Genotype> genotypes = ((VariantBackedByGenotype)eval).getGenotypes();
for ( Genotype g : genotypes ) {
if ( g.getNegLog10PError() >= threshold ) {
genotypable_bases[i]++;
break;
}
}
}
return null;
}
public List<String> done() {
List<String> s = new ArrayList<String>();
s.add(String.format("total_no_sites %d", nSites()));
s.add(String.format("total_no_calls %d", nCalls()));
s.add(String.format(""));
s.add(String.format("confidence_threshold\tdiscoverable_bases\tdiscoverable_bases_percent\tgenotypable_bases\tgenotypable_bases_percent"));
for (int i = 0; i < thresholds.length; i++) {
double threshold = thresholds[i];
s.add(String.format("%6.2f\t%d\t%.6f\t%d\t%.6f", threshold, nDiscoverable(i), percentDiscoverable(i), nGenotypable(i), percentGenotypable(i)));
}
return s;
}
}

View File

@ -1,155 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
import org.broadinstitute.sting.utils.genotype.Variation;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public abstract class ChipConcordance extends BasicVariantAnalysis {
// the array of truth tables used to store data in this analysis
private ConcordanceTruthTable[] truthTables;
// mapping from sample name to index into truthTables for the corresponding table
private Map<String, Integer> sampleToArrayIndex;
// list of all chip rods used
private ArrayList<String> rodNames = new ArrayList<String>();
public ChipConcordance(final String name, final String chipName, boolean chipNameIsFile) {
super(name);
// read sample names from file if appropriate, otherwise use the chip name
if ( chipNameIsFile ) {
List<String> sampleNames = readSampleNamesFromFile(chipName);
rodNames.addAll(sampleNames);
} else {
rodNames.add(chipName);
}
// initialize the truth table storage data
sampleToArrayIndex = new HashMap<String, Integer>();
truthTables = createTruthTableMappings(rodNames, sampleToArrayIndex);
}
private List<String> readSampleNamesFromFile(String file) {
ArrayList<String> samples = new ArrayList<String>();
try {
BufferedReader reader = new BufferedReader(new FileReader(file));
String line = reader.readLine();
while ( line != null ) {
samples.add(line);
line = reader.readLine();
}
} catch( FileNotFoundException e) {
throw new StingException("Chip file at "+file+" was not found. Please check filepath.");
} catch( IOException e) {
throw new StingException(e.getMessage());
}
return samples;
}
// these methods need to be implemented by subclasses
protected abstract ConcordanceTruthTable[] createTruthTableMappings(List<String> rodNames, Map<String, Integer> sampleToArrayIndex);
protected abstract void assertVariationIsValid(Variation eval);
protected abstract HashMap<String, Genotype> makeGenotypeHash(List<Genotype> evals, List<String> rodNames);
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
// get all of the chip rods at this locus
HashMap<String, Genotype> chips = new HashMap<String, Genotype>();
for ( String name : rodNames ) {
Variation chip = tracker.lookup(name,Variation.class);
if ( chip != null ) {
// chips must be Genotypes
if ( !(chip instanceof VariantBackedByGenotype) )
throw new StingException("Failure: trying to analyze genotypes using non-genotype truth data");
chips.put(name, ((VariantBackedByGenotype)chip).getCalledGenotype());
}
}
// evaluate only if we have truth data or a call
return ( eval != null || chips.size() > 0 ) ? inc(chips, eval, ref) : null;
}
public String inc(Map<String, Genotype> chips, Variation eval, char ref) {
// TODO -- needed to add this for now while we're moving over to VE2
if ( !(eval instanceof VariantBackedByGenotype) )
return null;
// each implementing class can decide whether the Variation is valid
assertVariationIsValid(eval);
// This shouldn't happen, but let's check anyways to be safe
if (BaseUtils.simpleBaseToBaseIndex(ref) == -1)
return null;
// create a hash of samples to their Genotypes
List<Genotype> evals = (eval != null ? ((VariantBackedByGenotype)eval).getGenotypes() : new ArrayList<Genotype>());
HashMap<String, Genotype> evalHash = makeGenotypeHash(evals, rodNames);
// make chip/eval Genotype pairs
List<Pair<Genotype, Genotype>>[] chipEvals = new List[truthTables.length];
for ( String name : rodNames ) {
Genotype chipG = chips.get(name);
Genotype evalG = evalHash.get(name);
if (chipG == null && evalG == null)
continue;
int index = sampleToArrayIndex.get(name);
if ( chipEvals[index] == null )
chipEvals[index] = new ArrayList<Pair<Genotype, Genotype>>();
chipEvals[index].add(new Pair<Genotype, Genotype>(chipG, evalG));
}
// now we can finally update our truth tables with the truth vs. calls data
StringBuilder s = new StringBuilder();
for (int i = 0; i < truthTables.length; i++) {
if ( chipEvals[i] != null ) {
ConcordanceTruthTable table = truthTables[i];
String x = table.addEntry(chipEvals[i], eval, ref);
if ( x != null ) s.append(x);
}
}
String toReturn = s.toString();
return toReturn.equals("") ? null : toReturn;
}
public List<String> done() {
List<String> s = new ArrayList<String>();
// let the truth tables do all the printing
for ( ConcordanceTruthTable table : truthTables ) {
table.addAllStats(s);
s.add("");
}
return s;
}
}

View File

@ -1,98 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.IntervalRod;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.genotype.Variation;
import java.util.ArrayList;
import java.util.List;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class ClusterCounterAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
int minDistanceForFlagging = 5;
/**
* Threshold distances between neighboring SNPs we will track and assess
*/
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100};
int[] variantsWithClusters = new int[neighborWiseBoundries.length];
/**
* Keep track of the last variation we saw in the stream
*/
Variation lastVariation = null;
/**
* The interval that the last variation occurred in. Don't think that SNPs from different intervals are
* too close together in hybrid selection.
*/
GenomeLoc lastVariantInterval = null;
int nSeen = 0;
public ClusterCounterAnalysis() {
super("cluster_counter_analysis");
}
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
String r = null;
if ( eval != null && eval.isSNP() ) {
IntervalRod intervalROD = tracker.lookup("interval",IntervalRod.class);
GenomeLoc interval = intervalROD == null ? null : intervalROD.getLocation();
if (lastVariation != null) {
GenomeLoc eL = eval.getLocation();
GenomeLoc lvL = lastVariation.getLocation();
// if we are on the same contig, and we in the same interval
if (eL.getContigIndex() == lvL.getContigIndex() && ! (lastVariantInterval != null && lastVariantInterval.compareTo(interval) != 0)) {
long d = eL.distance(lvL);
nSeen++;
StringBuilder s = new StringBuilder();
for (int i = 0; i < neighborWiseBoundries.length; i++) {
int maxDist = neighborWiseBoundries[i];
s.append(String.format("%d ", d <= maxDist ? maxDist : 0));
if ( d <= maxDist ) {
variantsWithClusters[i]++;
}
}
// lookup in master for performance reasons
if ( d <= minDistanceForFlagging && getMaster().includeViolations() )
r = String.format("snp_within_cluster %d %s %s %s", d, eL, lvL, s.toString());
}
}
// eval is now the last variation
lastVariation = eval;
lastVariantInterval = interval;
}
return r;
}
public List<String> done() {
List<String> s = new ArrayList<String>();
s.add(String.format("snps_counted_for_neighbor_distances %d", nSeen));
s.add(String.format("description maxDist count"));
for ( int i = 0; i < neighborWiseBoundries.length; i++ ) {
int maxDist = neighborWiseBoundries[i];
int count = variantsWithClusters[i];
s.add(String.format("snps_within_clusters_of_size %10d %10d", maxDist, count));
}
return s;
}
}

View File

@ -1,21 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
* If an analysis implements this interface, it asserts that it performs a genotype based analysis, as
* opposed a straight variant analysis. The difference here is that variants are not asserted to be
* the actual genotype of a particular person, but are really just variation "out-there" in a population.
* A genotype analysis would be something like covered bases, confidently called bases, genotyping
* concordance, etc.
*
*/
public interface GenotypeAnalysis {
}

View File

@ -1,63 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.*;
import java.util.*;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public class GenotypeConcordance extends ChipConcordance implements GenotypeAnalysis {
private String providedChipName = null;
public GenotypeConcordance(final String chipName, boolean chipNameIsFile) {
super("genotype_concordance", chipName, chipNameIsFile);
if ( !chipNameIsFile )
providedChipName = chipName;
}
protected void assertVariationIsValid(Variation eval) {
// must be a genotype
if ( eval != null && !(eval instanceof VariantBackedByGenotype) )
throw new StingException("Failure: trying to analyze genotypes of non-genotype data");
}
protected ConcordanceTruthTable[] createTruthTableMappings(List<String> rodNames, Map<String, Integer> sampleToArrayIndex) {
ConcordanceTruthTable[] tables = new ConcordanceTruthTable[rodNames.size()];
// each sample gets its own truth table
for (int i = 0; i < rodNames.size(); i++) {
String name = rodNames.get(i);
tables[i] = new ConcordanceTruthTable(name);
sampleToArrayIndex.put(name, i);
}
return tables;
}
protected HashMap<String, Genotype> makeGenotypeHash(List<Genotype> evals, List<String> rodNames) {
HashMap<String, Genotype> hash = new HashMap<String, Genotype>();
// associate each Genotype with the appropriate sample
for ( Genotype eval : evals ) {
if ( providedChipName != null )
hash.put(providedChipName, eval);
// TODO -- fix me in VE2
//else if ( eval instanceof SampleBacked )
// hash.put(((SampleBacked)eval).getSampleName(), eval);
else if ( rodNames.size() == 1 )
hash.put(rodNames.get(0), eval);
else
throw new StingException("Genotype data has no associated samples but are multi-sample...?");
}
return hash;
}
}

View File

@ -1,101 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import cern.jet.math.Arithmetic;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.SNPCallFromGenotypes;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.genotype.HardyWeinbergCalculation;
import org.broadinstitute.sting.utils.genotype.Variation;
import java.util.ArrayList;
import java.util.List;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class HardyWeinbergEquilibrium extends BasicVariantAnalysis implements PopulationAnalysis {
private double threshold;
int nSites = 0;
int nViolations = 0;
HardyWeinbergEquilibrium(double threshold) {
super("hwe");
this.threshold = threshold;
}
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
String r = null;
if ( eval != null &&
eval instanceof SNPCallFromGenotypes ) {
nSites++;
SNPCallFromGenotypes call = (SNPCallFromGenotypes)eval;
//double pPoint = pointHWcalculate(call);
double pTailed = tailedHWcalculate(call);
double p = pTailed;
//System.out.printf("HWE point=%.4e %s vs. tailed=%.4e %s for %s%n", pPoint, pPoint < threshold ? "***" : " ", pTailed, pTailed < threshold ? "***" : " ", call);
if ( p < threshold ) {
r = String.format("HWE-violation %f < %f at %s", p, threshold, eval);
nViolations++;
}
}
return r;
}
public double tailedHWcalculate(SNPCallFromGenotypes call) {
int obsAA = call.nHomRefGenotypes();
int obsAB = call.nHetGenotypes();
int obsBB = call.nHomVarGenotypes();
return HardyWeinbergCalculation.hwCalculate(obsAA, obsAB, obsBB);
}
public double pointHWcalculate(SNPCallFromGenotypes call) {
int nAA = call.nHomRefGenotypes();
int nAa = call.nHetGenotypes();
int naa = call.nHomVarGenotypes();
int nA = 2 * nAA + nAa;
int n = nAA + nAa + naa;
//
// from Emigh 1980
//
// P = pr[nAa | nA] = multinomial[n over nAA, nAa, naa] / binomial[2n over nA] * 2^nAa
//
// where nAA, nAa, naa are the observed numbers of the three genotypes, AA, Aa, and aa,
// respectively, and nA is the number of A alleles, where nA = 2nAA + nAa, and n is the number of alleles
//
int[] mXs = { nAA, nAa, naa }; // counts of each genotype as vector
double m = MathUtils.multinomial(mXs);
double b = Arithmetic.binomial(2 * n, nA);
double tosses = Math.pow(2, nAa);
double p = (m / b) * tosses;
if ( false ) {
System.out.printf("HWE-violation at %s %f < %f %1.2f %5d %5d %5d %5d %5d %.2e %.2e %.2e => %.6e [%s]%n",
call.getLocation(), p, threshold, call.getNonRefAlleleFrequency(), nAA, nAa, naa, nA, n, m, b, tosses, p, call);
System.out.printf("(factorial(%d) / (factorial(%d) * factorial(%d) * factorial(%d))) / choose(%d, %d) * 2^%d - %f < 1e-3%n",
nAA + nAa + naa, nAA, nAa, naa, 2 * n, nA, nAa, p);
}
return p;
}
public List<String> done() {
List<String> s = new ArrayList<String>();
s.add(String.format("n_calls %d", nSites));
s.add(String.format("n_violations %d", nViolations));
s.add(String.format("violations_rate %.2f", (100.0*nViolations) / nSites));
return s;
}
}

View File

@ -1,58 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import java.util.ArrayList;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class Histogram<T> {
ArrayList<T> data;
int nBins = 100;
double minValue, maxValue;
public Histogram(int nBins, double minValue, double maxValue, T initialValue) {
data = new ArrayList<T>(nBins);
this.nBins = nBins;
this.minValue = minValue;
this.maxValue = maxValue;
initialize(initialValue);
}
public void initialize(T initialValue) {
data.clear();
for ( int i = 0; i < nBins; i++ ) {
data.add(i, initialValue);
}
}
public void setBin(int i, T value) {
data.set(i, value);
}
public void setBin(double x, T value) {
setBin(x2bin(x), value);
}
public T getBin(int binI) {
return data.get(binI);
}
public T getBin(double x) {
return getBin(x2bin(x));
}
public int x2bin(double x) {
return (int)Math.floor((x - minValue) / ((maxValue - minValue) * nBins));
}
public double bin2x(int bin) {
return (maxValue - minValue) * ((bin + 0.5) / nBins) + minValue;
}
}

View File

@ -1,64 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.genotype.Variation;
import java.util.ArrayList;
import java.util.List;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public class IndelMetricsAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
long insertions = 0;
long deletions = 0;
int maxSize = 0;
long[][] sizes = new long[2][100];
public IndelMetricsAnalysis() {
super("indel_metrics");
for (int i = 0; i < 100; i++)
sizes[0][i] = sizes[1][i] = 0;
}
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
if (eval != null && eval.isInsertion()) {
if (eval.isInsertion())
insertions++;
else if (eval.isDeletion())
deletions++;
else
throw new RuntimeException("Variation is indel, but isn't insertion or deletion!");
for (String allele : eval.getAlleleList())
if (allele.length() < 100) {
sizes[eval.isDeletion() ? 0 : 1][allele.length()]++;
if (allele.length() > maxSize)
maxSize = allele.length();
}
}
return null;
}
public List<String> done() {
List<String> s = new ArrayList<String>();
s.add(String.format("total_deletions %d", deletions));
s.add(String.format("total_insertions %d", insertions));
s.add(String.format(""));
s.add("Size Distribution");
s.add("size\tdeletions\tinsertions");
for (int i = 1; i <= maxSize; i++)
s.add(String.format("%d\t%d\t\t%d", i, sizes[0][i], sizes[1][i]));
return s;
}
}

View File

@ -1,95 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.IntervalRod;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.genotype.Variation;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class NeighborDistanceAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
ArrayList<Long> neighborWiseDistances;
int minDistanceForFlagging = 5;
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100, 1000, 10000};
Variation lastVariation = null;
GenomeLoc lastVariantInterval = null;
PrintStream violationsOut = null;
public NeighborDistanceAnalysis() {
super("neighbor_distances");
neighborWiseDistances = new ArrayList<Long>();
}
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
String r = null;
if ( eval != null && eval.isSNP() ) {
IntervalRod intervalROD = tracker.lookup("interval",IntervalRod.class);
GenomeLoc interval = intervalROD == null ? null : intervalROD.getLocation();
if (lastVariation != null) {
GenomeLoc eL = eval.getLocation();
GenomeLoc lvL = lastVariation.getLocation();
if (eL.getContigIndex() == lvL.getContigIndex()) {
long d = eL.distance(lvL);
if ( lastVariantInterval != null && lastVariantInterval.compareTo(interval) != 0) {
// we're on different intervals
//out.printf("# Excluding %d %s %s vs. %s %s%n", d, eL, interval, lvL, lastVariantInterval);
} else {
neighborWiseDistances.add(d);
r = d <= minDistanceForFlagging ? String.format("neighbor-distance %d %s %s", d, eL, lvL) : null;
}
}
}
lastVariation = eval;
lastVariantInterval = interval;
}
return r;
}
public List<String> done() {
int[] pairCounts = new int[neighborWiseBoundries.length];
Arrays.fill(pairCounts, 0);
for ( long dist : neighborWiseDistances ) {
boolean done = false;
for ( int i = 0; i < neighborWiseBoundries.length && ! done ; i++ ) {
int maxDist = neighborWiseBoundries[i];
if ( dist <= maxDist ) {
pairCounts[i]++;
done = true;
}
}
}
List<String> s = new ArrayList<String>();
s.add(String.format("snps_counted_for_neighbor_distances %d", neighborWiseDistances.size()));
int cumulative = 0;
s.add(String.format("description maxDist count cumulative"));
for ( int i = 0; i < neighborWiseBoundries.length; i++ ) {
int maxDist = neighborWiseBoundries[i];
int count = pairCounts[i];
cumulative += count;
s.add(String.format("snps_immediate_neighbors_within_bp %d %d %d", maxDist, count, cumulative));
}
return s;
}
}

View File

@ -1,12 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Sep 3, 2009
* Time: 4:39:12 PM
* To change this template use File | Settings | File Templates.
*/
public interface PoolAnalysis extends PopulationAnalysis, GenotypeAnalysis {
}

View File

@ -1,54 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.refdata.RodVCF;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.*;
import java.util.*;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public class PooledConcordance extends ChipConcordance implements PoolAnalysis {
public PooledConcordance(final String chipName, boolean chipNameIsFile) {
super("pooled_concordance", chipName, chipNameIsFile);
}
protected void assertVariationIsValid(Variation eval) {
// we only support VCF for now
if ( eval != null && !(eval instanceof RodVCF) )
throw new StingException("Failure: we currently only support analyzing pooled data in VCF format");
}
protected ConcordanceTruthTable[] createTruthTableMappings(List<String> rodNames, Map<String, Integer> sampleToArrayIndex) {
// there is only one truth table - the pool-wide one
ConcordanceTruthTable[] tables = new ConcordanceTruthTable[1];
tables[0] = new ConcordanceTruthTable(rodNames.size());
for ( String name : rodNames ) {
sampleToArrayIndex.put(name, 0);
}
return tables;
}
protected HashMap<String, Genotype> makeGenotypeHash(List<Genotype> evals, List<String> rodNames) {
HashMap<String, Genotype> hash = new HashMap<String, Genotype>();
// te Genotype is pool-wide so all samples are associated with it
if ( evals.size() > 0 ) {
for ( String name : rodNames )
hash.put(name, evals.get(0));
}
return hash;
}
}

View File

@ -1,84 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import java.util.List;
import java.util.ArrayList;
import java.io.PrintStream;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Oct 20, 2009
* Time: 4:03:23 PM
* To change this template use File | Settings | File Templates.
*/
public class PooledFrequencyAnalysis extends BasicPoolVariantAnalysis implements PoolAnalysis, GenotypeAnalysis {
/*
Maintains an array of basic variant analyses broken down by estimated frequency
*/
private VariantDBCoverage[] coverageAnalysisByFrequency;
private VariantCounter[] variantCounterByFrequency;
private TransitionTranversionAnalysis[] transitionTransversionByFrequency;
public PooledFrequencyAnalysis(int poolSize, String knownDBSNPName ) {
super("Pooled_Frequency_Analysis",poolSize);
if ( poolSize > 0 ) {
coverageAnalysisByFrequency = new VariantDBCoverage[getNumberOfAllelesInPool()+1];
variantCounterByFrequency = new VariantCounter[getNumberOfAllelesInPool()+1];
transitionTransversionByFrequency = new TransitionTranversionAnalysis[getNumberOfAllelesInPool()+1];
for ( int j = 0; j < getNumberOfAllelesInPool()+1; j ++ ) {
coverageAnalysisByFrequency[j] = new VariantDBCoverage(knownDBSNPName);
variantCounterByFrequency[j] = new VariantCounter();
transitionTransversionByFrequency[j] = new TransitionTranversionAnalysis();
}
}
}
public void initialize(VariantEvalWalker master, PrintStream out1, PrintStream out2, String name) {
super.initialize(master,out1,out2,name);
}
public String update( Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context ) {
int f = calculateFrequencyFromVariation(eval);
String interest1 = coverageAnalysisByFrequency[f].update(eval,tracker,ref,context);
String interest2 = variantCounterByFrequency[f].update(eval,tracker,ref,context);
String interest3 = transitionTransversionByFrequency[f].update(eval,tracker,ref,context);
if ( interest1 != null ) {
return "coverageAnalysis_Frequency_"+f+"\t"+interest1;
}
if ( interest2 != null ) {
return "VariantCounter_Frequency_"+f+"\t"+interest2;
}
if ( interest3 != null ) {
return "transitionTransversion_Frequency_"+f+"\t"+interest3;
}
return null;
}
public int calculateFrequencyFromVariation(Variation eval) {
double freq = eval.getNonRefAlleleFrequency();
return (int) Math.round(getNumberOfAllelesInPool()*freq);
}
@Override
public List<String> done() {
List<String> s = new ArrayList<String>();
for ( int j = 0; j < getNumberOfAllelesInPool()+1; j ++ ) {
s.add("Pool_Frequency_Analysis_Frequency:\t"+j);
s.addAll(coverageAnalysisByFrequency[j].done());
s.addAll(variantCounterByFrequency[j].done());
s.addAll(transitionTransversionByFrequency[j].done());
}
return s;
}
}

View File

@ -1,21 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
* If an analysis implements this interface, it asserts that it performs a genotype based analysis, as
* opposed a straight variant analysis. The difference here is that variants are not asserted to be
* the actual genotype of a particular person, but are really just variation "out-there" in a population.
* A genotype analysis would be something like covered bases, confidently called bases, genotyping
* concordance, etc.
*
*/
public interface PopulationAnalysis {
}

View File

@ -1,47 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.genotype.Variation;
import java.util.ArrayList;
import java.util.List;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public class TransitionTranversionAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
long nTransitions = 0, nTransversions = 0;
public TransitionTranversionAnalysis() {
super("transitions_transversions");
}
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
if (eval != null && eval.isSNP()) {
char altBase = eval.getAlternativeBaseForSNP();
BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType((byte)ref, (byte)altBase);
if (subType == BaseUtils.BaseSubstitutionType.TRANSITION) {
nTransitions++;
} else {
nTransversions++;
}
}
return null;
}
public List<String> done() {
List<String> s = new ArrayList<String>();
s.add(String.format("transitions %d", nTransitions));
s.add(String.format("transversions %d", nTransversions));
s.add(String.format("ratio %.2f", nTransitions / (1.0 * nTransversions)));
return s;
}
}

View File

@ -1,60 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.genotype.Variation;
import java.util.ArrayList;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: andrewk
* Date: Sep 30, 2009
* Time: 5:28:57 PM
* To change this template use File | Settings | File Templates.
*/
// @Allows(value={DataSource.REFERENCE},referenceMetaData = {@RMD(name="eval",type=VariationRod.class), @RMD(name=rodDbSNP.STANDARD_DBSNP_TRACK_NAME,type= rodDbSNP.class),@RMD(name="hapmap-chip",type=RodGenotypeChipAsGFF.class), @RMD(name="interval",type=IntervalRod.class), @RMD(name="validation",type=RodGenotypeChipAsGFF.class)})
public class ValidationDataAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
private int calls_at_validated_sites = 0;
private int calls_at_sites_validated_true = 0;
private int validated_sites = 0;
public ValidationDataAnalysis() {
super("validation_data_analysis");
}
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
validated_sites++;
List<Object> objects = tracker.getReferenceMetaData("validation");
Object val_data = (objects.size() > 0) ? objects.get(0) : null;
if (eval != null) {
calls_at_sites_validated_true++;
//out.format("Has validaiton data: %s\n", val_data.getLocation());
if (val_data != null) {
//out.format("Validated true: %s\n", val_data.getLocation());
calls_at_validated_sites++;
}
}
//out.println(context.getLocation());
return null;
}
public List<String> done() {
List<String> s = new ArrayList<String>();
if (calls_at_validated_sites > 0) { // only output info if there were any validation sites encountered
s.add(String.format("validated sites %d", validated_sites));
s.add(String.format("calls at validated sites %d", calls_at_validated_sites));
s.add(String.format("calls at sites validated true %d", calls_at_sites_validated_true));
s.add(String.format("%% validated true %f", (float) calls_at_validated_sites / calls_at_sites_validated_true));
}else{
s.add("No validation data encountered");
}
return s;
}
}

View File

@ -1,29 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.genotype.Variation;
import java.io.PrintStream;
import java.util.List;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public interface VariantAnalysis {
public String getName();
public PrintStream getSummaryPrintStream();
public PrintStream getCallPrintStream();
public List<String> getParams();
public void initialize(VariantEvalWalker master, PrintStream out, PrintStream callOut, String filename);
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context);
public void finalize(long nSites);
public List<String> done();
}

View File

@ -1,81 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
import org.broadinstitute.sting.utils.genotype.Variation;
import java.util.ArrayList;
import java.util.List;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class VariantCounter extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
long nBasesCovered = 0;
int nSNPs = 0;
int nHets = 0;
public VariantCounter() {
super("variant_counts");
}
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
nSNPs += eval == null || eval.isReference() ? 0 : 1;
// TODO -- break the het check out to a different module used only for single samples
if ( this.getMaster().evalContainsGenotypes && eval != null && eval.isBiallelic() && eval.isSNP() && eval instanceof VariantBackedByGenotype ) {
List<Genotype> genotypes = ((VariantBackedByGenotype)eval).getGenotypes();
if ( genotypes.size() == 1 && genotypes.get(0).isHet() ) {
nHets++;
}
}
return null;
}
/**
* No need to finalize the data in general
* @param nSites
*/
public void finalize(long nSites) {
nBasesCovered = nSites;
}
private double variantRate(int n) {
return n / (1.0 * Math.max(nBasesCovered, 1));
}
private long variantRateInverse(int n) {
return nBasesCovered / Math.max(n, 1);
}
public List<String> done() {
List<String> s = new ArrayList<String>();
s.add(String.format("n bases covered %d", nBasesCovered));
s.add(String.format("variants %d", nSNPs));
s.add(String.format("variant rate %.5f confident variants per base", variantRate(nSNPs)));
s.add(String.format("variant rate 1 / %d confident variants per base", variantRateInverse(nSNPs)));
if ( this.getMaster().evalContainsGenotypes ) {
s.add(String.format("heterozygotes %d", nHets));
s.add(String.format("homozygotes %d", nSNPs - nHets));
s.add(String.format("heterozygosity %.5f confident hets per base", variantRate(nHets)));
s.add(String.format("heterozygosity 1 / %d confident hets per base", variantRateInverse(nHets)));
s.add(String.format("het to hom ratio %.2f confident hets per confident homozygote non-refs [single sample genome-wide expectation is 2:1]",
((double)nHets) / (Math.max(nSNPs - nHets, 1))));
}
return s;
}
}

View File

@ -1,121 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.RodVCF;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.genotype.Variation;
import java.util.ArrayList;
import java.util.List;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
private String dbName;
private int nDBSNPs = 0;
private int nEvalObs = 0;
private int nSNPsAtdbSNPs = 0;
private int nConcordant = 0;
public VariantDBCoverage(final String name) {
super("db_coverage");
dbName = name;
}
public int nDBSNPs() { return nDBSNPs; }
public int nEvalSites() { return nEvalObs; }
public int nSNPsAtdbSNPs() { return nSNPsAtdbSNPs; }
public int nConcordant() { return nConcordant; }
public int nNovelSites() { return Math.abs(nEvalSites() - nSNPsAtdbSNPs()); }
public boolean discordantP(Variation dbSNP, Variation eval) {
if (eval != null) {
char alt = (eval.isSNP()) ? eval.getAlternativeBaseForSNP() : Utils.stringToChar(eval.getReference());
if (dbSNP != null && dbSNP.isSNP())
return !dbSNP.getAlleleList().contains(String.valueOf(alt));
}
return false;
}
/**
* What fraction of the evaluated site variants were also found in the db?
*
* @return
*/
public double dbSNPRate() {
return nSNPsAtdbSNPs() / (1.0 * nEvalSites());
}
public double concordanceRate() {
return nConcordant() / (1.0 * nSNPsAtdbSNPs());
}
public static Variation getFirstRealSNP(List<Object> dbsnpList) {
if (dbsnpList == null)
return null;
Variation dbsnp = null;
for (Object d : dbsnpList) {
if (((Variation) d).isSNP() && (!(d instanceof RodVCF) || !((RodVCF) d).isFiltered())) {
dbsnp = (Variation) d;
break;
}
}
return dbsnp;
}
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
Variation dbSNP = getFirstRealSNP(tracker.getReferenceMetaData( dbName, false ));
String result = null;
if ( dbSNP != null ) nDBSNPs++; // count the number of real dbSNP events
if ( eval != null && eval.isSNP() ) { // ignore indels right now
nEvalObs++; // count the number of eval snps we've seen
if (dbSNP != null) { // both eval and dbSNP have real snps
nSNPsAtdbSNPs++;
if (!discordantP(dbSNP, eval)) // count whether we're concordant or not with the dbSNP value
nConcordant++;
else
result = String.format("Discordant [DBSNP %s] [EVAL %s]", dbSNP, eval);
}
}
// if ( dbSNP != null && dbSNP.isSNP() ) {
// BrokenRODSimulator.attach("dbSNP");
// rodDbSNP dbsnp = (rodDbSNP) BrokenRODSimulator.simulate_lookup("dbSNP", context.getLocation(), tracker);
// if ( ! dbSNP.getRS_ID().equals(dbsnp.getRS_ID()) && dbsnp.isSNP() ) {
// System.out.printf("Discordant site! %n%s%n vs.%n%s%n", dbSNP, dbsnp);
// }
// }
return result;
}
public List<String> done() {
List<String> s = new ArrayList<String>();
s.add(String.format("name %s", dbName));
s.add(String.format("n_db_snps %d", nDBSNPs()));
s.add(String.format("n_eval_sites %d", nEvalSites()));
s.add(String.format("n_overlapping_sites %d", nSNPsAtdbSNPs()));
s.add(String.format("n_concordant %d", nConcordant()));
s.add(String.format("n_novel_sites %d", nNovelSites()));
s.add(String.format("%s_rate %.2f # percent eval snps at dbsnp snps", dbName.toLowerCase(), 100 * dbSNPRate()));
s.add(String.format("concordance_rate %.2f", 100 * concordanceRate()));
return s;
}
}

View File

@ -1,441 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.*;
import java.util.regex.Pattern;
/**
* A robust and general purpose tool for characterizing the quality of SNPs, Indels, and other variants that includes basic
* counting, ti/tv, dbSNP% (if -D is provided), concordance to chip or validation data, and will show interesting sites (-V)
* that are FNs, FP, etc.
*/
@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="eval",type=ReferenceOrderedDatum.class)}) // right now we have no base variant class for rods, this should change
//@Allows(value={DataSource.REFERENCE},referenceMetaData = {@RMD(name="eval",type=ReferenceOrderedDatum.class), @RMD(name="dbsnp",type=rodDbSNP.class),@RMD(name="hapmap-chip",type=ReferenceOrderedDatum.class), @RMD(name="interval",type=IntervalRod.class), @RMD(name="validation",type=RodGenotypeChipAsGFF.class)})
//public class VariantEvalWalker extends RefWalker<Integer, Integer> {
public class VariantEvalWalker extends RodWalker<Integer, Integer> {
@Argument(shortName="minPhredConfidenceScore", doc="Minimum confidence score to consider an evaluation SNP a variant", required=false)
public double minConfidenceScore = -1.0;
@Argument(shortName="printVariants", doc="If true, prints the variants in all of the variant tracks that are examined", required=false)
public boolean printVariants = false;
@Argument(shortName="badHWEThreshold", doc="Only sites with deviations from Hardy-Weinberg equilibrium with P-values < than this threshold are flagged", required=false)
public double badHWEThreshold = 1e-3;
@Argument(fullName="evalContainsGenotypes", shortName = "G", doc="If true, the input list of variants will be treated as a genotyping file, containing assertions of actual genotype values for a particular person. Analyses that only make sense on at the population level will be disabled, while those operating on genotypes will be enabled", required=false)
public boolean evalContainsGenotypes = false;
@Argument(fullName="explode", shortName = "E", doc="Old style formatting, with each analysis split into separate files.", required=false)
public boolean explode = false;
@Argument(fullName="includeViolations", shortName = "V", doc="If provided, violations will be written out along with summary information", required=false)
public boolean mIncludeViolations = false;
@Argument(fullName="extensiveSubsets", shortName = "A", doc="If provided, output will be calculated over a lot of subsets, by default we only operate over all variants", required=false)
public boolean extensiveSubsets = false;
@Argument(fullName="supressDateInformation", doc="This flag indicates that we want to suppress the date information from the output, so that if can be diff'ed against previous evals.", required=false)
public boolean supressDateInformation = false;
@Argument(fullName="includeFilteredRecords", doc="If true, variation record with filter fields at are true will be included in the analysis", required=false)
public boolean includeFilteredRecords = false;
@Argument(fullName = "samplesFile", shortName="samples", doc="When running an analysis on one or more individuals with truth data, this field provides a filepath to the listing of which samples are used (and are used to name corresponding rods with -B)", required=false)
public String samplesFile = null;
@Argument(fullName = "sampleName", shortName="sampleName", doc="When running an analysis on one or more individuals with truth data, provide this parameter to only analyze the genotype of this sample", required=false)
public String sampleName = null;
@Argument(fullName = "vcfInfoSelector", shortName="vcfInfoSelector", doc="When running an analysis on one or more individuals with truth data, provide this parameter to only analyze the genotype of this sample", required=false)
public String vcfInfoSelector = null;
String analysisFilenameBase = null;
final String knownSNPDBName = "dbSNP";
final String One1KGSNPNames = "1kg";
final String genotypeChipName = "hapmap-chip";
HashMap<ANALYSIS_TYPE, ArrayList<VariantAnalysis>> analysisSets;
PrintStream perLocusStream = null;
long nMappedSites = 0;
// the types of analysis we support, and the string tags we associate with the enumerated value
enum ANALYSIS_TYPE {
// todo -- differeniate into three classes -- snps at known snp sites, snps not at known snp site but covered by known indel, and novel
ALL_SNPS("all"),
SINGLETON_SNPS("singletons"),
TWOHIT_SNPS("2plus_hit"),
KNOWN_SNPS("known"),
SNPS_AT_NON_SNP_SITES("snp_at_known_non_snps"),
NOVEL_SNPS("novel"),
FILTERED_SNPS("filtered");
private final String value;
ANALYSIS_TYPE(String value) { this.value = value;}
public String toString() { return value; }
}
// final ANALYSIS_TYPE[] POPULATION_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS,
// ANALYSIS_TYPE.SINGLETON_SNPS,
// ANALYSIS_TYPE.TWOHIT_SNPS,
// ANALYSIS_TYPE.KNOWN_SNPS,
// ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES,
// ANALYSIS_TYPE.NOVEL_SNPS};
// final ANALYSIS_TYPE[] GENOTYPE_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS,
// ANALYSIS_TYPE.KNOWN_SNPS,
// ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES,
// ANALYSIS_TYPE.NOVEL_SNPS};
final ANALYSIS_TYPE[] POPULATION_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS,
ANALYSIS_TYPE.SINGLETON_SNPS,
ANALYSIS_TYPE.TWOHIT_SNPS,
ANALYSIS_TYPE.KNOWN_SNPS,
ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES,
ANALYSIS_TYPE.NOVEL_SNPS,
ANALYSIS_TYPE.FILTERED_SNPS};
final ANALYSIS_TYPE[] GENOTYPE_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS,
ANALYSIS_TYPE.KNOWN_SNPS,
ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES,
ANALYSIS_TYPE.NOVEL_SNPS,
ANALYSIS_TYPE.FILTERED_SNPS};
final ANALYSIS_TYPE[] SIMPLE_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS};
ANALYSIS_TYPE[] ALL_ANALYSIS_NAMES = null;
public void initialize() {
ALL_ANALYSIS_NAMES = SIMPLE_ANALYSIS_NAMES;
if (extensiveSubsets)
ALL_ANALYSIS_NAMES = evalContainsGenotypes ? GENOTYPE_ANALYSIS_NAMES : POPULATION_ANALYSIS_NAMES;
// setup the path to the analysis
if (this.getToolkit().getArguments().outFileName != null) {
analysisFilenameBase = this.getToolkit().getArguments().outFileName + "."; // + ".analysis.";
}
analysisSets = new HashMap<ANALYSIS_TYPE, ArrayList<VariantAnalysis>>();
for (ANALYSIS_TYPE setName : ALL_ANALYSIS_NAMES) {
analysisSets.put(setName, initializeAnalysisSet(setName));
}
// THIS IS A HACK required in order to reproduce the behavior of old (and imperfect) RODIterator and
// hence to pass the integration test. The new iterator this code is now using does see ALL the SNPs,
// whether masked by overlapping indels/other events or not.
//TODO process correctly all the returned dbSNP rods at each location
//BrokenRODSimulator.attach("dbSNP");
}
public long getNMappedSites() {
return nMappedSites;
}
private ArrayList<VariantAnalysis> getAnalysisSet(final ANALYSIS_TYPE name) {
return analysisSets.containsKey(name) ? analysisSets.get(name) : null;
}
private ArrayList<VariantAnalysis> initializeAnalysisSet(final ANALYSIS_TYPE setName) {
ArrayList<VariantAnalysis> analyses = new ArrayList<VariantAnalysis>();
//
// Add new analyses here!
//
analyses.add(new ValidationDataAnalysis());
analyses.add(new VariantCounter());
analyses.add(new VariantDBCoverage(knownSNPDBName));
analyses.add(new VariantDBCoverage(One1KGSNPNames));
if ( samplesFile != null ) {
//if ( numPeopleInPool < 1 )
// analyses.add(new GenotypeConcordance(samplesFile, true));
//else
analyses.add(new PooledConcordance(samplesFile, true));
} else {
analyses.add(new GenotypeConcordance(genotypeChipName, false));
}
analyses.add(new TransitionTranversionAnalysis());
analyses.add(new NeighborDistanceAnalysis());
analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold));
analyses.add(new ClusterCounterAnalysis());
analyses.add(new CallableBasesAnalysis());
analyses.add(new IndelMetricsAnalysis());
//
// Filter out analyses inappropriate for our evaluation type Population or Genotype
//
Iterator<VariantAnalysis> iter = analyses.iterator();
while (iter.hasNext()) {
VariantAnalysis analysis = iter.next();
VariantAnalysis removed = null;
if ( evalContainsGenotypes ) {
if ( ! (analysis instanceof GenotypeAnalysis ) ) {
removed = analysis;
iter.remove();
}
} else if ( ! (analysis instanceof PopulationAnalysis || analysis instanceof PoolAnalysis) ) {
removed = analysis;
iter.remove();
// } else if ( numPeopleInPool > 1 && ! ( analysis instanceof PoolAnalysis ) ) {
} else if ( analysis instanceof PoolAnalysis && samplesFile == null ) {
removed = analysis;
iter.remove();
}
if ( removed != null ) {
logger.info(String.format("Disabling analysis %s in set %s", removed, setName));
}
}
if (printVariants) analyses.add(new VariantMatcher(knownSNPDBName));
for (VariantAnalysis analysis : analyses) {
initializeAnalysisOutputStream(setName, analysis);
}
return analyses;
}
/**
* Returns the filename of the analysis output file where output for an analysis with
*
* @param name
* @param params
*
* @return
*/
public String getAnalysisFilename(final String name, final List<String> params) {
if (analysisFilenameBase == null)
return null;
else
return analysisFilenameBase + Utils.join(".", Utils.cons(name, params));
}
public void initializeAnalysisOutputStream(final ANALYSIS_TYPE setName, VariantAnalysis analysis) {
final String filename = getAnalysisFilename(setName + "." + analysis.getName(), analysis.getParams());
try {
if (perLocusStream == null)
perLocusStream = filename == null ? out : new PrintStream(new File(analysisFilenameBase + "interesting_sites"));
} catch (FileNotFoundException e) {
throw new RuntimeException(e);
}
if (filename == null || !explode)
analysis.initialize(this, out, perLocusStream, filename);
else {
File file = new File(filename);
try {
analysis.initialize(this, new PrintStream(new FileOutputStream(file)), perLocusStream, filename);
} catch (FileNotFoundException e) {
throw new StingException("Couldn't open analysis output file " + filename, e);
}
}
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
//System.out.printf("Tracker at %s is %s, ref is %s, skip is %d mapped is %d%n", context.getLocation(), tracker, ref, context.getSkippedBases(), nMappedSites);
nMappedSites += context.getSkippedBases();
//System.out.printf("Tracker at %s is %s, ref is %s%n", context.getLocation(), tracker, ref);
//if ( ref == null )
// out.printf("Last position was %s: skipping %d bases%n",
// context.getLocation(), context.getSkippedBases() );
if ( ref == null ) { // we are seeing the last site
return 0;
}
nMappedSites++;
int nBoundGoodRods = tracker.getNBoundRodTracks("interval");
if (nBoundGoodRods > 0) {
// System.out.printf("%s: n = %d%n", context.getLocation(), nBoundGoodRods );
// Iterate over each analysis, and update it
Variation eval = tracker.lookup("eval",Variation.class);
Variation evalForFilter = null;
// ensure that the variation we're looking at is bi-allelic
if ( eval != null && ! eval.isBiallelic() )
eval = null;
if (eval != null)
if (eval.getNegLog10PError() * 10.0 < minConfidenceScore) eval = null;
if ( eval != null && (eval instanceof RodVCF) ) {
if ( sampleName != null ) {
// code to grab a particular sample from a VCF
Variation evalOld = eval;
// System.out.printf("original is %s%n", evalOld);
eval = fakeVCFForSample((RodVCF)eval, ref, sampleName);
if ( eval != null && eval.isSNP() ) {
// System.out.printf("sample is %s%n", eval);
// System.out.printf("Replacing %s with %s%n", evalOld, eval);
// System.out.printf("%n");
} else {
eval = null;
}
}
if ( vcfInfoSelector != null && eval != null ) {
String[] keyValue = vcfInfoSelector.split("=");
Map<String, String> map = ((RodVCF)eval).getRecord().getInfoValues();
if ( map.containsKey(keyValue[0]) && ! Pattern.matches(keyValue[1], map.get(keyValue[0]) ) )
eval = null;
}
if ( eval != null && ((RodVCF)eval).mCurrentRecord.isFiltered() ) {
evalForFilter = eval;
if ( ! includeFilteredRecords ) {
eval = null; // we are not including filtered records, so set eval to null
}
//System.out.printf("Rejecting filtered record %s%n", eval);
}
}
// update stats about all of the SNPs
updateAnalysisSet(ANALYSIS_TYPE.ALL_SNPS, eval, tracker, ref.getBase(), context);
// update the known / novel set by checking whether the knownSNPDBName track has an entry here
if (eval != null) {
ANALYSIS_TYPE noveltySet = getNovelAnalysisType(tracker);
updateAnalysisSet(noveltySet, eval, tracker, ref.getBase(), context);
}
if (evalForFilter != null) {
updateAnalysisSet(ANALYSIS_TYPE.FILTERED_SNPS, evalForFilter, tracker, ref.getBase(), context);
}
// are we a population backed call? then update
if (eval instanceof SNPCallFromGenotypes) {
SNPCallFromGenotypes call = (SNPCallFromGenotypes) eval;
int nVarGenotypes = call.nHetGenotypes() + call.nHomVarGenotypes();
//System.out.printf("%d variant genotypes at %s%n", nVarGenotypes, calls);
final ANALYSIS_TYPE s = nVarGenotypes == 1 ? ANALYSIS_TYPE.SINGLETON_SNPS : ANALYSIS_TYPE.TWOHIT_SNPS;
updateAnalysisSet(s, eval, tracker, ref.getBase(), context);
}
}
return 1;
}
private RodVCF fakeVCFForSample(RodVCF eval, ReferenceContext ref, final String sampleName) {
VCFGenotypeRecord genotype = (VCFGenotypeRecord)eval.getGenotype(sampleName);
if ( genotype.getNegLog10PError() > 0 ) {
VCFRecord record = new VCFRecord(Character.toString(ref.getBase()), ref.getLocus(), "GT");
record.setAlternateBases(eval.getRecord().getAlternateAlleles());
record.addGenotypeRecord(genotype);
record.setQual(10*genotype.getNegLog10PError());
record.setFilterString(eval.getFilterString());
record.addInfoFields(eval.getInfoValues());
return new RodVCF("fakeVCFForSample", record, eval.getReader());
} else {
return null;
}
}
private ANALYSIS_TYPE getNovelAnalysisType(RefMetaDataTracker tracker) {
List<Object> dbsnpList = tracker.getReferenceMetaData("dbsnp");
if (dbsnpList.size() == 0)
return ANALYSIS_TYPE.NOVEL_SNPS;
for (Object d : dbsnpList) {
if (((rodDbSNP) d).isSNP()) {
return ANALYSIS_TYPE.KNOWN_SNPS;
}
}
return ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES;
// old and busted way of doing this
// Variation dbsnp = (Variation) BrokenRODSimulator.simulate_lookup("dbSNP", ref.getLocus(), tracker);
//
// ANALYSIS_TYPE noveltySet = null;
// if ( dbsnp == null ) noveltySet = ANALYSIS_TYPE.NOVEL_SNPS;
// else if ( dbsnp.isSNP() ) noveltySet = ANALYSIS_TYPE.KNOWN_SNPS;
// else { // if ( dbsnp.isIndel() )
// noveltySet = ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES;
}
public boolean includeViolations() { return mIncludeViolations; }
public void updateAnalysisSet(final ANALYSIS_TYPE analysisSetName, Variation eval,
RefMetaDataTracker tracker, char ref, AlignmentContext context) {
// Iterate over each analysis, and update it
ArrayList<VariantAnalysis> set = getAnalysisSet(analysisSetName);
if ( set != null ) {
for ( VariantAnalysis analysis : set ) {
String s = analysis.update(eval, tracker, ref, context);
if ( s != null && includeViolations() ) {
analysis.getCallPrintStream().println(getLineHeader(analysisSetName, "flagged", analysis.getName()) + s);
}
}
}
}
// Given result of map function
public Integer reduceInit() {
return 0;
}
public Integer reduce(Integer value, Integer sum) {
return treeReduce(sum, value);
}
public Integer treeReduce(Integer lhs, Integer rhs) {
return lhs + rhs;
}
public void onTraversalDone(Integer result) {
for (ANALYSIS_TYPE analysisSetName : ALL_ANALYSIS_NAMES) {
printAnalysisSet(analysisSetName);
}
}
private String getLineHeader(final ANALYSIS_TYPE analysisSetName, final String keyword, final String analysis) {
String s = Utils.join(",", Arrays.asList(analysisSetName, keyword, analysis));
return s + Utils.dupString(' ', Math.max(50 - s.length(), 1));
}
private void printAnalysisSet(final ANALYSIS_TYPE analysisSetName) {
//out.printf("Writing analysis set %s", analysisSetName);
Date now = new Date();
for (VariantAnalysis analysis : getAnalysisSet(analysisSetName)) {
String header = getLineHeader(analysisSetName, "summary", analysis.getName());
analysis.finalize(getNMappedSites());
PrintStream stream = analysis.getSummaryPrintStream();
stream.printf("%s%s%n", header, Utils.dupString('-', 78));
//stream.printf("%s Analysis set %s%n", analysisSetName, , analysisSetName);
stream.printf("%sAnalysis name %s%n", header, analysis.getName());
stream.printf("%sAnalysis params %s%n", header, Utils.join(" ", analysis.getParams()));
stream.printf("%sAnalysis class %s%n", header, analysis.getClass().getName());
if (!supressDateInformation) stream.printf("%sAnalysis time %s%n", header, now);
for (String line : analysis.done()) {
stream.printf("%s%s%n", header, line);
}
}
}
}

View File

@ -1,40 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.genotype.Variation;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class VariantMatcher extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
String dbName;
public VariantMatcher(final String name) {
super("variant_matches");
dbName = name;
}
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
String r = null;
Variation db = tracker.lookup(dbName,Variation.class);
if ( eval != null || db != null ) {
String matchFlag = " ";
if ( eval != null && db != null ) matchFlag = "*** ";
if ( eval == null && db != null ) matchFlag = ">>> ";
if ( eval != null && db == null ) matchFlag = "<<< ";
r = String.format("%s %s: %s <=> EVAL: %s", dbName, matchFlag, db, eval);
}
return r;
}
}

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
package org.broadinstitute.sting.oneoffprojects.walkers;
import org.broadinstitute.sting.utils.genotype.Genotype;

View File

@ -4,7 +4,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.varianteval.ConcordanceTruthTable;
import org.broadinstitute.sting.playground.gatk.walkers.poolseq.PowerBelowFrequencyWalker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;

View File

@ -1,222 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import org.broadinstitute.sting.WalkerTest;
import org.junit.Test;
import java.io.File;
import java.util.*;
/**
*
* @author aaron
*
* Class VariantEvalPerformanceTest
*
* TODO: remove me Eric
*/
public class VariantEvalPerformanceTest extends WalkerTest {
@Test
public void testEvalVariantROD() {
HashMap<String, String> md5 = new HashMap<String, String>();
md5.put("", "9cfda40f521d75a3e8bafc44a663c14a");
md5.put("-A", "8fea7cc25f551ce170636fc35c5ae0fe");
/**
* the above MD5 was calculated from running the following command:
*
* java -jar ./dist/GenomeAnalysisTK.jar \
* -R /broad/1KG/reference/human_b36_both.fasta \
* -T VariantEval \
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
* -L 1:10,000,000-11,000,000 \
* --outerr myVariantEval \
* --supressDateInformation \
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls
*
*/
for ( Map.Entry<String, String> e : md5.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
" --rodBind eval,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" +
" -T VariantEval" +
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -L 1:10,000,000-11,000,000" +
" --outerr %s" +
" --supressDateInformation " + e.getKey(),
1, // just one output file
Arrays.asList(e.getValue()));
List<File> result = executeTest("testEvalVariantROD", spec).getFirst();
}
}
@Test
public void testEvalVariantRODConfSix() {
List<String> md5 = new ArrayList<String>();
md5.add("11d636d105f902680c46b9f2e330d922");
/**
* the above MD5 was calculated from running the following command:
*
* java -jar ./dist/GenomeAnalysisTK.jar \
* -R /broad/1KG/reference/human_b36_both.fasta \
* -T VariantEval \
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
* -L 1:10,000,000-11,000,000 \
* --outerr myVariantEval \
* --supressDateInformation \
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls \
* -minConfidenceScore 6
*/
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
" --rodBind eval,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" +
" -T VariantEval" +
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -L 1:10,000,000-11,000,000" +
" --outerr %s" +
" --supressDateInformation" +
" -minPhredConfidenceScore 60",
1, // just one output file
md5);
List<File> result = executeTest("testEvalVariantRODConfSixty", spec).getFirst();
}
@Test
public void testEvalVariantRODOutputViolations() {
List<String> md5 = new ArrayList<String>();
md5.add("12ecd457d62329e9d4e593de904a457d");
/**
* the above MD5 was calculated from running the following command:
*
* java -jar ./dist/GenomeAnalysisTK.jar \
* -R /broad/1KG/reference/human_b36_both.fasta \
* -T VariantEval \
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
* -L 1:10,000,000-11,000,000 \
* --outerr myVariantEval \
* --supressDateInformation \
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls \
* --includeViolations
*/
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
" --rodBind eval,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" +
" -T VariantEval" +
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -L 1:10,000,000-11,000,000" +
" --outerr %s" +
" --supressDateInformation" +
" --includeViolations",
1, // just one output file
md5);
List<File> result = executeTest("testEvalVariantRODOutputViolations", spec).getFirst();
}
@Test
public void testEvalGenotypeROD() {
List<String> md5 = new ArrayList<String>();
md5.add("6ed44fd586c89dafd40cb8e0194dc456");
/**
* the above MD5 was calculated after running the following command:
*
* java -jar ./dist/GenomeAnalysisTK.jar \
* -R /broad/1KG/reference/human_b36_both.fasta \
* -T VariantEval \
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
* -L 1:10,000,000-11,000,000 \
* --outerr myVariantEval \
* --supressDateInformation \
* --evalContainsGenotypes \
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.genotypes.geli.calls \
* --rodBind hapmap-chip,GFF,/humgen/gsa-scr1/GATK_Data/1KG_gffs/NA12878.1kg.gff
*/
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
" --rodBind eval,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.genotypes.geli.calls" +
" -T VariantEval" +
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -L 1:10,000,000-11,000,000" +
" --outerr %s" +
" --supressDateInformation" +
" --evalContainsGenotypes" +
" --rodBind hapmap-chip,GFF,/humgen/gsa-scr1/GATK_Data/1KG_gffs/NA12878.1kg.gff",
1, // just one output file
md5);
List<File> result = executeTest("testEvalGenotypeROD", spec).getFirst();
}
@Test
public void testEvalMarksGenotypingExample() {
List<String> md5 = new ArrayList<String>();
md5.add("c0396cfe89a63948aebbbae0a0e06678");
/**
* Run with the following commands:
*
* java -Xmx2048m -jar /humgen/gsa-hphome1/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar
* -T VariantEval -R /broad/1KG/reference/human_b36_both.fasta -l INFO
* -B eval,Variants,/humgen/gsa-scr1/ebanks/concordanceForMark/UMichVsBroad.venn.set1Only.calls
* -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -hc /humgen/gsa-scr1/GATK_Data/1KG_gffs/NA12878.1kg.gff
* -G -L 1 -o /humgen/gsa-scr1/ebanks/concordanceForMark/UMichVsBroad.venn.set1Only.calls.eval
*/
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T VariantEval -R " + oneKGLocation + "reference/human_b36_both.fasta " +
"-B eval,Variants," + validationDataLocation + "UMichVsBroad.venn.set1Only.calls " +
"-D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -hc /humgen/gsa-scr1/GATK_Data/1KG_gffs/NA12878.1kg.gff " +
"-G " +
"--supressDateInformation " +
"-L 1:1-10,000,000 " +
"--outerr %s",
1, // just one output file
md5);
List<File> result = executeTest("testEvalMarksGenotypingExample", spec).getFirst();
}
@Test
public void testEvalRuntimeWithLotsOfIntervals() {
List<String> md5 = new ArrayList<String>();
md5.add("6a90341517fc3c5026529301d9970c7b");
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T VariantEval -R " + oneKGLocation + "reference/human_b36_both.fasta " +
"-B eval,Variants," + validationDataLocation + "NA12878.pilot_3.all.geli.calls " +
"-D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod " +
"--supressDateInformation " +
"-L /humgen/gsa-scr1/GATK_Data/thousand_genomes_alpha_redesign.targets.b36.interval_list " +
"--outerr %s",
1, // just one output file
md5);
List<File> result = executeTest("testEvalRuntimeWithLotsOfIntervals", spec).getFirst();
}
@Test
public void testVCFVariantEvals() {
HashMap<String, String> md5 = new HashMap<String, String>();
md5.put("", "ee6b096169d6c5e2ce49d394fbec799b");
md5.put("-A", "a443193c0810363f85278b1cfaed2fff");
md5.put("-A --includeFilteredRecords", "812d7f2ecac28b1be7e7028af17df9c0");
md5.put("-A --sampleName NA12878", "a443193c0810363f85278b1cfaed2fff");
md5.put("-A -vcfInfoSelector AF=0.50", "afed4bf0c9f11b86f6e5356012f9cf2d");
for ( Map.Entry<String, String> e : md5.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
" --rodBind eval,VCF," + validationDataLocation + "NA12878.example1.vcf" +
" -T VariantEval" +
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -hc /humgen/gsa-scr1/GATK_Data/1KG_gffs/NA12878.1kg.gff" +
" -G" +
" -L 1:1-10,000" +
" --outerr %s" +
" --supressDateInformation " + e.getKey(),
1, // just one output file
Arrays.asList(e.getValue()));
List<File> result = executeTest("testVCFVariantEvals", spec).getFirst();
}
}
}

View File

@ -1,221 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import org.broadinstitute.sting.WalkerTest;
import org.junit.Test;
import java.io.File;
import java.util.*;
/**
* @author aaron
* <p/>
* Class VariantEvalWalkerTest
* <p/>
* test out the variant eval walker under different runtime conditions.
*/
public class VariantEvalWalkerIntegrationTest extends WalkerTest {
@Test
public void testEvalVariantROD() {
HashMap<String, String> md5 = new HashMap<String, String>();
md5.put("", "9cfda40f521d75a3e8bafc44a663c14a");
md5.put("-A", "8fea7cc25f551ce170636fc35c5ae0fe");
/**
* the above MD5 was calculated from running the following command:
*
* java -jar ./dist/GenomeAnalysisTK.jar \
* -R /broad/1KG/reference/human_b36_both.fasta \
* -T VariantEval \
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
* -L 1:10,000,000-11,000,000 \
* --outerr myVariantEval \
* --supressDateInformation \
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls
*
*/
for ( Map.Entry<String, String> e : md5.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
" --rodBind eval,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" +
" -T VariantEval" +
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -L 1:10,000,000-11,000,000" +
" --outerr %s" +
" --supressDateInformation " + e.getKey(),
1, // just one output file
Arrays.asList(e.getValue()));
List<File> result = executeTest("testEvalVariantROD", spec).getFirst();
}
}
@Test
public void testEvalVariantRODConfSix() {
List<String> md5 = new ArrayList<String>();
md5.add("11d636d105f902680c46b9f2e330d922");
/**
* the above MD5 was calculated from running the following command:
*
* java -jar ./dist/GenomeAnalysisTK.jar \
* -R /broad/1KG/reference/human_b36_both.fasta \
* -T VariantEval \
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
* -L 1:10,000,000-11,000,000 \
* --outerr myVariantEval \
* --supressDateInformation \
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls \
* -minConfidenceScore 6
*/
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
" --rodBind eval,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" +
" -T VariantEval" +
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -L 1:10,000,000-11,000,000" +
" --outerr %s" +
" --supressDateInformation" +
" -minPhredConfidenceScore 60",
1, // just one output file
md5);
List<File> result = executeTest("testEvalVariantRODConfSixty", spec).getFirst();
}
@Test
public void testEvalVariantRODOutputViolations() {
List<String> md5 = new ArrayList<String>();
md5.add("12ecd457d62329e9d4e593de904a457d");
/**
* the above MD5 was calculated from running the following command:
*
* java -jar ./dist/GenomeAnalysisTK.jar \
* -R /broad/1KG/reference/human_b36_both.fasta \
* -T VariantEval \
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
* -L 1:10,000,000-11,000,000 \
* --outerr myVariantEval \
* --supressDateInformation \
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls \
* --includeViolations
*/
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
" --rodBind eval,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" +
" -T VariantEval" +
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -L 1:10,000,000-11,000,000" +
" --outerr %s" +
" --supressDateInformation" +
" --includeViolations",
1, // just one output file
md5);
List<File> result = executeTest("testEvalVariantRODOutputViolations", spec).getFirst();
}
@Test
public void testEvalGenotypeROD() {
List<String> md5 = new ArrayList<String>();
md5.add("6ed44fd586c89dafd40cb8e0194dc456");
/**
* the above MD5 was calculated after running the following command:
*
* java -jar ./dist/GenomeAnalysisTK.jar \
* -R /broad/1KG/reference/human_b36_both.fasta \
* -T VariantEval \
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
* -L 1:10,000,000-11,000,000 \
* --outerr myVariantEval \
* --supressDateInformation \
* --evalContainsGenotypes \
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.genotypes.geli.calls \
* --rodBind hapmap-chip,GFF,/humgen/gsa-scr1/GATK_Data/1KG_gffs/NA12878.1kg.gff
*/
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
" --rodBind eval,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.genotypes.geli.calls" +
" -T VariantEval" +
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -L 1:10,000,000-11,000,000" +
" --outerr %s" +
" --supressDateInformation" +
" --evalContainsGenotypes" +
" --rodBind hapmap-chip,GFF,/humgen/gsa-scr1/GATK_Data/1KG_gffs/NA12878.1kg.gff",
1, // just one output file
md5);
List<File> result = executeTest("testEvalGenotypeROD", spec).getFirst();
}
@Test
public void testEvalMarksGenotypingExample() {
List<String> md5 = new ArrayList<String>();
md5.add("c0396cfe89a63948aebbbae0a0e06678");
/**
* Run with the following commands:
*
* java -Xmx2048m -jar /humgen/gsa-hphome1/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar
* -T VariantEval -R /broad/1KG/reference/human_b36_both.fasta -l INFO
* -B eval,Variants,/humgen/gsa-scr1/ebanks/concordanceForMark/UMichVsBroad.venn.set1Only.calls
* -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -hc /humgen/gsa-scr1/GATK_Data/1KG_gffs/NA12878.1kg.gff
* -G -L 1 -o /humgen/gsa-scr1/ebanks/concordanceForMark/UMichVsBroad.venn.set1Only.calls.eval
*/
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T VariantEval -R " + oneKGLocation + "reference/human_b36_both.fasta " +
"-B eval,Variants," + validationDataLocation + "UMichVsBroad.venn.set1Only.calls " +
"-D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -hc /humgen/gsa-scr1/GATK_Data/1KG_gffs/NA12878.1kg.gff " +
"-G " +
"--supressDateInformation " +
"-L 1:1-10,000,000 " +
"--outerr %s",
1, // just one output file
md5);
List<File> result = executeTest("testEvalMarksGenotypingExample", spec).getFirst();
}
@Test
public void testEvalRuntimeWithLotsOfIntervals() {
List<String> md5 = new ArrayList<String>();
md5.add("6a90341517fc3c5026529301d9970c7b");
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T VariantEval -R " + oneKGLocation + "reference/human_b36_both.fasta " +
"-B eval,Variants," + validationDataLocation + "NA12878.pilot_3.all.geli.calls " +
"-D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod " +
"--supressDateInformation " +
"-L /humgen/gsa-scr1/GATK_Data/thousand_genomes_alpha_redesign.targets.b36.interval_list " +
"--outerr %s",
1, // just one output file
md5);
List<File> result = executeTest("testEvalRuntimeWithLotsOfIntervals", spec).getFirst();
}
@Test
public void testVCFVariantEvals() {
HashMap<String, String> md5 = new HashMap<String, String>();
md5.put("", "ee6b096169d6c5e2ce49d394fbec799b");
md5.put("-A", "a443193c0810363f85278b1cfaed2fff");
md5.put("-A --includeFilteredRecords", "812d7f2ecac28b1be7e7028af17df9c0");
md5.put("-A --sampleName NA12878", "a443193c0810363f85278b1cfaed2fff");
md5.put("-A -vcfInfoSelector AF=0.50", "afed4bf0c9f11b86f6e5356012f9cf2d");
for ( Map.Entry<String, String> e : md5.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
" --rodBind eval,VCF," + validationDataLocation + "NA12878.example1.vcf" +
" -T VariantEval" +
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -hc /humgen/gsa-scr1/GATK_Data/1KG_gffs/NA12878.1kg.gff" +
" -G" +
" -L 1:1-10,000" +
" --outerr %s" +
" --supressDateInformation " + e.getKey(),
1, // just one output file
Arrays.asList(e.getValue()));
List<File> result = executeTest("testVCFVariantEvals", spec).getFirst();
}
}
}