Removing VariantEval (v1); everyone should be using VE2 now. Docs coming ASAP.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3177 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3330e254a9
commit
ac9dc0b4b4
|
|
@ -1,59 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: Sep 3, 2009
|
||||
* Time: 5:04:40 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public abstract class BasicPoolVariantAnalysis implements VariantAnalysis{
|
||||
protected int numIndividualsInPool;
|
||||
protected String name;
|
||||
protected String filenames;
|
||||
protected PrintStream out, callOut;
|
||||
private VariantEvalWalker master;
|
||||
|
||||
public BasicPoolVariantAnalysis(String name, int nIndividuals) {
|
||||
this.name = name;
|
||||
this.numIndividualsInPool = nIndividuals;
|
||||
}
|
||||
|
||||
public String getName() { return this.name; }
|
||||
|
||||
public int getNumberOfIndividualsInPool() { return this.numIndividualsInPool; }
|
||||
|
||||
public PrintStream getSummaryPrintStream() { return this.out; }
|
||||
|
||||
public PrintStream getCallPrintStream() { return this.callOut; }
|
||||
|
||||
public VariantEvalWalker getMaster() { return this.master; }
|
||||
|
||||
public List<String> getParams() { return new ArrayList<String>(); }
|
||||
|
||||
public List<String> done() { return new ArrayList<String>(); }
|
||||
|
||||
public int getNumberOfAllelesInPool() { return 2*getNumberOfIndividualsInPool(); }
|
||||
|
||||
public void initialize(VariantEvalWalker master, PrintStream out, PrintStream callOut, String filenames) {
|
||||
this.master = master;
|
||||
this.out = out;
|
||||
this.callOut = callOut;
|
||||
this.filenames = filenames;
|
||||
}
|
||||
|
||||
public void finalize(long nSites) {
|
||||
// no need to finalize data in general
|
||||
}
|
||||
|
||||
public abstract String update(Variation variant, RefMetaDataTracker tracker, char ref, AlignmentContext context);
|
||||
|
||||
}
|
||||
|
|
@ -1,69 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public abstract class BasicVariantAnalysis implements VariantAnalysis {
|
||||
protected String name;
|
||||
protected PrintStream out, callOut;
|
||||
private VariantEvalWalker master;
|
||||
protected String filename;
|
||||
|
||||
public BasicVariantAnalysis(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public VariantEvalWalker getMaster() { return master; }
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public List<String> getParams() {
|
||||
return new ArrayList<String>();
|
||||
}
|
||||
|
||||
public void initialize(VariantEvalWalker master, PrintStream out, PrintStream callOut, String filename) {
|
||||
this.master = master;
|
||||
this.out = out;
|
||||
this.callOut = callOut;
|
||||
this.filename = filename;
|
||||
}
|
||||
|
||||
public PrintStream getSummaryPrintStream() {
|
||||
return out;
|
||||
}
|
||||
|
||||
public PrintStream getCallPrintStream() {
|
||||
return callOut;
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
return new ArrayList<String>();
|
||||
}
|
||||
|
||||
/**
|
||||
* No need to finalize the data in general
|
||||
* @param nSites
|
||||
*/
|
||||
public void finalize(long nSites) {
|
||||
|
||||
}
|
||||
|
||||
public abstract String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context);
|
||||
}
|
||||
|
|
@ -1,103 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
* <p/>
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
public class CallableBasesAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis {
|
||||
//long all_bases = 0;
|
||||
long all_calls = 0;
|
||||
final static double[] thresholds = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 50, 100};
|
||||
long[] discoverable_bases = new long[thresholds.length];
|
||||
long[] genotypable_bases = new long[thresholds.length];
|
||||
|
||||
public CallableBasesAnalysis() {
|
||||
super("callable_bases");
|
||||
}
|
||||
|
||||
public long nSites() {
|
||||
return super.getMaster().getNMappedSites();
|
||||
}
|
||||
|
||||
public long nCalls() {
|
||||
return all_calls;
|
||||
}
|
||||
|
||||
public long nDiscoverable(int index) {
|
||||
return discoverable_bases[index];
|
||||
}
|
||||
|
||||
public double percentDiscoverable(int index) {
|
||||
return (100.0 * nDiscoverable(index)) / nSites();
|
||||
}
|
||||
|
||||
public long nGenotypable(int index) {
|
||||
return genotypable_bases[index];
|
||||
}
|
||||
|
||||
public double percentGenotypable(int index) {
|
||||
return (100.0 * nGenotypable(index)) / nSites();
|
||||
}
|
||||
|
||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
//all_bases++;
|
||||
|
||||
if (eval == null) // no data here!
|
||||
return null;
|
||||
|
||||
// we actually have a record here
|
||||
if (!(eval instanceof VariantBackedByGenotype)) { // evaluation record isn't a genotype, die!
|
||||
return null;
|
||||
//throw new RuntimeException("Evaluation track isn't backed by a Genotype!");
|
||||
}
|
||||
|
||||
all_calls++;
|
||||
// For every threshold, updated discoverable and callable
|
||||
for (int i = 0; i < thresholds.length; i++) {
|
||||
double threshold = thresholds[i];
|
||||
|
||||
// update discoverable
|
||||
if ( eval.getNegLog10PError() >= threshold )
|
||||
discoverable_bases[i]++;
|
||||
|
||||
List<Genotype> genotypes = ((VariantBackedByGenotype)eval).getGenotypes();
|
||||
for ( Genotype g : genotypes ) {
|
||||
if ( g.getNegLog10PError() >= threshold ) {
|
||||
genotypable_bases[i]++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
List<String> s = new ArrayList<String>();
|
||||
s.add(String.format("total_no_sites %d", nSites()));
|
||||
s.add(String.format("total_no_calls %d", nCalls()));
|
||||
s.add(String.format(""));
|
||||
s.add(String.format("confidence_threshold\tdiscoverable_bases\tdiscoverable_bases_percent\tgenotypable_bases\tgenotypable_bases_percent"));
|
||||
|
||||
for (int i = 0; i < thresholds.length; i++) {
|
||||
double threshold = thresholds[i];
|
||||
s.add(String.format("%6.2f\t%d\t%.6f\t%d\t%.6f", threshold, nDiscoverable(i), percentDiscoverable(i), nGenotypable(i), percentGenotypable(i)));
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,155 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public abstract class ChipConcordance extends BasicVariantAnalysis {
|
||||
|
||||
// the array of truth tables used to store data in this analysis
|
||||
private ConcordanceTruthTable[] truthTables;
|
||||
|
||||
// mapping from sample name to index into truthTables for the corresponding table
|
||||
private Map<String, Integer> sampleToArrayIndex;
|
||||
|
||||
// list of all chip rods used
|
||||
private ArrayList<String> rodNames = new ArrayList<String>();
|
||||
|
||||
public ChipConcordance(final String name, final String chipName, boolean chipNameIsFile) {
|
||||
super(name);
|
||||
|
||||
// read sample names from file if appropriate, otherwise use the chip name
|
||||
if ( chipNameIsFile ) {
|
||||
List<String> sampleNames = readSampleNamesFromFile(chipName);
|
||||
rodNames.addAll(sampleNames);
|
||||
} else {
|
||||
rodNames.add(chipName);
|
||||
}
|
||||
|
||||
// initialize the truth table storage data
|
||||
sampleToArrayIndex = new HashMap<String, Integer>();
|
||||
truthTables = createTruthTableMappings(rodNames, sampleToArrayIndex);
|
||||
}
|
||||
|
||||
private List<String> readSampleNamesFromFile(String file) {
|
||||
ArrayList<String> samples = new ArrayList<String>();
|
||||
|
||||
try {
|
||||
BufferedReader reader = new BufferedReader(new FileReader(file));
|
||||
String line = reader.readLine();
|
||||
while ( line != null ) {
|
||||
samples.add(line);
|
||||
line = reader.readLine();
|
||||
}
|
||||
} catch( FileNotFoundException e) {
|
||||
throw new StingException("Chip file at "+file+" was not found. Please check filepath.");
|
||||
} catch( IOException e) {
|
||||
throw new StingException(e.getMessage());
|
||||
}
|
||||
|
||||
return samples;
|
||||
}
|
||||
|
||||
// these methods need to be implemented by subclasses
|
||||
protected abstract ConcordanceTruthTable[] createTruthTableMappings(List<String> rodNames, Map<String, Integer> sampleToArrayIndex);
|
||||
protected abstract void assertVariationIsValid(Variation eval);
|
||||
protected abstract HashMap<String, Genotype> makeGenotypeHash(List<Genotype> evals, List<String> rodNames);
|
||||
|
||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
// get all of the chip rods at this locus
|
||||
HashMap<String, Genotype> chips = new HashMap<String, Genotype>();
|
||||
for ( String name : rodNames ) {
|
||||
Variation chip = tracker.lookup(name,Variation.class);
|
||||
if ( chip != null ) {
|
||||
// chips must be Genotypes
|
||||
if ( !(chip instanceof VariantBackedByGenotype) )
|
||||
throw new StingException("Failure: trying to analyze genotypes using non-genotype truth data");
|
||||
chips.put(name, ((VariantBackedByGenotype)chip).getCalledGenotype());
|
||||
}
|
||||
}
|
||||
|
||||
// evaluate only if we have truth data or a call
|
||||
return ( eval != null || chips.size() > 0 ) ? inc(chips, eval, ref) : null;
|
||||
}
|
||||
|
||||
public String inc(Map<String, Genotype> chips, Variation eval, char ref) {
|
||||
// TODO -- needed to add this for now while we're moving over to VE2
|
||||
if ( !(eval instanceof VariantBackedByGenotype) )
|
||||
return null;
|
||||
|
||||
// each implementing class can decide whether the Variation is valid
|
||||
assertVariationIsValid(eval);
|
||||
|
||||
// This shouldn't happen, but let's check anyways to be safe
|
||||
if (BaseUtils.simpleBaseToBaseIndex(ref) == -1)
|
||||
return null;
|
||||
|
||||
// create a hash of samples to their Genotypes
|
||||
List<Genotype> evals = (eval != null ? ((VariantBackedByGenotype)eval).getGenotypes() : new ArrayList<Genotype>());
|
||||
HashMap<String, Genotype> evalHash = makeGenotypeHash(evals, rodNames);
|
||||
|
||||
// make chip/eval Genotype pairs
|
||||
List<Pair<Genotype, Genotype>>[] chipEvals = new List[truthTables.length];
|
||||
for ( String name : rodNames ) {
|
||||
Genotype chipG = chips.get(name);
|
||||
Genotype evalG = evalHash.get(name);
|
||||
|
||||
if (chipG == null && evalG == null)
|
||||
continue;
|
||||
|
||||
int index = sampleToArrayIndex.get(name);
|
||||
if ( chipEvals[index] == null )
|
||||
chipEvals[index] = new ArrayList<Pair<Genotype, Genotype>>();
|
||||
chipEvals[index].add(new Pair<Genotype, Genotype>(chipG, evalG));
|
||||
}
|
||||
|
||||
// now we can finally update our truth tables with the truth vs. calls data
|
||||
StringBuilder s = new StringBuilder();
|
||||
for (int i = 0; i < truthTables.length; i++) {
|
||||
if ( chipEvals[i] != null ) {
|
||||
ConcordanceTruthTable table = truthTables[i];
|
||||
String x = table.addEntry(chipEvals[i], eval, ref);
|
||||
if ( x != null ) s.append(x);
|
||||
}
|
||||
}
|
||||
|
||||
String toReturn = s.toString();
|
||||
return toReturn.equals("") ? null : toReturn;
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
List<String> s = new ArrayList<String>();
|
||||
|
||||
// let the truth tables do all the printing
|
||||
for ( ConcordanceTruthTable table : truthTables ) {
|
||||
table.addAllStats(s);
|
||||
s.add("");
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,98 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.IntervalRod;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public class ClusterCounterAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
||||
int minDistanceForFlagging = 5;
|
||||
|
||||
/**
|
||||
* Threshold distances between neighboring SNPs we will track and assess
|
||||
*/
|
||||
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100};
|
||||
int[] variantsWithClusters = new int[neighborWiseBoundries.length];
|
||||
/**
|
||||
* Keep track of the last variation we saw in the stream
|
||||
*/
|
||||
Variation lastVariation = null;
|
||||
|
||||
/**
|
||||
* The interval that the last variation occurred in. Don't think that SNPs from different intervals are
|
||||
* too close together in hybrid selection.
|
||||
*/
|
||||
GenomeLoc lastVariantInterval = null;
|
||||
int nSeen = 0;
|
||||
|
||||
|
||||
|
||||
public ClusterCounterAnalysis() {
|
||||
super("cluster_counter_analysis");
|
||||
}
|
||||
|
||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
String r = null;
|
||||
|
||||
if ( eval != null && eval.isSNP() ) {
|
||||
IntervalRod intervalROD = tracker.lookup("interval",IntervalRod.class);
|
||||
GenomeLoc interval = intervalROD == null ? null : intervalROD.getLocation();
|
||||
|
||||
if (lastVariation != null) {
|
||||
GenomeLoc eL = eval.getLocation();
|
||||
GenomeLoc lvL = lastVariation.getLocation();
|
||||
|
||||
// if we are on the same contig, and we in the same interval
|
||||
if (eL.getContigIndex() == lvL.getContigIndex() && ! (lastVariantInterval != null && lastVariantInterval.compareTo(interval) != 0)) {
|
||||
long d = eL.distance(lvL);
|
||||
nSeen++;
|
||||
StringBuilder s = new StringBuilder();
|
||||
for (int i = 0; i < neighborWiseBoundries.length; i++) {
|
||||
int maxDist = neighborWiseBoundries[i];
|
||||
s.append(String.format("%d ", d <= maxDist ? maxDist : 0));
|
||||
if ( d <= maxDist ) {
|
||||
variantsWithClusters[i]++;
|
||||
}
|
||||
}
|
||||
|
||||
// lookup in master for performance reasons
|
||||
if ( d <= minDistanceForFlagging && getMaster().includeViolations() )
|
||||
r = String.format("snp_within_cluster %d %s %s %s", d, eL, lvL, s.toString());
|
||||
}
|
||||
}
|
||||
|
||||
// eval is now the last variation
|
||||
lastVariation = eval;
|
||||
lastVariantInterval = interval;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
List<String> s = new ArrayList<String>();
|
||||
s.add(String.format("snps_counted_for_neighbor_distances %d", nSeen));
|
||||
s.add(String.format("description maxDist count"));
|
||||
for ( int i = 0; i < neighborWiseBoundries.length; i++ ) {
|
||||
int maxDist = neighborWiseBoundries[i];
|
||||
int count = variantsWithClusters[i];
|
||||
s.add(String.format("snps_within_clusters_of_size %10d %10d", maxDist, count));
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,21 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
* If an analysis implements this interface, it asserts that it performs a genotype based analysis, as
|
||||
* opposed a straight variant analysis. The difference here is that variants are not asserted to be
|
||||
* the actual genotype of a particular person, but are really just variation "out-there" in a population.
|
||||
* A genotype analysis would be something like covered bases, confidently called bases, genotyping
|
||||
* concordance, etc.
|
||||
*
|
||||
*/
|
||||
public interface GenotypeAnalysis {
|
||||
|
||||
}
|
||||
|
|
@ -1,63 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
* <p/>
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
public class GenotypeConcordance extends ChipConcordance implements GenotypeAnalysis {
|
||||
|
||||
private String providedChipName = null;
|
||||
|
||||
public GenotypeConcordance(final String chipName, boolean chipNameIsFile) {
|
||||
super("genotype_concordance", chipName, chipNameIsFile);
|
||||
if ( !chipNameIsFile )
|
||||
providedChipName = chipName;
|
||||
}
|
||||
|
||||
protected void assertVariationIsValid(Variation eval) {
|
||||
// must be a genotype
|
||||
if ( eval != null && !(eval instanceof VariantBackedByGenotype) )
|
||||
throw new StingException("Failure: trying to analyze genotypes of non-genotype data");
|
||||
}
|
||||
|
||||
protected ConcordanceTruthTable[] createTruthTableMappings(List<String> rodNames, Map<String, Integer> sampleToArrayIndex) {
|
||||
ConcordanceTruthTable[] tables = new ConcordanceTruthTable[rodNames.size()];
|
||||
|
||||
// each sample gets its own truth table
|
||||
for (int i = 0; i < rodNames.size(); i++) {
|
||||
String name = rodNames.get(i);
|
||||
tables[i] = new ConcordanceTruthTable(name);
|
||||
sampleToArrayIndex.put(name, i);
|
||||
}
|
||||
|
||||
return tables;
|
||||
}
|
||||
|
||||
protected HashMap<String, Genotype> makeGenotypeHash(List<Genotype> evals, List<String> rodNames) {
|
||||
HashMap<String, Genotype> hash = new HashMap<String, Genotype>();
|
||||
|
||||
// associate each Genotype with the appropriate sample
|
||||
for ( Genotype eval : evals ) {
|
||||
if ( providedChipName != null )
|
||||
hash.put(providedChipName, eval);
|
||||
// TODO -- fix me in VE2
|
||||
//else if ( eval instanceof SampleBacked )
|
||||
// hash.put(((SampleBacked)eval).getSampleName(), eval);
|
||||
else if ( rodNames.size() == 1 )
|
||||
hash.put(rodNames.get(0), eval);
|
||||
else
|
||||
throw new StingException("Genotype data has no associated samples but are multi-sample...?");
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,101 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import cern.jet.math.Arithmetic;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.SNPCallFromGenotypes;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.HardyWeinbergCalculation;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public class HardyWeinbergEquilibrium extends BasicVariantAnalysis implements PopulationAnalysis {
|
||||
private double threshold;
|
||||
int nSites = 0;
|
||||
int nViolations = 0;
|
||||
|
||||
HardyWeinbergEquilibrium(double threshold) {
|
||||
super("hwe");
|
||||
this.threshold = threshold;
|
||||
}
|
||||
|
||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
String r = null;
|
||||
|
||||
if ( eval != null &&
|
||||
eval instanceof SNPCallFromGenotypes ) {
|
||||
nSites++;
|
||||
SNPCallFromGenotypes call = (SNPCallFromGenotypes)eval;
|
||||
//double pPoint = pointHWcalculate(call);
|
||||
double pTailed = tailedHWcalculate(call);
|
||||
double p = pTailed;
|
||||
|
||||
//System.out.printf("HWE point=%.4e %s vs. tailed=%.4e %s for %s%n", pPoint, pPoint < threshold ? "***" : " ", pTailed, pTailed < threshold ? "***" : " ", call);
|
||||
|
||||
if ( p < threshold ) {
|
||||
r = String.format("HWE-violation %f < %f at %s", p, threshold, eval);
|
||||
nViolations++;
|
||||
}
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
public double tailedHWcalculate(SNPCallFromGenotypes call) {
|
||||
int obsAA = call.nHomRefGenotypes();
|
||||
int obsAB = call.nHetGenotypes();
|
||||
int obsBB = call.nHomVarGenotypes();
|
||||
return HardyWeinbergCalculation.hwCalculate(obsAA, obsAB, obsBB);
|
||||
}
|
||||
|
||||
public double pointHWcalculate(SNPCallFromGenotypes call) {
|
||||
int nAA = call.nHomRefGenotypes();
|
||||
int nAa = call.nHetGenotypes();
|
||||
int naa = call.nHomVarGenotypes();
|
||||
int nA = 2 * nAA + nAa;
|
||||
int n = nAA + nAa + naa;
|
||||
|
||||
//
|
||||
// from Emigh 1980
|
||||
//
|
||||
// P = pr[nAa | nA] = multinomial[n over nAA, nAa, naa] / binomial[2n over nA] * 2^nAa
|
||||
//
|
||||
// where nAA, nAa, naa are the observed numbers of the three genotypes, AA, Aa, and aa,
|
||||
// respectively, and nA is the number of A alleles, where nA = 2nAA + nAa, and n is the number of alleles
|
||||
//
|
||||
int[] mXs = { nAA, nAa, naa }; // counts of each genotype as vector
|
||||
double m = MathUtils.multinomial(mXs);
|
||||
double b = Arithmetic.binomial(2 * n, nA);
|
||||
double tosses = Math.pow(2, nAa);
|
||||
double p = (m / b) * tosses;
|
||||
|
||||
if ( false ) {
|
||||
System.out.printf("HWE-violation at %s %f < %f %1.2f %5d %5d %5d %5d %5d %.2e %.2e %.2e => %.6e [%s]%n",
|
||||
call.getLocation(), p, threshold, call.getNonRefAlleleFrequency(), nAA, nAa, naa, nA, n, m, b, tosses, p, call);
|
||||
System.out.printf("(factorial(%d) / (factorial(%d) * factorial(%d) * factorial(%d))) / choose(%d, %d) * 2^%d - %f < 1e-3%n",
|
||||
nAA + nAa + naa, nAA, nAa, naa, 2 * n, nA, nAa, p);
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
List<String> s = new ArrayList<String>();
|
||||
s.add(String.format("n_calls %d", nSites));
|
||||
s.add(String.format("n_violations %d", nViolations));
|
||||
s.add(String.format("violations_rate %.2f", (100.0*nViolations) / nSites));
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,58 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public class Histogram<T> {
|
||||
ArrayList<T> data;
|
||||
int nBins = 100;
|
||||
double minValue, maxValue;
|
||||
|
||||
public Histogram(int nBins, double minValue, double maxValue, T initialValue) {
|
||||
data = new ArrayList<T>(nBins);
|
||||
this.nBins = nBins;
|
||||
this.minValue = minValue;
|
||||
this.maxValue = maxValue;
|
||||
initialize(initialValue);
|
||||
}
|
||||
|
||||
public void initialize(T initialValue) {
|
||||
data.clear();
|
||||
for ( int i = 0; i < nBins; i++ ) {
|
||||
data.add(i, initialValue);
|
||||
}
|
||||
}
|
||||
|
||||
public void setBin(int i, T value) {
|
||||
data.set(i, value);
|
||||
}
|
||||
|
||||
public void setBin(double x, T value) {
|
||||
setBin(x2bin(x), value);
|
||||
}
|
||||
|
||||
public T getBin(int binI) {
|
||||
return data.get(binI);
|
||||
}
|
||||
|
||||
public T getBin(double x) {
|
||||
return getBin(x2bin(x));
|
||||
}
|
||||
|
||||
public int x2bin(double x) {
|
||||
return (int)Math.floor((x - minValue) / ((maxValue - minValue) * nBins));
|
||||
}
|
||||
|
||||
public double bin2x(int bin) {
|
||||
return (maxValue - minValue) * ((bin + 0.5) / nBins) + minValue;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,64 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
* <p/>
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
public class IndelMetricsAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
||||
long insertions = 0;
|
||||
long deletions = 0;
|
||||
int maxSize = 0;
|
||||
long[][] sizes = new long[2][100];
|
||||
|
||||
public IndelMetricsAnalysis() {
|
||||
super("indel_metrics");
|
||||
for (int i = 0; i < 100; i++)
|
||||
sizes[0][i] = sizes[1][i] = 0;
|
||||
}
|
||||
|
||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
if (eval != null && eval.isInsertion()) {
|
||||
if (eval.isInsertion())
|
||||
insertions++;
|
||||
else if (eval.isDeletion())
|
||||
deletions++;
|
||||
else
|
||||
throw new RuntimeException("Variation is indel, but isn't insertion or deletion!");
|
||||
|
||||
for (String allele : eval.getAlleleList())
|
||||
if (allele.length() < 100) {
|
||||
sizes[eval.isDeletion() ? 0 : 1][allele.length()]++;
|
||||
if (allele.length() > maxSize)
|
||||
maxSize = allele.length();
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
List<String> s = new ArrayList<String>();
|
||||
s.add(String.format("total_deletions %d", deletions));
|
||||
s.add(String.format("total_insertions %d", insertions));
|
||||
s.add(String.format(""));
|
||||
|
||||
s.add("Size Distribution");
|
||||
s.add("size\tdeletions\tinsertions");
|
||||
for (int i = 1; i <= maxSize; i++)
|
||||
s.add(String.format("%d\t%d\t\t%d", i, sizes[0][i], sizes[1][i]));
|
||||
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,95 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.IntervalRod;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public class NeighborDistanceAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
||||
ArrayList<Long> neighborWiseDistances;
|
||||
int minDistanceForFlagging = 5;
|
||||
int[] neighborWiseBoundries = {1, 2, 5, 10, 20, 50, 100, 1000, 10000};
|
||||
|
||||
Variation lastVariation = null;
|
||||
GenomeLoc lastVariantInterval = null;
|
||||
PrintStream violationsOut = null;
|
||||
|
||||
public NeighborDistanceAnalysis() {
|
||||
super("neighbor_distances");
|
||||
neighborWiseDistances = new ArrayList<Long>();
|
||||
}
|
||||
|
||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
String r = null;
|
||||
|
||||
if ( eval != null && eval.isSNP() ) {
|
||||
IntervalRod intervalROD = tracker.lookup("interval",IntervalRod.class);
|
||||
|
||||
GenomeLoc interval = intervalROD == null ? null : intervalROD.getLocation();
|
||||
|
||||
if (lastVariation != null) {
|
||||
GenomeLoc eL = eval.getLocation();
|
||||
GenomeLoc lvL = lastVariation.getLocation();
|
||||
if (eL.getContigIndex() == lvL.getContigIndex()) {
|
||||
long d = eL.distance(lvL);
|
||||
if ( lastVariantInterval != null && lastVariantInterval.compareTo(interval) != 0) {
|
||||
// we're on different intervals
|
||||
//out.printf("# Excluding %d %s %s vs. %s %s%n", d, eL, interval, lvL, lastVariantInterval);
|
||||
} else {
|
||||
neighborWiseDistances.add(d);
|
||||
r = d <= minDistanceForFlagging ? String.format("neighbor-distance %d %s %s", d, eL, lvL) : null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lastVariation = eval;
|
||||
lastVariantInterval = interval;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
int[] pairCounts = new int[neighborWiseBoundries.length];
|
||||
Arrays.fill(pairCounts, 0);
|
||||
for ( long dist : neighborWiseDistances ) {
|
||||
boolean done = false;
|
||||
for ( int i = 0; i < neighborWiseBoundries.length && ! done ; i++ ) {
|
||||
int maxDist = neighborWiseBoundries[i];
|
||||
if ( dist <= maxDist ) {
|
||||
pairCounts[i]++;
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
List<String> s = new ArrayList<String>();
|
||||
s.add(String.format("snps_counted_for_neighbor_distances %d", neighborWiseDistances.size()));
|
||||
int cumulative = 0;
|
||||
s.add(String.format("description maxDist count cumulative"));
|
||||
for ( int i = 0; i < neighborWiseBoundries.length; i++ ) {
|
||||
int maxDist = neighborWiseBoundries[i];
|
||||
int count = pairCounts[i];
|
||||
cumulative += count;
|
||||
s.add(String.format("snps_immediate_neighbors_within_bp %d %d %d", maxDist, count, cumulative));
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: Sep 3, 2009
|
||||
* Time: 4:39:12 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public interface PoolAnalysis extends PopulationAnalysis, GenotypeAnalysis {
|
||||
|
||||
}
|
||||
|
|
@ -1,54 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.RodVCF;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
* <p/>
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
public class PooledConcordance extends ChipConcordance implements PoolAnalysis {
|
||||
|
||||
public PooledConcordance(final String chipName, boolean chipNameIsFile) {
|
||||
super("pooled_concordance", chipName, chipNameIsFile);
|
||||
}
|
||||
|
||||
protected void assertVariationIsValid(Variation eval) {
|
||||
// we only support VCF for now
|
||||
if ( eval != null && !(eval instanceof RodVCF) )
|
||||
throw new StingException("Failure: we currently only support analyzing pooled data in VCF format");
|
||||
}
|
||||
|
||||
protected ConcordanceTruthTable[] createTruthTableMappings(List<String> rodNames, Map<String, Integer> sampleToArrayIndex) {
|
||||
// there is only one truth table - the pool-wide one
|
||||
ConcordanceTruthTable[] tables = new ConcordanceTruthTable[1];
|
||||
tables[0] = new ConcordanceTruthTable(rodNames.size());
|
||||
|
||||
for ( String name : rodNames ) {
|
||||
sampleToArrayIndex.put(name, 0);
|
||||
}
|
||||
|
||||
return tables;
|
||||
}
|
||||
|
||||
protected HashMap<String, Genotype> makeGenotypeHash(List<Genotype> evals, List<String> rodNames) {
|
||||
|
||||
HashMap<String, Genotype> hash = new HashMap<String, Genotype>();
|
||||
|
||||
// te Genotype is pool-wide so all samples are associated with it
|
||||
if ( evals.size() > 0 ) {
|
||||
for ( String name : rodNames )
|
||||
hash.put(name, evals.get(0));
|
||||
}
|
||||
|
||||
return hash;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,84 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: Oct 20, 2009
|
||||
* Time: 4:03:23 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class PooledFrequencyAnalysis extends BasicPoolVariantAnalysis implements PoolAnalysis, GenotypeAnalysis {
|
||||
/*
|
||||
Maintains an array of basic variant analyses broken down by estimated frequency
|
||||
*/
|
||||
private VariantDBCoverage[] coverageAnalysisByFrequency;
|
||||
private VariantCounter[] variantCounterByFrequency;
|
||||
private TransitionTranversionAnalysis[] transitionTransversionByFrequency;
|
||||
|
||||
public PooledFrequencyAnalysis(int poolSize, String knownDBSNPName ) {
|
||||
super("Pooled_Frequency_Analysis",poolSize);
|
||||
if ( poolSize > 0 ) {
|
||||
coverageAnalysisByFrequency = new VariantDBCoverage[getNumberOfAllelesInPool()+1];
|
||||
variantCounterByFrequency = new VariantCounter[getNumberOfAllelesInPool()+1];
|
||||
transitionTransversionByFrequency = new TransitionTranversionAnalysis[getNumberOfAllelesInPool()+1];
|
||||
for ( int j = 0; j < getNumberOfAllelesInPool()+1; j ++ ) {
|
||||
coverageAnalysisByFrequency[j] = new VariantDBCoverage(knownDBSNPName);
|
||||
variantCounterByFrequency[j] = new VariantCounter();
|
||||
transitionTransversionByFrequency[j] = new TransitionTranversionAnalysis();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void initialize(VariantEvalWalker master, PrintStream out1, PrintStream out2, String name) {
|
||||
super.initialize(master,out1,out2,name);
|
||||
|
||||
}
|
||||
|
||||
public String update( Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context ) {
|
||||
int f = calculateFrequencyFromVariation(eval);
|
||||
String interest1 = coverageAnalysisByFrequency[f].update(eval,tracker,ref,context);
|
||||
String interest2 = variantCounterByFrequency[f].update(eval,tracker,ref,context);
|
||||
String interest3 = transitionTransversionByFrequency[f].update(eval,tracker,ref,context);
|
||||
|
||||
if ( interest1 != null ) {
|
||||
return "coverageAnalysis_Frequency_"+f+"\t"+interest1;
|
||||
}
|
||||
|
||||
if ( interest2 != null ) {
|
||||
return "VariantCounter_Frequency_"+f+"\t"+interest2;
|
||||
}
|
||||
|
||||
if ( interest3 != null ) {
|
||||
return "transitionTransversion_Frequency_"+f+"\t"+interest3;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public int calculateFrequencyFromVariation(Variation eval) {
|
||||
double freq = eval.getNonRefAlleleFrequency();
|
||||
return (int) Math.round(getNumberOfAllelesInPool()*freq);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> done() {
|
||||
List<String> s = new ArrayList<String>();
|
||||
for ( int j = 0; j < getNumberOfAllelesInPool()+1; j ++ ) {
|
||||
s.add("Pool_Frequency_Analysis_Frequency:\t"+j);
|
||||
s.addAll(coverageAnalysisByFrequency[j].done());
|
||||
s.addAll(variantCounterByFrequency[j].done());
|
||||
s.addAll(transitionTransversionByFrequency[j].done());
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,21 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
* If an analysis implements this interface, it asserts that it performs a genotype based analysis, as
|
||||
* opposed a straight variant analysis. The difference here is that variants are not asserted to be
|
||||
* the actual genotype of a particular person, but are really just variation "out-there" in a population.
|
||||
* A genotype analysis would be something like covered bases, confidently called bases, genotyping
|
||||
* concordance, etc.
|
||||
*
|
||||
*/
|
||||
public interface PopulationAnalysis {
|
||||
|
||||
}
|
||||
|
|
@ -1,47 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
* <p/>
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
public class TransitionTranversionAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
||||
long nTransitions = 0, nTransversions = 0;
|
||||
|
||||
public TransitionTranversionAnalysis() {
|
||||
super("transitions_transversions");
|
||||
}
|
||||
|
||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
if (eval != null && eval.isSNP()) {
|
||||
char altBase = eval.getAlternativeBaseForSNP();
|
||||
BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType((byte)ref, (byte)altBase);
|
||||
if (subType == BaseUtils.BaseSubstitutionType.TRANSITION) {
|
||||
nTransitions++;
|
||||
} else {
|
||||
nTransversions++;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
List<String> s = new ArrayList<String>();
|
||||
s.add(String.format("transitions %d", nTransitions));
|
||||
s.add(String.format("transversions %d", nTransversions));
|
||||
s.add(String.format("ratio %.2f", nTransitions / (1.0 * nTransversions)));
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,60 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: andrewk
|
||||
* Date: Sep 30, 2009
|
||||
* Time: 5:28:57 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
|
||||
// @Allows(value={DataSource.REFERENCE},referenceMetaData = {@RMD(name="eval",type=VariationRod.class), @RMD(name=rodDbSNP.STANDARD_DBSNP_TRACK_NAME,type= rodDbSNP.class),@RMD(name="hapmap-chip",type=RodGenotypeChipAsGFF.class), @RMD(name="interval",type=IntervalRod.class), @RMD(name="validation",type=RodGenotypeChipAsGFF.class)})
|
||||
|
||||
public class ValidationDataAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
||||
private int calls_at_validated_sites = 0;
|
||||
private int calls_at_sites_validated_true = 0;
|
||||
private int validated_sites = 0;
|
||||
|
||||
public ValidationDataAnalysis() {
|
||||
super("validation_data_analysis");
|
||||
}
|
||||
|
||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
|
||||
validated_sites++;
|
||||
List<Object> objects = tracker.getReferenceMetaData("validation");
|
||||
Object val_data = (objects.size() > 0) ? objects.get(0) : null;
|
||||
|
||||
if (eval != null) {
|
||||
calls_at_sites_validated_true++;
|
||||
//out.format("Has validaiton data: %s\n", val_data.getLocation());
|
||||
if (val_data != null) {
|
||||
//out.format("Validated true: %s\n", val_data.getLocation());
|
||||
calls_at_validated_sites++;
|
||||
}
|
||||
}
|
||||
//out.println(context.getLocation());
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
List<String> s = new ArrayList<String>();
|
||||
if (calls_at_validated_sites > 0) { // only output info if there were any validation sites encountered
|
||||
s.add(String.format("validated sites %d", validated_sites));
|
||||
s.add(String.format("calls at validated sites %d", calls_at_validated_sites));
|
||||
s.add(String.format("calls at sites validated true %d", calls_at_sites_validated_true));
|
||||
s.add(String.format("%% validated true %f", (float) calls_at_validated_sites / calls_at_sites_validated_true));
|
||||
}else{
|
||||
s.add("No validation data encountered");
|
||||
}
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,29 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public interface VariantAnalysis {
|
||||
public String getName();
|
||||
public PrintStream getSummaryPrintStream();
|
||||
public PrintStream getCallPrintStream();
|
||||
public List<String> getParams();
|
||||
public void initialize(VariantEvalWalker master, PrintStream out, PrintStream callOut, String filename);
|
||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context);
|
||||
public void finalize(long nSites);
|
||||
public List<String> done();
|
||||
}
|
||||
|
|
@ -1,81 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public class VariantCounter extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
||||
long nBasesCovered = 0;
|
||||
int nSNPs = 0;
|
||||
int nHets = 0;
|
||||
|
||||
public VariantCounter() {
|
||||
super("variant_counts");
|
||||
}
|
||||
|
||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
nSNPs += eval == null || eval.isReference() ? 0 : 1;
|
||||
|
||||
// TODO -- break the het check out to a different module used only for single samples
|
||||
|
||||
if ( this.getMaster().evalContainsGenotypes && eval != null && eval.isBiallelic() && eval.isSNP() && eval instanceof VariantBackedByGenotype ) {
|
||||
List<Genotype> genotypes = ((VariantBackedByGenotype)eval).getGenotypes();
|
||||
if ( genotypes.size() == 1 && genotypes.get(0).isHet() ) {
|
||||
nHets++;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* No need to finalize the data in general
|
||||
* @param nSites
|
||||
*/
|
||||
public void finalize(long nSites) {
|
||||
nBasesCovered = nSites;
|
||||
}
|
||||
|
||||
private double variantRate(int n) {
|
||||
return n / (1.0 * Math.max(nBasesCovered, 1));
|
||||
}
|
||||
|
||||
private long variantRateInverse(int n) {
|
||||
return nBasesCovered / Math.max(n, 1);
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
List<String> s = new ArrayList<String>();
|
||||
s.add(String.format("n bases covered %d", nBasesCovered));
|
||||
s.add(String.format("variants %d", nSNPs));
|
||||
s.add(String.format("variant rate %.5f confident variants per base", variantRate(nSNPs)));
|
||||
s.add(String.format("variant rate 1 / %d confident variants per base", variantRateInverse(nSNPs)));
|
||||
|
||||
if ( this.getMaster().evalContainsGenotypes ) {
|
||||
s.add(String.format("heterozygotes %d", nHets));
|
||||
s.add(String.format("homozygotes %d", nSNPs - nHets));
|
||||
|
||||
s.add(String.format("heterozygosity %.5f confident hets per base", variantRate(nHets)));
|
||||
s.add(String.format("heterozygosity 1 / %d confident hets per base", variantRateInverse(nHets)));
|
||||
|
||||
s.add(String.format("het to hom ratio %.2f confident hets per confident homozygote non-refs [single sample genome-wide expectation is 2:1]",
|
||||
((double)nHets) / (Math.max(nSNPs - nHets, 1))));
|
||||
}
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,121 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RodVCF;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
* <p/>
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
||||
private String dbName;
|
||||
private int nDBSNPs = 0;
|
||||
private int nEvalObs = 0;
|
||||
private int nSNPsAtdbSNPs = 0;
|
||||
private int nConcordant = 0;
|
||||
|
||||
public VariantDBCoverage(final String name) {
|
||||
super("db_coverage");
|
||||
dbName = name;
|
||||
}
|
||||
|
||||
public int nDBSNPs() { return nDBSNPs; }
|
||||
public int nEvalSites() { return nEvalObs; }
|
||||
public int nSNPsAtdbSNPs() { return nSNPsAtdbSNPs; }
|
||||
public int nConcordant() { return nConcordant; }
|
||||
public int nNovelSites() { return Math.abs(nEvalSites() - nSNPsAtdbSNPs()); }
|
||||
|
||||
public boolean discordantP(Variation dbSNP, Variation eval) {
|
||||
if (eval != null) {
|
||||
char alt = (eval.isSNP()) ? eval.getAlternativeBaseForSNP() : Utils.stringToChar(eval.getReference());
|
||||
if (dbSNP != null && dbSNP.isSNP())
|
||||
return !dbSNP.getAlleleList().contains(String.valueOf(alt));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* What fraction of the evaluated site variants were also found in the db?
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double dbSNPRate() {
|
||||
return nSNPsAtdbSNPs() / (1.0 * nEvalSites());
|
||||
}
|
||||
|
||||
public double concordanceRate() {
|
||||
return nConcordant() / (1.0 * nSNPsAtdbSNPs());
|
||||
}
|
||||
|
||||
public static Variation getFirstRealSNP(List<Object> dbsnpList) {
|
||||
if (dbsnpList == null)
|
||||
return null;
|
||||
|
||||
Variation dbsnp = null;
|
||||
for (Object d : dbsnpList) {
|
||||
if (((Variation) d).isSNP() && (!(d instanceof RodVCF) || !((RodVCF) d).isFiltered())) {
|
||||
dbsnp = (Variation) d;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return dbsnp;
|
||||
}
|
||||
|
||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
Variation dbSNP = getFirstRealSNP(tracker.getReferenceMetaData( dbName, false ));
|
||||
String result = null;
|
||||
|
||||
if ( dbSNP != null ) nDBSNPs++; // count the number of real dbSNP events
|
||||
if ( eval != null && eval.isSNP() ) { // ignore indels right now
|
||||
nEvalObs++; // count the number of eval snps we've seen
|
||||
|
||||
if (dbSNP != null) { // both eval and dbSNP have real snps
|
||||
nSNPsAtdbSNPs++;
|
||||
|
||||
if (!discordantP(dbSNP, eval)) // count whether we're concordant or not with the dbSNP value
|
||||
nConcordant++;
|
||||
else
|
||||
result = String.format("Discordant [DBSNP %s] [EVAL %s]", dbSNP, eval);
|
||||
}
|
||||
}
|
||||
|
||||
// if ( dbSNP != null && dbSNP.isSNP() ) {
|
||||
// BrokenRODSimulator.attach("dbSNP");
|
||||
// rodDbSNP dbsnp = (rodDbSNP) BrokenRODSimulator.simulate_lookup("dbSNP", context.getLocation(), tracker);
|
||||
// if ( ! dbSNP.getRS_ID().equals(dbsnp.getRS_ID()) && dbsnp.isSNP() ) {
|
||||
// System.out.printf("Discordant site! %n%s%n vs.%n%s%n", dbSNP, dbsnp);
|
||||
// }
|
||||
// }
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
List<String> s = new ArrayList<String>();
|
||||
s.add(String.format("name %s", dbName));
|
||||
|
||||
s.add(String.format("n_db_snps %d", nDBSNPs()));
|
||||
s.add(String.format("n_eval_sites %d", nEvalSites()));
|
||||
s.add(String.format("n_overlapping_sites %d", nSNPsAtdbSNPs()));
|
||||
s.add(String.format("n_concordant %d", nConcordant()));
|
||||
s.add(String.format("n_novel_sites %d", nNovelSites()));
|
||||
|
||||
s.add(String.format("%s_rate %.2f # percent eval snps at dbsnp snps", dbName.toLowerCase(), 100 * dbSNPRate()));
|
||||
s.add(String.format("concordance_rate %.2f", 100 * concordanceRate()));
|
||||
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,441 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.RMD;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* A robust and general purpose tool for characterizing the quality of SNPs, Indels, and other variants that includes basic
|
||||
* counting, ti/tv, dbSNP% (if -D is provided), concordance to chip or validation data, and will show interesting sites (-V)
|
||||
* that are FNs, FP, etc.
|
||||
*/
|
||||
@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="eval",type=ReferenceOrderedDatum.class)}) // right now we have no base variant class for rods, this should change
|
||||
//@Allows(value={DataSource.REFERENCE},referenceMetaData = {@RMD(name="eval",type=ReferenceOrderedDatum.class), @RMD(name="dbsnp",type=rodDbSNP.class),@RMD(name="hapmap-chip",type=ReferenceOrderedDatum.class), @RMD(name="interval",type=IntervalRod.class), @RMD(name="validation",type=RodGenotypeChipAsGFF.class)})
|
||||
//public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
||||
public class VariantEvalWalker extends RodWalker<Integer, Integer> {
|
||||
@Argument(shortName="minPhredConfidenceScore", doc="Minimum confidence score to consider an evaluation SNP a variant", required=false)
|
||||
public double minConfidenceScore = -1.0;
|
||||
|
||||
@Argument(shortName="printVariants", doc="If true, prints the variants in all of the variant tracks that are examined", required=false)
|
||||
public boolean printVariants = false;
|
||||
|
||||
@Argument(shortName="badHWEThreshold", doc="Only sites with deviations from Hardy-Weinberg equilibrium with P-values < than this threshold are flagged", required=false)
|
||||
public double badHWEThreshold = 1e-3;
|
||||
|
||||
@Argument(fullName="evalContainsGenotypes", shortName = "G", doc="If true, the input list of variants will be treated as a genotyping file, containing assertions of actual genotype values for a particular person. Analyses that only make sense on at the population level will be disabled, while those operating on genotypes will be enabled", required=false)
|
||||
public boolean evalContainsGenotypes = false;
|
||||
|
||||
@Argument(fullName="explode", shortName = "E", doc="Old style formatting, with each analysis split into separate files.", required=false)
|
||||
public boolean explode = false;
|
||||
|
||||
@Argument(fullName="includeViolations", shortName = "V", doc="If provided, violations will be written out along with summary information", required=false)
|
||||
public boolean mIncludeViolations = false;
|
||||
|
||||
@Argument(fullName="extensiveSubsets", shortName = "A", doc="If provided, output will be calculated over a lot of subsets, by default we only operate over all variants", required=false)
|
||||
public boolean extensiveSubsets = false;
|
||||
|
||||
@Argument(fullName="supressDateInformation", doc="This flag indicates that we want to suppress the date information from the output, so that if can be diff'ed against previous evals.", required=false)
|
||||
public boolean supressDateInformation = false;
|
||||
|
||||
@Argument(fullName="includeFilteredRecords", doc="If true, variation record with filter fields at are true will be included in the analysis", required=false)
|
||||
public boolean includeFilteredRecords = false;
|
||||
|
||||
|
||||
@Argument(fullName = "samplesFile", shortName="samples", doc="When running an analysis on one or more individuals with truth data, this field provides a filepath to the listing of which samples are used (and are used to name corresponding rods with -B)", required=false)
|
||||
public String samplesFile = null;
|
||||
|
||||
@Argument(fullName = "sampleName", shortName="sampleName", doc="When running an analysis on one or more individuals with truth data, provide this parameter to only analyze the genotype of this sample", required=false)
|
||||
public String sampleName = null;
|
||||
@Argument(fullName = "vcfInfoSelector", shortName="vcfInfoSelector", doc="When running an analysis on one or more individuals with truth data, provide this parameter to only analyze the genotype of this sample", required=false)
|
||||
public String vcfInfoSelector = null;
|
||||
|
||||
String analysisFilenameBase = null;
|
||||
|
||||
final String knownSNPDBName = "dbSNP";
|
||||
final String One1KGSNPNames = "1kg";
|
||||
final String genotypeChipName = "hapmap-chip";
|
||||
|
||||
HashMap<ANALYSIS_TYPE, ArrayList<VariantAnalysis>> analysisSets;
|
||||
|
||||
PrintStream perLocusStream = null;
|
||||
|
||||
long nMappedSites = 0;
|
||||
|
||||
// the types of analysis we support, and the string tags we associate with the enumerated value
|
||||
enum ANALYSIS_TYPE {
|
||||
// todo -- differeniate into three classes -- snps at known snp sites, snps not at known snp site but covered by known indel, and novel
|
||||
ALL_SNPS("all"),
|
||||
SINGLETON_SNPS("singletons"),
|
||||
TWOHIT_SNPS("2plus_hit"),
|
||||
KNOWN_SNPS("known"),
|
||||
SNPS_AT_NON_SNP_SITES("snp_at_known_non_snps"),
|
||||
NOVEL_SNPS("novel"),
|
||||
FILTERED_SNPS("filtered");
|
||||
|
||||
private final String value;
|
||||
ANALYSIS_TYPE(String value) { this.value = value;}
|
||||
|
||||
public String toString() { return value; }
|
||||
|
||||
}
|
||||
|
||||
// final ANALYSIS_TYPE[] POPULATION_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS,
|
||||
// ANALYSIS_TYPE.SINGLETON_SNPS,
|
||||
// ANALYSIS_TYPE.TWOHIT_SNPS,
|
||||
// ANALYSIS_TYPE.KNOWN_SNPS,
|
||||
// ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES,
|
||||
// ANALYSIS_TYPE.NOVEL_SNPS};
|
||||
// final ANALYSIS_TYPE[] GENOTYPE_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS,
|
||||
// ANALYSIS_TYPE.KNOWN_SNPS,
|
||||
// ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES,
|
||||
// ANALYSIS_TYPE.NOVEL_SNPS};
|
||||
|
||||
final ANALYSIS_TYPE[] POPULATION_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS,
|
||||
ANALYSIS_TYPE.SINGLETON_SNPS,
|
||||
ANALYSIS_TYPE.TWOHIT_SNPS,
|
||||
ANALYSIS_TYPE.KNOWN_SNPS,
|
||||
ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES,
|
||||
ANALYSIS_TYPE.NOVEL_SNPS,
|
||||
ANALYSIS_TYPE.FILTERED_SNPS};
|
||||
final ANALYSIS_TYPE[] GENOTYPE_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS,
|
||||
ANALYSIS_TYPE.KNOWN_SNPS,
|
||||
ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES,
|
||||
ANALYSIS_TYPE.NOVEL_SNPS,
|
||||
ANALYSIS_TYPE.FILTERED_SNPS};
|
||||
|
||||
final ANALYSIS_TYPE[] SIMPLE_ANALYSIS_NAMES = {ANALYSIS_TYPE.ALL_SNPS};
|
||||
ANALYSIS_TYPE[] ALL_ANALYSIS_NAMES = null;
|
||||
|
||||
public void initialize() {
|
||||
ALL_ANALYSIS_NAMES = SIMPLE_ANALYSIS_NAMES;
|
||||
if (extensiveSubsets)
|
||||
ALL_ANALYSIS_NAMES = evalContainsGenotypes ? GENOTYPE_ANALYSIS_NAMES : POPULATION_ANALYSIS_NAMES;
|
||||
|
||||
// setup the path to the analysis
|
||||
if (this.getToolkit().getArguments().outFileName != null) {
|
||||
analysisFilenameBase = this.getToolkit().getArguments().outFileName + "."; // + ".analysis.";
|
||||
}
|
||||
|
||||
analysisSets = new HashMap<ANALYSIS_TYPE, ArrayList<VariantAnalysis>>();
|
||||
for (ANALYSIS_TYPE setName : ALL_ANALYSIS_NAMES) {
|
||||
analysisSets.put(setName, initializeAnalysisSet(setName));
|
||||
}
|
||||
// THIS IS A HACK required in order to reproduce the behavior of old (and imperfect) RODIterator and
|
||||
// hence to pass the integration test. The new iterator this code is now using does see ALL the SNPs,
|
||||
// whether masked by overlapping indels/other events or not.
|
||||
//TODO process correctly all the returned dbSNP rods at each location
|
||||
//BrokenRODSimulator.attach("dbSNP");
|
||||
}
|
||||
|
||||
|
||||
public long getNMappedSites() {
|
||||
return nMappedSites;
|
||||
}
|
||||
|
||||
private ArrayList<VariantAnalysis> getAnalysisSet(final ANALYSIS_TYPE name) {
|
||||
return analysisSets.containsKey(name) ? analysisSets.get(name) : null;
|
||||
}
|
||||
|
||||
private ArrayList<VariantAnalysis> initializeAnalysisSet(final ANALYSIS_TYPE setName) {
|
||||
ArrayList<VariantAnalysis> analyses = new ArrayList<VariantAnalysis>();
|
||||
|
||||
//
|
||||
// Add new analyses here!
|
||||
//
|
||||
analyses.add(new ValidationDataAnalysis());
|
||||
|
||||
analyses.add(new VariantCounter());
|
||||
analyses.add(new VariantDBCoverage(knownSNPDBName));
|
||||
analyses.add(new VariantDBCoverage(One1KGSNPNames));
|
||||
|
||||
if ( samplesFile != null ) {
|
||||
//if ( numPeopleInPool < 1 )
|
||||
// analyses.add(new GenotypeConcordance(samplesFile, true));
|
||||
//else
|
||||
analyses.add(new PooledConcordance(samplesFile, true));
|
||||
} else {
|
||||
analyses.add(new GenotypeConcordance(genotypeChipName, false));
|
||||
}
|
||||
|
||||
analyses.add(new TransitionTranversionAnalysis());
|
||||
analyses.add(new NeighborDistanceAnalysis());
|
||||
analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold));
|
||||
analyses.add(new ClusterCounterAnalysis());
|
||||
analyses.add(new CallableBasesAnalysis());
|
||||
analyses.add(new IndelMetricsAnalysis());
|
||||
|
||||
//
|
||||
// Filter out analyses inappropriate for our evaluation type Population or Genotype
|
||||
//
|
||||
Iterator<VariantAnalysis> iter = analyses.iterator();
|
||||
while (iter.hasNext()) {
|
||||
VariantAnalysis analysis = iter.next();
|
||||
VariantAnalysis removed = null;
|
||||
if ( evalContainsGenotypes ) {
|
||||
if ( ! (analysis instanceof GenotypeAnalysis ) ) {
|
||||
removed = analysis;
|
||||
iter.remove();
|
||||
}
|
||||
} else if ( ! (analysis instanceof PopulationAnalysis || analysis instanceof PoolAnalysis) ) {
|
||||
removed = analysis;
|
||||
iter.remove();
|
||||
// } else if ( numPeopleInPool > 1 && ! ( analysis instanceof PoolAnalysis ) ) {
|
||||
} else if ( analysis instanceof PoolAnalysis && samplesFile == null ) {
|
||||
removed = analysis;
|
||||
iter.remove();
|
||||
}
|
||||
|
||||
if ( removed != null ) {
|
||||
logger.info(String.format("Disabling analysis %s in set %s", removed, setName));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (printVariants) analyses.add(new VariantMatcher(knownSNPDBName));
|
||||
|
||||
for (VariantAnalysis analysis : analyses) {
|
||||
initializeAnalysisOutputStream(setName, analysis);
|
||||
}
|
||||
|
||||
return analyses;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the filename of the analysis output file where output for an analysis with
|
||||
*
|
||||
* @param name
|
||||
* @param params
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public String getAnalysisFilename(final String name, final List<String> params) {
|
||||
if (analysisFilenameBase == null)
|
||||
return null;
|
||||
else
|
||||
return analysisFilenameBase + Utils.join(".", Utils.cons(name, params));
|
||||
}
|
||||
|
||||
public void initializeAnalysisOutputStream(final ANALYSIS_TYPE setName, VariantAnalysis analysis) {
|
||||
final String filename = getAnalysisFilename(setName + "." + analysis.getName(), analysis.getParams());
|
||||
|
||||
try {
|
||||
if (perLocusStream == null)
|
||||
perLocusStream = filename == null ? out : new PrintStream(new File(analysisFilenameBase + "interesting_sites"));
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
if (filename == null || !explode)
|
||||
analysis.initialize(this, out, perLocusStream, filename);
|
||||
else {
|
||||
File file = new File(filename);
|
||||
try {
|
||||
analysis.initialize(this, new PrintStream(new FileOutputStream(file)), perLocusStream, filename);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("Couldn't open analysis output file " + filename, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
//System.out.printf("Tracker at %s is %s, ref is %s, skip is %d mapped is %d%n", context.getLocation(), tracker, ref, context.getSkippedBases(), nMappedSites);
|
||||
nMappedSites += context.getSkippedBases();
|
||||
|
||||
//System.out.printf("Tracker at %s is %s, ref is %s%n", context.getLocation(), tracker, ref);
|
||||
//if ( ref == null )
|
||||
// out.printf("Last position was %s: skipping %d bases%n",
|
||||
// context.getLocation(), context.getSkippedBases() );
|
||||
if ( ref == null ) { // we are seeing the last site
|
||||
return 0;
|
||||
}
|
||||
|
||||
nMappedSites++;
|
||||
|
||||
int nBoundGoodRods = tracker.getNBoundRodTracks("interval");
|
||||
if (nBoundGoodRods > 0) {
|
||||
// System.out.printf("%s: n = %d%n", context.getLocation(), nBoundGoodRods );
|
||||
|
||||
// Iterate over each analysis, and update it
|
||||
Variation eval = tracker.lookup("eval",Variation.class);
|
||||
Variation evalForFilter = null;
|
||||
|
||||
// ensure that the variation we're looking at is bi-allelic
|
||||
if ( eval != null && ! eval.isBiallelic() )
|
||||
eval = null;
|
||||
|
||||
if (eval != null)
|
||||
if (eval.getNegLog10PError() * 10.0 < minConfidenceScore) eval = null;
|
||||
|
||||
if ( eval != null && (eval instanceof RodVCF) ) {
|
||||
if ( sampleName != null ) {
|
||||
// code to grab a particular sample from a VCF
|
||||
Variation evalOld = eval;
|
||||
// System.out.printf("original is %s%n", evalOld);
|
||||
eval = fakeVCFForSample((RodVCF)eval, ref, sampleName);
|
||||
if ( eval != null && eval.isSNP() ) {
|
||||
// System.out.printf("sample is %s%n", eval);
|
||||
// System.out.printf("Replacing %s with %s%n", evalOld, eval);
|
||||
// System.out.printf("%n");
|
||||
} else {
|
||||
eval = null;
|
||||
}
|
||||
}
|
||||
|
||||
if ( vcfInfoSelector != null && eval != null ) {
|
||||
String[] keyValue = vcfInfoSelector.split("=");
|
||||
Map<String, String> map = ((RodVCF)eval).getRecord().getInfoValues();
|
||||
if ( map.containsKey(keyValue[0]) && ! Pattern.matches(keyValue[1], map.get(keyValue[0]) ) )
|
||||
eval = null;
|
||||
}
|
||||
|
||||
if ( eval != null && ((RodVCF)eval).mCurrentRecord.isFiltered() ) {
|
||||
evalForFilter = eval;
|
||||
if ( ! includeFilteredRecords ) {
|
||||
eval = null; // we are not including filtered records, so set eval to null
|
||||
}
|
||||
//System.out.printf("Rejecting filtered record %s%n", eval);
|
||||
}
|
||||
}
|
||||
|
||||
// update stats about all of the SNPs
|
||||
updateAnalysisSet(ANALYSIS_TYPE.ALL_SNPS, eval, tracker, ref.getBase(), context);
|
||||
|
||||
// update the known / novel set by checking whether the knownSNPDBName track has an entry here
|
||||
if (eval != null) {
|
||||
ANALYSIS_TYPE noveltySet = getNovelAnalysisType(tracker);
|
||||
updateAnalysisSet(noveltySet, eval, tracker, ref.getBase(), context);
|
||||
}
|
||||
|
||||
if (evalForFilter != null) {
|
||||
updateAnalysisSet(ANALYSIS_TYPE.FILTERED_SNPS, evalForFilter, tracker, ref.getBase(), context);
|
||||
}
|
||||
|
||||
|
||||
// are we a population backed call? then update
|
||||
if (eval instanceof SNPCallFromGenotypes) {
|
||||
SNPCallFromGenotypes call = (SNPCallFromGenotypes) eval;
|
||||
int nVarGenotypes = call.nHetGenotypes() + call.nHomVarGenotypes();
|
||||
//System.out.printf("%d variant genotypes at %s%n", nVarGenotypes, calls);
|
||||
final ANALYSIS_TYPE s = nVarGenotypes == 1 ? ANALYSIS_TYPE.SINGLETON_SNPS : ANALYSIS_TYPE.TWOHIT_SNPS;
|
||||
updateAnalysisSet(s, eval, tracker, ref.getBase(), context);
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
private RodVCF fakeVCFForSample(RodVCF eval, ReferenceContext ref, final String sampleName) {
|
||||
VCFGenotypeRecord genotype = (VCFGenotypeRecord)eval.getGenotype(sampleName);
|
||||
if ( genotype.getNegLog10PError() > 0 ) {
|
||||
VCFRecord record = new VCFRecord(Character.toString(ref.getBase()), ref.getLocus(), "GT");
|
||||
record.setAlternateBases(eval.getRecord().getAlternateAlleles());
|
||||
record.addGenotypeRecord(genotype);
|
||||
record.setQual(10*genotype.getNegLog10PError());
|
||||
record.setFilterString(eval.getFilterString());
|
||||
record.addInfoFields(eval.getInfoValues());
|
||||
return new RodVCF("fakeVCFForSample", record, eval.getReader());
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private ANALYSIS_TYPE getNovelAnalysisType(RefMetaDataTracker tracker) {
|
||||
List<Object> dbsnpList = tracker.getReferenceMetaData("dbsnp");
|
||||
|
||||
if (dbsnpList.size() == 0)
|
||||
return ANALYSIS_TYPE.NOVEL_SNPS;
|
||||
|
||||
for (Object d : dbsnpList) {
|
||||
if (((rodDbSNP) d).isSNP()) {
|
||||
return ANALYSIS_TYPE.KNOWN_SNPS;
|
||||
}
|
||||
}
|
||||
|
||||
return ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES;
|
||||
|
||||
// old and busted way of doing this
|
||||
// Variation dbsnp = (Variation) BrokenRODSimulator.simulate_lookup("dbSNP", ref.getLocus(), tracker);
|
||||
//
|
||||
// ANALYSIS_TYPE noveltySet = null;
|
||||
// if ( dbsnp == null ) noveltySet = ANALYSIS_TYPE.NOVEL_SNPS;
|
||||
// else if ( dbsnp.isSNP() ) noveltySet = ANALYSIS_TYPE.KNOWN_SNPS;
|
||||
// else { // if ( dbsnp.isIndel() )
|
||||
// noveltySet = ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES;
|
||||
}
|
||||
|
||||
public boolean includeViolations() { return mIncludeViolations; }
|
||||
|
||||
public void updateAnalysisSet(final ANALYSIS_TYPE analysisSetName, Variation eval,
|
||||
RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
// Iterate over each analysis, and update it
|
||||
ArrayList<VariantAnalysis> set = getAnalysisSet(analysisSetName);
|
||||
if ( set != null ) {
|
||||
for ( VariantAnalysis analysis : set ) {
|
||||
String s = analysis.update(eval, tracker, ref, context);
|
||||
if ( s != null && includeViolations() ) {
|
||||
analysis.getCallPrintStream().println(getLineHeader(analysisSetName, "flagged", analysis.getName()) + s);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Given result of map function
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return treeReduce(sum, value);
|
||||
}
|
||||
|
||||
public Integer treeReduce(Integer lhs, Integer rhs) {
|
||||
return lhs + rhs;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
for (ANALYSIS_TYPE analysisSetName : ALL_ANALYSIS_NAMES) {
|
||||
printAnalysisSet(analysisSetName);
|
||||
}
|
||||
}
|
||||
|
||||
private String getLineHeader(final ANALYSIS_TYPE analysisSetName, final String keyword, final String analysis) {
|
||||
String s = Utils.join(",", Arrays.asList(analysisSetName, keyword, analysis));
|
||||
return s + Utils.dupString(' ', Math.max(50 - s.length(), 1));
|
||||
}
|
||||
|
||||
private void printAnalysisSet(final ANALYSIS_TYPE analysisSetName) {
|
||||
//out.printf("Writing analysis set %s", analysisSetName);
|
||||
Date now = new Date();
|
||||
for (VariantAnalysis analysis : getAnalysisSet(analysisSetName)) {
|
||||
String header = getLineHeader(analysisSetName, "summary", analysis.getName());
|
||||
analysis.finalize(getNMappedSites());
|
||||
PrintStream stream = analysis.getSummaryPrintStream();
|
||||
stream.printf("%s%s%n", header, Utils.dupString('-', 78));
|
||||
//stream.printf("%s Analysis set %s%n", analysisSetName, , analysisSetName);
|
||||
stream.printf("%sAnalysis name %s%n", header, analysis.getName());
|
||||
stream.printf("%sAnalysis params %s%n", header, Utils.join(" ", analysis.getParams()));
|
||||
stream.printf("%sAnalysis class %s%n", header, analysis.getClass().getName());
|
||||
if (!supressDateInformation) stream.printf("%sAnalysis time %s%n", header, now);
|
||||
for (String line : analysis.done()) {
|
||||
stream.printf("%s%s%n", header, line);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,40 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public class VariantMatcher extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
||||
String dbName;
|
||||
|
||||
public VariantMatcher(final String name) {
|
||||
super("variant_matches");
|
||||
dbName = name;
|
||||
}
|
||||
|
||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
String r = null;
|
||||
|
||||
Variation db = tracker.lookup(dbName,Variation.class);
|
||||
|
||||
if ( eval != null || db != null ) {
|
||||
String matchFlag = " ";
|
||||
if ( eval != null && db != null ) matchFlag = "*** ";
|
||||
if ( eval == null && db != null ) matchFlag = ">>> ";
|
||||
if ( eval != null && db == null ) matchFlag = "<<< ";
|
||||
|
||||
r = String.format("%s %s: %s <=> EVAL: %s", dbName, matchFlag, db, eval);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||
|
||||
|
||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
|
|
@ -4,7 +4,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.varianteval.ConcordanceTruthTable;
|
||||
import org.broadinstitute.sting.playground.gatk.walkers.poolseq.PowerBelowFrequencyWalker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
|
|
|||
|
|
@ -1,222 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
*
|
||||
* Class VariantEvalPerformanceTest
|
||||
*
|
||||
* TODO: remove me Eric
|
||||
*/
|
||||
public class VariantEvalPerformanceTest extends WalkerTest {
|
||||
|
||||
@Test
|
||||
public void testEvalVariantROD() {
|
||||
HashMap<String, String> md5 = new HashMap<String, String>();
|
||||
md5.put("", "9cfda40f521d75a3e8bafc44a663c14a");
|
||||
md5.put("-A", "8fea7cc25f551ce170636fc35c5ae0fe");
|
||||
|
||||
/**
|
||||
* the above MD5 was calculated from running the following command:
|
||||
*
|
||||
* java -jar ./dist/GenomeAnalysisTK.jar \
|
||||
* -R /broad/1KG/reference/human_b36_both.fasta \
|
||||
* -T VariantEval \
|
||||
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
|
||||
* -L 1:10,000,000-11,000,000 \
|
||||
* --outerr myVariantEval \
|
||||
* --supressDateInformation \
|
||||
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls
|
||||
*
|
||||
*/
|
||||
for ( Map.Entry<String, String> e : md5.entrySet() ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||
" --rodBind eval,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" +
|
||||
" -T VariantEval" +
|
||||
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||
" -L 1:10,000,000-11,000,000" +
|
||||
" --outerr %s" +
|
||||
" --supressDateInformation " + e.getKey(),
|
||||
1, // just one output file
|
||||
Arrays.asList(e.getValue()));
|
||||
List<File> result = executeTest("testEvalVariantROD", spec).getFirst();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEvalVariantRODConfSix() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("11d636d105f902680c46b9f2e330d922");
|
||||
|
||||
/**
|
||||
* the above MD5 was calculated from running the following command:
|
||||
*
|
||||
* java -jar ./dist/GenomeAnalysisTK.jar \
|
||||
* -R /broad/1KG/reference/human_b36_both.fasta \
|
||||
* -T VariantEval \
|
||||
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
|
||||
* -L 1:10,000,000-11,000,000 \
|
||||
* --outerr myVariantEval \
|
||||
* --supressDateInformation \
|
||||
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls \
|
||||
* -minConfidenceScore 6
|
||||
*/
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||
" --rodBind eval,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" +
|
||||
" -T VariantEval" +
|
||||
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||
" -L 1:10,000,000-11,000,000" +
|
||||
" --outerr %s" +
|
||||
" --supressDateInformation" +
|
||||
" -minPhredConfidenceScore 60",
|
||||
1, // just one output file
|
||||
md5);
|
||||
List<File> result = executeTest("testEvalVariantRODConfSixty", spec).getFirst();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEvalVariantRODOutputViolations() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("12ecd457d62329e9d4e593de904a457d");
|
||||
|
||||
/**
|
||||
* the above MD5 was calculated from running the following command:
|
||||
*
|
||||
* java -jar ./dist/GenomeAnalysisTK.jar \
|
||||
* -R /broad/1KG/reference/human_b36_both.fasta \
|
||||
* -T VariantEval \
|
||||
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
|
||||
* -L 1:10,000,000-11,000,000 \
|
||||
* --outerr myVariantEval \
|
||||
* --supressDateInformation \
|
||||
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls \
|
||||
* --includeViolations
|
||||
*/
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||
" --rodBind eval,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" +
|
||||
" -T VariantEval" +
|
||||
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||
" -L 1:10,000,000-11,000,000" +
|
||||
" --outerr %s" +
|
||||
" --supressDateInformation" +
|
||||
" --includeViolations",
|
||||
1, // just one output file
|
||||
md5);
|
||||
List<File> result = executeTest("testEvalVariantRODOutputViolations", spec).getFirst();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEvalGenotypeROD() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("6ed44fd586c89dafd40cb8e0194dc456");
|
||||
/**
|
||||
* the above MD5 was calculated after running the following command:
|
||||
*
|
||||
* java -jar ./dist/GenomeAnalysisTK.jar \
|
||||
* -R /broad/1KG/reference/human_b36_both.fasta \
|
||||
* -T VariantEval \
|
||||
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
|
||||
* -L 1:10,000,000-11,000,000 \
|
||||
* --outerr myVariantEval \
|
||||
* --supressDateInformation \
|
||||
* --evalContainsGenotypes \
|
||||
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.genotypes.geli.calls \
|
||||
* --rodBind hapmap-chip,GFF,/humgen/gsa-scr1/GATK_Data/1KG_gffs/NA12878.1kg.gff
|
||||
*/
|
||||
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||
" --rodBind eval,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.genotypes.geli.calls" +
|
||||
" -T VariantEval" +
|
||||
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||
" -L 1:10,000,000-11,000,000" +
|
||||
" --outerr %s" +
|
||||
" --supressDateInformation" +
|
||||
" --evalContainsGenotypes" +
|
||||
" --rodBind hapmap-chip,GFF,/humgen/gsa-scr1/GATK_Data/1KG_gffs/NA12878.1kg.gff",
|
||||
1, // just one output file
|
||||
md5);
|
||||
List<File> result = executeTest("testEvalGenotypeROD", spec).getFirst();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEvalMarksGenotypingExample() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("c0396cfe89a63948aebbbae0a0e06678");
|
||||
/**
|
||||
* Run with the following commands:
|
||||
*
|
||||
* java -Xmx2048m -jar /humgen/gsa-hphome1/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar
|
||||
* -T VariantEval -R /broad/1KG/reference/human_b36_both.fasta -l INFO
|
||||
* -B eval,Variants,/humgen/gsa-scr1/ebanks/concordanceForMark/UMichVsBroad.venn.set1Only.calls
|
||||
* -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -hc /humgen/gsa-scr1/GATK_Data/1KG_gffs/NA12878.1kg.gff
|
||||
* -G -L 1 -o /humgen/gsa-scr1/ebanks/concordanceForMark/UMichVsBroad.venn.set1Only.calls.eval
|
||||
*/
|
||||
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T VariantEval -R " + oneKGLocation + "reference/human_b36_both.fasta " +
|
||||
"-B eval,Variants," + validationDataLocation + "UMichVsBroad.venn.set1Only.calls " +
|
||||
"-D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -hc /humgen/gsa-scr1/GATK_Data/1KG_gffs/NA12878.1kg.gff " +
|
||||
"-G " +
|
||||
"--supressDateInformation " +
|
||||
"-L 1:1-10,000,000 " +
|
||||
"--outerr %s",
|
||||
1, // just one output file
|
||||
md5);
|
||||
List<File> result = executeTest("testEvalMarksGenotypingExample", spec).getFirst();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEvalRuntimeWithLotsOfIntervals() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("6a90341517fc3c5026529301d9970c7b");
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T VariantEval -R " + oneKGLocation + "reference/human_b36_both.fasta " +
|
||||
"-B eval,Variants," + validationDataLocation + "NA12878.pilot_3.all.geli.calls " +
|
||||
"-D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod " +
|
||||
"--supressDateInformation " +
|
||||
"-L /humgen/gsa-scr1/GATK_Data/thousand_genomes_alpha_redesign.targets.b36.interval_list " +
|
||||
"--outerr %s",
|
||||
1, // just one output file
|
||||
md5);
|
||||
List<File> result = executeTest("testEvalRuntimeWithLotsOfIntervals", spec).getFirst();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testVCFVariantEvals() {
|
||||
HashMap<String, String> md5 = new HashMap<String, String>();
|
||||
md5.put("", "ee6b096169d6c5e2ce49d394fbec799b");
|
||||
md5.put("-A", "a443193c0810363f85278b1cfaed2fff");
|
||||
md5.put("-A --includeFilteredRecords", "812d7f2ecac28b1be7e7028af17df9c0");
|
||||
md5.put("-A --sampleName NA12878", "a443193c0810363f85278b1cfaed2fff");
|
||||
md5.put("-A -vcfInfoSelector AF=0.50", "afed4bf0c9f11b86f6e5356012f9cf2d");
|
||||
|
||||
for ( Map.Entry<String, String> e : md5.entrySet() ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||
" --rodBind eval,VCF," + validationDataLocation + "NA12878.example1.vcf" +
|
||||
" -T VariantEval" +
|
||||
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||
" -hc /humgen/gsa-scr1/GATK_Data/1KG_gffs/NA12878.1kg.gff" +
|
||||
" -G" +
|
||||
" -L 1:1-10,000" +
|
||||
" --outerr %s" +
|
||||
" --supressDateInformation " + e.getKey(),
|
||||
1, // just one output file
|
||||
Arrays.asList(e.getValue()));
|
||||
List<File> result = executeTest("testVCFVariantEvals", spec).getFirst();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -1,221 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class VariantEvalWalkerTest
|
||||
* <p/>
|
||||
* test out the variant eval walker under different runtime conditions.
|
||||
*/
|
||||
public class VariantEvalWalkerIntegrationTest extends WalkerTest {
|
||||
|
||||
@Test
|
||||
public void testEvalVariantROD() {
|
||||
HashMap<String, String> md5 = new HashMap<String, String>();
|
||||
md5.put("", "9cfda40f521d75a3e8bafc44a663c14a");
|
||||
md5.put("-A", "8fea7cc25f551ce170636fc35c5ae0fe");
|
||||
|
||||
/**
|
||||
* the above MD5 was calculated from running the following command:
|
||||
*
|
||||
* java -jar ./dist/GenomeAnalysisTK.jar \
|
||||
* -R /broad/1KG/reference/human_b36_both.fasta \
|
||||
* -T VariantEval \
|
||||
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
|
||||
* -L 1:10,000,000-11,000,000 \
|
||||
* --outerr myVariantEval \
|
||||
* --supressDateInformation \
|
||||
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls
|
||||
*
|
||||
*/
|
||||
for ( Map.Entry<String, String> e : md5.entrySet() ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||
" --rodBind eval,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" +
|
||||
" -T VariantEval" +
|
||||
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||
" -L 1:10,000,000-11,000,000" +
|
||||
" --outerr %s" +
|
||||
" --supressDateInformation " + e.getKey(),
|
||||
1, // just one output file
|
||||
Arrays.asList(e.getValue()));
|
||||
List<File> result = executeTest("testEvalVariantROD", spec).getFirst();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEvalVariantRODConfSix() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("11d636d105f902680c46b9f2e330d922");
|
||||
|
||||
/**
|
||||
* the above MD5 was calculated from running the following command:
|
||||
*
|
||||
* java -jar ./dist/GenomeAnalysisTK.jar \
|
||||
* -R /broad/1KG/reference/human_b36_both.fasta \
|
||||
* -T VariantEval \
|
||||
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
|
||||
* -L 1:10,000,000-11,000,000 \
|
||||
* --outerr myVariantEval \
|
||||
* --supressDateInformation \
|
||||
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls \
|
||||
* -minConfidenceScore 6
|
||||
*/
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||
" --rodBind eval,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" +
|
||||
" -T VariantEval" +
|
||||
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||
" -L 1:10,000,000-11,000,000" +
|
||||
" --outerr %s" +
|
||||
" --supressDateInformation" +
|
||||
" -minPhredConfidenceScore 60",
|
||||
1, // just one output file
|
||||
md5);
|
||||
List<File> result = executeTest("testEvalVariantRODConfSixty", spec).getFirst();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEvalVariantRODOutputViolations() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("12ecd457d62329e9d4e593de904a457d");
|
||||
|
||||
/**
|
||||
* the above MD5 was calculated from running the following command:
|
||||
*
|
||||
* java -jar ./dist/GenomeAnalysisTK.jar \
|
||||
* -R /broad/1KG/reference/human_b36_both.fasta \
|
||||
* -T VariantEval \
|
||||
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
|
||||
* -L 1:10,000,000-11,000,000 \
|
||||
* --outerr myVariantEval \
|
||||
* --supressDateInformation \
|
||||
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls \
|
||||
* --includeViolations
|
||||
*/
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||
" --rodBind eval,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" +
|
||||
" -T VariantEval" +
|
||||
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||
" -L 1:10,000,000-11,000,000" +
|
||||
" --outerr %s" +
|
||||
" --supressDateInformation" +
|
||||
" --includeViolations",
|
||||
1, // just one output file
|
||||
md5);
|
||||
List<File> result = executeTest("testEvalVariantRODOutputViolations", spec).getFirst();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEvalGenotypeROD() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("6ed44fd586c89dafd40cb8e0194dc456");
|
||||
/**
|
||||
* the above MD5 was calculated after running the following command:
|
||||
*
|
||||
* java -jar ./dist/GenomeAnalysisTK.jar \
|
||||
* -R /broad/1KG/reference/human_b36_both.fasta \
|
||||
* -T VariantEval \
|
||||
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
|
||||
* -L 1:10,000,000-11,000,000 \
|
||||
* --outerr myVariantEval \
|
||||
* --supressDateInformation \
|
||||
* --evalContainsGenotypes \
|
||||
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.genotypes.geli.calls \
|
||||
* --rodBind hapmap-chip,GFF,/humgen/gsa-scr1/GATK_Data/1KG_gffs/NA12878.1kg.gff
|
||||
*/
|
||||
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||
" --rodBind eval,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.genotypes.geli.calls" +
|
||||
" -T VariantEval" +
|
||||
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||
" -L 1:10,000,000-11,000,000" +
|
||||
" --outerr %s" +
|
||||
" --supressDateInformation" +
|
||||
" --evalContainsGenotypes" +
|
||||
" --rodBind hapmap-chip,GFF,/humgen/gsa-scr1/GATK_Data/1KG_gffs/NA12878.1kg.gff",
|
||||
1, // just one output file
|
||||
md5);
|
||||
List<File> result = executeTest("testEvalGenotypeROD", spec).getFirst();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEvalMarksGenotypingExample() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("c0396cfe89a63948aebbbae0a0e06678");
|
||||
/**
|
||||
* Run with the following commands:
|
||||
*
|
||||
* java -Xmx2048m -jar /humgen/gsa-hphome1/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar
|
||||
* -T VariantEval -R /broad/1KG/reference/human_b36_both.fasta -l INFO
|
||||
* -B eval,Variants,/humgen/gsa-scr1/ebanks/concordanceForMark/UMichVsBroad.venn.set1Only.calls
|
||||
* -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -hc /humgen/gsa-scr1/GATK_Data/1KG_gffs/NA12878.1kg.gff
|
||||
* -G -L 1 -o /humgen/gsa-scr1/ebanks/concordanceForMark/UMichVsBroad.venn.set1Only.calls.eval
|
||||
*/
|
||||
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T VariantEval -R " + oneKGLocation + "reference/human_b36_both.fasta " +
|
||||
"-B eval,Variants," + validationDataLocation + "UMichVsBroad.venn.set1Only.calls " +
|
||||
"-D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -hc /humgen/gsa-scr1/GATK_Data/1KG_gffs/NA12878.1kg.gff " +
|
||||
"-G " +
|
||||
"--supressDateInformation " +
|
||||
"-L 1:1-10,000,000 " +
|
||||
"--outerr %s",
|
||||
1, // just one output file
|
||||
md5);
|
||||
List<File> result = executeTest("testEvalMarksGenotypingExample", spec).getFirst();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEvalRuntimeWithLotsOfIntervals() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("6a90341517fc3c5026529301d9970c7b");
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T VariantEval -R " + oneKGLocation + "reference/human_b36_both.fasta " +
|
||||
"-B eval,Variants," + validationDataLocation + "NA12878.pilot_3.all.geli.calls " +
|
||||
"-D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod " +
|
||||
"--supressDateInformation " +
|
||||
"-L /humgen/gsa-scr1/GATK_Data/thousand_genomes_alpha_redesign.targets.b36.interval_list " +
|
||||
"--outerr %s",
|
||||
1, // just one output file
|
||||
md5);
|
||||
List<File> result = executeTest("testEvalRuntimeWithLotsOfIntervals", spec).getFirst();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testVCFVariantEvals() {
|
||||
HashMap<String, String> md5 = new HashMap<String, String>();
|
||||
md5.put("", "ee6b096169d6c5e2ce49d394fbec799b");
|
||||
md5.put("-A", "a443193c0810363f85278b1cfaed2fff");
|
||||
md5.put("-A --includeFilteredRecords", "812d7f2ecac28b1be7e7028af17df9c0");
|
||||
md5.put("-A --sampleName NA12878", "a443193c0810363f85278b1cfaed2fff");
|
||||
md5.put("-A -vcfInfoSelector AF=0.50", "afed4bf0c9f11b86f6e5356012f9cf2d");
|
||||
|
||||
for ( Map.Entry<String, String> e : md5.entrySet() ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||
" --rodBind eval,VCF," + validationDataLocation + "NA12878.example1.vcf" +
|
||||
" -T VariantEval" +
|
||||
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||
" -hc /humgen/gsa-scr1/GATK_Data/1KG_gffs/NA12878.1kg.gff" +
|
||||
" -G" +
|
||||
" -L 1:1-10,000" +
|
||||
" --outerr %s" +
|
||||
" --supressDateInformation " + e.getKey(),
|
||||
1, // just one output file
|
||||
Arrays.asList(e.getValue()));
|
||||
List<File> result = executeTest("testVCFVariantEvals", spec).getFirst();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue