Critical bug fix for VariantEval dbSNP calculations. Moved the system over to the new improved ROD iterators, resulting in dbSNP rates jumping 5% or so, due to masking of true SNPs by preceding indels.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2274 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-12-07 03:36:38 +00:00
parent 8089aa3c50
commit 8f461d3c40
3 changed files with 60 additions and 115 deletions

View File

@ -1,8 +1,7 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.BrokenRODSimulator;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.genotype.Variation;
@ -21,77 +20,21 @@ import java.util.List;
public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
private String dbName;
private int nDBSNPs = 0;
private int nDBIndels = 0;
private int nDBElse = 0;
private int nEvalObs = 0;
private int nSNPsAtdbSNPs = 0;
private int nConcordant = 0;
private int nSNPsCalledAtIndels = 0;
public VariantDBCoverage(final String name) {
super("db_coverage");
dbName = name;
// THIS IS A HACK required in order to reproduce the behavior of old (and imperfect) RODIterator and
// hence to pass the integration test. The new iterator this code is now using does see ALL the SNPs,
// whether masked by overlapping indels/other events or not.
//TODO process correctly all the returned dbSNP rods at each location
BrokenRODSimulator.attach(name);
}
public void inc(Variation dbSNP, Variation eval) {
boolean inDB = dbSNP != null;
boolean inEval = eval != null;
public int nDBSNPs() { return nDBSNPs; }
public int nEvalSites() { return nEvalObs; }
public int nSNPsAtdbSNPs() { return nSNPsAtdbSNPs; }
public int nConcordant() { return nConcordant; }
public int nNovelSites() { return Math.abs(nEvalSites() - nSNPsAtdbSNPs()); }
if (inDB) {
if (dbSNP.isSNP()) nDBSNPs++;
else if (dbSNP.isIndel()) nDBIndels++;
else { nDBElse++; }
}
if (inEval) nEvalObs++;
if (inDB && inEval) {
if (dbSNP.isSNP()) { // changes the calculation a bit
nSNPsAtdbSNPs++;
if (!discordantP(dbSNP, eval))
nConcordant++;
}
if (dbSNP.isIndel() && eval.isSNP())
nSNPsCalledAtIndels++;
}
}
public int callCount = 0;
public int nDBSNPs() {
return nDBSNPs;
}
public int nDBIndels() {
return nDBIndels;
}
public int nEvalSites() {
return nEvalObs;
}
public int nSNPsAtdbSNPs() {
return nSNPsAtdbSNPs;
}
public int nConcordant() {
return nConcordant;
}
public int nNovelSites() {
return Math.abs(nEvalSites() - nSNPsAtdbSNPs());
}
public int nSNPsAtIndels() {
return nSNPsCalledAtIndels;
}
public boolean discordantP(Variation dbSNP, Variation eval) {
if (eval != null) {
char alt = (eval.isSNP()) ? eval.getAlternativeBaseForSNP() : Utils.stringToChar(eval.getReference());
@ -107,7 +50,7 @@ public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeA
*
* @return
*/
public double fractionEvalSitesCoveredByDB() {
public double dbSNPRate() {
return nSNPsAtdbSNPs() / (1.0 * nEvalSites());
}
@ -116,59 +59,47 @@ public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeA
}
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
rodDbSNP dbSNP = rodDbSNP.getFirstRealSNP(tracker.getTrackData( dbName, null ));
String result = null;
// There are four cases here:
//TODO process correctly all the returned dbSNP rods at each location
Variation dbsnp = (Variation) BrokenRODSimulator.simulate_lookup(dbName,context.getLocation(),tracker);
if (dbSNP != null) nDBSNPs++; // count the number of real dbSNP events
if ( eval != null && eval.isSNP() ) { // ignore indels right now
nEvalObs++; // count the number of eval snps we've seen
inc(dbsnp, eval);
if (dbSNP != null) { // both eval and dbSNP have real snps
nSNPsAtdbSNPs++;
if (dbsnp != null && eval != null) {
if (dbsnp.isSNP() && eval.isSNP() && discordantP(dbsnp, eval)) {
return String.format("Discordant [DBSNP %s] [EVAL %s]", dbsnp, eval);
} else if (dbsnp.isIndel() && eval.isSNP()) {
return String.format("SNP-at-indel DBSNP=%s %s", Utils.join("",dbsnp.getAlleleList()), eval);
} else {
return null;
if (!discordantP(dbSNP, eval)) // count whether we're concordant or not with the dbSNP value
nConcordant++;
else
result = String.format("Discordant [DBSNP %s] [EVAL %s]", dbSNP, eval);
}
} else {
return null;
}
}
/**
* What fraction of the DB sites were discovered in the evalution calls?
*
* @return
*/
public double fractionDBSitesDiscoveredInEval() {
return nSNPsAtdbSNPs() / (1.0 * nDBSNPs());
if ( dbSNP != null && dbSNP.isSNP() ) {
BrokenRODSimulator.attach("dbSNP");
rodDbSNP dbsnp = (rodDbSNP) BrokenRODSimulator.simulate_lookup("dbSNP", context.getLocation(), tracker);
if ( ! dbSNP.getRS_ID().equals(dbsnp.getRS_ID()) && dbsnp.isSNP() ) {
System.out.printf("Discordant site! %n%s%n vs.%n%s%n", dbSNP, dbsnp);
}
}
return result;
}
public List<String> done() {
List<String> s = new ArrayList<String>();
//s.add(String.format("%d\t%d\t%d\t%.2f\t%.2f", nDBSNPs(), nEvalSites(), nOverlappingSites(), fractionEvalSitesCoveredByDB(), fractionDBSitesDiscoveredInEval()));
s.add(String.format("name %s", dbName));
s.add(String.format("n_db_sites %d", nDBSNPs() + nDBIndels() + nDBElse));
s.add(String.format("n_db_snps %d", nDBSNPs()));
s.add(String.format("n_db_indels %d", nDBIndels()));
s.add(String.format("n_db_others %d", nDBElse));
s.add(String.format("n_eval_sites %d", nEvalSites()));
s.add(String.format("n_overlapping_sites %d", nSNPsAtdbSNPs()));
s.add(String.format("n_concordant %d", nConcordant()));
s.add(String.format("n_novel_sites %d", nNovelSites()));
s.add(String.format("dbsnp_rate %.2f # percent eval snps at dbsnp snps", 100 * fractionEvalSitesCoveredByDB()));
s.add(String.format("dbsnp_rate %.2f # percent eval snps at dbsnp snps", 100 * dbSNPRate()));
s.add(String.format("concordance_rate %.2f", 100 * concordanceRate()));
s.add(String.format("percent_db_sites_in_eval %.2f", 100 * fractionDBSitesDiscoveredInEval()));
s.add(String.format("n_snp_calls_at_indels %d", nSNPsAtIndels()));
s.add(String.format("percent_calls_at_indels %.2f", nSNPsAtIndels() / (0.01 * nEvalSites())));
return s;
}
}

View File

@ -113,7 +113,7 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
// hence to pass the integration test. The new iterator this code is now using does see ALL the SNPs,
// whether masked by overlapping indels/other events or not.
//TODO process correctly all the returned dbSNP rods at each location
BrokenRODSimulator.attach("dbSNP");
//BrokenRODSimulator.attach("dbSNP");
}
@ -259,17 +259,7 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
// update the known / novel set by checking whether the knownSNPDBName track has an entry here
if (eval != null) {
Variation dbsnp = (Variation) BrokenRODSimulator.simulate_lookup("dbSNP", ref.getLocus(), tracker);
ANALYSIS_TYPE noveltySet = null;
if ( dbsnp == null ) noveltySet = ANALYSIS_TYPE.NOVEL_SNPS;
else if ( dbsnp.isSNP() ) noveltySet = ANALYSIS_TYPE.KNOWN_SNPS;
else { // if ( dbsnp.isIndel() )
noveltySet = ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES;
}
//else { ; } // lots of wierd things in there }
// // throw new StingException("Unexpected dbSNP record type: " + dbsnp); }
ANALYSIS_TYPE noveltySet = getNovelAnalysisType(tracker);
updateAnalysisSet(noveltySet, eval, tracker, ref.getBase(), context);
}
@ -286,6 +276,30 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
return 1;
}
private ANALYSIS_TYPE getNovelAnalysisType(RefMetaDataTracker tracker) {
RODRecordList<ReferenceOrderedDatum> dbsnpList = tracker.getTrackData("dbsnp", null);
if (dbsnpList == null)
return ANALYSIS_TYPE.NOVEL_SNPS;
for (ReferenceOrderedDatum d : dbsnpList) {
if (((rodDbSNP) d).isSNP()) {
return ANALYSIS_TYPE.KNOWN_SNPS;
}
}
return ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES;
// old and busted way of doing this
// Variation dbsnp = (Variation) BrokenRODSimulator.simulate_lookup("dbSNP", ref.getLocus(), tracker);
//
// ANALYSIS_TYPE noveltySet = null;
// if ( dbsnp == null ) noveltySet = ANALYSIS_TYPE.NOVEL_SNPS;
// else if ( dbsnp.isSNP() ) noveltySet = ANALYSIS_TYPE.KNOWN_SNPS;
// else { // if ( dbsnp.isIndel() )
// noveltySet = ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES;
}
public boolean includeViolations() { return mIncludeViolations; }
public void updateAnalysisSet(final ANALYSIS_TYPE analysisSetName, Variation eval,

View File

@ -18,8 +18,8 @@ public class VariantEvalWalkerIntegrationTest extends WalkerTest {
@Test
public void testEvalVariantROD() {
HashMap<String, String> md5 = new HashMap<String, String>();
md5.put("", "b4adb7fed93005b3c28d72c3578ebc33");
md5.put("-A", "d34c1598a5b35098bd5524a52fbbc0fd");
md5.put("", "d6b8c2d6c37d42d1ca2288799a8bd8e4");
md5.put("-A", "b868aac194f6d0bd1fd2c0c63ddfaeab");
/**
* the above MD5 was calculated from running the following command:
@ -52,7 +52,7 @@ public class VariantEvalWalkerIntegrationTest extends WalkerTest {
@Test
public void testEvalVariantRODConfSix() {
List<String> md5 = new ArrayList<String>();
md5.add("4421a0b379577ea0fcde2c9866f9ca9f");
md5.add("85cfefcac2dfb06545792605a3043a52");
/**
* the above MD5 was calculated from running the following command:
@ -84,7 +84,7 @@ public class VariantEvalWalkerIntegrationTest extends WalkerTest {
@Test
public void testEvalVariantRODOutputViolations() {
List<String> md5 = new ArrayList<String>();
md5.add("e1ce229698725aa06c9b08b21fe628ef");
md5.add("e24732ffd95a78385a2c6986d1d3a359");
/**
* the above MD5 was calculated from running the following command:
@ -116,7 +116,7 @@ public class VariantEvalWalkerIntegrationTest extends WalkerTest {
@Test
public void testEvalGenotypeROD() {
List<String> md5 = new ArrayList<String>();
md5.add("fd95465965116e05e2e0356e2d105321");
md5.add("8b85148ec20f31481a30ceddd8645ba1");
/**
* the above MD5 was calculated after running the following command:
*
@ -150,7 +150,7 @@ public class VariantEvalWalkerIntegrationTest extends WalkerTest {
@Test
public void testEvalMarksGenotypingExample() {
List<String> md5 = new ArrayList<String>();
md5.add("58ef0487c9231aec7f356919e8cbd305");
md5.add("7d5a98c01051f96a684a383786da3d76");
/**
* Run with the following commands:
*