Critical bug fix for VariantEval dbSNP calculations. Moved the system over to the new improved ROD iterators, resulting in dbSNP rates jumping 5% or so, due to masking of true SNPs by preceding indels.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2274 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
8089aa3c50
commit
8f461d3c40
|
|
@ -1,8 +1,7 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.BrokenRODSimulator;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
|
||||
|
|
@ -21,77 +20,21 @@ import java.util.List;
|
|||
public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
||||
private String dbName;
|
||||
private int nDBSNPs = 0;
|
||||
private int nDBIndels = 0;
|
||||
private int nDBElse = 0;
|
||||
private int nEvalObs = 0;
|
||||
private int nSNPsAtdbSNPs = 0;
|
||||
private int nConcordant = 0;
|
||||
private int nSNPsCalledAtIndels = 0;
|
||||
|
||||
|
||||
public VariantDBCoverage(final String name) {
|
||||
super("db_coverage");
|
||||
dbName = name;
|
||||
// THIS IS A HACK required in order to reproduce the behavior of old (and imperfect) RODIterator and
|
||||
// hence to pass the integration test. The new iterator this code is now using does see ALL the SNPs,
|
||||
// whether masked by overlapping indels/other events or not.
|
||||
//TODO process correctly all the returned dbSNP rods at each location
|
||||
BrokenRODSimulator.attach(name);
|
||||
}
|
||||
|
||||
public void inc(Variation dbSNP, Variation eval) {
|
||||
boolean inDB = dbSNP != null;
|
||||
boolean inEval = eval != null;
|
||||
public int nDBSNPs() { return nDBSNPs; }
|
||||
public int nEvalSites() { return nEvalObs; }
|
||||
public int nSNPsAtdbSNPs() { return nSNPsAtdbSNPs; }
|
||||
public int nConcordant() { return nConcordant; }
|
||||
public int nNovelSites() { return Math.abs(nEvalSites() - nSNPsAtdbSNPs()); }
|
||||
|
||||
if (inDB) {
|
||||
if (dbSNP.isSNP()) nDBSNPs++;
|
||||
else if (dbSNP.isIndel()) nDBIndels++;
|
||||
else { nDBElse++; }
|
||||
}
|
||||
|
||||
if (inEval) nEvalObs++;
|
||||
if (inDB && inEval) {
|
||||
if (dbSNP.isSNP()) { // changes the calculation a bit
|
||||
nSNPsAtdbSNPs++;
|
||||
|
||||
if (!discordantP(dbSNP, eval))
|
||||
nConcordant++;
|
||||
}
|
||||
|
||||
if (dbSNP.isIndel() && eval.isSNP())
|
||||
nSNPsCalledAtIndels++;
|
||||
}
|
||||
}
|
||||
|
||||
public int callCount = 0;
|
||||
|
||||
public int nDBSNPs() {
|
||||
return nDBSNPs;
|
||||
}
|
||||
|
||||
public int nDBIndels() {
|
||||
return nDBIndels;
|
||||
}
|
||||
|
||||
public int nEvalSites() {
|
||||
return nEvalObs;
|
||||
}
|
||||
|
||||
public int nSNPsAtdbSNPs() {
|
||||
return nSNPsAtdbSNPs;
|
||||
}
|
||||
|
||||
public int nConcordant() {
|
||||
return nConcordant;
|
||||
}
|
||||
|
||||
public int nNovelSites() {
|
||||
return Math.abs(nEvalSites() - nSNPsAtdbSNPs());
|
||||
}
|
||||
|
||||
public int nSNPsAtIndels() {
|
||||
return nSNPsCalledAtIndels;
|
||||
}
|
||||
public boolean discordantP(Variation dbSNP, Variation eval) {
|
||||
if (eval != null) {
|
||||
char alt = (eval.isSNP()) ? eval.getAlternativeBaseForSNP() : Utils.stringToChar(eval.getReference());
|
||||
|
|
@ -107,7 +50,7 @@ public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeA
|
|||
*
|
||||
* @return
|
||||
*/
|
||||
public double fractionEvalSitesCoveredByDB() {
|
||||
public double dbSNPRate() {
|
||||
return nSNPsAtdbSNPs() / (1.0 * nEvalSites());
|
||||
}
|
||||
|
||||
|
|
@ -116,59 +59,47 @@ public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeA
|
|||
}
|
||||
|
||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
rodDbSNP dbSNP = rodDbSNP.getFirstRealSNP(tracker.getTrackData( dbName, null ));
|
||||
String result = null;
|
||||
|
||||
// There are four cases here:
|
||||
//TODO process correctly all the returned dbSNP rods at each location
|
||||
Variation dbsnp = (Variation) BrokenRODSimulator.simulate_lookup(dbName,context.getLocation(),tracker);
|
||||
if (dbSNP != null) nDBSNPs++; // count the number of real dbSNP events
|
||||
if ( eval != null && eval.isSNP() ) { // ignore indels right now
|
||||
nEvalObs++; // count the number of eval snps we've seen
|
||||
|
||||
inc(dbsnp, eval);
|
||||
if (dbSNP != null) { // both eval and dbSNP have real snps
|
||||
nSNPsAtdbSNPs++;
|
||||
|
||||
if (dbsnp != null && eval != null) {
|
||||
|
||||
if (dbsnp.isSNP() && eval.isSNP() && discordantP(dbsnp, eval)) {
|
||||
return String.format("Discordant [DBSNP %s] [EVAL %s]", dbsnp, eval);
|
||||
} else if (dbsnp.isIndel() && eval.isSNP()) {
|
||||
return String.format("SNP-at-indel DBSNP=%s %s", Utils.join("",dbsnp.getAlleleList()), eval);
|
||||
} else {
|
||||
return null;
|
||||
if (!discordantP(dbSNP, eval)) // count whether we're concordant or not with the dbSNP value
|
||||
nConcordant++;
|
||||
else
|
||||
result = String.format("Discordant [DBSNP %s] [EVAL %s]", dbSNP, eval);
|
||||
}
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* What fraction of the DB sites were discovered in the evalution calls?
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double fractionDBSitesDiscoveredInEval() {
|
||||
return nSNPsAtdbSNPs() / (1.0 * nDBSNPs());
|
||||
if ( dbSNP != null && dbSNP.isSNP() ) {
|
||||
BrokenRODSimulator.attach("dbSNP");
|
||||
rodDbSNP dbsnp = (rodDbSNP) BrokenRODSimulator.simulate_lookup("dbSNP", context.getLocation(), tracker);
|
||||
if ( ! dbSNP.getRS_ID().equals(dbsnp.getRS_ID()) && dbsnp.isSNP() ) {
|
||||
System.out.printf("Discordant site! %n%s%n vs.%n%s%n", dbSNP, dbsnp);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
List<String> s = new ArrayList<String>();
|
||||
//s.add(String.format("%d\t%d\t%d\t%.2f\t%.2f", nDBSNPs(), nEvalSites(), nOverlappingSites(), fractionEvalSitesCoveredByDB(), fractionDBSitesDiscoveredInEval()));
|
||||
s.add(String.format("name %s", dbName));
|
||||
|
||||
s.add(String.format("n_db_sites %d", nDBSNPs() + nDBIndels() + nDBElse));
|
||||
s.add(String.format("n_db_snps %d", nDBSNPs()));
|
||||
s.add(String.format("n_db_indels %d", nDBIndels()));
|
||||
s.add(String.format("n_db_others %d", nDBElse));
|
||||
s.add(String.format("n_eval_sites %d", nEvalSites()));
|
||||
s.add(String.format("n_overlapping_sites %d", nSNPsAtdbSNPs()));
|
||||
s.add(String.format("n_concordant %d", nConcordant()));
|
||||
s.add(String.format("n_novel_sites %d", nNovelSites()));
|
||||
|
||||
|
||||
s.add(String.format("dbsnp_rate %.2f # percent eval snps at dbsnp snps", 100 * fractionEvalSitesCoveredByDB()));
|
||||
s.add(String.format("dbsnp_rate %.2f # percent eval snps at dbsnp snps", 100 * dbSNPRate()));
|
||||
s.add(String.format("concordance_rate %.2f", 100 * concordanceRate()));
|
||||
|
||||
s.add(String.format("percent_db_sites_in_eval %.2f", 100 * fractionDBSitesDiscoveredInEval()));
|
||||
|
||||
s.add(String.format("n_snp_calls_at_indels %d", nSNPsAtIndels()));
|
||||
s.add(String.format("percent_calls_at_indels %.2f", nSNPsAtIndels() / (0.01 * nEvalSites())));
|
||||
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
|
@ -113,7 +113,7 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
|||
// hence to pass the integration test. The new iterator this code is now using does see ALL the SNPs,
|
||||
// whether masked by overlapping indels/other events or not.
|
||||
//TODO process correctly all the returned dbSNP rods at each location
|
||||
BrokenRODSimulator.attach("dbSNP");
|
||||
//BrokenRODSimulator.attach("dbSNP");
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -259,17 +259,7 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
|||
|
||||
// update the known / novel set by checking whether the knownSNPDBName track has an entry here
|
||||
if (eval != null) {
|
||||
Variation dbsnp = (Variation) BrokenRODSimulator.simulate_lookup("dbSNP", ref.getLocus(), tracker);
|
||||
|
||||
ANALYSIS_TYPE noveltySet = null;
|
||||
if ( dbsnp == null ) noveltySet = ANALYSIS_TYPE.NOVEL_SNPS;
|
||||
else if ( dbsnp.isSNP() ) noveltySet = ANALYSIS_TYPE.KNOWN_SNPS;
|
||||
else { // if ( dbsnp.isIndel() )
|
||||
noveltySet = ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES;
|
||||
}
|
||||
//else { ; } // lots of wierd things in there }
|
||||
// // throw new StingException("Unexpected dbSNP record type: " + dbsnp); }
|
||||
|
||||
ANALYSIS_TYPE noveltySet = getNovelAnalysisType(tracker);
|
||||
updateAnalysisSet(noveltySet, eval, tracker, ref.getBase(), context);
|
||||
}
|
||||
|
||||
|
|
@ -286,6 +276,30 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
|||
return 1;
|
||||
}
|
||||
|
||||
private ANALYSIS_TYPE getNovelAnalysisType(RefMetaDataTracker tracker) {
|
||||
RODRecordList<ReferenceOrderedDatum> dbsnpList = tracker.getTrackData("dbsnp", null);
|
||||
|
||||
if (dbsnpList == null)
|
||||
return ANALYSIS_TYPE.NOVEL_SNPS;
|
||||
|
||||
for (ReferenceOrderedDatum d : dbsnpList) {
|
||||
if (((rodDbSNP) d).isSNP()) {
|
||||
return ANALYSIS_TYPE.KNOWN_SNPS;
|
||||
}
|
||||
}
|
||||
|
||||
return ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES;
|
||||
|
||||
// old and busted way of doing this
|
||||
// Variation dbsnp = (Variation) BrokenRODSimulator.simulate_lookup("dbSNP", ref.getLocus(), tracker);
|
||||
//
|
||||
// ANALYSIS_TYPE noveltySet = null;
|
||||
// if ( dbsnp == null ) noveltySet = ANALYSIS_TYPE.NOVEL_SNPS;
|
||||
// else if ( dbsnp.isSNP() ) noveltySet = ANALYSIS_TYPE.KNOWN_SNPS;
|
||||
// else { // if ( dbsnp.isIndel() )
|
||||
// noveltySet = ANALYSIS_TYPE.SNPS_AT_NON_SNP_SITES;
|
||||
}
|
||||
|
||||
public boolean includeViolations() { return mIncludeViolations; }
|
||||
|
||||
public void updateAnalysisSet(final ANALYSIS_TYPE analysisSetName, Variation eval,
|
||||
|
|
|
|||
|
|
@ -18,8 +18,8 @@ public class VariantEvalWalkerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testEvalVariantROD() {
|
||||
HashMap<String, String> md5 = new HashMap<String, String>();
|
||||
md5.put("", "b4adb7fed93005b3c28d72c3578ebc33");
|
||||
md5.put("-A", "d34c1598a5b35098bd5524a52fbbc0fd");
|
||||
md5.put("", "d6b8c2d6c37d42d1ca2288799a8bd8e4");
|
||||
md5.put("-A", "b868aac194f6d0bd1fd2c0c63ddfaeab");
|
||||
|
||||
/**
|
||||
* the above MD5 was calculated from running the following command:
|
||||
|
|
@ -52,7 +52,7 @@ public class VariantEvalWalkerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testEvalVariantRODConfSix() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("4421a0b379577ea0fcde2c9866f9ca9f");
|
||||
md5.add("85cfefcac2dfb06545792605a3043a52");
|
||||
|
||||
/**
|
||||
* the above MD5 was calculated from running the following command:
|
||||
|
|
@ -84,7 +84,7 @@ public class VariantEvalWalkerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testEvalVariantRODOutputViolations() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("e1ce229698725aa06c9b08b21fe628ef");
|
||||
md5.add("e24732ffd95a78385a2c6986d1d3a359");
|
||||
|
||||
/**
|
||||
* the above MD5 was calculated from running the following command:
|
||||
|
|
@ -116,7 +116,7 @@ public class VariantEvalWalkerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testEvalGenotypeROD() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("fd95465965116e05e2e0356e2d105321");
|
||||
md5.add("8b85148ec20f31481a30ceddd8645ba1");
|
||||
/**
|
||||
* the above MD5 was calculated after running the following command:
|
||||
*
|
||||
|
|
@ -150,7 +150,7 @@ public class VariantEvalWalkerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testEvalMarksGenotypingExample() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("58ef0487c9231aec7f356919e8cbd305");
|
||||
md5.add("7d5a98c01051f96a684a383786da3d76");
|
||||
/**
|
||||
* Run with the following commands:
|
||||
*
|
||||
|
|
|
|||
Loading…
Reference in New Issue