First incarnation of the population-based SNP analysis tool. Also bug fixes throughout the GATK

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@845 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-05-27 22:02:24 +00:00
parent a11bf0f43e
commit ce6a0f522b
10 changed files with 290 additions and 58 deletions

View File

@ -34,10 +34,10 @@ public class LocusContext {
* @param offsets
*/
public LocusContext(GenomeLoc loc, List<SAMRecord> reads, List<Integer> offsets) {
assert loc != null;
assert loc.getContig() != null;
assert reads != null;
assert offsets != null;
//assert loc != null;
//assert loc.getContig() != null;
//assert reads != null;
//assert offsets != null;
this.loc = loc;
this.reads = reads;

View File

@ -95,6 +95,12 @@ public interface AllelicVariant extends Comparable<ReferenceOrderedDatum> {
*/
double getMAF() ;
/** Returns heterozygosity, a more accessible general feature of a variant.
*
* @return
*/
double getHeterozygosity() ;
/** Is this variant an actual genotype (such as individual call from sequencing, HapMap chip etc), or
* population allelic variant (call from pooled sequencing, dbSNP site etc). Only if variant is a genotype, there
* is a meaningful question of, e.g., whether it is a het, or homogeneous non-ref.

View File

@ -0,0 +1,48 @@
package org.broadinstitute.sting.gatk.refdata;
import java.util.*;
import java.util.regex.MatchResult;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.xReadLines;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.apache.log4j.Logger;
/**
* loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom
* chr1:1104840 A N 0.000000 -85.341265 -85.341265 0.000000 0.000000 324.000000 162 0 0
* chr1:1104841 C N 0.000000 -69.937928 -69.937928 0.000000 0.000000 324.000000 162 0 0
* chr1:1104842 A N 0.000000 -84.816002 -84.816002 0.000000 0.000000 324.000000 162 0 0
*
*/
public class PooledEMSNPROD extends TabularROD implements AllelicVariant {
public PooledEMSNPROD(final String name) {
super(name);
}
//GenomeLoc getLocation();
public String getRefBasesFWD() { return this.get("ref"); }
public char getRefSnpFWD() throws IllegalStateException { return getRefBasesFWD().charAt(0); }
public String getAltBasesFWD() { return this.get("alt"); }
public char getAltSnpFWD() throws IllegalStateException { return getAltBasesFWD().charAt(0); }
public boolean isReference() { return getVariationConfidence() < 0.01; }
public boolean isSNP() { return ! isReference(); }
public boolean isInsertion() { return false; }
public boolean isDeletion() { return false; }
public boolean isIndel() { return false; }
public double getMAF() { return Double.parseDouble(this.get("EM_alt_freq")); }
public double getHeterozygosity() { return 2 * getMAF() * (1 - getMAF()); }
public boolean isGenotype() { return false; }
public double getVariationConfidence() { return Double.parseDouble(this.get("discovery_lod")) * 10; }
public double getConsensusConfidence() { return -1; }
public List<String> getGenotype() throws IllegalStateException { throw new IllegalStateException(); }
public int getPloidy() throws IllegalStateException { return 2; }
public boolean isBiallelic() { return true; }
}

View File

@ -63,6 +63,7 @@ public class RefMetaDataTracker {
*/
private final String canonicalName(final String name)
{
//return name; // .toLowerCase();
return name.toLowerCase();
}
@ -93,7 +94,7 @@ public class RefMetaDataTracker {
* @param rod
*/
public void bind(final String name, ReferenceOrderedDatum rod) {
//logger.debug(String.format("Binding %s to %s%n", name, rod));
logger.debug(String.format("Binding %s to %s%n", name, rod));
map.put(canonicalName(name), rod);
}
}

View File

@ -64,6 +64,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
addModule("SAMPileup", rodSAMPileup.class);
addModule("RefSeq", rodRefSeq.class);
addModule("Table", TabularROD.class);
addModule("PooledEM", PooledEMSNPROD.class);
}
@ -80,24 +81,28 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
public static void parseBindings(Logger logger, ArrayList<String> bindings, List<ReferenceOrderedData<? extends ReferenceOrderedDatum> > rods)
{
// Loop over triplets
for( String binding: bindings ) {
String[] bindingTokens = binding.split(",");
logger.info("Processing ROD bindings: " + bindings.size() + " -> " + Utils.join(" : ", bindingTokens));
if( bindingTokens.length != 3 )
System.out.printf("Binding is %s%n", Utils.join(" XXX ", bindings));
for( String bindingSets: bindings ) {
String[] bindingTokens = bindingSets.split(",");
if( bindingTokens.length % 3 != 0 )
Utils.scareUser(String.format("Invalid ROD specification: requires triplets of <name>,<type>,<file> but got %s", Utils.join(",", bindings)));
final String name = bindingTokens[0];
final String typeName = bindingTokens[1];
final String fileName = bindingTokens[2];
for ( int bindingSet = 0; bindingSet < bindingTokens.length; bindingSet += 3 ) {
logger.info("Processing ROD bindings: " + bindings.size() + " -> " + Utils.join(" : ", bindingTokens));
ReferenceOrderedData<?> rod = parse1Binding(logger, name, typeName, fileName);
final String name = bindingTokens[bindingSet];
final String typeName = bindingTokens[bindingSet + 1];
final String fileName = bindingTokens[bindingSet + 2];
// check that we're not generating duplicate bindings
for ( ReferenceOrderedData rod2 : rods )
if ( rod2.getName().equals(rod.getName()) )
Utils.scareUser(String.format("Found duplicate rod bindings", rod.getName()));
ReferenceOrderedData<?> rod = parse1Binding(logger, name, typeName, fileName);
rods.add(rod);
// check that we're not generating duplicate bindings
for ( ReferenceOrderedData rod2 : rods )
if ( rod2.getName().equals(rod.getName()) )
Utils.scareUser(String.format("Found duplicate rod bindings", rod.getName()));
rods.add(rod);
}
}
}

View File

@ -52,7 +52,7 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
public static String DELIMITER_REGEX = DEFAULT_DELIMITER_REGEX;
private static int MAX_LINES_TO_LOOK_FOR_HEADER = 1000;
private static Pattern HEADER_PATTERN = Pattern.compile("^\\s*HEADER.*");
private static Pattern HEADER_PATTERN = Pattern.compile("^\\s*((HEADER)|(loc)).*");
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
/**

View File

@ -175,6 +175,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVaria
func = parts[15];
locType = parts[16];
weight = Integer.parseInt(parts[17]);
//System.out.printf("Parsed %s%n", toString());
return true;
} catch ( RuntimeException e ) {
System.out.printf(" Exception caught during parsing GFFLine %s%n", Utils.join(" <=> ", parts));
@ -196,7 +197,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVaria
@Override
public double getConsensusConfidence() {
// TODO Auto-generated method stub
return 0;
return Double.MAX_VALUE;
}
@Override
@ -205,12 +206,16 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVaria
return null;
}
@Override
public double getMAF() {
// Fixme: update to actually get MAF
return avHet;
//return avHet;
return -1;
}
public double getHeterozygosity() {
return avHet;
}
@Override
public int getPloidy() throws IllegalStateException {
// TODO Auto-generated method stub
@ -220,7 +225,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVaria
@Override
public double getVariationConfidence() {
// TODO Auto-generated method stub
return 100;
return Double.MAX_VALUE;
}
@Override

View File

@ -0,0 +1,55 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import java.util.ArrayList;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: May 27, 2009
* Time: 2:37:56 PM
* To change this template use File | Settings | File Templates.
*/
public class Histogram<T> {
ArrayList<T> data;
int nBins = 100;
double minValue, maxValue;
public Histogram(int nBins, double minValue, double maxValue, T initialValue) {
data = new ArrayList<T>(nBins);
this.nBins = nBins;
this.minValue = minValue;
this.maxValue = maxValue;
initialize(initialValue);
}
public void initialize(T initialValue) {
data.clear();
for ( int i = 0; i < nBins; i++ ) {
data.add(i, initialValue);
}
}
public void setBin(int i, T value) {
data.set(i, value);
}
public void setBin(double x, T value) {
setBin(x2bin(x), value);
}
public T getBin(int binI) {
return data.get(binI);
}
public T getBin(double x) {
return getBin(x2bin(x));
}
public int x2bin(double x) {
return (int)Math.floor((x - minValue) / ((maxValue - minValue) * nBins));
}
public double bin2x(int bin) {
return (maxValue - minValue) * ((bin + 0.5) / nBins) + minValue;
}
}

View File

@ -0,0 +1,63 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
public class VariantDBCoverage {
private String dbName;
private int nDBObs = 0;
private int nEvalObs = 0;
private int nOverlapping = 0;
public VariantDBCoverage(final String name) {
dbName = name;
}
public void inc(boolean inDB, boolean inEval) {
if (inDB) nDBObs++;
if (inEval) nEvalObs++;
if (inDB && inEval) nOverlapping++;
}
public int nDBSites() {
return nDBObs;
}
public int nEvalSites() {
return nEvalObs;
}
public int nOverlappingSites() {
return nOverlapping;
}
/**
* What fraction of the evaluated site variants were also found in the db?
*
* @return
*/
public double fractionEvalSitesCoveredByDB() {
return nOverlappingSites() / (1.0 * nEvalSites());
}
/**
* What fraction of the DB sites were discovered in the evalution calls?
*
* @return
*/
public double fractionDBSitesDiscoveredInEval() {
return nOverlappingSites() / (1.0 * nDBSites());
}
public String toSingleLineString(final String prefix) {
return String.format("%s %d\t%d\t%d\t%.2f\t%.2f", prefix, nDBSites(), nEvalSites(), nOverlappingSites(), fractionEvalSitesCoveredByDB(), fractionDBSitesDiscoveredInEval());
}
public String toMultiLineString(final String prefix) {
StringBuilder s = new StringBuilder();
s.append(String.format("%s name %s%n", prefix, dbName));
s.append(String.format("%s n_db_sites %d%n", prefix, nDBSites()));
s.append(String.format("%s n_eval_sites %d%n", prefix, nEvalSites()));
s.append(String.format("%s n_overlapping_sites %d%n", prefix, nOverlappingSites()));
s.append(String.format("%s per_eval_sites_in_db %.2f%n", prefix, 100*fractionEvalSitesCoveredByDB()));
s.append(String.format("%s per_db_sites_in_eval %.2f%n", prefix, 100*fractionDBSitesDiscoveredInEval()));
return s.toString();
}
}

View File

@ -2,10 +2,10 @@ package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.EnumMap;
@ -20,49 +20,75 @@ import java.util.EnumMap;
@Requires(DataSource.REFERENCE)
@Allows(DataSource.REFERENCE)
public class VariantEvalWalker extends RefWalker<Integer, Integer> {
int N_TRANSITION_TRANVERSION_BINS = 100;
EnumMap<BaseUtils.BaseSubstitutionType, Integer> transition_transversion_counts[];
@Argument(shortName="minDiscoveryQ", doc="Phred-scaled minimum LOD to consider an evaluation SNP a variant", required=false)
public int minDiscoveryQ = -1;
int nBasesCovered = 0;
VariantDBCoverage dbSNPStats = new VariantDBCoverage("dbSNP");
int N_TRANSITION_TRANVERSION_BINS = 20;
Histogram<Integer> transitions;
Histogram<Integer> transversions;
//EnumMap<BaseUtils.BaseSubstitutionType, Integer> transition_transversion_counts[];
public void initialize() {
transition_transversion_counts = new EnumMap[N_TRANSITION_TRANVERSION_BINS];
for ( int i = 0; i < N_TRANSITION_TRANVERSION_BINS; i++ ) {
transition_transversion_counts[i] = new EnumMap<BaseUtils.BaseSubstitutionType, Integer>(BaseUtils.BaseSubstitutionType.class);
for ( BaseUtils.BaseSubstitutionType t : BaseUtils.BaseSubstitutionType.values() ) {
transition_transversion_counts[i].put(t, 0);
}
}
//transitions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, Math.log10(1e-10), 0.0, 0);
//transversions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, Math.log10(1e-10), 0.0, 0);
transitions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
transversions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
}
private int transition_transversion_bin(double MAF) {
return (int)Math.floor(MAF * N_TRANSITION_TRANVERSION_BINS);
}
//private int transition_transversion_bin(double het) {
// return (int)Math.floor(het * N_TRANSITION_TRANVERSION_BINS);
//}
private double transition_transversion_bin2MAF(int bin) {
return (bin + 0.5) / N_TRANSITION_TRANVERSION_BINS;
}
//private double transition_transversion_bin2het(int bin) {
// return (bin + 0.5) / N_TRANSITION_TRANVERSION_BINS;
//}
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
nBasesCovered++;
AllelicVariant dbsnp = (AllelicVariant)tracker.lookup("dbSNP", null);
AllelicVariant eval = (AllelicVariant)tracker.lookup("eval", null);
if ( dbsnp != null && dbsnp.isSNP() ) {
char refBase = dbsnp.getRefSnpFWD();
char altBase = dbsnp.getAltSnpFWD();
//System.out.printf("%c %c%n", refBase, altBase);
int i = transition_transversion_bin(dbsnp.getMAF());
//System.out.printf("MAF = %f => %d%n", dbsnp.getMAF(), i);
EnumMap<BaseUtils.BaseSubstitutionType, Integer> bin = transition_transversion_counts[i];
//if ( eval != null || dbsnp != null )
// System.out.printf("%s has: %nDBSNP: %s%nEVAL:%s%n", context.getLocation(), dbsnp, eval);
BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType(refBase, altBase);
bin.put(subType, bin.get(subType) + 1);
int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION);
int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION);
out.printf("%s %d %d %.2f %s%n", subType, sit, ver, (float)sit/ver, dbsnp.toString());
if ( eval != null && eval.isSNP() && eval.getVariationConfidence() >= minDiscoveryQ ) {
//System.out.printf("%s has: %nDBSNP: %s%nEVAL:%s%n", context.getLocation(), dbsnp, eval);
updateTransitionTransversion(eval, ref, context);
//updateHapMapRate(dbsnp, eval, ref, context);
}
updateVariantDBCoverage(dbsnp, eval, ref, context);
return 1;
}
private void updateTransitionTransversion(AllelicVariant dbsnp, char ref, LocusContext context) {
char refBase = dbsnp.getRefSnpFWD();
char altBase = dbsnp.getAltSnpFWD();
//System.out.printf("%c %c%n", refBase, altBase);
//int i = transition_transversion_bin(dbsnp.getHeterozygosity());
//System.out.printf("MAF = %f => %d%n", dbsnp.getMAF(), i);
//EnumMap<BaseUtils.BaseSubstitutionType, Integer> bin = transition_transversion_counts[i];
BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType(refBase, altBase);
Histogram<Integer> h = subType == BaseUtils.BaseSubstitutionType.TRANSITION ? transitions : transversions;
double het = dbsnp.getHeterozygosity();
h.setBin(het, h.getBin(het) + 1);
//int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION);
//int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION);
//System.out.printf("%s %.2f %s%n", subType, dbsnp.getHeterozygosity(), h.x2bin(logHet), dbsnp.toString());
}
private void updateVariantDBCoverage(AllelicVariant dbsnp, AllelicVariant eval, char ref, LocusContext context) {
// There are four cases here:
dbSNPStats.inc(dbsnp != null, eval != null);
}
// Given result of map function
public Integer reduceInit() { return 0; }
public Integer reduce(Integer value, Integer sum) {
@ -73,13 +99,36 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
}
public void onTraversalDone(Integer result) {
int nTransitions = 0;
int nTransversions = 0;
StringBuilder s = new StringBuilder();
for ( int i = 0; i < N_TRANSITION_TRANVERSION_BINS; i++ ) {
double averageMAF = transition_transversion_bin2MAF(i);
EnumMap<BaseUtils.BaseSubstitutionType, Integer> bin = transition_transversion_counts[i];
int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION);
int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION);
//double avHet = Math.pow(10, transitions.bin2x(i));
double avHet = transitions.bin2x(i);
if ( avHet > 0.5 ) break;
int sit = transitions.getBin(i);
int ver = transversions.getBin(i);
nTransitions += sit;
nTransversions += ver;
double ratio = (float)sit/ver;
out.printf("%.2f %d %d %f%n", averageMAF, sit, ver, ratio);
s.append(String.format("%.2f %d %d %f%n", avHet, sit, ver, ratio));
}
long nSites = nTransitions + nTransversions;
out.printf("# n bases covered: %d%n", nBasesCovered);
out.printf("# sites: %d%n", nSites);
out.printf("# variant rate: %.5f confident variants per base%n", nSites / (1.0 * nBasesCovered));
out.printf("# variant rate: 1 / %d confident variants per base%n", nBasesCovered / nSites);
out.printf("# transitions: %d%n", nTransitions);
out.printf("# transversions: %d%n", nTransversions);
out.printf("# ratio: %.2f%n", nTransitions / (1.0 * nTransversions));
// dbSNP stats
out.println(dbSNPStats.toSingleLineString("#"));
out.print(dbSNPStats.toMultiLineString("#"));
out.print(s);
}
}
}