First incarnation of the population-based SNP analysis tool. Also bug fixes throughout the GATK

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@845 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-05-27 22:02:24 +00:00
parent a11bf0f43e
commit ce6a0f522b
10 changed files with 290 additions and 58 deletions

View File

@ -34,10 +34,10 @@ public class LocusContext {
* @param offsets * @param offsets
*/ */
public LocusContext(GenomeLoc loc, List<SAMRecord> reads, List<Integer> offsets) { public LocusContext(GenomeLoc loc, List<SAMRecord> reads, List<Integer> offsets) {
assert loc != null; //assert loc != null;
assert loc.getContig() != null; //assert loc.getContig() != null;
assert reads != null; //assert reads != null;
assert offsets != null; //assert offsets != null;
this.loc = loc; this.loc = loc;
this.reads = reads; this.reads = reads;

View File

@ -95,6 +95,12 @@ public interface AllelicVariant extends Comparable<ReferenceOrderedDatum> {
*/ */
double getMAF() ; double getMAF() ;
/** Returns heterozygosity, a more accessible general feature of a variant.
*
* @return
*/
double getHeterozygosity() ;
/** Is this variant an actual genotype (such as individual call from sequencing, HapMap chip etc), or /** Is this variant an actual genotype (such as individual call from sequencing, HapMap chip etc), or
* population allelic variant (call from pooled sequencing, dbSNP site etc). Only if variant is a genotype, there * population allelic variant (call from pooled sequencing, dbSNP site etc). Only if variant is a genotype, there
* is a meaningful question of, e.g., whether it is a het, or homogeneous non-ref. * is a meaningful question of, e.g., whether it is a het, or homogeneous non-ref.

View File

@ -0,0 +1,48 @@
package org.broadinstitute.sting.gatk.refdata;
import java.util.*;
import java.util.regex.MatchResult;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.xReadLines;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.apache.log4j.Logger;
/**
* loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom
* chr1:1104840 A N 0.000000 -85.341265 -85.341265 0.000000 0.000000 324.000000 162 0 0
* chr1:1104841 C N 0.000000 -69.937928 -69.937928 0.000000 0.000000 324.000000 162 0 0
* chr1:1104842 A N 0.000000 -84.816002 -84.816002 0.000000 0.000000 324.000000 162 0 0
*
*/
public class PooledEMSNPROD extends TabularROD implements AllelicVariant {
public PooledEMSNPROD(final String name) {
super(name);
}
//GenomeLoc getLocation();
public String getRefBasesFWD() { return this.get("ref"); }
public char getRefSnpFWD() throws IllegalStateException { return getRefBasesFWD().charAt(0); }
public String getAltBasesFWD() { return this.get("alt"); }
public char getAltSnpFWD() throws IllegalStateException { return getAltBasesFWD().charAt(0); }
public boolean isReference() { return getVariationConfidence() < 0.01; }
public boolean isSNP() { return ! isReference(); }
public boolean isInsertion() { return false; }
public boolean isDeletion() { return false; }
public boolean isIndel() { return false; }
public double getMAF() { return Double.parseDouble(this.get("EM_alt_freq")); }
public double getHeterozygosity() { return 2 * getMAF() * (1 - getMAF()); }
public boolean isGenotype() { return false; }
public double getVariationConfidence() { return Double.parseDouble(this.get("discovery_lod")) * 10; }
public double getConsensusConfidence() { return -1; }
public List<String> getGenotype() throws IllegalStateException { throw new IllegalStateException(); }
public int getPloidy() throws IllegalStateException { return 2; }
public boolean isBiallelic() { return true; }
}

View File

@ -63,6 +63,7 @@ public class RefMetaDataTracker {
*/ */
private final String canonicalName(final String name) private final String canonicalName(final String name)
{ {
//return name; // .toLowerCase();
return name.toLowerCase(); return name.toLowerCase();
} }
@ -93,7 +94,7 @@ public class RefMetaDataTracker {
* @param rod * @param rod
*/ */
public void bind(final String name, ReferenceOrderedDatum rod) { public void bind(final String name, ReferenceOrderedDatum rod) {
//logger.debug(String.format("Binding %s to %s%n", name, rod)); logger.debug(String.format("Binding %s to %s%n", name, rod));
map.put(canonicalName(name), rod); map.put(canonicalName(name), rod);
} }
} }

View File

@ -64,6 +64,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
addModule("SAMPileup", rodSAMPileup.class); addModule("SAMPileup", rodSAMPileup.class);
addModule("RefSeq", rodRefSeq.class); addModule("RefSeq", rodRefSeq.class);
addModule("Table", TabularROD.class); addModule("Table", TabularROD.class);
addModule("PooledEM", PooledEMSNPROD.class);
} }
@ -80,24 +81,28 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
public static void parseBindings(Logger logger, ArrayList<String> bindings, List<ReferenceOrderedData<? extends ReferenceOrderedDatum> > rods) public static void parseBindings(Logger logger, ArrayList<String> bindings, List<ReferenceOrderedData<? extends ReferenceOrderedDatum> > rods)
{ {
// Loop over triplets // Loop over triplets
for( String binding: bindings ) { System.out.printf("Binding is %s%n", Utils.join(" XXX ", bindings));
String[] bindingTokens = binding.split(","); for( String bindingSets: bindings ) {
logger.info("Processing ROD bindings: " + bindings.size() + " -> " + Utils.join(" : ", bindingTokens)); String[] bindingTokens = bindingSets.split(",");
if( bindingTokens.length != 3 ) if( bindingTokens.length % 3 != 0 )
Utils.scareUser(String.format("Invalid ROD specification: requires triplets of <name>,<type>,<file> but got %s", Utils.join(",", bindings))); Utils.scareUser(String.format("Invalid ROD specification: requires triplets of <name>,<type>,<file> but got %s", Utils.join(",", bindings)));
final String name = bindingTokens[0]; for ( int bindingSet = 0; bindingSet < bindingTokens.length; bindingSet += 3 ) {
final String typeName = bindingTokens[1]; logger.info("Processing ROD bindings: " + bindings.size() + " -> " + Utils.join(" : ", bindingTokens));
final String fileName = bindingTokens[2];
ReferenceOrderedData<?> rod = parse1Binding(logger, name, typeName, fileName); final String name = bindingTokens[bindingSet];
final String typeName = bindingTokens[bindingSet + 1];
final String fileName = bindingTokens[bindingSet + 2];
// check that we're not generating duplicate bindings ReferenceOrderedData<?> rod = parse1Binding(logger, name, typeName, fileName);
for ( ReferenceOrderedData rod2 : rods )
if ( rod2.getName().equals(rod.getName()) )
Utils.scareUser(String.format("Found duplicate rod bindings", rod.getName()));
rods.add(rod); // check that we're not generating duplicate bindings
for ( ReferenceOrderedData rod2 : rods )
if ( rod2.getName().equals(rod.getName()) )
Utils.scareUser(String.format("Found duplicate rod bindings", rod.getName()));
rods.add(rod);
}
} }
} }

View File

@ -52,7 +52,7 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
public static String DELIMITER_REGEX = DEFAULT_DELIMITER_REGEX; public static String DELIMITER_REGEX = DEFAULT_DELIMITER_REGEX;
private static int MAX_LINES_TO_LOOK_FOR_HEADER = 1000; private static int MAX_LINES_TO_LOOK_FOR_HEADER = 1000;
private static Pattern HEADER_PATTERN = Pattern.compile("^\\s*HEADER.*"); private static Pattern HEADER_PATTERN = Pattern.compile("^\\s*((HEADER)|(loc)).*");
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
/** /**

View File

@ -175,6 +175,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVaria
func = parts[15]; func = parts[15];
locType = parts[16]; locType = parts[16];
weight = Integer.parseInt(parts[17]); weight = Integer.parseInt(parts[17]);
//System.out.printf("Parsed %s%n", toString());
return true; return true;
} catch ( RuntimeException e ) { } catch ( RuntimeException e ) {
System.out.printf(" Exception caught during parsing GFFLine %s%n", Utils.join(" <=> ", parts)); System.out.printf(" Exception caught during parsing GFFLine %s%n", Utils.join(" <=> ", parts));
@ -196,7 +197,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVaria
@Override @Override
public double getConsensusConfidence() { public double getConsensusConfidence() {
// TODO Auto-generated method stub // TODO Auto-generated method stub
return 0; return Double.MAX_VALUE;
} }
@Override @Override
@ -205,12 +206,16 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVaria
return null; return null;
} }
@Override
public double getMAF() { public double getMAF() {
// Fixme: update to actually get MAF // Fixme: update to actually get MAF
return avHet; //return avHet;
return -1;
} }
public double getHeterozygosity() {
return avHet;
}
@Override @Override
public int getPloidy() throws IllegalStateException { public int getPloidy() throws IllegalStateException {
// TODO Auto-generated method stub // TODO Auto-generated method stub
@ -220,7 +225,7 @@ public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVaria
@Override @Override
public double getVariationConfidence() { public double getVariationConfidence() {
// TODO Auto-generated method stub // TODO Auto-generated method stub
return 100; return Double.MAX_VALUE;
} }
@Override @Override

View File

@ -0,0 +1,55 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import java.util.ArrayList;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: May 27, 2009
* Time: 2:37:56 PM
* To change this template use File | Settings | File Templates.
*/
public class Histogram<T> {
ArrayList<T> data;
int nBins = 100;
double minValue, maxValue;
public Histogram(int nBins, double minValue, double maxValue, T initialValue) {
data = new ArrayList<T>(nBins);
this.nBins = nBins;
this.minValue = minValue;
this.maxValue = maxValue;
initialize(initialValue);
}
public void initialize(T initialValue) {
data.clear();
for ( int i = 0; i < nBins; i++ ) {
data.add(i, initialValue);
}
}
public void setBin(int i, T value) {
data.set(i, value);
}
public void setBin(double x, T value) {
setBin(x2bin(x), value);
}
public T getBin(int binI) {
return data.get(binI);
}
public T getBin(double x) {
return getBin(x2bin(x));
}
public int x2bin(double x) {
return (int)Math.floor((x - minValue) / ((maxValue - minValue) * nBins));
}
public double bin2x(int bin) {
return (maxValue - minValue) * ((bin + 0.5) / nBins) + minValue;
}
}

View File

@ -0,0 +1,63 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
public class VariantDBCoverage {
private String dbName;
private int nDBObs = 0;
private int nEvalObs = 0;
private int nOverlapping = 0;
public VariantDBCoverage(final String name) {
dbName = name;
}
public void inc(boolean inDB, boolean inEval) {
if (inDB) nDBObs++;
if (inEval) nEvalObs++;
if (inDB && inEval) nOverlapping++;
}
public int nDBSites() {
return nDBObs;
}
public int nEvalSites() {
return nEvalObs;
}
public int nOverlappingSites() {
return nOverlapping;
}
/**
* What fraction of the evaluated site variants were also found in the db?
*
* @return
*/
public double fractionEvalSitesCoveredByDB() {
return nOverlappingSites() / (1.0 * nEvalSites());
}
/**
* What fraction of the DB sites were discovered in the evalution calls?
*
* @return
*/
public double fractionDBSitesDiscoveredInEval() {
return nOverlappingSites() / (1.0 * nDBSites());
}
public String toSingleLineString(final String prefix) {
return String.format("%s %d\t%d\t%d\t%.2f\t%.2f", prefix, nDBSites(), nEvalSites(), nOverlappingSites(), fractionEvalSitesCoveredByDB(), fractionDBSitesDiscoveredInEval());
}
public String toMultiLineString(final String prefix) {
StringBuilder s = new StringBuilder();
s.append(String.format("%s name %s%n", prefix, dbName));
s.append(String.format("%s n_db_sites %d%n", prefix, nDBSites()));
s.append(String.format("%s n_eval_sites %d%n", prefix, nEvalSites()));
s.append(String.format("%s n_overlapping_sites %d%n", prefix, nOverlappingSites()));
s.append(String.format("%s per_eval_sites_in_db %.2f%n", prefix, 100*fractionEvalSitesCoveredByDB()));
s.append(String.format("%s per_db_sites_in_eval %.2f%n", prefix, 100*fractionDBSitesDiscoveredInEval()));
return s.toString();
}
}

View File

@ -2,10 +2,10 @@ package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant; import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.EnumMap; import java.util.EnumMap;
@ -20,49 +20,75 @@ import java.util.EnumMap;
@Requires(DataSource.REFERENCE) @Requires(DataSource.REFERENCE)
@Allows(DataSource.REFERENCE) @Allows(DataSource.REFERENCE)
public class VariantEvalWalker extends RefWalker<Integer, Integer> { public class VariantEvalWalker extends RefWalker<Integer, Integer> {
int N_TRANSITION_TRANVERSION_BINS = 100; @Argument(shortName="minDiscoveryQ", doc="Phred-scaled minimum LOD to consider an evaluation SNP a variant", required=false)
EnumMap<BaseUtils.BaseSubstitutionType, Integer> transition_transversion_counts[]; public int minDiscoveryQ = -1;
int nBasesCovered = 0;
VariantDBCoverage dbSNPStats = new VariantDBCoverage("dbSNP");
int N_TRANSITION_TRANVERSION_BINS = 20;
Histogram<Integer> transitions;
Histogram<Integer> transversions;
//EnumMap<BaseUtils.BaseSubstitutionType, Integer> transition_transversion_counts[];
public void initialize() { public void initialize() {
transition_transversion_counts = new EnumMap[N_TRANSITION_TRANVERSION_BINS]; //transitions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, Math.log10(1e-10), 0.0, 0);
//transversions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, Math.log10(1e-10), 0.0, 0);
for ( int i = 0; i < N_TRANSITION_TRANVERSION_BINS; i++ ) { transitions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
transition_transversion_counts[i] = new EnumMap<BaseUtils.BaseSubstitutionType, Integer>(BaseUtils.BaseSubstitutionType.class); transversions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
for ( BaseUtils.BaseSubstitutionType t : BaseUtils.BaseSubstitutionType.values() ) {
transition_transversion_counts[i].put(t, 0);
}
}
} }
private int transition_transversion_bin(double MAF) { //private int transition_transversion_bin(double het) {
return (int)Math.floor(MAF * N_TRANSITION_TRANVERSION_BINS); // return (int)Math.floor(het * N_TRANSITION_TRANVERSION_BINS);
} //}
private double transition_transversion_bin2MAF(int bin) { //private double transition_transversion_bin2het(int bin) {
return (bin + 0.5) / N_TRANSITION_TRANVERSION_BINS; // return (bin + 0.5) / N_TRANSITION_TRANVERSION_BINS;
} //}
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) { public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
nBasesCovered++;
AllelicVariant dbsnp = (AllelicVariant)tracker.lookup("dbSNP", null); AllelicVariant dbsnp = (AllelicVariant)tracker.lookup("dbSNP", null);
AllelicVariant eval = (AllelicVariant)tracker.lookup("eval", null);
if ( dbsnp != null && dbsnp.isSNP() ) { //if ( eval != null || dbsnp != null )
char refBase = dbsnp.getRefSnpFWD(); // System.out.printf("%s has: %nDBSNP: %s%nEVAL:%s%n", context.getLocation(), dbsnp, eval);
char altBase = dbsnp.getAltSnpFWD();
//System.out.printf("%c %c%n", refBase, altBase);
int i = transition_transversion_bin(dbsnp.getMAF());
//System.out.printf("MAF = %f => %d%n", dbsnp.getMAF(), i);
EnumMap<BaseUtils.BaseSubstitutionType, Integer> bin = transition_transversion_counts[i];
BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType(refBase, altBase); if ( eval != null && eval.isSNP() && eval.getVariationConfidence() >= minDiscoveryQ ) {
bin.put(subType, bin.get(subType) + 1); //System.out.printf("%s has: %nDBSNP: %s%nEVAL:%s%n", context.getLocation(), dbsnp, eval);
int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION);
int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION); updateTransitionTransversion(eval, ref, context);
out.printf("%s %d %d %.2f %s%n", subType, sit, ver, (float)sit/ver, dbsnp.toString());
//updateHapMapRate(dbsnp, eval, ref, context);
} }
updateVariantDBCoverage(dbsnp, eval, ref, context);
return 1; return 1;
} }
private void updateTransitionTransversion(AllelicVariant dbsnp, char ref, LocusContext context) {
char refBase = dbsnp.getRefSnpFWD();
char altBase = dbsnp.getAltSnpFWD();
//System.out.printf("%c %c%n", refBase, altBase);
//int i = transition_transversion_bin(dbsnp.getHeterozygosity());
//System.out.printf("MAF = %f => %d%n", dbsnp.getMAF(), i);
//EnumMap<BaseUtils.BaseSubstitutionType, Integer> bin = transition_transversion_counts[i];
BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType(refBase, altBase);
Histogram<Integer> h = subType == BaseUtils.BaseSubstitutionType.TRANSITION ? transitions : transversions;
double het = dbsnp.getHeterozygosity();
h.setBin(het, h.getBin(het) + 1);
//int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION);
//int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION);
//System.out.printf("%s %.2f %s%n", subType, dbsnp.getHeterozygosity(), h.x2bin(logHet), dbsnp.toString());
}
private void updateVariantDBCoverage(AllelicVariant dbsnp, AllelicVariant eval, char ref, LocusContext context) {
// There are four cases here:
dbSNPStats.inc(dbsnp != null, eval != null);
}
// Given result of map function // Given result of map function
public Integer reduceInit() { return 0; } public Integer reduceInit() { return 0; }
public Integer reduce(Integer value, Integer sum) { public Integer reduce(Integer value, Integer sum) {
@ -73,13 +99,36 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
} }
public void onTraversalDone(Integer result) { public void onTraversalDone(Integer result) {
int nTransitions = 0;
int nTransversions = 0;
StringBuilder s = new StringBuilder();
for ( int i = 0; i < N_TRANSITION_TRANVERSION_BINS; i++ ) { for ( int i = 0; i < N_TRANSITION_TRANVERSION_BINS; i++ ) {
double averageMAF = transition_transversion_bin2MAF(i); //double avHet = Math.pow(10, transitions.bin2x(i));
EnumMap<BaseUtils.BaseSubstitutionType, Integer> bin = transition_transversion_counts[i]; double avHet = transitions.bin2x(i);
int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION); if ( avHet > 0.5 ) break;
int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION);
int sit = transitions.getBin(i);
int ver = transversions.getBin(i);
nTransitions += sit;
nTransversions += ver;
double ratio = (float)sit/ver; double ratio = (float)sit/ver;
out.printf("%.2f %d %d %f%n", averageMAF, sit, ver, ratio); s.append(String.format("%.2f %d %d %f%n", avHet, sit, ver, ratio));
} }
long nSites = nTransitions + nTransversions;
out.printf("# n bases covered: %d%n", nBasesCovered);
out.printf("# sites: %d%n", nSites);
out.printf("# variant rate: %.5f confident variants per base%n", nSites / (1.0 * nBasesCovered));
out.printf("# variant rate: 1 / %d confident variants per base%n", nBasesCovered / nSites);
out.printf("# transitions: %d%n", nTransitions);
out.printf("# transversions: %d%n", nTransversions);
out.printf("# ratio: %.2f%n", nTransitions / (1.0 * nTransversions));
// dbSNP stats
out.println(dbSNPStats.toSingleLineString("#"));
out.print(dbSNPStats.toMultiLineString("#"));
out.print(s);
} }
} }