AlleleBalance and on/off primary base filters -- version 0.0.1 -- for experimental use only

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1294 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-07-22 17:54:44 +00:00
parent 00f9bcd6d1
commit 9c12c02768
9 changed files with 480 additions and 64 deletions

View File

@ -110,6 +110,11 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
attributes = new HashMap<String, String>();
}
public TabularROD(final String name, ArrayList<String> header) {
this(name);
this.header = header;
}
/**
* Make a new TabularROD with name, using header columns header, at loc, without any bound data. Data
* must be bound to each corresponding header[i] field before the object is really usable.
@ -155,7 +160,11 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
* @return
*/
public Object initialize(final File source) throws FileNotFoundException {
List<String> header = null;
return readHeader(source);
}
public static ArrayList<String> readHeader(final File source) throws FileNotFoundException {
ArrayList<String> header = null;
int linesLookedAt = 0;
xReadLines reader = new xReadLines(source);
@ -186,9 +195,16 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
header.add(Integer.toString(i));
}
try {
reader.close();
} catch ( IOException e ) {
throw new RuntimeException(e);
}
return header;
}
// ----------------------------------------------------------------------
//
// ROD accessors

View File

@ -50,6 +50,8 @@ public class rodVariants extends BasicReferenceOrderedDatum implements AllelicVa
public String delimiterRegex() { return "\\s+"; }
public boolean parseLine(Object header, String[] parts) throws IOException {
if ( parts.length < 18 )
throw new IOException("Invalid rodVariant row found -- too few elements. Expected 18+, got " + parts.length);
if (!parts[0].startsWith("#")) {
loc = GenomeLocParser.createGenomeLoc(parts[0], Long.valueOf(parts[1]));
refBase = parts[2].charAt(0);

View File

@ -1,27 +0,0 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: andrewk
* Date: Apr 2, 2009
* Time: 9:01:44 PM
* To change this template use File | Settings | File Templates.
*/
public class AlleleMetricsWalker {
// Class that will walk over various metrics in a reference ordered way
// This class walks over the genome in reference order and calls AlleleMetrics on each class
// Hapmap and dbSNP tracks are taken from the command line
// At first pass, this will at least be able to walk over a GFF file and compare to the hapmap and dbsnp
// tracks specified on the command line and handed in via the LocusContext
public void map(List<AllelicVariant> avdata) {
}
}

View File

@ -20,55 +20,29 @@ import java.util.List;
*
*/
public class TransitionTranversionAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
int N_TRANSITION_TRANVERSION_BINS = 100;
Histogram<Integer> transitions;
Histogram<Integer> transversions;
long nTransitions = 0, nTransversions = 0;
public TransitionTranversionAnalysis() {
super("transitions_transversions");
transitions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
transversions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
}
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
if ( eval != null && eval.isSNP() ) {
char refBase = eval.getRefSnpFWD();
char altBase = eval.getAltSnpFWD();
//System.out.printf("%c %c%n", refBase, altBase);
//int i = transition_transversion_bin(dbsnp.getHeterozygosity());
//System.out.printf("MAF = %f => %d%n", dbsnp.getMAF(), i);
//EnumMap<BaseUtils.BaseSubstitutionType, Integer> bin = transition_transversion_counts[i];
BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType(refBase, altBase);
Histogram<Integer> h = subType == BaseUtils.BaseSubstitutionType.TRANSITION ? transitions : transversions;
double het = eval.getHeterozygosity();
h.setBin(het, h.getBin(het) + 1);
//int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION);
//int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION);
//System.out.printf("%s %.2f %s%n", subType, dbsnp.getHeterozygosity(), h.x2bin(logHet), dbsnp.toString());
if ( subType == BaseUtils.BaseSubstitutionType.TRANSITION )
nTransitions++;
else
nTransversions++;
}
return null;
}
public List<String> done() {
int nTransitions = 0;
int nTransversions = 0;
List<String> s = new ArrayList<String>();
for ( int i = 0; i < N_TRANSITION_TRANVERSION_BINS; i++ ) {
//double avHet = Math.pow(10, transitions.bin2x(i));
double avHet = transitions.bin2x(i);
if ( avHet > 0.5 ) break;
int sit = transitions.getBin(i);
int ver = transversions.getBin(i);
nTransitions += sit;
nTransversions += ver;
double ratio = (float)sit/ver;
//s.append(String.format("%s %s %.2f %d %d %f%n", commentLine, prefix, avHet, sit, ver, ratio));
}
s.add(String.format("transitions %d", nTransitions));
s.add(String.format("transversions %d", nTransversions));
s.add(String.format("ratio %.2f", nTransitions / (1.0 * nTransversions)));

View File

@ -0,0 +1,317 @@
package org.broadinstitute.sting.playground.gatk.walkers.variants;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.rodVariants;
import org.broadinstitute.sting.gatk.refdata.TabularROD;
import org.broadinstitute.sting.utils.*;
import org.apache.log4j.Logger;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Set;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import edu.mit.broad.picard.util.MathUtil;
class GenotypeFeatureData extends ArrayList<Double> {
public enum Tail { LeftTailed, RightTailed, TwoTailed }
private String genotype = null;
private Integer[] permutation = null;
private Logger logger = null;
private Tail tail = null;
double lowThreshold = -1;
double highThreshold = -1;
public GenotypeFeatureData(final String genotype, Logger logger, Tail tail ) {
super();
this.genotype = genotype;
this.logger = logger;
this.tail = tail;
}
public Tail getTail() {
return tail;
}
public double getLowThreshold() {
return lowThreshold;
}
public void setLowThreshold(double lowThreshold) {
this.lowThreshold = lowThreshold;
}
public double getHighThreshold() {
return highThreshold;
}
public void setHighThreshold(double highThreshold) {
this.highThreshold = highThreshold;
}
public void finalizeData() {
permutation = Utils.SortPermutation(this);
}
public void determineThresholds(final double pvalueLimit) {
if ( pvalueLimit <= 0.0 || pvalueLimit >= 1.0 )
throw new RuntimeException(String.format("Invalid pValue limit = %f", pvalueLimit));
switch ( tail ) {
case LeftTailed:
lowThreshold = highThreshold = determineOneTailThreshold(pvalueLimit);
break;
case RightTailed:
lowThreshold = highThreshold = determineOneTailThreshold(1 - pvalueLimit);
break;
case TwoTailed:
lowThreshold = determineOneTailThreshold(pvalueLimit/2);
highThreshold = determineOneTailThreshold(1-pvalueLimit/2);
break;
}
//logger.info(String.format(" %d of %d elements are >= threshold = %f (%f percent)",
// nElementsAboveThreshold(), nElements(), getHighThreshold(), nElementsAboveThreshold() / ( 1.0 * nElements())));
//logger.info(String.format(" %d of %d elements are <= threshold = %f (%f percent)",
// nElementsBelowThreshold(), nElements(), getLowThreshold(), nElementsBelowThreshold() / ( 1.0 * nElements())));
}
private double determineOneTailThreshold(final double pvalueLimit) {
double trueSortedIndexLimit = pvalueLimit * nElements();
int sortedIndexLimit = (int)(pvalueLimit > 0.5 ? Math.ceil(trueSortedIndexLimit) : Math.floor(trueSortedIndexLimit));
double threshold = get(itemIndex(sortedIndexLimit));
logger.debug(String.format("determineTailThreshold(%s, %f) => %f => %d => %f", genotype, pvalueLimit, trueSortedIndexLimit, sortedIndexLimit, threshold));
return threshold;
}
public int nElementsBelowThreshold() { return nElementsPassingThreshold(true, false); }
public int nElementsAboveThreshold() { return nElementsPassingThreshold(false, true); }
public int nElementsPassingThreshold(boolean keepBelow, boolean keepAbove) {
int count = 0;
for ( double stat : this ) {
if ( passesThreshold(stat, keepBelow, keepAbove) ) count++;
}
return count;
}
public boolean passesThreshold(double value, boolean keepBelow, boolean keepAbove) {
//System.out.printf("passThreshold %s, %b, %b = %b%n", value, keepBelow, keepAbove, value < threshold && keepBelow || value >= threshold && keepAbove);
return value <= getHighThreshold() && keepBelow || value >= getLowThreshold() && keepAbove;
}
public boolean passesThreshold(double value) {
switch ( tail ) {
case LeftTailed:
return value >= getLowThreshold();
case RightTailed:
return value <= getHighThreshold();
case TwoTailed:
return value >= getLowThreshold() && value < getHighThreshold();
default:
return true;
}
}
public double pValue(double value) {
return 1.0;
}
public int itemIndex(int sortedIndex) {
return permutation[sortedIndex];
}
public int nElements() { return size(); }
public String toString() {
return String.format("[GenotypeFeatureData: genotype=%s, tail=%s, lowThreshold=%f, highThreshold=%f]", genotype, tail, lowThreshold, highThreshold);
}
}
class ObservationTable extends HashMap<String, GenotypeFeatureData> {
String statField = null;
File observationsFile = null;
Logger logger = null;
GenotypeFeatureData.Tail tail;
public ObservationTable(final File observationsFile, final String statField, Logger logger, GenotypeFeatureData.Tail tail ) throws NoSuchFieldException, FileNotFoundException, IOException {
this.observationsFile = observationsFile;
this.statField = statField;
this.logger = logger;
this.tail = tail;
readAllData(observationsFile, statField);
}
public Set<String> genotypes() {
return keySet();
}
public void readAllData(final File observationsFile, final String statField) throws NoSuchFieldException, FileNotFoundException, IOException {
ArrayList<String> header = TabularROD.readHeader(observationsFile);
//logger.info(String.format("Starting to read table %s", observationsFile));
for ( String line : new xReadLines(observationsFile) ) {
TabularROD d = new TabularROD("ignoreMe", header);
String[] parts = line.split("\\s+");
if ( d.parseLine(header, parts) ) {
if (! d.containsKey(statField)) {
throw new NoSuchFieldException(String.format("Could not find field %s in line %s", statField, line));
}
double stat = Double.valueOf(d.get(statField));
final String genotype = d.get("genotype");
GenotypeFeatureData gfd = getData(genotype);
gfd.add(stat);
}
}
for ( GenotypeFeatureData gfd : this.values() ) {
gfd.finalizeData();
}
//logger.info(String.format("Read table %s, found %d genotypes/data pairs in %s field",
// observationsFile, size(), statField));
}
public GenotypeFeatureData getData(final String genotype) {
if ( ! containsKey(genotype) ) {
GenotypeFeatureData gfd = new GenotypeFeatureData(genotype, logger, tail);
put(genotype, gfd);
}
return get(genotype);
}
public String toString() {
return String.format("[ObservationTable: file=%s, field=%s, nGenotypes=%d]", observationsFile, statField, size());
}
}
public abstract class RatioFilter implements VariantExclusionCriterion {
protected double pvalueLimit = -1;
protected File observationsFile = null;
protected String statField = null; // "AlleleRatio";
protected Logger logger = null; // Logger.getLogger(RatioFilter.class);
protected ObservationTable dataTable = null;
protected String name = null;
protected GenotypeFeatureData.Tail tail = null;
/**
* A short-term hack to stop the systme from rejecting poorly covered sites, under the assumption
* that the ratio test is poorly determined without at least MIN_COUNTS_TO_APPLY_TEST. To be
* replaced by the more robust sampling approach outlined below
*/
final private static int MIN_COUNTS_TO_APPLY_TEST = 20;
final static boolean integrateOverSamplingProbabilities = true;
public RatioFilter(final String name, final String statField, Class myClass, GenotypeFeatureData.Tail tail ) {
this.name = name;
this.statField = statField;
this.tail = tail;
logger = Logger.getLogger(myClass);
}
public void initialize(String arguments) {
if (arguments != null && !arguments.isEmpty()) {
String[] argPieces = arguments.split(",");
try {
pvalueLimit = Double.valueOf(argPieces[0]);
observationsFile = new File(argPieces[1]);
dataTable = new ObservationTable(observationsFile, statField, logger, tail);
logger.info(String.format("Initialized data for ratio filter %s: %s", name, dataTable));
for ( String genotype : dataTable.genotypes() ) {
GenotypeFeatureData gfd = dataTable.get(genotype);
gfd.determineThresholds(pvalueLimit);
logger.info(gfd.toString());
}
} catch ( NumberFormatException e ) {
throw new RuntimeException(String.format("Couldn't parse pValue limit from %s", argPieces[0]), e);
} catch ( FileNotFoundException e ) {
throw new RuntimeException("Couldn't open ObservationTable " + observationsFile, e);
} catch ( NoSuchFieldException e ) {
throw new RuntimeException("Couldn't parse ObservationTable " + observationsFile, e);
} catch ( IOException e ) {
throw new RuntimeException("Couldn't parse ObservationTable " + observationsFile,e );
}
}
}
protected abstract boolean applyToVariant(rodVariants variant);
protected abstract Pair<Integer, Integer> scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant);
public boolean exclude(char ref, LocusContext context, rodVariants variant) {
boolean exclude = false;
//
// todo -- need to calculate a significance value for the chance that the
// todo -- observed first and second counts are reliably not passing the threshold
// todo -- basically, we need to sample with replacement from all first / second pairs
// todo -- and only conclude that the variant fails the test if the probability of failing
// todo -- across all samples is greater than some P value, like 0.05
//
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
if ( applyToVariant(variant) ) {
Pair<Integer, Integer> counts = scoreVariant(ref, pileup, variant);
GenotypeFeatureData gfd = dataTable.get(variant.getBestGenotype());
if (integrateOverSamplingProbabilities)
exclude = integralExclude(gfd, counts);
else
exclude = pointEstimateExclude(gfd, counts);
//
// for printing only
//
int n = counts.first + counts.second;
double value = counts.first / (1.0 * counts.first + counts.second);
logger.info(String.format("%s: counts1=%d (%.2f), counts2=%d (%.2f), n=%d, value=%f, %s, exclude=%b, bases=%s",
name, counts.first, counts.first / (0.01 * n), counts.second, counts.second / (0.01 * n), n,
value, gfd, exclude, pileup.getBases()));
}
return exclude;
}
private final static double SEARCH_INCREMENT = 0.01;
private final static double integralPValueThreshold = 0.05;
private boolean integralExclude(GenotypeFeatureData gfd, Pair<Integer, Integer> counts ) {
double sumExclude = 0.0, sumP = 0.0;
int n = counts.first + counts.second;
for ( double r = 0.0; r <= 1.0; r += SEARCH_INCREMENT ) {
double p = MathUtils.binomialProbability(counts.first, n, r);
sumP += p;
boolean exclude = ! gfd.passesThreshold(r);
sumExclude += p * (exclude ? 1.0 : 0.0);
//System.out.printf("integral: k=%d, n=%d, r=%f, p=%f, sumP = %f, exclude=%b | sum=%f, percentExcluded=%f%n",
// counts.first, n, r, p, sumP, exclude, sumExclude, sumExclude / sumP);
}
double percentExcluded = sumExclude / sumP;
return 1 - percentExcluded <= integralPValueThreshold ;
}
private boolean pointEstimateExclude(GenotypeFeatureData gfd, Pair<Integer, Integer> counts ) {
int n = counts.first + counts.second;
if ( n < MIN_COUNTS_TO_APPLY_TEST ) return false;
double value = counts.first / (1.0 * counts.first + counts.second);
//double value = counts.first / (1.0 * Math.max(counts.second, 1.0));
return ! gfd.passesThreshold(value);
}
}

View File

@ -0,0 +1,54 @@
package org.broadinstitute.sting.playground.gatk.walkers.variants;
import org.broadinstitute.sting.gatk.refdata.rodVariants;
import org.broadinstitute.sting.utils.ReadBackedPileup;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.Pair;
public class VECAlleleBalance extends RatioFilter {
final private static String statField = "AlleleRatio";
final private static GenotypeFeatureData.Tail tail = GenotypeFeatureData.Tail.TwoTailed;
public VECAlleleBalance() {
super("AlleleBalance", statField, VECAlleleBalance.class, tail);
}
/**
* We can only filter out het calls with the AlleleBalance filter
*
* @param variant
* @return
*/
protected boolean applyToVariant(rodVariants variant) {
String genotype = variant.getBestGenotype();
return genotype.charAt(0) != genotype.charAt(1);
}
/**
* Return the count of bases matching the major (first) and minor (second) alleles as a pair.
*
* @param ref
* @param pileup
* @param variant
* @return
*/
protected Pair<Integer, Integer> scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant) {
final String genotype = variant.getBestGenotype();
final String bases = pileup.getBases();
if ( genotype.length() > 2 )
throw new IllegalArgumentException(String.format("Can only handle diploid genotypes: %s", genotype));
char a = genotype.toUpperCase().charAt(0);
char b = genotype.toUpperCase().charAt(1);
int aCount = Utils.countOccurrences(a, bases.toUpperCase());
int bCount = Utils.countOccurrences(b, bases.toUpperCase());
int major = Math.max(aCount, bCount);
int minor = Math.min(aCount, bCount);
return new Pair(major, minor);
}
}

View File

@ -0,0 +1,54 @@
package org.broadinstitute.sting.playground.gatk.walkers.variants;
import org.broadinstitute.sting.gatk.refdata.rodVariants;
import org.broadinstitute.sting.utils.*;
public class VECOnOffGenotypeRatio extends RatioFilter {
final private static String statField = "onOffRatio";
final private static GenotypeFeatureData.Tail tail = GenotypeFeatureData.Tail.LeftTailed;
public VECOnOffGenotypeRatio() {
super("On/Off Genotype Ratio", statField, VECOnOffGenotypeRatio.class, tail);
}
/**
* On/off genotype filters can be applied to any genotype
*
* @param variant
* @return
*/
protected boolean applyToVariant(rodVariants variant) {
return true;
}
/**
* Return the counts of bases that are on (matching the bestGenotype) and off (not matching the
* best genotype). On are in the first field, off in the second.
*
* @param ref
* @param pileup
* @param variant
* @return
*/
protected Pair<Integer, Integer> scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant) {
final String genotype = variant.getBestGenotype().toUpperCase();
final String bases = pileup.getBases();
if ( genotype.length() > 2 )
throw new IllegalArgumentException(String.format("Can only handle diploid genotypes: %s", genotype));
int on = 0, off = 0;
for ( char base : BaseUtils.BASES ) {
int count = BasicPileup.countBase(base, bases);
if ( Utils.countOccurrences(base, genotype) > 0 )
on += count;
else
off += count;
//System.out.printf("count = %d, on=%d, off=%d for %c in %s%n", count, on, off, base, genotype);
}
return new Pair<Integer, Integer>(on, off);
}
}

View File

@ -6,6 +6,7 @@ import java.util.Random;
* BaseUtils contains some basic utilities for manipulating nucleotides.
*/
public class BaseUtils {
public final static char[] BASES = { 'A', 'C', 'G', 'T' };
/// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or
// a pyrimidine to another pyrimidine nucleotide (C <-> T).

View File

@ -347,12 +347,35 @@ abstract public class BasicPileup implements Pileup {
return null;
}
public static byte consensusBase(String bases) {
public static class BaseCounts {
int a, c, t, g;
public BaseCounts(int a, int c, int t, int g) {
this.a = a;
this.c = c;
this.t = t;
this.g = g;
}
}
public static int countBase(final char base, final String bases) {
return Utils.countOccurrences(base, bases);
}
public static BaseCounts countBases(final String bases) {
String canon = bases.toUpperCase();
int ACount = Utils.countOccurrences('A', bases);
int CCount = Utils.countOccurrences('C', bases);
int TCount = Utils.countOccurrences('T', bases);
int GCount = Utils.countOccurrences('G', bases);
return new BaseCounts(Utils.countOccurrences('A', canon),
Utils.countOccurrences('C', canon),
Utils.countOccurrences('T', canon),
Utils.countOccurrences('G', canon));
}
public static byte consensusBase(String bases) {
BaseCounts counts = countBases( bases );
int ACount = counts.a;
int CCount = counts.c;
int TCount = counts.t;
int GCount = counts.g;
int m = Math.max(ACount, Math.max(CCount, Math.max(TCount, GCount)));
if ( ACount == m ) return 'A';
@ -362,3 +385,5 @@ abstract public class BasicPileup implements Pileup {
return 0;
}
}