AlleleBalance and on/off primary base filters -- version 0.0.1 -- for experimental use only
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1294 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
00f9bcd6d1
commit
9c12c02768
|
|
@ -110,6 +110,11 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
|
||||||
attributes = new HashMap<String, String>();
|
attributes = new HashMap<String, String>();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public TabularROD(final String name, ArrayList<String> header) {
|
||||||
|
this(name);
|
||||||
|
this.header = header;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Make a new TabularROD with name, using header columns header, at loc, without any bound data. Data
|
* Make a new TabularROD with name, using header columns header, at loc, without any bound data. Data
|
||||||
* must be bound to each corresponding header[i] field before the object is really usable.
|
* must be bound to each corresponding header[i] field before the object is really usable.
|
||||||
|
|
@ -155,7 +160,11 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public Object initialize(final File source) throws FileNotFoundException {
|
public Object initialize(final File source) throws FileNotFoundException {
|
||||||
List<String> header = null;
|
return readHeader(source);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ArrayList<String> readHeader(final File source) throws FileNotFoundException {
|
||||||
|
ArrayList<String> header = null;
|
||||||
int linesLookedAt = 0;
|
int linesLookedAt = 0;
|
||||||
xReadLines reader = new xReadLines(source);
|
xReadLines reader = new xReadLines(source);
|
||||||
|
|
||||||
|
|
@ -186,9 +195,16 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
|
||||||
header.add(Integer.toString(i));
|
header.add(Integer.toString(i));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
reader.close();
|
||||||
|
} catch ( IOException e ) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
return header;
|
return header;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// ----------------------------------------------------------------------
|
// ----------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// ROD accessors
|
// ROD accessors
|
||||||
|
|
|
||||||
|
|
@ -50,6 +50,8 @@ public class rodVariants extends BasicReferenceOrderedDatum implements AllelicVa
|
||||||
public String delimiterRegex() { return "\\s+"; }
|
public String delimiterRegex() { return "\\s+"; }
|
||||||
|
|
||||||
public boolean parseLine(Object header, String[] parts) throws IOException {
|
public boolean parseLine(Object header, String[] parts) throws IOException {
|
||||||
|
if ( parts.length < 18 )
|
||||||
|
throw new IOException("Invalid rodVariant row found -- too few elements. Expected 18+, got " + parts.length);
|
||||||
if (!parts[0].startsWith("#")) {
|
if (!parts[0].startsWith("#")) {
|
||||||
loc = GenomeLocParser.createGenomeLoc(parts[0], Long.valueOf(parts[1]));
|
loc = GenomeLocParser.createGenomeLoc(parts[0], Long.valueOf(parts[1]));
|
||||||
refBase = parts[2].charAt(0);
|
refBase = parts[2].charAt(0);
|
||||||
|
|
|
||||||
|
|
@ -1,27 +0,0 @@
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Created by IntelliJ IDEA.
|
|
||||||
* User: andrewk
|
|
||||||
* Date: Apr 2, 2009
|
|
||||||
* Time: 9:01:44 PM
|
|
||||||
* To change this template use File | Settings | File Templates.
|
|
||||||
*/
|
|
||||||
public class AlleleMetricsWalker {
|
|
||||||
// Class that will walk over various metrics in a reference ordered way
|
|
||||||
// This class walks over the genome in reference order and calls AlleleMetrics on each class
|
|
||||||
// Hapmap and dbSNP tracks are taken from the command line
|
|
||||||
// At first pass, this will at least be able to walk over a GFF file and compare to the hapmap and dbsnp
|
|
||||||
// tracks specified on the command line and handed in via the LocusContext
|
|
||||||
|
|
||||||
public void map(List<AllelicVariant> avdata) {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -20,55 +20,29 @@ import java.util.List;
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class TransitionTranversionAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
public class TransitionTranversionAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
||||||
int N_TRANSITION_TRANVERSION_BINS = 100;
|
long nTransitions = 0, nTransversions = 0;
|
||||||
Histogram<Integer> transitions;
|
|
||||||
Histogram<Integer> transversions;
|
|
||||||
|
|
||||||
public TransitionTranversionAnalysis() {
|
public TransitionTranversionAnalysis() {
|
||||||
super("transitions_transversions");
|
super("transitions_transversions");
|
||||||
transitions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
|
|
||||||
transversions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||||
if ( eval != null && eval.isSNP() ) {
|
if ( eval != null && eval.isSNP() ) {
|
||||||
char refBase = eval.getRefSnpFWD();
|
char refBase = eval.getRefSnpFWD();
|
||||||
char altBase = eval.getAltSnpFWD();
|
char altBase = eval.getAltSnpFWD();
|
||||||
//System.out.printf("%c %c%n", refBase, altBase);
|
|
||||||
//int i = transition_transversion_bin(dbsnp.getHeterozygosity());
|
|
||||||
//System.out.printf("MAF = %f => %d%n", dbsnp.getMAF(), i);
|
|
||||||
//EnumMap<BaseUtils.BaseSubstitutionType, Integer> bin = transition_transversion_counts[i];
|
|
||||||
|
|
||||||
BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType(refBase, altBase);
|
BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType(refBase, altBase);
|
||||||
Histogram<Integer> h = subType == BaseUtils.BaseSubstitutionType.TRANSITION ? transitions : transversions;
|
if ( subType == BaseUtils.BaseSubstitutionType.TRANSITION )
|
||||||
double het = eval.getHeterozygosity();
|
nTransitions++;
|
||||||
h.setBin(het, h.getBin(het) + 1);
|
else
|
||||||
//int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION);
|
nTransversions++;
|
||||||
//int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION);
|
|
||||||
//System.out.printf("%s %.2f %s%n", subType, dbsnp.getHeterozygosity(), h.x2bin(logHet), dbsnp.toString());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> done() {
|
public List<String> done() {
|
||||||
int nTransitions = 0;
|
|
||||||
int nTransversions = 0;
|
|
||||||
|
|
||||||
List<String> s = new ArrayList<String>();
|
List<String> s = new ArrayList<String>();
|
||||||
for ( int i = 0; i < N_TRANSITION_TRANVERSION_BINS; i++ ) {
|
|
||||||
//double avHet = Math.pow(10, transitions.bin2x(i));
|
|
||||||
double avHet = transitions.bin2x(i);
|
|
||||||
if ( avHet > 0.5 ) break;
|
|
||||||
|
|
||||||
int sit = transitions.getBin(i);
|
|
||||||
int ver = transversions.getBin(i);
|
|
||||||
nTransitions += sit;
|
|
||||||
nTransversions += ver;
|
|
||||||
double ratio = (float)sit/ver;
|
|
||||||
//s.append(String.format("%s %s %.2f %d %d %f%n", commentLine, prefix, avHet, sit, ver, ratio));
|
|
||||||
}
|
|
||||||
|
|
||||||
s.add(String.format("transitions %d", nTransitions));
|
s.add(String.format("transitions %d", nTransitions));
|
||||||
s.add(String.format("transversions %d", nTransversions));
|
s.add(String.format("transversions %d", nTransversions));
|
||||||
s.add(String.format("ratio %.2f", nTransitions / (1.0 * nTransversions)));
|
s.add(String.format("ratio %.2f", nTransitions / (1.0 * nTransversions)));
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,317 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.variants;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.rodVariants;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.TabularROD;
|
||||||
|
import org.broadinstitute.sting.utils.*;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import edu.mit.broad.picard.util.MathUtil;
|
||||||
|
|
||||||
|
class GenotypeFeatureData extends ArrayList<Double> {
|
||||||
|
public enum Tail { LeftTailed, RightTailed, TwoTailed }
|
||||||
|
|
||||||
|
private String genotype = null;
|
||||||
|
private Integer[] permutation = null;
|
||||||
|
private Logger logger = null;
|
||||||
|
|
||||||
|
private Tail tail = null;
|
||||||
|
|
||||||
|
double lowThreshold = -1;
|
||||||
|
double highThreshold = -1;
|
||||||
|
|
||||||
|
public GenotypeFeatureData(final String genotype, Logger logger, Tail tail ) {
|
||||||
|
super();
|
||||||
|
this.genotype = genotype;
|
||||||
|
this.logger = logger;
|
||||||
|
this.tail = tail;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Tail getTail() {
|
||||||
|
return tail;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getLowThreshold() {
|
||||||
|
return lowThreshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLowThreshold(double lowThreshold) {
|
||||||
|
this.lowThreshold = lowThreshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getHighThreshold() {
|
||||||
|
return highThreshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setHighThreshold(double highThreshold) {
|
||||||
|
this.highThreshold = highThreshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void finalizeData() {
|
||||||
|
permutation = Utils.SortPermutation(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void determineThresholds(final double pvalueLimit) {
|
||||||
|
if ( pvalueLimit <= 0.0 || pvalueLimit >= 1.0 )
|
||||||
|
throw new RuntimeException(String.format("Invalid pValue limit = %f", pvalueLimit));
|
||||||
|
|
||||||
|
switch ( tail ) {
|
||||||
|
case LeftTailed:
|
||||||
|
lowThreshold = highThreshold = determineOneTailThreshold(pvalueLimit);
|
||||||
|
break;
|
||||||
|
case RightTailed:
|
||||||
|
lowThreshold = highThreshold = determineOneTailThreshold(1 - pvalueLimit);
|
||||||
|
break;
|
||||||
|
case TwoTailed:
|
||||||
|
lowThreshold = determineOneTailThreshold(pvalueLimit/2);
|
||||||
|
highThreshold = determineOneTailThreshold(1-pvalueLimit/2);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
//logger.info(String.format(" %d of %d elements are >= threshold = %f (%f percent)",
|
||||||
|
// nElementsAboveThreshold(), nElements(), getHighThreshold(), nElementsAboveThreshold() / ( 1.0 * nElements())));
|
||||||
|
//logger.info(String.format(" %d of %d elements are <= threshold = %f (%f percent)",
|
||||||
|
// nElementsBelowThreshold(), nElements(), getLowThreshold(), nElementsBelowThreshold() / ( 1.0 * nElements())));
|
||||||
|
}
|
||||||
|
|
||||||
|
private double determineOneTailThreshold(final double pvalueLimit) {
|
||||||
|
double trueSortedIndexLimit = pvalueLimit * nElements();
|
||||||
|
int sortedIndexLimit = (int)(pvalueLimit > 0.5 ? Math.ceil(trueSortedIndexLimit) : Math.floor(trueSortedIndexLimit));
|
||||||
|
double threshold = get(itemIndex(sortedIndexLimit));
|
||||||
|
|
||||||
|
logger.debug(String.format("determineTailThreshold(%s, %f) => %f => %d => %f", genotype, pvalueLimit, trueSortedIndexLimit, sortedIndexLimit, threshold));
|
||||||
|
|
||||||
|
return threshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int nElementsBelowThreshold() { return nElementsPassingThreshold(true, false); }
|
||||||
|
public int nElementsAboveThreshold() { return nElementsPassingThreshold(false, true); }
|
||||||
|
|
||||||
|
public int nElementsPassingThreshold(boolean keepBelow, boolean keepAbove) {
|
||||||
|
int count = 0;
|
||||||
|
|
||||||
|
for ( double stat : this ) {
|
||||||
|
if ( passesThreshold(stat, keepBelow, keepAbove) ) count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean passesThreshold(double value, boolean keepBelow, boolean keepAbove) {
|
||||||
|
//System.out.printf("passThreshold %s, %b, %b = %b%n", value, keepBelow, keepAbove, value < threshold && keepBelow || value >= threshold && keepAbove);
|
||||||
|
return value <= getHighThreshold() && keepBelow || value >= getLowThreshold() && keepAbove;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean passesThreshold(double value) {
|
||||||
|
switch ( tail ) {
|
||||||
|
case LeftTailed:
|
||||||
|
return value >= getLowThreshold();
|
||||||
|
case RightTailed:
|
||||||
|
return value <= getHighThreshold();
|
||||||
|
case TwoTailed:
|
||||||
|
return value >= getLowThreshold() && value < getHighThreshold();
|
||||||
|
default:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public double pValue(double value) {
|
||||||
|
return 1.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int itemIndex(int sortedIndex) {
|
||||||
|
return permutation[sortedIndex];
|
||||||
|
}
|
||||||
|
|
||||||
|
public int nElements() { return size(); }
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return String.format("[GenotypeFeatureData: genotype=%s, tail=%s, lowThreshold=%f, highThreshold=%f]", genotype, tail, lowThreshold, highThreshold);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class ObservationTable extends HashMap<String, GenotypeFeatureData> {
|
||||||
|
String statField = null;
|
||||||
|
File observationsFile = null;
|
||||||
|
Logger logger = null;
|
||||||
|
GenotypeFeatureData.Tail tail;
|
||||||
|
|
||||||
|
public ObservationTable(final File observationsFile, final String statField, Logger logger, GenotypeFeatureData.Tail tail ) throws NoSuchFieldException, FileNotFoundException, IOException {
|
||||||
|
this.observationsFile = observationsFile;
|
||||||
|
this.statField = statField;
|
||||||
|
this.logger = logger;
|
||||||
|
this.tail = tail;
|
||||||
|
readAllData(observationsFile, statField);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Set<String> genotypes() {
|
||||||
|
return keySet();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void readAllData(final File observationsFile, final String statField) throws NoSuchFieldException, FileNotFoundException, IOException {
|
||||||
|
ArrayList<String> header = TabularROD.readHeader(observationsFile);
|
||||||
|
|
||||||
|
//logger.info(String.format("Starting to read table %s", observationsFile));
|
||||||
|
|
||||||
|
for ( String line : new xReadLines(observationsFile) ) {
|
||||||
|
TabularROD d = new TabularROD("ignoreMe", header);
|
||||||
|
String[] parts = line.split("\\s+");
|
||||||
|
if ( d.parseLine(header, parts) ) {
|
||||||
|
if (! d.containsKey(statField)) {
|
||||||
|
throw new NoSuchFieldException(String.format("Could not find field %s in line %s", statField, line));
|
||||||
|
}
|
||||||
|
double stat = Double.valueOf(d.get(statField));
|
||||||
|
|
||||||
|
final String genotype = d.get("genotype");
|
||||||
|
GenotypeFeatureData gfd = getData(genotype);
|
||||||
|
gfd.add(stat);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( GenotypeFeatureData gfd : this.values() ) {
|
||||||
|
gfd.finalizeData();
|
||||||
|
}
|
||||||
|
|
||||||
|
//logger.info(String.format("Read table %s, found %d genotypes/data pairs in %s field",
|
||||||
|
// observationsFile, size(), statField));
|
||||||
|
}
|
||||||
|
|
||||||
|
public GenotypeFeatureData getData(final String genotype) {
|
||||||
|
if ( ! containsKey(genotype) ) {
|
||||||
|
GenotypeFeatureData gfd = new GenotypeFeatureData(genotype, logger, tail);
|
||||||
|
put(genotype, gfd);
|
||||||
|
}
|
||||||
|
|
||||||
|
return get(genotype);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return String.format("[ObservationTable: file=%s, field=%s, nGenotypes=%d]", observationsFile, statField, size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public abstract class RatioFilter implements VariantExclusionCriterion {
|
||||||
|
protected double pvalueLimit = -1;
|
||||||
|
protected File observationsFile = null;
|
||||||
|
protected String statField = null; // "AlleleRatio";
|
||||||
|
protected Logger logger = null; // Logger.getLogger(RatioFilter.class);
|
||||||
|
protected ObservationTable dataTable = null;
|
||||||
|
protected String name = null;
|
||||||
|
protected GenotypeFeatureData.Tail tail = null;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A short-term hack to stop the systme from rejecting poorly covered sites, under the assumption
|
||||||
|
* that the ratio test is poorly determined without at least MIN_COUNTS_TO_APPLY_TEST. To be
|
||||||
|
* replaced by the more robust sampling approach outlined below
|
||||||
|
*/
|
||||||
|
final private static int MIN_COUNTS_TO_APPLY_TEST = 20;
|
||||||
|
|
||||||
|
final static boolean integrateOverSamplingProbabilities = true;
|
||||||
|
|
||||||
|
public RatioFilter(final String name, final String statField, Class myClass, GenotypeFeatureData.Tail tail ) {
|
||||||
|
this.name = name;
|
||||||
|
this.statField = statField;
|
||||||
|
this.tail = tail;
|
||||||
|
logger = Logger.getLogger(myClass);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void initialize(String arguments) {
|
||||||
|
if (arguments != null && !arguments.isEmpty()) {
|
||||||
|
String[] argPieces = arguments.split(",");
|
||||||
|
try {
|
||||||
|
pvalueLimit = Double.valueOf(argPieces[0]);
|
||||||
|
observationsFile = new File(argPieces[1]);
|
||||||
|
|
||||||
|
dataTable = new ObservationTable(observationsFile, statField, logger, tail);
|
||||||
|
|
||||||
|
logger.info(String.format("Initialized data for ratio filter %s: %s", name, dataTable));
|
||||||
|
for ( String genotype : dataTable.genotypes() ) {
|
||||||
|
GenotypeFeatureData gfd = dataTable.get(genotype);
|
||||||
|
gfd.determineThresholds(pvalueLimit);
|
||||||
|
logger.info(gfd.toString());
|
||||||
|
}
|
||||||
|
} catch ( NumberFormatException e ) {
|
||||||
|
throw new RuntimeException(String.format("Couldn't parse pValue limit from %s", argPieces[0]), e);
|
||||||
|
} catch ( FileNotFoundException e ) {
|
||||||
|
throw new RuntimeException("Couldn't open ObservationTable " + observationsFile, e);
|
||||||
|
} catch ( NoSuchFieldException e ) {
|
||||||
|
throw new RuntimeException("Couldn't parse ObservationTable " + observationsFile, e);
|
||||||
|
} catch ( IOException e ) {
|
||||||
|
throw new RuntimeException("Couldn't parse ObservationTable " + observationsFile,e );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected abstract boolean applyToVariant(rodVariants variant);
|
||||||
|
protected abstract Pair<Integer, Integer> scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant);
|
||||||
|
|
||||||
|
public boolean exclude(char ref, LocusContext context, rodVariants variant) {
|
||||||
|
boolean exclude = false;
|
||||||
|
|
||||||
|
//
|
||||||
|
// todo -- need to calculate a significance value for the chance that the
|
||||||
|
// todo -- observed first and second counts are reliably not passing the threshold
|
||||||
|
// todo -- basically, we need to sample with replacement from all first / second pairs
|
||||||
|
// todo -- and only conclude that the variant fails the test if the probability of failing
|
||||||
|
// todo -- across all samples is greater than some P value, like 0.05
|
||||||
|
//
|
||||||
|
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
|
||||||
|
if ( applyToVariant(variant) ) {
|
||||||
|
Pair<Integer, Integer> counts = scoreVariant(ref, pileup, variant);
|
||||||
|
GenotypeFeatureData gfd = dataTable.get(variant.getBestGenotype());
|
||||||
|
|
||||||
|
if (integrateOverSamplingProbabilities)
|
||||||
|
exclude = integralExclude(gfd, counts);
|
||||||
|
else
|
||||||
|
exclude = pointEstimateExclude(gfd, counts);
|
||||||
|
|
||||||
|
//
|
||||||
|
// for printing only
|
||||||
|
//
|
||||||
|
int n = counts.first + counts.second;
|
||||||
|
double value = counts.first / (1.0 * counts.first + counts.second);
|
||||||
|
logger.info(String.format("%s: counts1=%d (%.2f), counts2=%d (%.2f), n=%d, value=%f, %s, exclude=%b, bases=%s",
|
||||||
|
name, counts.first, counts.first / (0.01 * n), counts.second, counts.second / (0.01 * n), n,
|
||||||
|
value, gfd, exclude, pileup.getBases()));
|
||||||
|
}
|
||||||
|
|
||||||
|
return exclude;
|
||||||
|
}
|
||||||
|
|
||||||
|
private final static double SEARCH_INCREMENT = 0.01;
|
||||||
|
private final static double integralPValueThreshold = 0.05;
|
||||||
|
|
||||||
|
private boolean integralExclude(GenotypeFeatureData gfd, Pair<Integer, Integer> counts ) {
|
||||||
|
double sumExclude = 0.0, sumP = 0.0;
|
||||||
|
int n = counts.first + counts.second;
|
||||||
|
for ( double r = 0.0; r <= 1.0; r += SEARCH_INCREMENT ) {
|
||||||
|
double p = MathUtils.binomialProbability(counts.first, n, r);
|
||||||
|
sumP += p;
|
||||||
|
boolean exclude = ! gfd.passesThreshold(r);
|
||||||
|
sumExclude += p * (exclude ? 1.0 : 0.0);
|
||||||
|
|
||||||
|
//System.out.printf("integral: k=%d, n=%d, r=%f, p=%f, sumP = %f, exclude=%b | sum=%f, percentExcluded=%f%n",
|
||||||
|
// counts.first, n, r, p, sumP, exclude, sumExclude, sumExclude / sumP);
|
||||||
|
}
|
||||||
|
|
||||||
|
double percentExcluded = sumExclude / sumP;
|
||||||
|
return 1 - percentExcluded <= integralPValueThreshold ;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean pointEstimateExclude(GenotypeFeatureData gfd, Pair<Integer, Integer> counts ) {
|
||||||
|
int n = counts.first + counts.second;
|
||||||
|
if ( n < MIN_COUNTS_TO_APPLY_TEST ) return false;
|
||||||
|
double value = counts.first / (1.0 * counts.first + counts.second);
|
||||||
|
//double value = counts.first / (1.0 * Math.max(counts.second, 1.0));
|
||||||
|
return ! gfd.passesThreshold(value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.variants;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.rodVariants;
|
||||||
|
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
import org.broadinstitute.sting.utils.Pair;
|
||||||
|
|
||||||
|
public class VECAlleleBalance extends RatioFilter {
|
||||||
|
final private static String statField = "AlleleRatio";
|
||||||
|
final private static GenotypeFeatureData.Tail tail = GenotypeFeatureData.Tail.TwoTailed;
|
||||||
|
|
||||||
|
|
||||||
|
public VECAlleleBalance() {
|
||||||
|
super("AlleleBalance", statField, VECAlleleBalance.class, tail);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* We can only filter out het calls with the AlleleBalance filter
|
||||||
|
*
|
||||||
|
* @param variant
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
protected boolean applyToVariant(rodVariants variant) {
|
||||||
|
String genotype = variant.getBestGenotype();
|
||||||
|
return genotype.charAt(0) != genotype.charAt(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the count of bases matching the major (first) and minor (second) alleles as a pair.
|
||||||
|
*
|
||||||
|
* @param ref
|
||||||
|
* @param pileup
|
||||||
|
* @param variant
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
protected Pair<Integer, Integer> scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant) {
|
||||||
|
final String genotype = variant.getBestGenotype();
|
||||||
|
final String bases = pileup.getBases();
|
||||||
|
|
||||||
|
if ( genotype.length() > 2 )
|
||||||
|
throw new IllegalArgumentException(String.format("Can only handle diploid genotypes: %s", genotype));
|
||||||
|
|
||||||
|
|
||||||
|
char a = genotype.toUpperCase().charAt(0);
|
||||||
|
char b = genotype.toUpperCase().charAt(1);
|
||||||
|
int aCount = Utils.countOccurrences(a, bases.toUpperCase());
|
||||||
|
int bCount = Utils.countOccurrences(b, bases.toUpperCase());
|
||||||
|
|
||||||
|
int major = Math.max(aCount, bCount);
|
||||||
|
int minor = Math.min(aCount, bCount);
|
||||||
|
|
||||||
|
return new Pair(major, minor);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.variants;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.rodVariants;
|
||||||
|
import org.broadinstitute.sting.utils.*;
|
||||||
|
|
||||||
|
public class VECOnOffGenotypeRatio extends RatioFilter {
|
||||||
|
final private static String statField = "onOffRatio";
|
||||||
|
final private static GenotypeFeatureData.Tail tail = GenotypeFeatureData.Tail.LeftTailed;
|
||||||
|
|
||||||
|
public VECOnOffGenotypeRatio() {
|
||||||
|
super("On/Off Genotype Ratio", statField, VECOnOffGenotypeRatio.class, tail);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* On/off genotype filters can be applied to any genotype
|
||||||
|
*
|
||||||
|
* @param variant
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
protected boolean applyToVariant(rodVariants variant) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the counts of bases that are on (matching the bestGenotype) and off (not matching the
|
||||||
|
* best genotype). On are in the first field, off in the second.
|
||||||
|
*
|
||||||
|
* @param ref
|
||||||
|
* @param pileup
|
||||||
|
* @param variant
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
protected Pair<Integer, Integer> scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant) {
|
||||||
|
final String genotype = variant.getBestGenotype().toUpperCase();
|
||||||
|
final String bases = pileup.getBases();
|
||||||
|
|
||||||
|
if ( genotype.length() > 2 )
|
||||||
|
throw new IllegalArgumentException(String.format("Can only handle diploid genotypes: %s", genotype));
|
||||||
|
|
||||||
|
|
||||||
|
int on = 0, off = 0;
|
||||||
|
|
||||||
|
for ( char base : BaseUtils.BASES ) {
|
||||||
|
int count = BasicPileup.countBase(base, bases);
|
||||||
|
if ( Utils.countOccurrences(base, genotype) > 0 )
|
||||||
|
on += count;
|
||||||
|
else
|
||||||
|
off += count;
|
||||||
|
//System.out.printf("count = %d, on=%d, off=%d for %c in %s%n", count, on, off, base, genotype);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Pair<Integer, Integer>(on, off);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -6,6 +6,7 @@ import java.util.Random;
|
||||||
* BaseUtils contains some basic utilities for manipulating nucleotides.
|
* BaseUtils contains some basic utilities for manipulating nucleotides.
|
||||||
*/
|
*/
|
||||||
public class BaseUtils {
|
public class BaseUtils {
|
||||||
|
public final static char[] BASES = { 'A', 'C', 'G', 'T' };
|
||||||
|
|
||||||
/// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or
|
/// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or
|
||||||
// a pyrimidine to another pyrimidine nucleotide (C <-> T).
|
// a pyrimidine to another pyrimidine nucleotide (C <-> T).
|
||||||
|
|
|
||||||
|
|
@ -347,12 +347,35 @@ abstract public class BasicPileup implements Pileup {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static byte consensusBase(String bases) {
|
public static class BaseCounts {
|
||||||
|
int a, c, t, g;
|
||||||
|
|
||||||
|
public BaseCounts(int a, int c, int t, int g) {
|
||||||
|
this.a = a;
|
||||||
|
this.c = c;
|
||||||
|
this.t = t;
|
||||||
|
this.g = g;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int countBase(final char base, final String bases) {
|
||||||
|
return Utils.countOccurrences(base, bases);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static BaseCounts countBases(final String bases) {
|
||||||
String canon = bases.toUpperCase();
|
String canon = bases.toUpperCase();
|
||||||
int ACount = Utils.countOccurrences('A', bases);
|
return new BaseCounts(Utils.countOccurrences('A', canon),
|
||||||
int CCount = Utils.countOccurrences('C', bases);
|
Utils.countOccurrences('C', canon),
|
||||||
int TCount = Utils.countOccurrences('T', bases);
|
Utils.countOccurrences('T', canon),
|
||||||
int GCount = Utils.countOccurrences('G', bases);
|
Utils.countOccurrences('G', canon));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static byte consensusBase(String bases) {
|
||||||
|
BaseCounts counts = countBases( bases );
|
||||||
|
int ACount = counts.a;
|
||||||
|
int CCount = counts.c;
|
||||||
|
int TCount = counts.t;
|
||||||
|
int GCount = counts.g;
|
||||||
|
|
||||||
int m = Math.max(ACount, Math.max(CCount, Math.max(TCount, GCount)));
|
int m = Math.max(ACount, Math.max(CCount, Math.max(TCount, GCount)));
|
||||||
if ( ACount == m ) return 'A';
|
if ( ACount == m ) return 'A';
|
||||||
|
|
@ -362,3 +385,5 @@ abstract public class BasicPileup implements Pileup {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue