AlleleBalance and on/off primary base filters -- version 0.0.1 -- for experimental use only
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1294 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
00f9bcd6d1
commit
9c12c02768
|
|
@ -110,6 +110,11 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
|
|||
attributes = new HashMap<String, String>();
|
||||
}
|
||||
|
||||
public TabularROD(final String name, ArrayList<String> header) {
|
||||
this(name);
|
||||
this.header = header;
|
||||
}
|
||||
|
||||
/**
|
||||
* Make a new TabularROD with name, using header columns header, at loc, without any bound data. Data
|
||||
* must be bound to each corresponding header[i] field before the object is really usable.
|
||||
|
|
@ -155,7 +160,11 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
|
|||
* @return
|
||||
*/
|
||||
public Object initialize(final File source) throws FileNotFoundException {
|
||||
List<String> header = null;
|
||||
return readHeader(source);
|
||||
}
|
||||
|
||||
public static ArrayList<String> readHeader(final File source) throws FileNotFoundException {
|
||||
ArrayList<String> header = null;
|
||||
int linesLookedAt = 0;
|
||||
xReadLines reader = new xReadLines(source);
|
||||
|
||||
|
|
@ -186,9 +195,16 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
|
|||
header.add(Integer.toString(i));
|
||||
}
|
||||
|
||||
try {
|
||||
reader.close();
|
||||
} catch ( IOException e ) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
return header;
|
||||
}
|
||||
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// ROD accessors
|
||||
|
|
|
|||
|
|
@ -50,6 +50,8 @@ public class rodVariants extends BasicReferenceOrderedDatum implements AllelicVa
|
|||
public String delimiterRegex() { return "\\s+"; }
|
||||
|
||||
public boolean parseLine(Object header, String[] parts) throws IOException {
|
||||
if ( parts.length < 18 )
|
||||
throw new IOException("Invalid rodVariant row found -- too few elements. Expected 18+, got " + parts.length);
|
||||
if (!parts[0].startsWith("#")) {
|
||||
loc = GenomeLocParser.createGenomeLoc(parts[0], Long.valueOf(parts[1]));
|
||||
refBase = parts[2].charAt(0);
|
||||
|
|
|
|||
|
|
@ -1,27 +0,0 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: andrewk
|
||||
* Date: Apr 2, 2009
|
||||
* Time: 9:01:44 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class AlleleMetricsWalker {
|
||||
// Class that will walk over various metrics in a reference ordered way
|
||||
// This class walks over the genome in reference order and calls AlleleMetrics on each class
|
||||
// Hapmap and dbSNP tracks are taken from the command line
|
||||
// At first pass, this will at least be able to walk over a GFF file and compare to the hapmap and dbsnp
|
||||
// tracks specified on the command line and handed in via the LocusContext
|
||||
|
||||
public void map(List<AllelicVariant> avdata) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -20,55 +20,29 @@ import java.util.List;
|
|||
*
|
||||
*/
|
||||
public class TransitionTranversionAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis {
|
||||
int N_TRANSITION_TRANVERSION_BINS = 100;
|
||||
Histogram<Integer> transitions;
|
||||
Histogram<Integer> transversions;
|
||||
long nTransitions = 0, nTransversions = 0;
|
||||
|
||||
public TransitionTranversionAnalysis() {
|
||||
super("transitions_transversions");
|
||||
transitions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
|
||||
transversions = new Histogram<Integer>(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0);
|
||||
}
|
||||
|
||||
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
if ( eval != null && eval.isSNP() ) {
|
||||
char refBase = eval.getRefSnpFWD();
|
||||
char altBase = eval.getAltSnpFWD();
|
||||
//System.out.printf("%c %c%n", refBase, altBase);
|
||||
//int i = transition_transversion_bin(dbsnp.getHeterozygosity());
|
||||
//System.out.printf("MAF = %f => %d%n", dbsnp.getMAF(), i);
|
||||
//EnumMap<BaseUtils.BaseSubstitutionType, Integer> bin = transition_transversion_counts[i];
|
||||
|
||||
BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType(refBase, altBase);
|
||||
Histogram<Integer> h = subType == BaseUtils.BaseSubstitutionType.TRANSITION ? transitions : transversions;
|
||||
double het = eval.getHeterozygosity();
|
||||
h.setBin(het, h.getBin(het) + 1);
|
||||
//int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION);
|
||||
//int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION);
|
||||
//System.out.printf("%s %.2f %s%n", subType, dbsnp.getHeterozygosity(), h.x2bin(logHet), dbsnp.toString());
|
||||
if ( subType == BaseUtils.BaseSubstitutionType.TRANSITION )
|
||||
nTransitions++;
|
||||
else
|
||||
nTransversions++;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
int nTransitions = 0;
|
||||
int nTransversions = 0;
|
||||
|
||||
List<String> s = new ArrayList<String>();
|
||||
for ( int i = 0; i < N_TRANSITION_TRANVERSION_BINS; i++ ) {
|
||||
//double avHet = Math.pow(10, transitions.bin2x(i));
|
||||
double avHet = transitions.bin2x(i);
|
||||
if ( avHet > 0.5 ) break;
|
||||
|
||||
int sit = transitions.getBin(i);
|
||||
int ver = transversions.getBin(i);
|
||||
nTransitions += sit;
|
||||
nTransversions += ver;
|
||||
double ratio = (float)sit/ver;
|
||||
//s.append(String.format("%s %s %.2f %d %d %f%n", commentLine, prefix, avHet, sit, ver, ratio));
|
||||
}
|
||||
|
||||
s.add(String.format("transitions %d", nTransitions));
|
||||
s.add(String.format("transversions %d", nTransversions));
|
||||
s.add(String.format("ratio %.2f", nTransitions / (1.0 * nTransversions)));
|
||||
|
|
|
|||
|
|
@ -0,0 +1,317 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variants;
|
||||
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodVariants;
|
||||
import org.broadinstitute.sting.gatk.refdata.TabularROD;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Set;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
|
||||
import edu.mit.broad.picard.util.MathUtil;
|
||||
|
||||
class GenotypeFeatureData extends ArrayList<Double> {
|
||||
public enum Tail { LeftTailed, RightTailed, TwoTailed }
|
||||
|
||||
private String genotype = null;
|
||||
private Integer[] permutation = null;
|
||||
private Logger logger = null;
|
||||
|
||||
private Tail tail = null;
|
||||
|
||||
double lowThreshold = -1;
|
||||
double highThreshold = -1;
|
||||
|
||||
public GenotypeFeatureData(final String genotype, Logger logger, Tail tail ) {
|
||||
super();
|
||||
this.genotype = genotype;
|
||||
this.logger = logger;
|
||||
this.tail = tail;
|
||||
}
|
||||
|
||||
public Tail getTail() {
|
||||
return tail;
|
||||
}
|
||||
|
||||
public double getLowThreshold() {
|
||||
return lowThreshold;
|
||||
}
|
||||
|
||||
public void setLowThreshold(double lowThreshold) {
|
||||
this.lowThreshold = lowThreshold;
|
||||
}
|
||||
|
||||
public double getHighThreshold() {
|
||||
return highThreshold;
|
||||
}
|
||||
|
||||
public void setHighThreshold(double highThreshold) {
|
||||
this.highThreshold = highThreshold;
|
||||
}
|
||||
|
||||
public void finalizeData() {
|
||||
permutation = Utils.SortPermutation(this);
|
||||
}
|
||||
|
||||
public void determineThresholds(final double pvalueLimit) {
|
||||
if ( pvalueLimit <= 0.0 || pvalueLimit >= 1.0 )
|
||||
throw new RuntimeException(String.format("Invalid pValue limit = %f", pvalueLimit));
|
||||
|
||||
switch ( tail ) {
|
||||
case LeftTailed:
|
||||
lowThreshold = highThreshold = determineOneTailThreshold(pvalueLimit);
|
||||
break;
|
||||
case RightTailed:
|
||||
lowThreshold = highThreshold = determineOneTailThreshold(1 - pvalueLimit);
|
||||
break;
|
||||
case TwoTailed:
|
||||
lowThreshold = determineOneTailThreshold(pvalueLimit/2);
|
||||
highThreshold = determineOneTailThreshold(1-pvalueLimit/2);
|
||||
break;
|
||||
}
|
||||
|
||||
//logger.info(String.format(" %d of %d elements are >= threshold = %f (%f percent)",
|
||||
// nElementsAboveThreshold(), nElements(), getHighThreshold(), nElementsAboveThreshold() / ( 1.0 * nElements())));
|
||||
//logger.info(String.format(" %d of %d elements are <= threshold = %f (%f percent)",
|
||||
// nElementsBelowThreshold(), nElements(), getLowThreshold(), nElementsBelowThreshold() / ( 1.0 * nElements())));
|
||||
}
|
||||
|
||||
private double determineOneTailThreshold(final double pvalueLimit) {
|
||||
double trueSortedIndexLimit = pvalueLimit * nElements();
|
||||
int sortedIndexLimit = (int)(pvalueLimit > 0.5 ? Math.ceil(trueSortedIndexLimit) : Math.floor(trueSortedIndexLimit));
|
||||
double threshold = get(itemIndex(sortedIndexLimit));
|
||||
|
||||
logger.debug(String.format("determineTailThreshold(%s, %f) => %f => %d => %f", genotype, pvalueLimit, trueSortedIndexLimit, sortedIndexLimit, threshold));
|
||||
|
||||
return threshold;
|
||||
}
|
||||
|
||||
public int nElementsBelowThreshold() { return nElementsPassingThreshold(true, false); }
|
||||
public int nElementsAboveThreshold() { return nElementsPassingThreshold(false, true); }
|
||||
|
||||
public int nElementsPassingThreshold(boolean keepBelow, boolean keepAbove) {
|
||||
int count = 0;
|
||||
|
||||
for ( double stat : this ) {
|
||||
if ( passesThreshold(stat, keepBelow, keepAbove) ) count++;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
public boolean passesThreshold(double value, boolean keepBelow, boolean keepAbove) {
|
||||
//System.out.printf("passThreshold %s, %b, %b = %b%n", value, keepBelow, keepAbove, value < threshold && keepBelow || value >= threshold && keepAbove);
|
||||
return value <= getHighThreshold() && keepBelow || value >= getLowThreshold() && keepAbove;
|
||||
}
|
||||
|
||||
public boolean passesThreshold(double value) {
|
||||
switch ( tail ) {
|
||||
case LeftTailed:
|
||||
return value >= getLowThreshold();
|
||||
case RightTailed:
|
||||
return value <= getHighThreshold();
|
||||
case TwoTailed:
|
||||
return value >= getLowThreshold() && value < getHighThreshold();
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
public double pValue(double value) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
public int itemIndex(int sortedIndex) {
|
||||
return permutation[sortedIndex];
|
||||
}
|
||||
|
||||
public int nElements() { return size(); }
|
||||
|
||||
public String toString() {
|
||||
return String.format("[GenotypeFeatureData: genotype=%s, tail=%s, lowThreshold=%f, highThreshold=%f]", genotype, tail, lowThreshold, highThreshold);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class ObservationTable extends HashMap<String, GenotypeFeatureData> {
|
||||
String statField = null;
|
||||
File observationsFile = null;
|
||||
Logger logger = null;
|
||||
GenotypeFeatureData.Tail tail;
|
||||
|
||||
public ObservationTable(final File observationsFile, final String statField, Logger logger, GenotypeFeatureData.Tail tail ) throws NoSuchFieldException, FileNotFoundException, IOException {
|
||||
this.observationsFile = observationsFile;
|
||||
this.statField = statField;
|
||||
this.logger = logger;
|
||||
this.tail = tail;
|
||||
readAllData(observationsFile, statField);
|
||||
}
|
||||
|
||||
public Set<String> genotypes() {
|
||||
return keySet();
|
||||
}
|
||||
|
||||
public void readAllData(final File observationsFile, final String statField) throws NoSuchFieldException, FileNotFoundException, IOException {
|
||||
ArrayList<String> header = TabularROD.readHeader(observationsFile);
|
||||
|
||||
//logger.info(String.format("Starting to read table %s", observationsFile));
|
||||
|
||||
for ( String line : new xReadLines(observationsFile) ) {
|
||||
TabularROD d = new TabularROD("ignoreMe", header);
|
||||
String[] parts = line.split("\\s+");
|
||||
if ( d.parseLine(header, parts) ) {
|
||||
if (! d.containsKey(statField)) {
|
||||
throw new NoSuchFieldException(String.format("Could not find field %s in line %s", statField, line));
|
||||
}
|
||||
double stat = Double.valueOf(d.get(statField));
|
||||
|
||||
final String genotype = d.get("genotype");
|
||||
GenotypeFeatureData gfd = getData(genotype);
|
||||
gfd.add(stat);
|
||||
}
|
||||
}
|
||||
|
||||
for ( GenotypeFeatureData gfd : this.values() ) {
|
||||
gfd.finalizeData();
|
||||
}
|
||||
|
||||
//logger.info(String.format("Read table %s, found %d genotypes/data pairs in %s field",
|
||||
// observationsFile, size(), statField));
|
||||
}
|
||||
|
||||
public GenotypeFeatureData getData(final String genotype) {
|
||||
if ( ! containsKey(genotype) ) {
|
||||
GenotypeFeatureData gfd = new GenotypeFeatureData(genotype, logger, tail);
|
||||
put(genotype, gfd);
|
||||
}
|
||||
|
||||
return get(genotype);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("[ObservationTable: file=%s, field=%s, nGenotypes=%d]", observationsFile, statField, size());
|
||||
}
|
||||
}
|
||||
|
||||
public abstract class RatioFilter implements VariantExclusionCriterion {
|
||||
protected double pvalueLimit = -1;
|
||||
protected File observationsFile = null;
|
||||
protected String statField = null; // "AlleleRatio";
|
||||
protected Logger logger = null; // Logger.getLogger(RatioFilter.class);
|
||||
protected ObservationTable dataTable = null;
|
||||
protected String name = null;
|
||||
protected GenotypeFeatureData.Tail tail = null;
|
||||
|
||||
|
||||
/**
|
||||
* A short-term hack to stop the systme from rejecting poorly covered sites, under the assumption
|
||||
* that the ratio test is poorly determined without at least MIN_COUNTS_TO_APPLY_TEST. To be
|
||||
* replaced by the more robust sampling approach outlined below
|
||||
*/
|
||||
final private static int MIN_COUNTS_TO_APPLY_TEST = 20;
|
||||
|
||||
final static boolean integrateOverSamplingProbabilities = true;
|
||||
|
||||
public RatioFilter(final String name, final String statField, Class myClass, GenotypeFeatureData.Tail tail ) {
|
||||
this.name = name;
|
||||
this.statField = statField;
|
||||
this.tail = tail;
|
||||
logger = Logger.getLogger(myClass);
|
||||
}
|
||||
|
||||
public void initialize(String arguments) {
|
||||
if (arguments != null && !arguments.isEmpty()) {
|
||||
String[] argPieces = arguments.split(",");
|
||||
try {
|
||||
pvalueLimit = Double.valueOf(argPieces[0]);
|
||||
observationsFile = new File(argPieces[1]);
|
||||
|
||||
dataTable = new ObservationTable(observationsFile, statField, logger, tail);
|
||||
|
||||
logger.info(String.format("Initialized data for ratio filter %s: %s", name, dataTable));
|
||||
for ( String genotype : dataTable.genotypes() ) {
|
||||
GenotypeFeatureData gfd = dataTable.get(genotype);
|
||||
gfd.determineThresholds(pvalueLimit);
|
||||
logger.info(gfd.toString());
|
||||
}
|
||||
} catch ( NumberFormatException e ) {
|
||||
throw new RuntimeException(String.format("Couldn't parse pValue limit from %s", argPieces[0]), e);
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new RuntimeException("Couldn't open ObservationTable " + observationsFile, e);
|
||||
} catch ( NoSuchFieldException e ) {
|
||||
throw new RuntimeException("Couldn't parse ObservationTable " + observationsFile, e);
|
||||
} catch ( IOException e ) {
|
||||
throw new RuntimeException("Couldn't parse ObservationTable " + observationsFile,e );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected abstract boolean applyToVariant(rodVariants variant);
|
||||
protected abstract Pair<Integer, Integer> scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant);
|
||||
|
||||
public boolean exclude(char ref, LocusContext context, rodVariants variant) {
|
||||
boolean exclude = false;
|
||||
|
||||
//
|
||||
// todo -- need to calculate a significance value for the chance that the
|
||||
// todo -- observed first and second counts are reliably not passing the threshold
|
||||
// todo -- basically, we need to sample with replacement from all first / second pairs
|
||||
// todo -- and only conclude that the variant fails the test if the probability of failing
|
||||
// todo -- across all samples is greater than some P value, like 0.05
|
||||
//
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
|
||||
if ( applyToVariant(variant) ) {
|
||||
Pair<Integer, Integer> counts = scoreVariant(ref, pileup, variant);
|
||||
GenotypeFeatureData gfd = dataTable.get(variant.getBestGenotype());
|
||||
|
||||
if (integrateOverSamplingProbabilities)
|
||||
exclude = integralExclude(gfd, counts);
|
||||
else
|
||||
exclude = pointEstimateExclude(gfd, counts);
|
||||
|
||||
//
|
||||
// for printing only
|
||||
//
|
||||
int n = counts.first + counts.second;
|
||||
double value = counts.first / (1.0 * counts.first + counts.second);
|
||||
logger.info(String.format("%s: counts1=%d (%.2f), counts2=%d (%.2f), n=%d, value=%f, %s, exclude=%b, bases=%s",
|
||||
name, counts.first, counts.first / (0.01 * n), counts.second, counts.second / (0.01 * n), n,
|
||||
value, gfd, exclude, pileup.getBases()));
|
||||
}
|
||||
|
||||
return exclude;
|
||||
}
|
||||
|
||||
private final static double SEARCH_INCREMENT = 0.01;
|
||||
private final static double integralPValueThreshold = 0.05;
|
||||
|
||||
private boolean integralExclude(GenotypeFeatureData gfd, Pair<Integer, Integer> counts ) {
|
||||
double sumExclude = 0.0, sumP = 0.0;
|
||||
int n = counts.first + counts.second;
|
||||
for ( double r = 0.0; r <= 1.0; r += SEARCH_INCREMENT ) {
|
||||
double p = MathUtils.binomialProbability(counts.first, n, r);
|
||||
sumP += p;
|
||||
boolean exclude = ! gfd.passesThreshold(r);
|
||||
sumExclude += p * (exclude ? 1.0 : 0.0);
|
||||
|
||||
//System.out.printf("integral: k=%d, n=%d, r=%f, p=%f, sumP = %f, exclude=%b | sum=%f, percentExcluded=%f%n",
|
||||
// counts.first, n, r, p, sumP, exclude, sumExclude, sumExclude / sumP);
|
||||
}
|
||||
|
||||
double percentExcluded = sumExclude / sumP;
|
||||
return 1 - percentExcluded <= integralPValueThreshold ;
|
||||
}
|
||||
|
||||
private boolean pointEstimateExclude(GenotypeFeatureData gfd, Pair<Integer, Integer> counts ) {
|
||||
int n = counts.first + counts.second;
|
||||
if ( n < MIN_COUNTS_TO_APPLY_TEST ) return false;
|
||||
double value = counts.first / (1.0 * counts.first + counts.second);
|
||||
//double value = counts.first / (1.0 * Math.max(counts.second, 1.0));
|
||||
return ! gfd.passesThreshold(value);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variants;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.rodVariants;
|
||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
|
||||
public class VECAlleleBalance extends RatioFilter {
|
||||
final private static String statField = "AlleleRatio";
|
||||
final private static GenotypeFeatureData.Tail tail = GenotypeFeatureData.Tail.TwoTailed;
|
||||
|
||||
|
||||
public VECAlleleBalance() {
|
||||
super("AlleleBalance", statField, VECAlleleBalance.class, tail);
|
||||
}
|
||||
|
||||
/**
|
||||
* We can only filter out het calls with the AlleleBalance filter
|
||||
*
|
||||
* @param variant
|
||||
* @return
|
||||
*/
|
||||
protected boolean applyToVariant(rodVariants variant) {
|
||||
String genotype = variant.getBestGenotype();
|
||||
return genotype.charAt(0) != genotype.charAt(1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the count of bases matching the major (first) and minor (second) alleles as a pair.
|
||||
*
|
||||
* @param ref
|
||||
* @param pileup
|
||||
* @param variant
|
||||
* @return
|
||||
*/
|
||||
protected Pair<Integer, Integer> scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant) {
|
||||
final String genotype = variant.getBestGenotype();
|
||||
final String bases = pileup.getBases();
|
||||
|
||||
if ( genotype.length() > 2 )
|
||||
throw new IllegalArgumentException(String.format("Can only handle diploid genotypes: %s", genotype));
|
||||
|
||||
|
||||
char a = genotype.toUpperCase().charAt(0);
|
||||
char b = genotype.toUpperCase().charAt(1);
|
||||
int aCount = Utils.countOccurrences(a, bases.toUpperCase());
|
||||
int bCount = Utils.countOccurrences(b, bases.toUpperCase());
|
||||
|
||||
int major = Math.max(aCount, bCount);
|
||||
int minor = Math.min(aCount, bCount);
|
||||
|
||||
return new Pair(major, minor);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variants;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.rodVariants;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
||||
public class VECOnOffGenotypeRatio extends RatioFilter {
|
||||
final private static String statField = "onOffRatio";
|
||||
final private static GenotypeFeatureData.Tail tail = GenotypeFeatureData.Tail.LeftTailed;
|
||||
|
||||
public VECOnOffGenotypeRatio() {
|
||||
super("On/Off Genotype Ratio", statField, VECOnOffGenotypeRatio.class, tail);
|
||||
}
|
||||
|
||||
/**
|
||||
* On/off genotype filters can be applied to any genotype
|
||||
*
|
||||
* @param variant
|
||||
* @return
|
||||
*/
|
||||
protected boolean applyToVariant(rodVariants variant) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the counts of bases that are on (matching the bestGenotype) and off (not matching the
|
||||
* best genotype). On are in the first field, off in the second.
|
||||
*
|
||||
* @param ref
|
||||
* @param pileup
|
||||
* @param variant
|
||||
* @return
|
||||
*/
|
||||
protected Pair<Integer, Integer> scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant) {
|
||||
final String genotype = variant.getBestGenotype().toUpperCase();
|
||||
final String bases = pileup.getBases();
|
||||
|
||||
if ( genotype.length() > 2 )
|
||||
throw new IllegalArgumentException(String.format("Can only handle diploid genotypes: %s", genotype));
|
||||
|
||||
|
||||
int on = 0, off = 0;
|
||||
|
||||
for ( char base : BaseUtils.BASES ) {
|
||||
int count = BasicPileup.countBase(base, bases);
|
||||
if ( Utils.countOccurrences(base, genotype) > 0 )
|
||||
on += count;
|
||||
else
|
||||
off += count;
|
||||
//System.out.printf("count = %d, on=%d, off=%d for %c in %s%n", count, on, off, base, genotype);
|
||||
}
|
||||
|
||||
return new Pair<Integer, Integer>(on, off);
|
||||
}
|
||||
}
|
||||
|
|
@ -6,6 +6,7 @@ import java.util.Random;
|
|||
* BaseUtils contains some basic utilities for manipulating nucleotides.
|
||||
*/
|
||||
public class BaseUtils {
|
||||
public final static char[] BASES = { 'A', 'C', 'G', 'T' };
|
||||
|
||||
/// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or
|
||||
// a pyrimidine to another pyrimidine nucleotide (C <-> T).
|
||||
|
|
|
|||
|
|
@ -347,12 +347,35 @@ abstract public class BasicPileup implements Pileup {
|
|||
return null;
|
||||
}
|
||||
|
||||
public static byte consensusBase(String bases) {
|
||||
public static class BaseCounts {
|
||||
int a, c, t, g;
|
||||
|
||||
public BaseCounts(int a, int c, int t, int g) {
|
||||
this.a = a;
|
||||
this.c = c;
|
||||
this.t = t;
|
||||
this.g = g;
|
||||
}
|
||||
}
|
||||
|
||||
public static int countBase(final char base, final String bases) {
|
||||
return Utils.countOccurrences(base, bases);
|
||||
}
|
||||
|
||||
public static BaseCounts countBases(final String bases) {
|
||||
String canon = bases.toUpperCase();
|
||||
int ACount = Utils.countOccurrences('A', bases);
|
||||
int CCount = Utils.countOccurrences('C', bases);
|
||||
int TCount = Utils.countOccurrences('T', bases);
|
||||
int GCount = Utils.countOccurrences('G', bases);
|
||||
return new BaseCounts(Utils.countOccurrences('A', canon),
|
||||
Utils.countOccurrences('C', canon),
|
||||
Utils.countOccurrences('T', canon),
|
||||
Utils.countOccurrences('G', canon));
|
||||
}
|
||||
|
||||
public static byte consensusBase(String bases) {
|
||||
BaseCounts counts = countBases( bases );
|
||||
int ACount = counts.a;
|
||||
int CCount = counts.c;
|
||||
int TCount = counts.t;
|
||||
int GCount = counts.g;
|
||||
|
||||
int m = Math.max(ACount, Math.max(CCount, Math.max(TCount, GCount)));
|
||||
if ( ACount == m ) return 'A';
|
||||
|
|
@ -362,3 +385,5 @@ abstract public class BasicPileup implements Pileup {
|
|||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue