diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java index ecc921a63..8bb8da6d3 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java @@ -110,6 +110,11 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map(); } + public TabularROD(final String name, ArrayList header) { + this(name); + this.header = header; + } + /** * Make a new TabularROD with name, using header columns header, at loc, without any bound data. Data * must be bound to each corresponding header[i] field before the object is really usable. @@ -155,7 +160,11 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map header = null; + return readHeader(source); + } + + public static ArrayList readHeader(final File source) throws FileNotFoundException { + ArrayList header = null; int linesLookedAt = 0; xReadLines reader = new xReadLines(source); @@ -186,9 +195,16 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map avdata) { - - } - - -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/TransitionTranversionAnalysis.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/TransitionTranversionAnalysis.java index 865f9dad7..4c529caba 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/TransitionTranversionAnalysis.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/TransitionTranversionAnalysis.java @@ -20,55 +20,29 @@ import java.util.List; * */ public class TransitionTranversionAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis, PopulationAnalysis { - int N_TRANSITION_TRANVERSION_BINS = 100; - Histogram transitions; - Histogram transversions; + long nTransitions = 0, nTransversions = 0; public TransitionTranversionAnalysis() { super("transitions_transversions"); - transitions = new Histogram(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0); - transversions = new Histogram(N_TRANSITION_TRANVERSION_BINS, 0.0, 1.0, 0); } public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) { if ( eval != null && eval.isSNP() ) { char refBase = eval.getRefSnpFWD(); char altBase = eval.getAltSnpFWD(); - //System.out.printf("%c %c%n", refBase, altBase); - //int i = transition_transversion_bin(dbsnp.getHeterozygosity()); - //System.out.printf("MAF = %f => %d%n", dbsnp.getMAF(), i); - //EnumMap bin = transition_transversion_counts[i]; BaseUtils.BaseSubstitutionType subType = BaseUtils.SNPSubstitutionType(refBase, altBase); - Histogram h = subType == BaseUtils.BaseSubstitutionType.TRANSITION ? transitions : transversions; - double het = eval.getHeterozygosity(); - h.setBin(het, h.getBin(het) + 1); - //int sit = bin.get(BaseUtils.BaseSubstitutionType.TRANSITION); - //int ver = bin.get(BaseUtils.BaseSubstitutionType.TRANSVERSION); - //System.out.printf("%s %.2f %s%n", subType, dbsnp.getHeterozygosity(), h.x2bin(logHet), dbsnp.toString()); + if ( subType == BaseUtils.BaseSubstitutionType.TRANSITION ) + nTransitions++; + else + nTransversions++; } return null; } public List done() { - int nTransitions = 0; - int nTransversions = 0; - List s = new ArrayList(); - for ( int i = 0; i < N_TRANSITION_TRANVERSION_BINS; i++ ) { - //double avHet = Math.pow(10, transitions.bin2x(i)); - double avHet = transitions.bin2x(i); - if ( avHet > 0.5 ) break; - - int sit = transitions.getBin(i); - int ver = transversions.getBin(i); - nTransitions += sit; - nTransversions += ver; - double ratio = (float)sit/ver; - //s.append(String.format("%s %s %.2f %d %d %f%n", commentLine, prefix, avHet, sit, ver, ratio)); - } - s.add(String.format("transitions %d", nTransitions)); s.add(String.format("transversions %d", nTransversions)); s.add(String.format("ratio %.2f", nTransitions / (1.0 * nTransversions))); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/RatioFilter.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/RatioFilter.java new file mode 100755 index 000000000..36ea0ae07 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/RatioFilter.java @@ -0,0 +1,317 @@ +package org.broadinstitute.sting.playground.gatk.walkers.variants; + +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.refdata.rodVariants; +import org.broadinstitute.sting.gatk.refdata.TabularROD; +import org.broadinstitute.sting.utils.*; +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Set; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; + +import edu.mit.broad.picard.util.MathUtil; + +class GenotypeFeatureData extends ArrayList { + public enum Tail { LeftTailed, RightTailed, TwoTailed } + + private String genotype = null; + private Integer[] permutation = null; + private Logger logger = null; + + private Tail tail = null; + + double lowThreshold = -1; + double highThreshold = -1; + + public GenotypeFeatureData(final String genotype, Logger logger, Tail tail ) { + super(); + this.genotype = genotype; + this.logger = logger; + this.tail = tail; + } + + public Tail getTail() { + return tail; + } + + public double getLowThreshold() { + return lowThreshold; + } + + public void setLowThreshold(double lowThreshold) { + this.lowThreshold = lowThreshold; + } + + public double getHighThreshold() { + return highThreshold; + } + + public void setHighThreshold(double highThreshold) { + this.highThreshold = highThreshold; + } + + public void finalizeData() { + permutation = Utils.SortPermutation(this); + } + + public void determineThresholds(final double pvalueLimit) { + if ( pvalueLimit <= 0.0 || pvalueLimit >= 1.0 ) + throw new RuntimeException(String.format("Invalid pValue limit = %f", pvalueLimit)); + + switch ( tail ) { + case LeftTailed: + lowThreshold = highThreshold = determineOneTailThreshold(pvalueLimit); + break; + case RightTailed: + lowThreshold = highThreshold = determineOneTailThreshold(1 - pvalueLimit); + break; + case TwoTailed: + lowThreshold = determineOneTailThreshold(pvalueLimit/2); + highThreshold = determineOneTailThreshold(1-pvalueLimit/2); + break; + } + + //logger.info(String.format(" %d of %d elements are >= threshold = %f (%f percent)", + // nElementsAboveThreshold(), nElements(), getHighThreshold(), nElementsAboveThreshold() / ( 1.0 * nElements()))); + //logger.info(String.format(" %d of %d elements are <= threshold = %f (%f percent)", + // nElementsBelowThreshold(), nElements(), getLowThreshold(), nElementsBelowThreshold() / ( 1.0 * nElements()))); + } + + private double determineOneTailThreshold(final double pvalueLimit) { + double trueSortedIndexLimit = pvalueLimit * nElements(); + int sortedIndexLimit = (int)(pvalueLimit > 0.5 ? Math.ceil(trueSortedIndexLimit) : Math.floor(trueSortedIndexLimit)); + double threshold = get(itemIndex(sortedIndexLimit)); + + logger.debug(String.format("determineTailThreshold(%s, %f) => %f => %d => %f", genotype, pvalueLimit, trueSortedIndexLimit, sortedIndexLimit, threshold)); + + return threshold; + } + + public int nElementsBelowThreshold() { return nElementsPassingThreshold(true, false); } + public int nElementsAboveThreshold() { return nElementsPassingThreshold(false, true); } + + public int nElementsPassingThreshold(boolean keepBelow, boolean keepAbove) { + int count = 0; + + for ( double stat : this ) { + if ( passesThreshold(stat, keepBelow, keepAbove) ) count++; + } + + return count; + } + + public boolean passesThreshold(double value, boolean keepBelow, boolean keepAbove) { + //System.out.printf("passThreshold %s, %b, %b = %b%n", value, keepBelow, keepAbove, value < threshold && keepBelow || value >= threshold && keepAbove); + return value <= getHighThreshold() && keepBelow || value >= getLowThreshold() && keepAbove; + } + + public boolean passesThreshold(double value) { + switch ( tail ) { + case LeftTailed: + return value >= getLowThreshold(); + case RightTailed: + return value <= getHighThreshold(); + case TwoTailed: + return value >= getLowThreshold() && value < getHighThreshold(); + default: + return true; + } + } + + public double pValue(double value) { + return 1.0; + } + + public int itemIndex(int sortedIndex) { + return permutation[sortedIndex]; + } + + public int nElements() { return size(); } + + public String toString() { + return String.format("[GenotypeFeatureData: genotype=%s, tail=%s, lowThreshold=%f, highThreshold=%f]", genotype, tail, lowThreshold, highThreshold); + } +} + + +class ObservationTable extends HashMap { + String statField = null; + File observationsFile = null; + Logger logger = null; + GenotypeFeatureData.Tail tail; + + public ObservationTable(final File observationsFile, final String statField, Logger logger, GenotypeFeatureData.Tail tail ) throws NoSuchFieldException, FileNotFoundException, IOException { + this.observationsFile = observationsFile; + this.statField = statField; + this.logger = logger; + this.tail = tail; + readAllData(observationsFile, statField); + } + + public Set genotypes() { + return keySet(); + } + + public void readAllData(final File observationsFile, final String statField) throws NoSuchFieldException, FileNotFoundException, IOException { + ArrayList header = TabularROD.readHeader(observationsFile); + + //logger.info(String.format("Starting to read table %s", observationsFile)); + + for ( String line : new xReadLines(observationsFile) ) { + TabularROD d = new TabularROD("ignoreMe", header); + String[] parts = line.split("\\s+"); + if ( d.parseLine(header, parts) ) { + if (! d.containsKey(statField)) { + throw new NoSuchFieldException(String.format("Could not find field %s in line %s", statField, line)); + } + double stat = Double.valueOf(d.get(statField)); + + final String genotype = d.get("genotype"); + GenotypeFeatureData gfd = getData(genotype); + gfd.add(stat); + } + } + + for ( GenotypeFeatureData gfd : this.values() ) { + gfd.finalizeData(); + } + + //logger.info(String.format("Read table %s, found %d genotypes/data pairs in %s field", + // observationsFile, size(), statField)); + } + + public GenotypeFeatureData getData(final String genotype) { + if ( ! containsKey(genotype) ) { + GenotypeFeatureData gfd = new GenotypeFeatureData(genotype, logger, tail); + put(genotype, gfd); + } + + return get(genotype); + } + + public String toString() { + return String.format("[ObservationTable: file=%s, field=%s, nGenotypes=%d]", observationsFile, statField, size()); + } +} + +public abstract class RatioFilter implements VariantExclusionCriterion { + protected double pvalueLimit = -1; + protected File observationsFile = null; + protected String statField = null; // "AlleleRatio"; + protected Logger logger = null; // Logger.getLogger(RatioFilter.class); + protected ObservationTable dataTable = null; + protected String name = null; + protected GenotypeFeatureData.Tail tail = null; + + + /** + * A short-term hack to stop the systme from rejecting poorly covered sites, under the assumption + * that the ratio test is poorly determined without at least MIN_COUNTS_TO_APPLY_TEST. To be + * replaced by the more robust sampling approach outlined below + */ + final private static int MIN_COUNTS_TO_APPLY_TEST = 20; + + final static boolean integrateOverSamplingProbabilities = true; + + public RatioFilter(final String name, final String statField, Class myClass, GenotypeFeatureData.Tail tail ) { + this.name = name; + this.statField = statField; + this.tail = tail; + logger = Logger.getLogger(myClass); + } + + public void initialize(String arguments) { + if (arguments != null && !arguments.isEmpty()) { + String[] argPieces = arguments.split(","); + try { + pvalueLimit = Double.valueOf(argPieces[0]); + observationsFile = new File(argPieces[1]); + + dataTable = new ObservationTable(observationsFile, statField, logger, tail); + + logger.info(String.format("Initialized data for ratio filter %s: %s", name, dataTable)); + for ( String genotype : dataTable.genotypes() ) { + GenotypeFeatureData gfd = dataTable.get(genotype); + gfd.determineThresholds(pvalueLimit); + logger.info(gfd.toString()); + } + } catch ( NumberFormatException e ) { + throw new RuntimeException(String.format("Couldn't parse pValue limit from %s", argPieces[0]), e); + } catch ( FileNotFoundException e ) { + throw new RuntimeException("Couldn't open ObservationTable " + observationsFile, e); + } catch ( NoSuchFieldException e ) { + throw new RuntimeException("Couldn't parse ObservationTable " + observationsFile, e); + } catch ( IOException e ) { + throw new RuntimeException("Couldn't parse ObservationTable " + observationsFile,e ); + } + } + } + + protected abstract boolean applyToVariant(rodVariants variant); + protected abstract Pair scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant); + + public boolean exclude(char ref, LocusContext context, rodVariants variant) { + boolean exclude = false; + + // + // todo -- need to calculate a significance value for the chance that the + // todo -- observed first and second counts are reliably not passing the threshold + // todo -- basically, we need to sample with replacement from all first / second pairs + // todo -- and only conclude that the variant fails the test if the probability of failing + // todo -- across all samples is greater than some P value, like 0.05 + // + ReadBackedPileup pileup = new ReadBackedPileup(ref, context); + if ( applyToVariant(variant) ) { + Pair counts = scoreVariant(ref, pileup, variant); + GenotypeFeatureData gfd = dataTable.get(variant.getBestGenotype()); + + if (integrateOverSamplingProbabilities) + exclude = integralExclude(gfd, counts); + else + exclude = pointEstimateExclude(gfd, counts); + + // + // for printing only + // + int n = counts.first + counts.second; + double value = counts.first / (1.0 * counts.first + counts.second); + logger.info(String.format("%s: counts1=%d (%.2f), counts2=%d (%.2f), n=%d, value=%f, %s, exclude=%b, bases=%s", + name, counts.first, counts.first / (0.01 * n), counts.second, counts.second / (0.01 * n), n, + value, gfd, exclude, pileup.getBases())); + } + + return exclude; + } + + private final static double SEARCH_INCREMENT = 0.01; + private final static double integralPValueThreshold = 0.05; + + private boolean integralExclude(GenotypeFeatureData gfd, Pair counts ) { + double sumExclude = 0.0, sumP = 0.0; + int n = counts.first + counts.second; + for ( double r = 0.0; r <= 1.0; r += SEARCH_INCREMENT ) { + double p = MathUtils.binomialProbability(counts.first, n, r); + sumP += p; + boolean exclude = ! gfd.passesThreshold(r); + sumExclude += p * (exclude ? 1.0 : 0.0); + + //System.out.printf("integral: k=%d, n=%d, r=%f, p=%f, sumP = %f, exclude=%b | sum=%f, percentExcluded=%f%n", + // counts.first, n, r, p, sumP, exclude, sumExclude, sumExclude / sumP); + } + + double percentExcluded = sumExclude / sumP; + return 1 - percentExcluded <= integralPValueThreshold ; + } + + private boolean pointEstimateExclude(GenotypeFeatureData gfd, Pair counts ) { + int n = counts.first + counts.second; + if ( n < MIN_COUNTS_TO_APPLY_TEST ) return false; + double value = counts.first / (1.0 * counts.first + counts.second); + //double value = counts.first / (1.0 * Math.max(counts.second, 1.0)); + return ! gfd.passesThreshold(value); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECAlleleBalance.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECAlleleBalance.java new file mode 100755 index 000000000..651b9b8e2 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECAlleleBalance.java @@ -0,0 +1,54 @@ +package org.broadinstitute.sting.playground.gatk.walkers.variants; + +import org.broadinstitute.sting.gatk.refdata.rodVariants; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.Pair; + +public class VECAlleleBalance extends RatioFilter { + final private static String statField = "AlleleRatio"; + final private static GenotypeFeatureData.Tail tail = GenotypeFeatureData.Tail.TwoTailed; + + + public VECAlleleBalance() { + super("AlleleBalance", statField, VECAlleleBalance.class, tail); + } + + /** + * We can only filter out het calls with the AlleleBalance filter + * + * @param variant + * @return + */ + protected boolean applyToVariant(rodVariants variant) { + String genotype = variant.getBestGenotype(); + return genotype.charAt(0) != genotype.charAt(1); + } + + /** + * Return the count of bases matching the major (first) and minor (second) alleles as a pair. + * + * @param ref + * @param pileup + * @param variant + * @return + */ + protected Pair scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant) { + final String genotype = variant.getBestGenotype(); + final String bases = pileup.getBases(); + + if ( genotype.length() > 2 ) + throw new IllegalArgumentException(String.format("Can only handle diploid genotypes: %s", genotype)); + + + char a = genotype.toUpperCase().charAt(0); + char b = genotype.toUpperCase().charAt(1); + int aCount = Utils.countOccurrences(a, bases.toUpperCase()); + int bCount = Utils.countOccurrences(b, bases.toUpperCase()); + + int major = Math.max(aCount, bCount); + int minor = Math.min(aCount, bCount); + + return new Pair(major, minor); + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECOnOffGenotypeRatio.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECOnOffGenotypeRatio.java new file mode 100755 index 000000000..560decb8d --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variants/VECOnOffGenotypeRatio.java @@ -0,0 +1,54 @@ +package org.broadinstitute.sting.playground.gatk.walkers.variants; + +import org.broadinstitute.sting.gatk.refdata.rodVariants; +import org.broadinstitute.sting.utils.*; + +public class VECOnOffGenotypeRatio extends RatioFilter { + final private static String statField = "onOffRatio"; + final private static GenotypeFeatureData.Tail tail = GenotypeFeatureData.Tail.LeftTailed; + + public VECOnOffGenotypeRatio() { + super("On/Off Genotype Ratio", statField, VECOnOffGenotypeRatio.class, tail); + } + + /** + * On/off genotype filters can be applied to any genotype + * + * @param variant + * @return + */ + protected boolean applyToVariant(rodVariants variant) { + return true; + } + + /** + * Return the counts of bases that are on (matching the bestGenotype) and off (not matching the + * best genotype). On are in the first field, off in the second. + * + * @param ref + * @param pileup + * @param variant + * @return + */ + protected Pair scoreVariant(char ref, ReadBackedPileup pileup, rodVariants variant) { + final String genotype = variant.getBestGenotype().toUpperCase(); + final String bases = pileup.getBases(); + + if ( genotype.length() > 2 ) + throw new IllegalArgumentException(String.format("Can only handle diploid genotypes: %s", genotype)); + + + int on = 0, off = 0; + + for ( char base : BaseUtils.BASES ) { + int count = BasicPileup.countBase(base, bases); + if ( Utils.countOccurrences(base, genotype) > 0 ) + on += count; + else + off += count; + //System.out.printf("count = %d, on=%d, off=%d for %c in %s%n", count, on, off, base, genotype); + } + + return new Pair(on, off); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/java/src/org/broadinstitute/sting/utils/BaseUtils.java index de3acd999..cfc475031 100644 --- a/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -6,6 +6,7 @@ import java.util.Random; * BaseUtils contains some basic utilities for manipulating nucleotides. */ public class BaseUtils { + public final static char[] BASES = { 'A', 'C', 'G', 'T' }; /// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or // a pyrimidine to another pyrimidine nucleotide (C <-> T). diff --git a/java/src/org/broadinstitute/sting/utils/BasicPileup.java b/java/src/org/broadinstitute/sting/utils/BasicPileup.java index 2d3d92cc0..972214a45 100755 --- a/java/src/org/broadinstitute/sting/utils/BasicPileup.java +++ b/java/src/org/broadinstitute/sting/utils/BasicPileup.java @@ -347,12 +347,35 @@ abstract public class BasicPileup implements Pileup { return null; } - public static byte consensusBase(String bases) { + public static class BaseCounts { + int a, c, t, g; + + public BaseCounts(int a, int c, int t, int g) { + this.a = a; + this.c = c; + this.t = t; + this.g = g; + } + } + + public static int countBase(final char base, final String bases) { + return Utils.countOccurrences(base, bases); + } + + public static BaseCounts countBases(final String bases) { String canon = bases.toUpperCase(); - int ACount = Utils.countOccurrences('A', bases); - int CCount = Utils.countOccurrences('C', bases); - int TCount = Utils.countOccurrences('T', bases); - int GCount = Utils.countOccurrences('G', bases); + return new BaseCounts(Utils.countOccurrences('A', canon), + Utils.countOccurrences('C', canon), + Utils.countOccurrences('T', canon), + Utils.countOccurrences('G', canon)); + } + + public static byte consensusBase(String bases) { + BaseCounts counts = countBases( bases ); + int ACount = counts.a; + int CCount = counts.c; + int TCount = counts.t; + int GCount = counts.g; int m = Math.max(ACount, Math.max(CCount, Math.max(TCount, GCount))); if ( ACount == m ) return 'A'; @@ -362,3 +385,5 @@ abstract public class BasicPileup implements Pileup { return 0; } } + +