diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java index 70a13c52d..5181baea8 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java @@ -60,6 +60,7 @@ import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ReducibleAnnotation; import org.broadinstitute.gatk.utils.MannWhitneyU; +import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.exceptions.GATKException; @@ -78,10 +79,11 @@ import java.util.*; */ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnnotation { private final static Logger logger = Logger.getLogger(AS_RMSAnnotation.class); - protected final String splitDelim = "\\|"; //String.split takes a regex, so we need to escape the pipe - protected final String printDelim = "|"; - protected final String reducedDelim = ","; - protected AnnotatorCompatible callingWalker; + private final String splitDelim = "\\|"; //String.split takes a regex, so we need to escape the pipe + private final String printDelim = "|"; + private final String reducedDelim = ","; //delimiter for list of reduced values + private final String rawDelim = ","; //delimiter for list of raw values + private AnnotatorCompatible callingWalker; @Override public void initialize(final AnnotatorCompatible walker, final GenomeAnalysisEngine toolkit, final Set headerLines) { @@ -109,20 +111,21 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn return new HashMap<>(); final Map annotations = new HashMap<>(); - final AlleleSpecificAnnotationData> myData = initializeNewAnnotationData(vc.getAlleles()); - calculateRawData(vc, perReadAlleleLikelihoodMap, myData); - final String annotationString = makeRawAnnotationString(vc.getAlleles(), myData.getAttributeMap()); + final AlleleSpecificAnnotationData> myRawData = initializeNewRawAnnotationData(vc.getAlleles()); + calculateRawData(vc, perReadAlleleLikelihoodMap, myRawData); + Map> myRankSumStats = calculateRankSum(myRawData.getAttributeMap(), myRawData.getRefAllele()); + final String annotationString = makeRawAnnotationString(vc.getAlleles(), myRankSumStats); annotations.put(getRawKeyName(), annotationString); return annotations; } - protected void parseRawDataString(final ReducibleAnnotationData> myData) { + protected void parseRawDataString(final ReducibleAnnotationData myData) { final String rawDataString = myData.getRawData(); String rawDataNoBrackets; - final Map> perAlleleValues = new HashMap<>(); + final Map perAlleleValues = new HashMap<>(); //Initialize maps for (final Allele current : myData.getAlleles()) { - perAlleleValues.put(current, new CompressedDataList()); + perAlleleValues.put(current, new Histogram()); } //Map gives back list with [] if (rawDataString.charAt(0) == '[') { @@ -131,40 +134,61 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn else { rawDataNoBrackets = rawDataString; } - //rawDataPerAllele is the list of values for each allele (each of variable length) + //rawDataPerAllele is a per-sample list of the rank sum statistic for each allele final String[] rawDataPerAllele = rawDataNoBrackets.split(splitDelim); for (int i=0; i alleleList = perAlleleValues.get(myData.getAlleles().get(i)); + final Histogram alleleList = perAlleleValues.get(myData.getAlleles().get(i)); final String[] rawListEntriesAsStringVector = alleleData.split(","); - if (rawListEntriesAsStringVector.length %2 != 0) - throw new GATKException("ERROR: rank sum test raw annotation data must occur in pairs"); - for (int j=0; j myData) { + final String rawDataString = myData.getRawData(); + String rawDataNoBrackets; + final Map perAlleleValues = new HashMap<>(); + //Initialize maps + for (final Allele current : myData.getAlleles()) { + perAlleleValues.put(current, new Histogram()); + } + //Map gives back list with [] + if (rawDataString.charAt(0) == '[') { + rawDataNoBrackets = rawDataString.substring(1, rawDataString.length() - 1); + } + else { + rawDataNoBrackets = rawDataString; + } + //rawDataPerAllele is a string representation of the Histogram for each allele in the form value, count, value, count... + final String[] rawDataPerAllele = rawDataNoBrackets.split(splitDelim); + for (int i=0; i annotations = new HashMap<>(); - final String annotationString = makeRawAnnotationString(vcAlleles, combinedData.getAttributeMap()); + final String annotationString = makeCombinedAnnotationString(vcAlleles, combinedData.getAttributeMap()); annotations.put(getRawKeyName(), annotationString); return annotations; } - protected AlleleSpecificAnnotationData initializeNewAnnotationData(final List vcAlleles) { + protected AlleleSpecificAnnotationData initializeNewRawAnnotationData(final List vcAlleles) { Map> perAlleleValues = new HashMap<>(); for (Allele a : vcAlleles) { perAlleleValues.put(a, new CompressedDataList()); } - AlleleSpecificAnnotationData ret = new AlleleSpecificAnnotationData(vcAlleles, perAlleleValues.toString()); + final AlleleSpecificAnnotationData ret = new AlleleSpecificAnnotationData(vcAlleles, perAlleleValues.toString()); ret.setAttributeMap(perAlleleValues); return ret; } - protected void combineAttributeMap(final ReducibleAnnotationData> toAdd, final ReducibleAnnotationData> combined) { + protected AlleleSpecificAnnotationData initializeNewAnnotationData(final List vcAlleles) { + Map perAlleleValues = new HashMap<>(); + for (Allele a : vcAlleles) { + perAlleleValues.put(a, new Histogram()); + } + final AlleleSpecificAnnotationData ret = new AlleleSpecificAnnotationData(vcAlleles, perAlleleValues.toString()); + ret.setAttributeMap(perAlleleValues); + return ret; + } + + protected void combineAttributeMap(final ReducibleAnnotationData toAdd, final ReducibleAnnotationData combined) { for (final Allele a : combined.getAlleles()) { if (toAdd.hasAttribute(a)) { - final CompressedDataList alleleData = combined.getAttribute(a); - alleleData.add(toAdd.getAttribute(a)); - combined.putAttribute(a, alleleData); + final Histogram alleleData = combined.getAttribute(a); + if (toAdd.getAttribute(a) != null) { + alleleData.add(toAdd.getAttribute(a)); + combined.putAttribute(a, alleleData); + } } } } - protected String makeRawAnnotationString(final List vcAlleles, final Map> perAlleleValues) { + protected String makeRawAnnotationString(final List vcAlleles, final Map> perAlleleValues) { String annotationString = ""; - for (int i =0; i< vcAlleles.size(); i++) { - if (i!=0) - annotationString += printDelim; - CompressedDataList alleleValues = perAlleleValues.get(vcAlleles.get(i)); - annotationString += alleleValues.toString(); + for (int i = 0; i< vcAlleles.size(); i++) { + if (vcAlleles.get(i).isReference()) + continue; + if (i != 0) + annotationString += printDelim; + final List alleleValue = perAlleleValues.get(vcAlleles.get(i)); + //can be null if there are no ref reads + if (alleleValue == null) + continue; + annotationString += formatListAsString(alleleValue); } return annotationString; } - protected String makeReducedAnnotationString(VariantContext vc, Map perAltRankSumResults) { + protected String makeCombinedAnnotationString(final List vcAlleles, final Map perAlleleValues) { + String annotationString = ""; + for (int i = 0; i< vcAlleles.size(); i++) { + if (vcAlleles.get(i).isReference()) + continue; + if (i != 0) + annotationString += printDelim; + final Histogram alleleValue = perAlleleValues.get(vcAlleles.get(i)); + //can be null if there are no ref reads + if (alleleValue == null) + continue; + annotationString += alleleValue.toString(); + } + return annotationString; + } + + private String makeReducedAnnotationString(final VariantContext vc, final Map perAltRankSumResults) { String annotationString = ""; for (final Allele a : vc.getAlternateAlleles()) { if (!annotationString.isEmpty()) @@ -242,8 +299,8 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn return new HashMap<>(); final Map annotations = new HashMap<>(); - final AlleleSpecificAnnotationData> myData = new AlleleSpecificAnnotationData(originalVC.getAlleles(), rawRankSumData); - parseRawDataString(myData); + final AlleleSpecificAnnotationData myData = new AlleleSpecificAnnotationData(originalVC.getAlleles(), rawRankSumData); + parseCombinedDataString(myData); final Map perAltRankSumResults = calculateReducedData(myData.getAttributeMap(), myData.getRefAllele()); //shortcut for no ref values @@ -254,27 +311,37 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn return annotations; } - public void calculateRawData(VariantContext vc, Map pralm, ReducibleAnnotationData myData) { + public void calculateRawData(final VariantContext vc, final Map pralm, final ReducibleAnnotationData myData) { + if( vc.getGenotypes().getSampleNames().size() != 1) + throw new IllegalStateException("Calculating raw data for allele-specific rank sums requires variant context input with exactly one sample, as in a gVCF."); + if(pralm == null) return; final Map> perAlleleValues = myData.getAttributeMap(); for ( final PerReadAlleleLikelihoodMap likelihoodMap : pralm.values() ) { if ( likelihoodMap != null && !likelihoodMap.isEmpty() ) { - fillQualsFromLikelihoodMap(vc.getAlleles(), vc.getStart(), likelihoodMap, perAlleleValues); + fillQualsFromLikelihoodMap(vc.getGenotype(0).getAlleles(), vc.getStart(), likelihoodMap, perAlleleValues); } } } + /** + * + * @param alleles the alleles that were called in the genotype for this sample + * @param refLoc + * @param likelihoodMap + * @param perAlleleValues + */ private void fillQualsFromLikelihoodMap(final List alleles, final int refLoc, final PerReadAlleleLikelihoodMap likelihoodMap, final Map> perAlleleValues) { for ( final Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet() ) { final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); - if ( ! a.isInformative() ) - continue; // read is non-informative + if ( ! a.isInformative() || ! alleles.contains(a.getMostLikelyAllele())) + continue; // read is non-informative or supports an allele that was not called final GATKSAMRecord read = el.getKey(); if ( isUsableRead(read, refLoc) ) { @@ -289,8 +356,8 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn } } - public Map calculateReducedData(final Map> perAlleleValues, final Allele ref) { - final Map perAltRankSumResults = new HashMap<>(); + public Map> calculateRankSum(final Map> perAlleleValues, final Allele ref) { + final Map> perAltRankSumResults = new HashMap<>(); //shortcut to not try to calculate rank sum if there are no reads that unambiguously support the ref if (perAlleleValues.get(ref).isEmpty()) return perAltRankSumResults; @@ -299,12 +366,12 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn continue; final MannWhitneyU mannWhitneyU = new MannWhitneyU(); //load alts (series 1) - List alts = new ArrayList<>(); + final List alts = new ArrayList<>(); for (final Number qual : perAlleleValues.get(alt)) { alts.add((double) qual.intValue()); } //load refs (series 2) - List refs = new ArrayList<>(); + final List refs = new ArrayList<>(); for (final Number qual : perAlleleValues.get(ref)) { refs.add((double) qual.intValue()); } @@ -322,9 +389,31 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn } // we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases) final MannWhitneyU.Result result = mannWhitneyU.test(convertToArray(alts), convertToArray(refs), MannWhitneyU.TestType.FIRST_DOMINATES); - perAltRankSumResults.put(alt, result.getZ()); + perAltRankSumResults.put(alt, Collections.singletonList(result.getZ())); } return perAltRankSumResults; } + public Map calculateReducedData(final Map perAlleleValues, final Allele ref) { + final Map perAltRankSumResults = new HashMap<>(); + for (final Allele alt : perAlleleValues.keySet()) { + if (alt.equals(ref, false)) + continue; + if (perAlleleValues.get(alt) != null) + perAltRankSumResults.put(alt, perAlleleValues.get(alt).median()); + } + return perAltRankSumResults; + } + + public String formatListAsString(final List rankSumValues) { + String formattedString = ""; + for (int i=0; i dataList = new CompressedDataList<>(); + + public Histogram() { + this.binSize = 0.1; + precisionFormat = "%.1f"; + } + + public Histogram(final Double binSize) { + this.binSize = binSize; + precisionFormat = "%." + Math.round(-Math.log10(binSize)) + "f"; + } + + public void add(final Double d) { + long binKey = getBinnedValue(d); + if (isValidBinKey(binKey)) + dataList.add((int)binKey); + else + throw new GATKException("Histogram values are suspiciously extreme. Failed to add " + d + " to the Histogram."); + } + + public void add(final Double d, final int count) { + if (count < 1) + throw new GATKException("Cannot add non-positive counts to Histogram."); + long binKey = getBinnedValue(d); + if (isValidBinKey(binKey)) + dataList.add((int)binKey, count); + else + throw new GATKException("Histogram values are suspiciously extreme. Failed to add " + d + " to the Histogram."); + } + + public void add(final Histogram h) { + if (!this.binSize.equals(h.binSize)) + throw new GATKException("Histogram bin sizes are mismatched -- cannot add bin size " + this.binSize + " to " + h.binSize); + this.dataList.add(h.dataList); + } + + public Integer get(final Double d) { + long binKey = getBinnedValue(d); + if (isValidBinKey(binKey)) + return dataList.getValueCounts().get((int)binKey); + else + throw new GATKException("Requested value is suspiciously extreme. Failed to retrieve " + d + " from the Histogram."); + } + + /** + * + * @return may be null if Histogram is empty + */ + + public Double median() { + int numItems = 0; + for(final int count : dataList.valueCounts.values()) { + numItems += count; + } + boolean oddNumberValues = true; + if(numItems % 2 == 0) + oddNumberValues = false; + int medianIndex = (numItems+1)/2; + + int counter = 0; + Double firstMedian = null; + for(final Integer key : dataList.valueCounts.keySet()) { + counter += dataList.valueCounts.get(key); + if( counter > medianIndex) { + if (firstMedian == null) + return key*binSize; + else { + return (firstMedian+key)/2.0*binSize; + } + } + if( counter == medianIndex) { + if (oddNumberValues) + return key*binSize; + else { + firstMedian = (double) key; + } + } + } + return null; + } + + private long getBinnedValue(double d) { + return Math.round(Math.floor((d+BIN_EPSILON*binSize)/binSize)); //add a little epsilon before division so values exactly on bin boundaries will stay in the same bin + } + + private boolean isValidBinKey(long binnedValue) { + return binnedValue <= Integer.MAX_VALUE && binnedValue >= Integer.MIN_VALUE; + } + + @Override + public String toString(){ + printDelim = ","; + String str = ""; + Object[] keys = dataList.valueCounts.keySet().toArray(); + Arrays.sort(keys); + for (Object i: keys){ + if(!str.isEmpty()) + str+=printDelim; + str+=(String.format(precisionFormat,(double)(int)i*binSize)+printDelim+dataList.valueCounts.get(i)); //key i needs to be output with specified precision + } + return str; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReducibleAnnotationData.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReducibleAnnotationData.java index bf998078d..b50258bfb 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReducibleAnnotationData.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReducibleAnnotationData.java @@ -26,6 +26,7 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.utils.exceptions.GATKException; import java.util.*; @@ -102,4 +103,17 @@ public class ReducibleAnnotationData { */ public Map getAttributeMap() {return Collections.unmodifiableMap(attributeMap);} + public void validateAllelesList() { + boolean foundRef = false; + for (final Allele a : this.getAlleles()) { + if (a.isReference()) { + if (foundRef) + throw new GATKException("ERROR: multiple reference alleles found in annotation data\n"); + foundRef = true; + } + } + if (!foundRef) + throw new GATKException("ERROR: no reference alleles found in annotation data\n"); + } + } diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/HistogramUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/HistogramUnitTest.java new file mode 100644 index 000000000..3a9390433 --- /dev/null +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/HistogramUnitTest.java @@ -0,0 +1,103 @@ +/* +* Copyright 2012-2016 Broad Institute, Inc. +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.tools.walkers.annotator; + +import org.testng.Assert; +import org.testng.annotations.Test; + +/** + * Created by gauthier on 11/1/16. + */ +public class HistogramUnitTest { + private final Double EPSILON = 0.001; + + @Test + public void testAdd() throws Exception { + Histogram bimodalHist = new Histogram(); + for(int i=0; i<=100; i++) { + bimodalHist.add(1+i/1000.0); + } + Assert.assertEquals(bimodalHist.get(1.0), new Integer(100), ""); + Assert.assertEquals(bimodalHist.get(1.1), new Integer(1), ""); + } + + @Test + public void testAddingQuantizedValues() throws Exception { + Histogram hist = new Histogram(); + for(int i=0; i<100; i++) { + hist.add(1.2); + } + Assert.assertEquals(hist.get(1.2), new Integer(100)); + Assert.assertEquals(hist.median(), 1.2, EPSILON); + } + + @Test + public void testBulkAdd() throws Exception { + Histogram bimodalHist = new Histogram(); + for(int i=0; i<=100; i++) { + bimodalHist.add(1+i/1000.0, 2); + } + Assert.assertEquals(bimodalHist.get(1.0), new Integer(200), ""); + Assert.assertEquals(bimodalHist.get(1.1), new Integer(2), ""); + } + + @Test + public void testMedianOfEvens() throws Exception { + Histogram bimodalHist = new Histogram(); + for(int i = 0; i<10; i++) { + bimodalHist.add(10.0); + bimodalHist.add(20.0); + } + + Assert.assertEquals(bimodalHist.median(), 15.0, EPSILON, ""); + } + + @Test + public void testMedianOfOdds() throws Exception { + Histogram bimodalHist = new Histogram(); + for(int i = 0; i<10; i++) { + bimodalHist.add(10.0); + bimodalHist.add(20.0); + } + bimodalHist.add(20.0); + + Assert.assertEquals(bimodalHist.median(), 20.0, EPSILON, ""); + } + + @Test + public void testMedianOfEmptyHist() throws Exception { + Histogram empty = new Histogram(); + Assert.assertNull(empty.median()); + } + + @Test + public void testMedianOfSingleItem() throws Exception { + Histogram singleItem = new Histogram(); + singleItem.add(20.0); + Assert.assertEquals(singleItem.median(), 20.0, EPSILON, ""); + } + +} \ No newline at end of file