From 09ac3779d63625f390f42a50be8b5a72e84c7718 Mon Sep 17 00:00:00 2001 From: Valentin Ruano-Rubio Date: Thu, 24 Jul 2014 23:42:53 -0400 Subject: [PATCH 1/5] Added ReadLikelihoods component to substitute Map. It uses a more efficient java array[] based implementation and encapsulates operations perform with such a read-likelihood collection such as marginalization, filtering by position, poor modeling or capping worst likelihoods and so forth. Stories: https://www.pivotaltracker.com/story/show/70222086 https://www.pivotaltracker.com/story/show/67961652 --- .../org/broadinstitute/gatk/utils/Utils.java | 255 +++ .../gatk/utils/genotyper/ReadLikelihoods.java | 1502 +++++++++++++++++ 2 files changed, 1757 insertions(+) create mode 100644 public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/Utils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/Utils.java index 8fdec27b4..b4c0c75fa 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/Utils.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/Utils.java @@ -34,6 +34,7 @@ import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; import org.broadinstitute.gatk.utils.text.TextFormattingUtils; +import java.lang.reflect.Array; import java.math.BigInteger; import java.net.InetAddress; import java.security.MessageDigest; @@ -882,4 +883,258 @@ public class Utils { return false; return true; } + + /** + * Skims out positions of an array returning a shorter one with the remaning positions in the same order. + * @param original the original array to splice. + * @param remove for each position in {@code original} indicates whether it should be spliced away ({@code true}), + * or retained ({@code false}) + * + * @param the array type. + * + * @throws IllegalArgumentException if either {@code original} or {@code remove} is {@code null}, + * or {@code remove length is different to {@code original}'s}, or {@code original} is not in + * fact an array. + * + * @return never {@code null}. + */ + public static T skimArray(final T original, final boolean[] remove) { + return skimArray(original,0,null,0,remove,0); + } + + /** + * Skims out positions of an array returning a shorter one with the remaning positions in the same order. + * + *

+ * If the {@code dest} array provide is not long enough a new one will be created and returned with the + * same component type. All elements before {@code destOffset} will be copied from the input to the + * result array. If {@code dest} is {@code null}, a brand-new array large enough will be created where + * the position preceding {@code destOffset} will be left with the default value. The component type + * Will match the one of the {@code source} array. + *

+ * + * @param source the original array to splice. + * @param sourceOffset the first position to skim. + * @param dest the destination array. + * @param destOffset the first position where to copy the skimed array values. + * @param remove for each position in {@code original} indicates whether it should be spliced away ({@code true}), + * or retained ({@code false}) + * @param removeOffset the first position in the remove index array to consider. + * + * @param the array type. + * + * @throws IllegalArgumentException if either {@code original} or {@code remove} is {@code null}, + * or {@code remove length is different to {@code original}'s}, or {@code original} is not in + * fact an array. + * + * @return never {@code null}. + */ + public static T skimArray(final T source, final int sourceOffset, final T dest, final int destOffset, final boolean[] remove, final int removeOffset) { + if (source == null) + throw new IllegalArgumentException("the source array cannot be null"); + @SuppressWarnings("unchecked") + final Class sourceClazz = (Class) source.getClass(); + + if (!sourceClazz.isArray()) + throw new IllegalArgumentException("the source array is not in fact an array instance"); + final int length = Array.getLength(source) - sourceOffset; + if (length < 0) + throw new IllegalArgumentException("the source offset goes beyond the source array length"); + return skimArray(source,sourceOffset,dest,destOffset,remove,removeOffset,length); + } + + /** + * Skims out positions of an array returning a shorter one with the remaning positions in the same order. + * + *

+ * If the {@code dest} array provide is not long enough a new one will be created and returned with the + * same component type. All elements before {@code destOffset} will be copied from the input to the + * result array. If {@code dest} is {@code null}, a brand-new array large enough will be created where + * the position preceding {@code destOffset} will be left with the default value. The component type + * Will match the one of the {@code source} array. + *

+ * + * @param source the original array to splice. + * @param sourceOffset the first position to skim. + * @param dest the destination array. + * @param destOffset the first position where to copy the skimed array values. + * @param remove for each position in {@code original} indicates whether it should be spliced away ({@code true}), + * or retained ({@code false}) + * @param removeOffset the first position in the remove index array to consider. + * @param length the total number of position in {@code source} to consider. Thus only the {@code sourceOffset} to + * {@code sourceOffset + length - 1} region will be skimmed. + * + * @param the array type. + * + * @throws IllegalArgumentException if either {@code original} or {@code remove} is {@code null}, + * or {@code remove length is different to {@code original}'s}, or {@code original} is not in + * fact an array. + * + * @return never {@code null}. + */ + public static T skimArray(final T source, final int sourceOffset, final T dest, final int destOffset, + final boolean[] remove, final int removeOffset, final int length) { + if (source == null) + throw new IllegalArgumentException("the source array cannot be null"); + if (remove == null) + throw new IllegalArgumentException("the remove array cannot be null"); + if (sourceOffset < 0) + throw new IllegalArgumentException("the source array offset cannot be negative"); + if (destOffset < 0) + throw new IllegalArgumentException("the destination array offset cannot be negative"); + if (removeOffset < 0) + throw new IllegalArgumentException("the remove array offset cannot be negative"); + if (length < 0) + throw new IllegalArgumentException("the length provided cannot be negative"); + + final int removeLength = Math.min(remove.length - removeOffset,length); + + if (removeLength < 0) + throw new IllegalArgumentException("the remove offset provided falls beyond the remove array end"); + + + @SuppressWarnings("unchecked") + final Class sourceClazz = (Class) source.getClass(); + + if (!sourceClazz.isArray()) + throw new IllegalArgumentException("the source array is not in fact an array instance"); + + final Class destClazz = skimArrayDetermineDestArrayClass(dest, sourceClazz); + + final int sourceLength = Array.getLength(source); + + if (sourceLength < length + sourceOffset) + throw new IllegalArgumentException("the source array is too small considering length and offset"); + + // count how many positions are to be removed. + + int removeCount = 0; + + final int removeEnd = removeLength + removeOffset; + for (int i = removeOffset; i < removeEnd; i++) + if (remove[i]) removeCount++; + + + final int newLength = length - removeCount; + + + @SuppressWarnings("unchecked") + final T result = skimArrayBuildResultArray(dest, destOffset, destClazz, newLength); + // No removals, just copy the whole thing. + + if (removeCount == 0) + System.arraycopy(source,sourceOffset,result,destOffset,length); + else if (length > 0) { // if length == 0 nothing to do. + int nextOriginalIndex = 0; + int nextNewIndex = 0; + int nextRemoveIndex = removeOffset; + while (nextOriginalIndex < length && nextNewIndex < newLength) { + while (nextRemoveIndex < removeEnd && remove[nextRemoveIndex++]) { nextOriginalIndex++; } // skip positions to be spliced. + // Since we make the nextNewIndex < newLength check in the while condition + // there is no need to include the following break, as is guaranteed not to be true: + // if (nextOriginalIndex >= length) break; // we reach the final (last positions are to be spliced. + final int copyStart = nextOriginalIndex; + while (++nextOriginalIndex < length && (nextRemoveIndex >= removeEnd || !remove[nextRemoveIndex])) { nextRemoveIndex++; } + final int copyEnd = nextOriginalIndex; + final int copyLength = copyEnd - copyStart; + System.arraycopy(source, sourceOffset + copyStart, result, destOffset + nextNewIndex, copyLength); + nextNewIndex += copyLength; + } + } + return result; + } + + private static T skimArrayBuildResultArray(final T dest, final int destOffset, final Class destClazz, final int newLength) { + @SuppressWarnings("unchecked") + final T result; + + if (dest == null) + result = (T) Array.newInstance(destClazz.getComponentType(), newLength + destOffset); + else if (Array.getLength(dest) < newLength + destOffset) { + result = (T) Array.newInstance(destClazz.getComponentType(),newLength + destOffset); + if (destOffset > 0) System.arraycopy(dest,0,result,0,destOffset); + } else + result = dest; + return result; + } + + private static Class skimArrayDetermineDestArrayClass(final T dest, Class sourceClazz) { + final Class destClazz; + if (dest == null) + destClazz = sourceClazz; + else { + destClazz = (Class) dest.getClass(); + if (destClazz != sourceClazz) { + if (!destClazz.isArray()) + throw new IllegalArgumentException("the destination array class must be an array"); + if (sourceClazz.getComponentType().isAssignableFrom(destClazz.getComponentType())) + throw new IllegalArgumentException("the provided destination array class cannot contain values from the source due to type incompatibility"); + } + } + return destClazz; + } + + /** + * Makes a deep clone of the array provided. + * + *

+ * When you can use {@link Arrays#copyOf} or an array {@link Object#clone()} to create a copy of itself, + * if it is multi-dimentional each sub array or matrix would be cloned. + *

+ * + *

+ * Notice however that if the base type is an Object type, the base elements themselves wont be cloned. + *

+ * + * @param array the array to deep-clone. + * @param type of the array. + * + * @throws IllegalArgumentException if {@code array} is {@code null} or is not an array. + */ + public static T deepCloneArray(final T array) { + + if (array == null) + throw new IllegalArgumentException(""); + + @SuppressWarnings("unchecked") + final Class clazz = (Class) array.getClass(); + + + if (!clazz.isArray()) + throw new IllegalArgumentException("the input is not an array"); + + final int dimension = calculateArrayDimensions(clazz); + + return deepCloneArrayUnchecked(array,clazz, dimension); + } + + private static int calculateArrayDimensions(final Class clazz) { + if (clazz.isArray()) + return calculateArrayDimensions(clazz.getComponentType()) + 1; + else + return 0; + } + + private static T deepCloneArrayUnchecked(final T array, final Class clazz, final int dimension) { + + + final int length = Array.getLength(array); + + final Class componentClass = clazz.getComponentType(); + + final T result = (T) Array.newInstance(componentClass,length); + + if (dimension <= 1) { + System.arraycopy(array, 0, result, 0, length); + return result; + } + + + final int dimensionMinus1 = dimension - 1; + + for (int i = 0; i < length; i++) + Array.set(result,i,deepCloneArrayUnchecked(Array.get(array,i),componentClass,dimensionMinus1)); + + return result; + } } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java new file mode 100644 index 000000000..f03dfbe78 --- /dev/null +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java @@ -0,0 +1,1502 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.genotyper; + +import htsjdk.variant.variantcontext.Allele; +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.objects.Object2IntMap; +import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; +import org.broadinstitute.gatk.engine.downsampling.AlleleBiasedDownsamplingUtils; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; + +import java.util.*; + +/** + * Read-likelihoods container implementation based on integer indexed arrays. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class ReadLikelihoods implements Cloneable { + + /** + * Reads by sample index. Each sub array contains reference to the reads of the ith sample. + */ + private GATKSAMRecord[][] readsBySampleIndex; + + /** + * Indexed per sample, allele and finally read (within sample). + *

+ * valuesBySampleIndex[s][a][r] == lnLk(R_r | A_a) where R_r comes from Sample s. + *

+ */ + private double[][][] valuesBySampleIndex; + + /** + * Sorted list samples. + */ + private final String[] samples; + + /** + * Unmodifiable list version of the samples + */ + private List sampleList; + + /** + * Unmodifiable list version of the alleles + */ + private List
alleleList; + + /** + * Sorted array of alleles. + */ + private A[] alleles; + + /** + * Maps from sample name to its index. + */ + private final Object2IntMap sampleIndex; + + /** + * Maps from allele to its index. + */ + private final Object2IntMap alleleIndex; + + /** + * Maps from each read to its index within the sample. + */ + private final Object2IntMap[] readIndexBySampleIndex; + + /** + * Index of the reference allele if any, otherwise -1. + */ + private final int referenceAlleleIndex; + + /** + * Caches the read-list per sample list returned by {@link #sampleReads} + */ + private final List[] readListBySampleIndex; + + /** + * Sample matrices. + */ + private final Matrix[] sampleMatrices; + + /** + * Constructs a new read-likelihood collection. + * + *

+ * The initial likelihoods for all allele-read combinations are + * 0. + *

+ * + * @param samples all supported samples in the collection. + * @param alleles all supported alleles in the collection. + * @param reads reads stratified per sample. + * + * @throws IllegalArgumentException if any of {@code allele}, {@code samples} + * or {@code reads} is {@code null}, + * or if they contain null values. + */ + @SuppressWarnings("unchecked") + public ReadLikelihoods(final List samples, final List
alleles, + final Map> reads) { + if (alleles == null) + throw new IllegalArgumentException("allele list cannot be null"); + if (samples == null) + throw new IllegalArgumentException("sample list cannot be null"); + if (reads == null) + throw new IllegalArgumentException("read map cannot be null"); + + final int sampleCount = samples.size(); + final int alleleCount = alleles.size(); + this.alleles = alleles.toArray((A[]) new Allele[alleleCount]); + this.samples = samples.toArray(new String[sampleCount]); + readsBySampleIndex = new GATKSAMRecord[sampleCount][]; + readListBySampleIndex = new List[sampleCount]; + valuesBySampleIndex = new double[sampleCount][][]; + referenceAlleleIndex = findReferenceAllele(this.alleles); + + sampleIndex = new Object2IntOpenHashMap<>(sampleCount); + alleleIndex = new Object2IntOpenHashMap<>(alleleCount); + readIndexBySampleIndex = new Object2IntMap[sampleCount]; + + setupIndexes(reads, sampleCount, alleleCount); + + sampleMatrices = (Matrix[]) new Matrix[sampleCount]; + } + + // Add all the indices to alleles, sample and reads in the look-up maps. + private void setupIndexes(final Map> reads, final int sampleCount, final int alleleCount) { + alleleIndex.clear(); + for (int i = 0; i < alleleCount; i++) { + final A allele = alleles[i]; + if (allele == null) + throw new IllegalArgumentException("no allele can be null"); + alleleIndex.put(allele, i); + } + if (alleleIndex.size() != alleleCount) + throw new IllegalArgumentException("repeated alleles"); + + sampleIndex.clear(); + for (int i = 0; i < sampleCount; i++) { + setupSampleData(i, reads, alleleCount); + } + + if (sampleIndex.size() != sampleCount) + throw new IllegalArgumentException("repeated sample names"); + } + + // Assumes that {@link #samples} has been initialized with the sample names. + private void setupSampleData(final int sampleIndex, final Map> readsBySample, + final int alleleCount) { + final String sample = this.samples[sampleIndex]; + if (sample == null) + throw new IllegalArgumentException("no sample can be null"); + this.sampleIndex.put(sample, sampleIndex); + final List reads = readsBySample.get(sample); + readsBySampleIndex[sampleIndex] = reads == null + ? new GATKSAMRecord[0] + : reads.toArray(new GATKSAMRecord[reads.size()]); + final int sampleReadCount = readsBySampleIndex[sampleIndex].length; + + readIndexBySampleIndex[sampleIndex] = new Object2IntOpenHashMap<>(sampleReadCount); + for (int r = 0; r < sampleReadCount; r++) { + final GATKSAMRecord read = readsBySampleIndex[sampleIndex][r]; + if (read == null) + throw new IllegalArgumentException("read cannot be null"); + if (readIndexBySampleIndex[sampleIndex].containsKey(read)) + throw new IllegalArgumentException("repeated read " + read); + readIndexBySampleIndex[sampleIndex].put(readsBySampleIndex[sampleIndex][r], r); + } + + final double[][] sampleValues = new double[alleleCount][sampleReadCount]; + valuesBySampleIndex[sampleIndex] = sampleValues; + } + + /** + * Create an independent copy of this read-likelihoods collection + */ + public ReadLikelihoods clone() { + + final int sampleCount = samples.length; + final int alleleCount = alleles.length; + + final double[][][] newLikelihoodValues = new double[sampleCount][alleleCount][]; + + @SuppressWarnings("unchecked") + final Object2IntMap[] newReadIndexBySampleIndex = new Object2IntMap[sampleCount]; + final GATKSAMRecord[][] newReadsBySampleIndex = new GATKSAMRecord[sampleCount][]; + + for (int s = 0; s < sampleCount; s++) { + newReadIndexBySampleIndex[s] = new Object2IntOpenHashMap<>(readIndexBySampleIndex[s]); + newReadsBySampleIndex[s] = readsBySampleIndex[s].clone(); + for (int a = 0; a < alleleCount; a++) + newLikelihoodValues[s][a] = valuesBySampleIndex[s][a].clone(); + } + + // Finally we create the new read-likelihood + return new ReadLikelihoods<>(alleles.clone(), samples, sampleIndex, + newReadsBySampleIndex, + newReadIndexBySampleIndex, newLikelihoodValues); + } + + // Internally used constructor. + @SuppressWarnings("unchecked") + private ReadLikelihoods(final A[] alleles, final String[] samples, final Object2IntMap sampleIndex, + final GATKSAMRecord[][] readsBySampleIndex, final Object2IntMap[] readIndex, + final double[][][] values) { + this.samples = samples; + this.alleles = alleles; + this.readsBySampleIndex = readsBySampleIndex; + this.valuesBySampleIndex = values; + this.readIndexBySampleIndex = readIndex; + this.readListBySampleIndex = new List[samples.length]; + + this.sampleIndex = sampleIndex; + + this.alleleIndex = new Object2IntOpenHashMap<>(alleles.length); + for (int i = 0; i < alleles.length; i++) + alleleIndex.put(alleles[i],i); + + if (alleleIndex.size() != alleles.length) + throw new IllegalArgumentException("the allele index is null"); + + referenceAlleleIndex = findReferenceAllele(alleles); + sampleMatrices = (Matrix[]) new Matrix[samples.length]; + } + + + // Search for the reference allele, if not found the index is -1. + private int findReferenceAllele(final A[] alleles) { + for (int i = 0; i < alleles.length; i++) + if (alleles[i].isReference()) + return i; + return -1; + } + + /** + * Checks whether a sample is included likelihood collection. + * + * @param sample the query sample. + * + * @throws IllegalArgumentException if {@code sample} is {@code null}. + * @return {@code true} iff the sample is in the likelihood collection. + */ + @SuppressWarnings("unused") + public boolean containsSample(final String sample) { + return sampleIndex.containsKey(sample); + } + + /** + * Returns the index of a sample within the likelihood collection. + * + * @param sample the query sample. + * + * @throws IllegalArgumentException if {@code sample} is {@code null}. + * @return -1 if the allele is not included, 0 or greater otherwise. + */ + public int sampleIndex(final String sample) { + if (sample == null) + throw new IllegalArgumentException("the input allele cannot be null"); + if (sampleIndex.containsKey(sample)) + return sampleIndex.getInt(sample); + else + return -1; + } + + /** + * Number of samples included in the likelihood collection. + * @return 0 or greater. + */ + public int sampleCount() { + return samples.length; + } + + /** + * Returns sample name given its index. + * + * @param sampleIndex query index. + * + * @throws IllegalArgumentException if {@code sampleIndex} is negative. + * + * @return never {@code null}. + */ + public String sample(final int sampleIndex) { + checkSampleIndex(sampleIndex); + return samples[sampleIndex]; + } + + /** + * Checks whether the allele is within the likelihood collection. + * + * @param allele the query allele. + * + * @throws IllegalArgumentException if {@code allele} is {@code null}. + * @return {@code true} iff the allele is in the likelihood collection. + */ + @SuppressWarnings("unused") + public boolean containsAllele(final A allele) { + return alleleIndex.containsKey(allele); + } + + /** + * Returns the index of an allele within the likelihood collection. + * + * @param allele the query allele. + * + * @throws IllegalArgumentException if {@code allele} is {@code null}. + * + * @return -1 if the allele is not included, 0 or greater otherwise. + */ + public int alleleIndex(final A allele) { + if (allele == null) + throw new IllegalArgumentException("the input allele cannot be null"); + if (alleleIndex.containsKey(allele)) + return alleleIndex.getInt(allele); + else + return -1; + } + + /** + * Returns number of alleles in the collection. + * @return 0 or greater. + */ + @SuppressWarnings("unused") + public int alleleCount() { + return alleles.length; + } + + /** + * Returns the allele given its index. + * + * @param alleleIndex the allele index. + * + * @throws IllegalArgumentException the allele index is {@code null}. + * + * @return never {@code null}. + */ + public A allele(final int alleleIndex) { + checkAlleleIndex(alleleIndex); + return alleles[alleleIndex]; + } + + /** + * Returns the reads that belong to a sample sorted by their index (within that sample). + * + * @param sampleIndex the requested sample. + * @return never {@code null} but perhaps a zero-length array if there is no reads in sample. No element in + * the array will be null. + */ + public List sampleReads(final int sampleIndex) { + checkSampleIndex(sampleIndex); + final List extantList = readListBySampleIndex[sampleIndex]; + if (extantList == null) + return readListBySampleIndex[sampleIndex] = Collections.unmodifiableList(Arrays.asList(readsBySampleIndex[sampleIndex])); + else + return extantList; + } + + /** + * Returns the likelihood matrix for a sample. + *

+ * The matrix is indexed by allele and the by read index. + * + *

+     *          result[a][r] == lnLk(Read_r | Allele_a)
+     *     
+ *

+ * + *

+ * The matrix is live and changes to it update the likelihood in the collection, please use with care. + *

+ * + * @param sampleIndex the sample index. + * + * @return never {@code null}. + */ + /* package */ double[][] sampleValues(final int sampleIndex) { + checkSampleIndex(sampleIndex); + return valuesBySampleIndex[sampleIndex]; + } + + + /** + * Returns a read vs allele likelihood matrix corresponding to a sample. + * + * @param sampleIndex target sample. + * + * @throws IllegalArgumentException if {@code sampleIndex} is not null. + * + * @return never {@code null} + */ + public Matrix
sampleMatrix(final int sampleIndex) { + checkSampleIndex(sampleIndex); + final Matrix extantResult = sampleMatrices[sampleIndex]; + if (extantResult != null) + return extantResult; + else + return sampleMatrices[sampleIndex] = new SampleMatrix(sampleIndex); + } + + /** + * Adjusts likelihoods so that for each read, the best allele likelihood is 0 and caps the minimum likelihood + * of any allele for each read based on the maximum alternative allele likelihood. + * + * @param bestToZero set the best likelihood to 0, others will be subtracted the same amount. + * @param maximumLikelihoodDifferenceCap maximum difference between the best alternative allele likelihood + * and any other likelihood. + * + * @throws IllegalArgumentException if {@code maximumDifferenceWithBestAlternative} is not 0 or less. + */ + public void normalizeLikelihoods(final boolean bestToZero, final double maximumLikelihoodDifferenceCap) { + if (maximumLikelihoodDifferenceCap >= 0.0 || Double.isNaN(maximumLikelihoodDifferenceCap)) + throw new IllegalArgumentException("the minimum reference likelihood fall cannot be positive"); + + if (maximumLikelihoodDifferenceCap == Double.NEGATIVE_INFINITY && !bestToZero) + return; + + final int alleleCount = alleles.length; + if (alleleCount == 0) // trivial case there is no alleles. + return; + else if (alleleCount == 1 && !bestToZero) + return; + + for (int s = 0; s < valuesBySampleIndex.length; s++) { + final double[][] sampleValues = valuesBySampleIndex[s]; + final int readCount = readsBySampleIndex[s].length; + for (int r = 0; r < readCount; r++) + normalizeLikelihoodsPerRead(bestToZero, maximumLikelihoodDifferenceCap, sampleValues, s, r); + } + } + + // Does the normalizeLikelihoods job for each read. + private void normalizeLikelihoodsPerRead(final boolean bestToZero, final double maximumBestAltLikelihoodDifference, + final double[][] sampleValues, final int sampleIndex, final int readIndex) { + + final BestAllele bestAlternativeAllele = searchBestAllele(sampleIndex,readIndex,false); + + final double worstLikelihoodCap = bestAlternativeAllele.likelihood + maximumBestAltLikelihoodDifference; + + final double referenceLikelihood = referenceAlleleIndex == -1 ? Double.NEGATIVE_INFINITY : + sampleValues[referenceAlleleIndex][readIndex]; + + + final double bestAbsoluteLikelihood = Math.max(bestAlternativeAllele.likelihood,referenceLikelihood); + + if (bestToZero) { + if (bestAbsoluteLikelihood == Double.NEGATIVE_INFINITY) + for (int a = 0; a < alleles.length; a++) + sampleValues[a][readIndex] = 0; + else if (worstLikelihoodCap != Double.NEGATIVE_INFINITY) + for (int a = 0; a < alleles.length; a++) + sampleValues[a][readIndex] = (sampleValues[a][readIndex] < worstLikelihoodCap ? worstLikelihoodCap : sampleValues[a][readIndex]) - bestAbsoluteLikelihood; + else + for (int a = 0; a < alleles.length; a++) + sampleValues[a][readIndex] -= bestAbsoluteLikelihood; + } else // else if (maximumReferenceLikelihoodFall != Double.NEGATIVE_INFINITY ) { // + // Guarantee to be the case by enclosing code. + for (int a = 0; a < alleles.length; a++) + if (sampleValues[a][readIndex] < worstLikelihoodCap) + sampleValues[a][readIndex] = worstLikelihoodCap; + } + + /** + * Returns the samples in this read-likelihood collection. + *

+ * Samples are sorted by their index in the collection. + *

+ * + *

+ * The returned list is an unmodifiable view on the read-likelihoods sample list. + *

+ * + * @return never {@code null}. + */ + public List samples() { + if (sampleList == null) + sampleList = Collections.unmodifiableList(Arrays.asList(samples)); + return sampleList; + } + + /** + * Returns the samples in this read-likelihood collection. + *

+ * Samples are sorted by their index in the collection. + *

+ * + *

+ * The returned list is an unmodifiable. It will not be updated if the collection + * allele list changes. + *

+ * + * @return never {@code null}. + */ + public List
alleles() { + if (alleleList == null) + alleleList = Collections.unmodifiableList(Arrays.asList(alleles)); + return alleleList; + } + + /** + * Search the best allele for a read. + * + * @param sampleIndex including sample index. + * @param readIndex target read index. + * + * @return never {@code null}, but with {@link BestAllele#allele allele} == {@code null} + * if non-could be found. + */ + private BestAllele searchBestAllele(final int sampleIndex, final int readIndex, final boolean canBeReference) { + final int alleleCount = alleles.length; + if (alleleCount == 0 || (alleleCount == 1 && referenceAlleleIndex == 0 && !canBeReference)) + return new BestAllele(sampleIndex,readIndex,-1,Double.NEGATIVE_INFINITY,Double.NEGATIVE_INFINITY); + + final double[][] sampleValues = valuesBySampleIndex[sampleIndex]; + int bestAlleleIndex = canBeReference || referenceAlleleIndex != 0 ? 0 : 1; + + double bestLikelihood = sampleValues[bestAlleleIndex][readIndex]; + double secondBestLikelihood = Double.NEGATIVE_INFINITY; + for (int a = bestAlleleIndex + 1; a < alleleCount; a++) { + if (!canBeReference && referenceAlleleIndex == a) + continue; + final double candidateLikelihood = sampleValues[a][readIndex]; + if (candidateLikelihood > bestLikelihood) { + bestAlleleIndex = a; + secondBestLikelihood = bestLikelihood; + bestLikelihood = candidateLikelihood; + } else if (candidateLikelihood > secondBestLikelihood) { + secondBestLikelihood = candidateLikelihood; + } + } + return new BestAllele(sampleIndex,readIndex,bestAlleleIndex,bestLikelihood,secondBestLikelihood); + } + + public void changeReads(final Map readRealignments) { + final int sampleCount = samples.length; + for (int s = 0; s < sampleCount; s++) { + final GATKSAMRecord[] sampleReads = readsBySampleIndex[s]; + final Object2IntMap readIndex = readIndexBySampleIndex[s]; + final int sampleReadCount = sampleReads.length; + for (int r = 0; r < sampleReadCount; r++) { + final GATKSAMRecord read = sampleReads[r]; + final GATKSAMRecord replacement = readRealignments.get(read); + if (replacement == null) + continue; + readIndex.put(sampleReads[r] = replacement,r); + readIndex.remove(read); + } + } + } + + /** + * Likelihood matrix between a set of alleles and reads. + * @param the allele-type. + */ + public interface Matrix { + + /** + * List of reads in the matrix sorted by their index therein. + * @return never {@code null}. + */ + public List reads(); + + /** + * List of alleles in the matrix sorted by their index in the collection. + * @return never {@code null}. + */ + public List alleles(); + + /** + * Set the likelihood of a read given an allele through their indices. + * + * @param alleleIndex the target allele index. + * @param readIndex the target read index. + * @param value new likelihood value for the target read give the target allele. + * + * @throws IllegalArgumentException if {@code alleleIndex} or {@code readIndex} + * are not valid allele and read indices respectively. + */ + public void set(final int alleleIndex, final int readIndex, final double value); + + /** + * Returns the likelihood of a read given a haplotype. + * + * @param alleleIndex the index of the given haplotype. + * @param readIndex the index of the target read. + * + * @throws IllegalArgumentException if {@code alleleIndex} or {@code readIndex} is not a + * valid allele or read index respectively. + * + * @return the requested likelihood, whatever value was provided using {@link #set(int,int,double) set} + * or 0.0 if none was set. + */ + public double get(final int alleleIndex, final int readIndex); + + /** + * Queries the index of an allele in the matrix. + * + * @param allele the target allele. + * + * @throws IllegalArgumentException if {@code allele} is {@code null}. + * @return -1 if such allele does not exist, otherwise its index which 0 or greater. + */ + @SuppressWarnings("unused") + public int alleleIndex(final A allele); + + /** + * Queries the index of a read in the matrix. + * + * @param read the target read. + * + * @throws IllegalArgumentException if {@code read} is {@code null}. + * + * @return -1 if there is not such a read in the matrix, otherwise its index + * which is 0 or greater. + */ + @SuppressWarnings("unused") + public int readIndex(final GATKSAMRecord read); + + /** + * Number of allele in the matrix. + * @return never negative. + */ + public int alleleCount(); + + /** + * Number of reads in the matrix. + * @return never negative. + */ + public int readCount(); + + /** + * Returns the allele given its index. + * + * @param alleleIndex the target allele index. + * + * @throws IllegalArgumentException if {@code alleleIndex} is not a valid allele index. + * @return never {@code null}. + */ + public A allele(final int alleleIndex); + + /** + * Returns the allele given its index. + * + * @param readIndex the target allele index. + * + * @throws IllegalArgumentException if {@code readIndex} is not a valid read index. + * @return never {@code null}. + */ + public GATKSAMRecord read(final int readIndex); + } + + /** + * Perform marginalization from an allele set to another (smaller one) taking the maximum value + * for each read in the original allele subset. + * + * @param newToOldAlleleMap map where the keys are the new alleles and the value list the original + * alleles that correspond to the new one. + * @return never {@code null}. The result will have the requested set of new alleles (keys in {@code newToOldAlleleMap}, and + * the same set of samples and reads as the original. + * + * @throws IllegalArgumentException is {@code newToOldAlleleMap} is {@code null} or contains {@code null} values, + * or its values contain reference to non-existing alleles in this read-likelihood collection. Also no new allele + * can have zero old alleles mapping nor two new alleles can make reference to the same old allele. + */ + public ReadLikelihoods marginalize(final Map> newToOldAlleleMap) { + + if (newToOldAlleleMap == null) + throw new IllegalArgumentException("the input allele mapping cannot be null"); + + @SuppressWarnings("unchecked") + final B[] newAlleles = newToOldAlleleMap.keySet().toArray((B[]) new Allele[newToOldAlleleMap.size()]); + final int oldAlleleCount = alleles.length; + final int newAlleleCount = newAlleles.length; + + // we get the index correspondence between new old -> new allele, -1 entries mean that the old + // allele does not map to any new; supported but typically not the case. + final int[] oldToNewAlleleIndexMap = oldToNewAlleleIndexMap(newToOldAlleleMap, newAlleles, oldAlleleCount, newAlleleCount); + + // We calculate the marginal likelihoods. + + final double[][][] newLikelihoodValues = marginalLikelihoods(oldAlleleCount, newAlleleCount, oldToNewAlleleIndexMap, null); + + final int sampleCount = samples.length; + + @SuppressWarnings("unchecked") + final Object2IntMap[] newReadIndexBySampleIndex = new Object2IntMap[sampleCount]; + final GATKSAMRecord[][] newReadsBySampleIndex = new GATKSAMRecord[sampleCount][]; + + for (int s = 0; s < sampleCount; s++) { + newReadIndexBySampleIndex[s] = new Object2IntOpenHashMap<>(readIndexBySampleIndex[s]); + newReadsBySampleIndex[s] = readsBySampleIndex[s].clone(); + } + + // Finally we create the new read-likelihood + return new ReadLikelihoods<>(newAlleles, samples, sampleIndex, + newReadsBySampleIndex, + newReadIndexBySampleIndex, newLikelihoodValues); + } + + + /** + * Perform marginalization from an allele set to another (smaller one) taking the maximum value + * for each read in the original allele subset. + * + * @param newToOldAlleleMap map where the keys are the new alleles and the value list the original + * alleles that correspond to the new one. + * @return never {@code null}. The result will have the requested set of new alleles (keys in {@code newToOldAlleleMap}, and + * the same set of samples and reads as the original. + * + * @param overlap if not {@code null}, only reads that overlap the location (with unclipping) will be present in + * the output read-collection. + * + * @throws IllegalArgumentException is {@code newToOldAlleleMap} is {@code null} or contains {@code null} values, + * or its values contain reference to non-existing alleles in this read-likelihood collection. Also no new allele + * can have zero old alleles mapping nor two new alleles can make reference to the same old allele. + */ + public ReadLikelihoods marginalize(final Map> newToOldAlleleMap,final GenomeLoc overlap) { + + if (overlap == null) + return marginalize(newToOldAlleleMap); + + if (newToOldAlleleMap == null) + throw new IllegalArgumentException("the input allele mapping cannot be null"); + + @SuppressWarnings("unchecked") + final B[] newAlleles = newToOldAlleleMap.keySet().toArray((B[]) new Allele[newToOldAlleleMap.size()]); + final int oldAlleleCount = alleles.length; + final int newAlleleCount = newAlleles.length; + + // we get the index correspondence between new old -> new allele, -1 entries mean that the old + // allele does not map to any new; supported but typically not the case. + final int[] oldToNewAlleleIndexMap = oldToNewAlleleIndexMap(newToOldAlleleMap, newAlleles, oldAlleleCount, newAlleleCount); + + final int[][] readsToKeep = overlappingReadIndicesBySampleIndex(overlap); + // We calculate the marginal likelihoods. + + final double[][][] newLikelihoodValues = marginalLikelihoods(oldAlleleCount, newAlleleCount, oldToNewAlleleIndexMap, readsToKeep); + + final int sampleCount = samples.length; + + @SuppressWarnings("unchecked") + final Object2IntMap[] newReadIndexBySampleIndex = new Object2IntMap[sampleCount]; + final GATKSAMRecord[][] newReadsBySampleIndex = new GATKSAMRecord[sampleCount][]; + + for (int s = 0; s < sampleCount; s++) { + final int[] sampleReadsToKeep = readsToKeep[s]; + final GATKSAMRecord[] oldSampleReads = readsBySampleIndex[s]; + final int oldSampleReadCount = oldSampleReads.length; + final int newSampleReadCount = sampleReadsToKeep.length; + if (newSampleReadCount == oldSampleReadCount) { + newReadIndexBySampleIndex[s] = new Object2IntOpenHashMap<>(readIndexBySampleIndex[s]); + newReadsBySampleIndex[s] = readsBySampleIndex[s].clone(); + } else { + final Object2IntMap newSampleReadIndex = newReadIndexBySampleIndex[s] = new Object2IntOpenHashMap<>(newSampleReadCount); + final GATKSAMRecord[] newSampleReads = newReadsBySampleIndex[s] = new GATKSAMRecord[newSampleReadCount]; + for (int r = 0; r < newSampleReadCount; r++) + newSampleReadIndex.put(newSampleReads[r] = oldSampleReads[sampleReadsToKeep[r]],r); + } + } + + // Finally we create the new read-likelihood + return new ReadLikelihoods<>(newAlleles, samples, sampleIndex, + newReadsBySampleIndex, + newReadIndexBySampleIndex, newLikelihoodValues); + } + + private int[][] overlappingReadIndicesBySampleIndex(final GenomeLoc overlap) { + if (overlap == null) + return null; + final int sampleCount = samples.length; + final int[][] result = new int[sampleCount][]; + final IntArrayList buffer = new IntArrayList(200); + final int referenceIndex = overlap.getContigIndex(); + final int overlapStart = overlap.getStart(); + final int overlapEnd = overlap.getStop(); + for (int s = 0; s < sampleCount; s++) { + buffer.clear(); + final GATKSAMRecord[] sampleReads = readsBySampleIndex[s]; + final int sampleReadCount = sampleReads.length; + buffer.ensureCapacity(sampleReadCount); + for (int r = 0; r < sampleReadCount; r++) + if (unclippedReadOverlapsRegion(sampleReads[r], referenceIndex, overlapStart, overlapEnd)) + buffer.add(r); + result[s] = buffer.toIntArray(); + } + return result; + } + + public static boolean unclippedReadOverlapsRegion(final GATKSAMRecord read, final GenomeLoc region) { + return unclippedReadOverlapsRegion(read, region.getContigIndex(), region.getStart(), region.getStop()); + } + + private static boolean unclippedReadOverlapsRegion(final GATKSAMRecord sampleRead, final int referenceIndex, final int start, final int end) { + final int readReference = sampleRead.getReferenceIndex(); + if (readReference != referenceIndex) + return false; + + final int readStart = sampleRead.getUnclippedStart(); + if (readStart > end) + return false; + + final int readEnd = sampleRead.getReadUnmappedFlag() ? sampleRead.getUnclippedEnd() + : Math.max(sampleRead.getUnclippedEnd(), sampleRead.getUnclippedStart()); + return readEnd >= start; + } + + + + // Calculate the marginal likelihoods considering the old -> new allele index mapping. + private double[][][] marginalLikelihoods(final int oldAlleleCount, final int newAlleleCount, final int[] oldToNewAlleleIndexMap, final int[][] readsToKeep) { + + final int sampleCount = samples.length; + final double[][][] result = new double[sampleCount][][]; + + for (int s = 0; s < sampleCount; s++) { + final int sampleReadCount = readsBySampleIndex[s].length; + final double[][] oldSampleValues = valuesBySampleIndex[s]; + final int[] sampleReadToKeep = readsToKeep == null || readsToKeep[s].length == sampleReadCount ? null : readsToKeep[s]; + final int newSampleReadCount = sampleReadToKeep == null ? sampleReadCount : sampleReadToKeep.length; + final double[][] newSampleValues = result[s] = new double[newAlleleCount][newSampleReadCount]; + // We initiate all likelihoods to -Inf. + for (int a = 0; a < newAlleleCount; a++) + Arrays.fill(newSampleValues[a], Double.NEGATIVE_INFINITY); + // For each old allele and read we update the new table keeping the maximum likelihood. + for (int r = 0; r < newSampleReadCount; r++) { + for (int a = 0; a < oldAlleleCount; a++) { + final int oldReadIndex = newSampleReadCount == sampleReadCount ? r : sampleReadToKeep[r]; + final int newAlleleIndex = oldToNewAlleleIndexMap[a]; + if (newAlleleIndex == -1) + continue; + final double likelihood = oldSampleValues[a][oldReadIndex]; + if (likelihood > newSampleValues[newAlleleIndex][r]) + newSampleValues[newAlleleIndex][r] = likelihood; + } + } + } + return result; + } + + // calculates an old to new allele index map array. + private int[] oldToNewAlleleIndexMap(final Map> newToOldAlleleMap, final B[] newAlleles, + final int oldAlleleCount, final int newAlleleCount) { + + final int[] oldToNewAlleleIndexMap = new int[oldAlleleCount]; + Arrays.fill(oldToNewAlleleIndexMap, -1); // -1 indicate that there is no new allele that make reference to that old one. + + for (int i = 0; i < newAlleleCount; i++) { + final B newAllele = newAlleles[i]; + if (newAllele == null) + throw new IllegalArgumentException("input alleles cannot be null"); + final List oldAlleles = newToOldAlleleMap.get(newAllele); + if (oldAlleles == null) + throw new IllegalArgumentException("no new allele list can be null"); + for (final A oldAllele : oldAlleles) { + if (oldAllele == null) + throw new IllegalArgumentException("old alleles cannot be null"); + final int oldAlleleIndex = alleleIndex(oldAllele); + if (oldAlleleIndex == -1) + throw new IllegalArgumentException("missing old allele " + oldAllele + " in likelihood collection "); + if (oldToNewAlleleIndexMap[oldAlleleIndex] != -1) + throw new IllegalArgumentException("collision: two new alleles make reference to the same old allele"); + oldToNewAlleleIndexMap[oldAlleleIndex] = i; + } + } + return oldToNewAlleleIndexMap; + } + + /** + * Remove those reads that do not overlap certain genomic location. + * + *

+ * This method modifies the current read-likelihoods collection. + *

+ * + * @param location the target location. + * + * @throws IllegalArgumentException the location cannot be {@code null} nor unmapped. + */ + @SuppressWarnings("unused") + public void filterToOnlyOverlappingUnclippedReads(final GenomeLoc location) { + if (location == null) + throw new IllegalArgumentException("the location cannot be null"); + if (location.isUnmapped()) + throw new IllegalArgumentException("the location cannot be unmapped"); + + final int sampleCount = samples.length; + + final int locContig = location.getContigIndex(); + final int locStart = location.getStart(); + final int locEnd = location.getStop(); + + final Set readsToRemove = new HashSet<>(100); // can we improve this estimate? + for (int s = 0; s < sampleCount; s++) + for (final GATKSAMRecord read : readsBySampleIndex[s]) + if (!unclippedReadOverlapsRegion(read, locContig, locStart, locEnd)) + readsToRemove.add(read); + removeReads(readsToRemove); + } + + // Compare the read coordinates to the location of interest. + private boolean readOverlapsLocation(final String contig, final int locStart, + final int locEnd, final GATKSAMRecord read) { + final boolean overlaps; + + if (read.getReadUnmappedFlag()) + overlaps = false; + else if (!read.getReferenceName().equals(contig)) + overlaps = false; + else { + int alnStart = read.getAlignmentStart(); + int alnStop = read.getAlignmentEnd(); + if (alnStart > alnStop) { // Paranoia? based on GLP.createGenomeLoc(Read) this can happen?. + final int end = alnStart; + alnStart = alnStop; + alnStop = end; + } + overlaps = !(alnStop < locStart || alnStart > locEnd); + } + return overlaps; + } + + /** + * Removes those read that the best possible likelihood given any allele is just too low. + * + *

+ * This is determined by a maximum error per read-base against the best likelihood possible. + *

+ * + * @param maximumErrorPerBase the minimum acceptable error rate per read base, must be + * a positive number. + * + * @throws IllegalStateException is not supported for read-likelihood that do not contain alleles. + * + * @throws IllegalArgumentException if {@code maximumErrorPerBase} is negative. + */ + public void filterPoorlyModeledReads(final double maximumErrorPerBase) { + if (alleles.length == 0) + throw new IllegalStateException("unsupported for read-likelihood collections with no alleles"); + if (Double.isNaN(maximumErrorPerBase) || maximumErrorPerBase <= 0.0) + throw new IllegalArgumentException("the maximum error per base must be a positive number"); + final int sampleCount = samples.length; + + final Set readsToRemove = new HashSet<>(100); // rough estimate, can be improved? + for (int s = 0; s < sampleCount; s++) { + final GATKSAMRecord[] sampleReads = readsBySampleIndex[s]; + final int sampleReadCount = sampleReads.length; + for (int r = 0; r < sampleReadCount; r++) { + final GATKSAMRecord read = sampleReads[r]; + if (readIsPoorlyModelled(s,r,read, maximumErrorPerBase)) + readsToRemove.add(read); + } + } + removeReads(readsToRemove); + } + + // Check whether the read is poorly modelled. + protected boolean readIsPoorlyModelled(final int sampleIndex, final int readIndex, final GATKSAMRecord read, final double maxErrorRatePerBase) { + final double maxErrorsForRead = Math.min(2.0, Math.ceil(read.getReadLength() * maxErrorRatePerBase)); + final double log10QualPerBase = -4.0; + final double log10MaxLikelihoodForTrueAllele = maxErrorsForRead * log10QualPerBase; + + final int alleleCount = alleles.length; + final double[][] sampleValues = valuesBySampleIndex[sampleIndex]; + for (int a = 0; a < alleleCount; a++) + if (sampleValues[a][readIndex] >= log10MaxLikelihoodForTrueAllele) + return false; + return true; + } + + + /** + * Add more reads to the collection. + * + * @param readsBySample reads to add. + * @param initialLikelihood the likelihood for the new entries. + * + * @throws IllegalArgumentException if {@code readsBySample} is {@code null} or {@code readsBySample} contains + * {@code null} reads, or {@code readsBySample} contains read that are already present in the read-likelihood + * collection. + */ + public void addReads(final Map> readsBySample, final double initialLikelihood) { + + for (final Map.Entry> entry : readsBySample.entrySet()) { + + final String sample = entry.getKey(); + final List newSampleReads = entry.getValue(); + final int sampleIndex = this.sampleIndex.getInt(sample); + + if (sampleIndex == -1) + throw new IllegalArgumentException("input sample " + sample + + " is not part of the read-likelihoods collection"); + + if (newSampleReads == null || newSampleReads.size() == 0) + continue; + + final int sampleReadCount = readsBySampleIndex[sampleIndex].length; + final int newSampleReadCount = sampleReadCount + newSampleReads.size(); + + appendReads(newSampleReads, sampleIndex, sampleReadCount, newSampleReadCount); + extendsLikelihoodArrays(initialLikelihood, sampleIndex, sampleReadCount, newSampleReadCount); + } + } + + // Extends the likelihood arrays-matrices. + private void extendsLikelihoodArrays(double initialLikelihood, int sampleIndex, int sampleReadCount, int newSampleReadCount) { + final double[][] sampleValues = valuesBySampleIndex[sampleIndex]; + final int alleleCount = alleles.length; + for (int a = 0; a < alleleCount; a++) + sampleValues[a] = Arrays.copyOf(sampleValues[a], newSampleReadCount); + if (initialLikelihood != 0.0) // the default array new value. + for (int a = 0; a < alleleCount; a++) + Arrays.fill(sampleValues[a],sampleReadCount,newSampleReadCount,initialLikelihood); + } + + // Append the new read reference into the structure per-sample. + private void appendReads(final List newSampleReads, final int sampleIndex, + final int sampleReadCount, final int newSampleReadCount) { + final GATKSAMRecord[] sampleReads = readsBySampleIndex[sampleIndex] = + Arrays.copyOf(readsBySampleIndex[sampleIndex], newSampleReadCount); + + int nextReadIndex = sampleReadCount; + final Object2IntMap sampleReadIndex = readIndexBySampleIndex[sampleIndex]; + for (final GATKSAMRecord newRead : newSampleReads) { + if (sampleReadIndex.containsKey(newRead)) // might be worth handle this without exception (ignore the read?) but in practice should never be the case. + throw new IllegalArgumentException("you cannot add reads that are already in read-likelihood collection"); + sampleReads[nextReadIndex] = newRead; + sampleReadIndex.put(newRead,nextReadIndex++); + } + } + + /** + * Adds the non-reference allele to the read-likelihood collection setting each read likelihood to the second + * best found (or best one if only one allele has likelihood). + * + *

Nothing will happen if the read-likelihoods collection already includes the non-ref allele

+ * + *

+ * Implementation note: even when strictly speaking we do not need to demand the calling code to pass + * the reference the non-ref allele, we still demand it in order to lead the + * the calling code to use the right generic type for this likelihoods + * collection {@link Allele}. + *

+ * + * @param nonRefAllele the non-ref allele. + * + * @throws IllegalArgumentException if {@code nonRefAllele} is anything but the designated <NON_REF> + * symbolic allele {@link GATKVariantContextUtils#NON_REF_SYMBOLIC_ALLELE}. + */ + public void addNonReferenceAllele(final A nonRefAllele) { + + if (nonRefAllele == null) + throw new IllegalArgumentException("non-ref allele cannot be null"); + if (!nonRefAllele.equals(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE)) + throw new IllegalArgumentException("the non-ref allele is not valid"); + // Already present? + if (alleleIndex.containsKey(nonRefAllele)) + return; + + final int alleleCount = alleles.length; + final int newAlleleCount = alleleCount + 1; + alleles = Arrays.copyOf(alleles,newAlleleCount); + alleles[alleleCount] = nonRefAllele; + alleleIndex.put(nonRefAllele,alleleCount); + alleleList = null; // remove the cached alleleList. + + final int sampleCount = samples.length; + for (int s = 0; s < sampleCount; s++) + addNonReferenceAlleleLikelihoodsPerSample(alleleCount, newAlleleCount, s); + } + + // Updates per-sample structures according to the addition of the NON_REF allele. + private void addNonReferenceAlleleLikelihoodsPerSample(final int alleleCount, final int newAlleleCount, final int sampleIndex) { + final double[][] sampleValues = valuesBySampleIndex[sampleIndex] = Arrays.copyOf(valuesBySampleIndex[sampleIndex], newAlleleCount); + final int sampleReadCount = readsBySampleIndex[sampleIndex].length; + + final double[] nonRefAlleleLikelihoods = sampleValues[alleleCount] = new double [sampleReadCount]; + Arrays.fill(nonRefAlleleLikelihoods,Double.NEGATIVE_INFINITY); + for (int r = 0; r < sampleReadCount; r++) { + final BestAllele bestAllele = searchBestAllele(sampleIndex,r,true); + final double secondBestLikelihood = Double.isInfinite(bestAllele.confidence) ? bestAllele.likelihood + : bestAllele.likelihood - bestAllele.confidence; + nonRefAlleleLikelihoods[r] = secondBestLikelihood; + } + } + + /** + * Downsamples reads based on contamination fractions making sure that all alleles are affected proportionally. + * + * @param perSampleDownsamplingFraction contamination sample map where the sample name are the keys and the + * fractions are the values. + * + * @throws IllegalArgumentException if {@code perSampleDownsamplingFraction} is {@code null}. + */ + public void contaminationDownsampling(final Map perSampleDownsamplingFraction) { + + final int sampleCount = samples.length; + final Set readsToRemove = new HashSet<>(100); // blind estimate, can be improved? + for (int s = 0; s < sampleCount; s++) { + final String sample = samples[s]; + final Double fractionDouble = perSampleDownsamplingFraction.get(sample); + if (fractionDouble == null) + continue; + final double fraction = fractionDouble; + if (Double.isNaN(fraction) || fraction <= 0.0) + continue; + if (fraction >= 1.0) + readsToRemove.addAll(Arrays.asList(readsBySampleIndex[s])); + else { + final Map> readsByBestAllelesMap = readsByBestAlleleMap(s); + readsToRemove.addAll(AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(readsByBestAllelesMap, fraction)); + } + } + removeReads(readsToRemove); + } + + /** + * Returns the collection of best allele estimates for the reads based on the read-likelihoods. + * + * @throws IllegalStateException if there is no alleles. + * + * @return never {@code null}, one element per read in the read-likelihoods collection. + */ + public Collection bestAlleles() { + final List result = new ArrayList<>(100); // blind estimate. + final int sampleCount = samples.length; + for (int s = 0; s < sampleCount; s++) { + final GATKSAMRecord[] sampleReads = readsBySampleIndex[s]; + final int readCount = sampleReads.length; + for (int r = 0; r < readCount; r++) + result.add(searchBestAllele(s,r,true)); + } + return result; + } + + /** + * Returns reads stratified by their best allele. + * @param sampleIndex the target sample. + * @return never {@code null}, perhaps empty. + */ + public Map> readsByBestAlleleMap(final int sampleIndex) { + checkSampleIndex(sampleIndex); + final int alleleCount = alleles.length; + final int sampleReadCount = readsBySampleIndex[sampleIndex].length; + final Map> result = new HashMap<>(alleleCount); + for (final A allele : alleles) + result.put(allele,new ArrayList(sampleReadCount)); + readsByBestAlleleMap(sampleIndex,result); + return result; + } + + /** + * Returns reads stratified by their best allele. + * @return never {@code null}, perhaps empty. + */ + @SuppressWarnings("unused") + public Map> readsByBestAlleleMap() { + final int alleleCount = alleles.length; + final Map> result = new HashMap<>(alleleCount); + final int totalReadCount = readCount(); + for (final A allele : alleles) + result.put(allele,new ArrayList(totalReadCount)); + final int sampleCount = samples.length; + for (int s = 0; s < sampleCount; s++) + readsByBestAlleleMap(s,result); + return result; + } + + private void readsByBestAlleleMap(final int sampleIndex, final Map> result) { + final GATKSAMRecord[] reads = readsBySampleIndex[sampleIndex]; + final int readCount = reads.length; + + for (int r = 0; r < readCount; r++) { + final BestAllele bestAllele = searchBestAllele(sampleIndex,r,true); + if (!bestAllele.isInformative()) + continue; + result.get(bestAllele.allele).add(bestAllele.read); + } + } + + /** + * Returns the index of a read within a sample read-likelihood sub collection. + * @param sampleIndex the sample index. + * @param read the query read. + * @return -1 if there is no such read in that sample, 0 or greater otherwise. + */ + @SuppressWarnings("unused") + public int readIndex(final int sampleIndex, final GATKSAMRecord read) { + final Object2IntMap readIndex = readIndexBySampleIndex[sampleIndex]; + if (readIndex.containsKey(read)) + return readIndexBySampleIndex[sampleIndex].getInt(read); + else + return -1; + } + + /** + * Returns the total number of reads in the read-likelihood collection. + * + * @return never {@code null} + */ + public int readCount() { + int sum = 0; + for (int i = 0; i < samples.length; i++) + sum += readsBySampleIndex[i].length; + return sum; + } + + /** + * Returns the number of reads that belong to a sample in the read-likelihood collection. + * @param sampleIndex the query sample index. + * + * @throws IllegalArgumentException if {@code sampleIndex} is not a valid sample index. + * @return 0 or greater. + */ + public int sampleReadCount(int sampleIndex) { + checkSampleIndex(sampleIndex); + return readsBySampleIndex[sampleIndex].length; + } + + /** + * Contains information about the best allele for a read search result. + */ + public class BestAllele { + public static final double INFORMATIVE_THRESHOLD = 0.2; + + /** + * Null if there is no possible match (no allele?). + */ + public final A allele; + + /** + * The containing sample. + */ + public final String sample; + + /** + * The query read. + */ + public final GATKSAMRecord read; + + /** + * If allele != null, the indicates the likelihood of the read. + */ + public final double likelihood; + + /** + * Confidence that the read actually was generated under that likelihood. + * This is equal to the difference between this and the second best allele match. + */ + public final double confidence; + + private BestAllele(final int sampleIndex, final int readIndex, final int bestAlleleIndex, + final double likelihood, final double secondBestLikelihood) { + allele = bestAlleleIndex == -1 ? null : alleles[bestAlleleIndex]; + this.likelihood = likelihood; + sample = samples[sampleIndex]; + read = readsBySampleIndex[sampleIndex][readIndex]; + confidence = likelihood == secondBestLikelihood ? 0 : likelihood - secondBestLikelihood; + } + + public boolean isInformative() { + return confidence > INFORMATIVE_THRESHOLD; + } + } + + + /** + * Remove a set of reads from the read-likelihoods + * @param readsToRemove collection of reads to remove. + */ + @SuppressWarnings("unused") + public void removeReads(final Collection readsToRemove) { + if (readsToRemove == null) + throw new IllegalArgumentException("remove read collection cannot be null"); + if (readsToRemove.isEmpty()) + return; + removeReads(new HashSet<>(readsToRemove)); + } + + // Requires that the collection passed iterator can remove elements, and it can be modified. + private void removeReads(final Set readsToRemove) { + if (readsToRemove.isEmpty()) + return; + final int sampleCount = samples.length; + final int alleleCount = alleles.length; + for (int s = 0; s < sampleCount; s++) { + removeSampleReads(s, readsToRemove, alleleCount); + if (readsToRemove.isEmpty()) break; // each iteration may exhaust the reads to remove. + } + } + + // Requires that the collection passed iterator can remove elements, and it can be modified. + private void removeSampleReads(final int sampleIndex, final Set readsToRemove, final int alleleCount) { + final GATKSAMRecord[] sampleReads = readsBySampleIndex[sampleIndex]; + final int sampleReadCount = sampleReads.length; + + final Object2IntMap indexByRead = readIndexBySampleIndex[sampleIndex]; + // Count how many we are going to remove, which ones (indexes) and remove entry from the read-index map. + final boolean[] removeIndex = new boolean[sampleReadCount]; + int removeCount = 0; // captures the number of deletions. + int firstDeleted = sampleReadCount; // captures the first position that was deleted. + + final Iterator readsToRemoveIterator = readsToRemove.iterator(); + while (readsToRemoveIterator.hasNext()) { + final GATKSAMRecord read = readsToRemoveIterator.next(); + if (indexByRead.containsKey(read)) { + final int index = indexByRead.getInt(read); + if (firstDeleted > index) + firstDeleted = index; + removeCount++; + removeIndex[index] = true; + readsToRemoveIterator.remove(); + indexByRead.remove(read); + } + } + + // Nothing to remove we just finish here. + if (removeCount == 0) + return; + + final int newSampleReadCount = sampleReadCount - removeCount; + + // Now we skim out the removed reads from the read array. + final GATKSAMRecord[] oldSampleReads = readsBySampleIndex[sampleIndex]; + final GATKSAMRecord[] newSampleReads = new GATKSAMRecord[newSampleReadCount]; + + System.arraycopy(oldSampleReads,0,newSampleReads,0,firstDeleted); + Utils.skimArray(oldSampleReads,firstDeleted, newSampleReads, firstDeleted, removeIndex, firstDeleted); + + // Update the indices for the extant reads from the first deletion onwards. + for (int r = firstDeleted; r < newSampleReadCount; r++) { + indexByRead.put(newSampleReads[r], r); + } + + // Then we skim out the likelihoods of the removed reads. + final double[][] oldSampleValues = valuesBySampleIndex[sampleIndex]; + final double[][] newSampleValues = new double[alleleCount][newSampleReadCount]; + for (int a = 0; a < alleleCount; a++) { + System.arraycopy(oldSampleValues[a],0,newSampleValues[a],0,firstDeleted); + Utils.skimArray(oldSampleValues[a], firstDeleted, newSampleValues[a], firstDeleted, removeIndex, firstDeleted); + } + valuesBySampleIndex[sampleIndex] = newSampleValues; + readsBySampleIndex[sampleIndex] = newSampleReads; + readListBySampleIndex[sampleIndex] = null; // reset the unmodifiable list. + } + + /** + * Transform into a multi-sample HashMap backed {@link PerReadAlleleLikelihoodMap} type. + * @return never {@code null}. + * + */ + @Deprecated + @SuppressWarnings("all") + public Map toPerReadAlleleLikelihoodMap () { + final int sampleCount = samples.length; + final Map result = new HashMap<>(sampleCount); + for (int s = 0; s < sampleCount; s++) + result.put(samples[s],toPerReadAlleleLikelihoodMap(s)); + return result; + } + + /** + * Transform into a single-sample HashMap backed {@link PerReadAlleleLikelihoodMap} type. + * + * @return never {@code null}. + */ + @Deprecated + public PerReadAlleleLikelihoodMap toPerReadAlleleLikelihoodMap(final int sampleIndex) { + checkSampleIndex(sampleIndex); + final PerReadAlleleLikelihoodMap result = new PerReadAlleleLikelihoodMap(); + final int alleleCount = alleles.length; + final GATKSAMRecord[] sampleReads = readsBySampleIndex[sampleIndex]; + final int sampleReadCount = sampleReads.length; + for (int a = 0; a < alleleCount; a++) { + final A allele = alleles[a]; + final double[] readLikelihoods = valuesBySampleIndex[sampleIndex][a]; + for (int r = 0; r < sampleReadCount; r++) + result.add(sampleReads[r], allele, readLikelihoods[r]); + } + return result; + } + + /** + * Implements a likelihood matrix per sample given its index. + */ + private class SampleMatrix implements Matrix
{ + + private final int sampleIndex; + + private SampleMatrix(final int sampleIndex) { + this.sampleIndex = sampleIndex; + } + + @Override + public List reads() { + return sampleReads(sampleIndex); + } + + @Override + public List alleles() { + return ReadLikelihoods.this.alleles(); + } + + @Override + public void set(final int alleleIndex, final int readIndex, final double value) { + valuesBySampleIndex[sampleIndex][alleleIndex][readIndex] = value; + } + + @Override + public double get(final int alleleIndex, final int readIndex) { + return valuesBySampleIndex[sampleIndex][alleleIndex][readIndex]; + } + + @Override + public int alleleIndex(final A allele) { + return ReadLikelihoods.this.alleleIndex(allele); + } + + @Override + public int readIndex(final GATKSAMRecord read) { + return ReadLikelihoods.this.readIndex(sampleIndex,read); + } + + @Override + public int alleleCount() { + return alleles.length; + } + + @Override + public int readCount() { + return readsBySampleIndex[sampleIndex].length; + } + + @Override + public A allele(int alleleIndex) { + return ReadLikelihoods.this.allele(alleleIndex); + } + + @Override + public GATKSAMRecord read(final int readIndex) { + if (readIndex < 0) + throw new IllegalArgumentException("the read-index cannot be negative"); + final GATKSAMRecord[] sampleReads = readsBySampleIndex[sampleIndex]; + if (readIndex >= sampleReads.length) + throw new IllegalArgumentException("the read-index is beyond the read count of the sample"); + return sampleReads[readIndex]; + } + } + + /** + * Checks whether the provide sample index is valid. + *

+ * If not, it throws an exception. + *

+ * @param sampleIndex the target sample index. + * + * @throws IllegalArgumentException if {@code sampleIndex} is invalid, i.e. outside the range [0,{@link #sampleCount}). + */ + private void checkSampleIndex(final int sampleIndex) { + if (sampleIndex < 0 || sampleIndex >= samples.length) + throw new IllegalArgumentException("invalid sample index: " + sampleIndex); + } + + /** + * Checks whether the provide allele index is valid. + *

+ * If not, it throws an exception. + *

+ * @param alleleIndex the target sample index. + * + * @throws IllegalArgumentException if {@code alleleIndex} is invalid, i.e. outside the range [0,{@link #sampleCount}). + */ + private void checkAlleleIndex(final int alleleIndex) { + if (alleleIndex < 0 || alleleIndex >= alleles.length) + throw new IllegalArgumentException("invalid allele index: " + alleleIndex); + } +} \ No newline at end of file From 2914ecb585e5407128c35a86b391c3fabf490aba Mon Sep 17 00:00:00 2001 From: Valentin Ruano-Rubio Date: Thu, 7 Aug 2014 17:03:12 -0400 Subject: [PATCH 2/5] Change the Map-of-maps-of-maps for an array based implementation ReadLikelihoods to hold read likelihoods. The array structure should be faster to populate and query (no properly benchmarked) and reduce memory footprint considerably. Nevertheless removing PairHMM factor (using likelihoodEngine Random) it only achieves a speed up of 15% in some example WGS dataset i.e. there are other bigger bottle necks in the system. Bamboo tests also seem to run significantly faster with this change. Stories: https://www.pivotaltracker.com/story/show/70222086 https://www.pivotaltracker.com/story/show/67961652 Changes: - ReadLikelihoods added to substitute Map - Operation that involve changes in full sets of ReadLikelihoods have been moved into that class. - Simplified a bit the code that handles the downsampling of reads based on contamination Caveats: - Still we keep Map around to pass to annotators..., didn't feel like change the interface of so many public classes in this pull-request. --- .../StandardCallerArgumentCollection.java | 22 ++ ...GraphBasedLikelihoodCalculationEngine.java | 57 ++- ...edLikelihoodCalculationEngineInstance.java | 140 +++---- .../haplotypecaller/HaplotypeCaller.java | 70 ++-- .../HaplotypeCallerGenotypingEngine.java | 226 +++++------- .../PairHMMLikelihoodCalculationEngine.java | 347 ++++-------------- .../RandomLikelihoodCalculationEngine.java | 55 +-- .../ReadLikelihoodCalculationEngine.java | 5 +- .../ReferenceConfidenceModel.java | 30 +- .../haplotype/HaplotypeLDCalculator.java | 24 +- .../gatk/utils/haplotype/LDMerger.java | 19 +- .../MergeVariantsAcrossHaplotypes.java | 7 +- .../AllHaplotypeBAMWriter.java | 19 +- .../CalledHaplotypeBAMWriter.java | 15 +- .../HaplotypeBAMWriter.java | 14 +- .../AlleleBiasedDownsamplingUtils.java | 13 +- .../annotator/VariantAnnotatorEngine.java | 16 +- .../gatk/utils/pairhmm/PairHMM.java | 83 ++++- .../gatk/utils/sam/AlignmentUtils.java | 7 +- 19 files changed, 503 insertions(+), 666 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java index 653b438ec..7c69ab014 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java @@ -99,6 +99,11 @@ public class StandardCallerArgumentCollection implements Cloneable { @Argument(fullName = "contamination_fraction_per_sample_file", shortName = "contaminationFile", doc = "Tab-separated File containing fraction of contamination in sequencing data (per sample) to aggressively remove. Format should be \"\" (Contamination is double) per line; No header.", required = false) public File CONTAMINATION_FRACTION_FILE = null; + /** + * Indicates whether there is some sample contamination present. + */ + private boolean sampleContaminationWasLoaded = false; + /** * * @return an _Immutable_ copy of the Sample-Contamination Map, defaulting to CONTAMINATION_FRACTION so that if the sample isn't in the map map(sample)==CONTAMINATION_FRACTION @@ -106,15 +111,32 @@ public class StandardCallerArgumentCollection implements Cloneable { public Map getSampleContamination(){ //make sure that the default value is set up right sampleContamination.setDefaultValue(CONTAMINATION_FRACTION); + if (!Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0) + sampleContaminationWasLoaded = true; return Collections.unmodifiableMap(sampleContamination); } public void setSampleContamination(DefaultHashMap sampleContamination) { this.sampleContamination.clear(); + this.sampleContaminationWasLoaded = !Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0; + if (!sampleContaminationWasLoaded) + for (final Double d : sampleContamination.values()) + if (!Double.isNaN(d) && d > 0.0) { + sampleContaminationWasLoaded = true; + break; + } this.sampleContamination.putAll(sampleContamination); this.sampleContamination.setDefaultValue(CONTAMINATION_FRACTION); } + /** + * Returns true if there is some sample contamination present, false otherwise. + * @return {@code true} iff there is some sample contamination + */ + public boolean isSampleContaminationPresent() { + return (!Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0) || sampleContaminationWasLoaded; + } + //Needs to be here because it uses CONTAMINATION_FRACTION private DefaultHashMap sampleContamination = new DefaultHashMap(CONTAMINATION_FRACTION); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java index cfb1b957a..7ba81ca8c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java @@ -1,58 +1,58 @@ /* * By downloading the PROGRAM you agree to the following terms of use: -* +* * BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* +* * This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* +* * WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and * WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. * NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* +* * 1. DEFINITIONS * 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* +* * 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. * The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. * 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY * LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. * Copyright 2012 Broad Institute, Inc. * Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. * LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* +* * 4. INDEMNIFICATION * LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* +* * 5. NO REPRESENTATIONS OR WARRANTIES * THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. * IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* +* * 6. ASSIGNMENT * This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* +* * 7. MISCELLANEOUS * 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. * 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. * 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. * 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ + package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import org.apache.log4j.Logger; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.graphs.SeqGraph; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; -import org.broadinstitute.gatk.utils.exceptions.GATKException; -import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; -import org.broadinstitute.gatk.utils.pairhmm.FlexibleHMM; import org.broadinstitute.gatk.utils.pairhmm.FastLoglessPairHMM; +import org.broadinstitute.gatk.utils.pairhmm.FlexibleHMM; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import java.io.File; @@ -112,17 +112,16 @@ public class GraphBasedLikelihoodCalculationEngine implements ReadLikelihoodCalc debugMode = debugHaplotypeGraphAndLikelihoods ? DebugMode.EXTRA_DEBUG : debug ? DebugMode.DEBUG : DebugMode.NONE; } - @Override - public Map computeReadLikelihoods(final AssemblyResultSet assemblyResultSet, final Map> perSampleReadList) { + public ReadLikelihoods computeReadLikelihoods(final AssemblyResultSet assemblyResultSet, final List samples, final Map> perSampleReadList) { final GraphBasedLikelihoodCalculationEngineInstance graphLikelihoodEngine = new GraphBasedLikelihoodCalculationEngineInstance(assemblyResultSet, hmm,log10GlobalReadMismappingRate,heterogeneousKmerSizeResolution); final List haplotypes = assemblyResultSet.getHaplotypeList(); final List supportedHaplotypes = graphLikelihoodEngine.getHaplotypeList(); - if (supportedHaplotypes.size() != haplotypes.size()) logger.warn("Some haplotypes were drop due to missing route on the graph (supported / all): " + supportedHaplotypes.size() + "/" + haplotypes.size()); - final Map result = graphLikelihoodEngine.computeReadLikelihoods(supportedHaplotypes, - perSampleReadList ); + if (supportedHaplotypes.size() != haplotypes.size()) + logger.warn("Some haplotypes were drop due to missing route on the graph (supported / all): " + supportedHaplotypes.size() + "/" + haplotypes.size()); + final ReadLikelihoods result = graphLikelihoodEngine.computeReadLikelihoods(supportedHaplotypes,samples,perSampleReadList); if (debugMode != DebugMode.NONE) graphLikelihoodDebugDumps(assemblyResultSet.getRegionForGenotyping(), graphLikelihoodEngine,result); return result; } @@ -131,7 +130,7 @@ public class GraphBasedLikelihoodCalculationEngine implements ReadLikelihoodCalc * A few debug messages associated with the GraphBased likelihoods engine. */ private void graphLikelihoodDebugDumps(final ActiveRegion originalActiveRegion, final GraphBasedLikelihoodCalculationEngineInstance graphLikelihoodEngine, - final Map result) { + final ReadLikelihoods result) { if (graphLikelihoodEngine.hasCycles()) logger.debug("Resulting haplotype graph combining several kmer sizes has cycles"); else if (graphLikelihoodEngine.haplotypeGraph.hasNonReferenceEnds()) @@ -144,14 +143,14 @@ public class GraphBasedLikelihoodCalculationEngine implements ReadLikelihoodCalc sq.simplifyGraph(); sq.printGraph(new File(originalActiveRegion.getLocation() + "-" + graphLikelihoodEngine.getKmerSize() + "-haplotypeSeqGraph.dot"), 10000); try { - FileWriter fw = new FileWriter(new File(originalActiveRegion.getLocation() + "-likelihoods.txt")); - PrintWriter pw = new PrintWriter(fw); + final FileWriter fw = new FileWriter(new File(originalActiveRegion.getLocation() + "-likelihoods.txt")); + final PrintWriter pw = new PrintWriter(fw); //Note: we only output the first sample likelihoods, perhaps should output all of them but for debugging this is normally what is needed. - pw.println(result.entrySet().iterator().next().getValue().toString()); + pw.println(result.sampleMatrix(0)); // need to actually implement a proper toString for the SampleMatrix. pw.close(); fw.close(); - } catch (Exception ex) { - throw new GATKException("", ex); + } catch (final Exception ex) { + throw new IllegalStateException("", ex); } } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java index 50e12842a..7d4b3db1a 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java @@ -57,6 +57,7 @@ import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.collections.CountSet; import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.pairhmm.FlexibleHMM; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; @@ -215,6 +216,7 @@ public class GraphBasedLikelihoodCalculationEngineInstance { * * @return {@code true} iff so. */ + @SuppressWarnings("unused") public boolean hasVariation() { return hasVariation; } @@ -231,27 +233,24 @@ public class GraphBasedLikelihoodCalculationEngineInstance { * @return never {@code null}, and with at least one entry for input sample (keys in {@code perSampleReadList}. * The value maps can be potentially empty though. */ - public Map computeReadLikelihoods( - final List haplotypes, + public ReadLikelihoods computeReadLikelihoods(final List haplotypes, final List samples, final Map> perSampleReadList) { // General preparation on the input haplotypes: - Collections.sort(haplotypes, Haplotype.ALPHANUMERICAL_COMPARATOR); - final Map alleleVersions = new LinkedHashMap<>(haplotypes.size()); - for (final Haplotype haplotype : haplotypes) - alleleVersions.put(haplotype, Allele.create(haplotype,haplotype.isReference())); + final ReadLikelihoods result = new ReadLikelihoods<>(samples, haplotypes, perSampleReadList); + final List sortedHaplotypes = new ArrayList<>(haplotypes); + Collections.sort(sortedHaplotypes, Haplotype.ALPHANUMERICAL_COMPARATOR); // The actual work: - final HashMap result = new HashMap<>(perSampleReadList.size()); - for (final Map.Entry> e : perSampleReadList.entrySet()) { - final String sample = e.getKey(); - final List reads = e.getValue(); - final Set mayNeedAdjustment = new HashSet<>(reads.size()); + final int sampleCount = result.sampleCount(); + for (int s = 0; s < sampleCount; s++) { + final List sampleReads = result.sampleReads(s); + // Get the cost/likelihood of each read at relevant subpaths on the tree: - final Map> costsByEndingVertex = calculatePathCostsByRead(reads, mayNeedAdjustment); + final Map> costsByEndingVertex = calculatePathCostsByRead(sampleReads); // Create the resulting per-read maps: - final PerReadAlleleLikelihoodMap prallm = calculatePerReadAlleleLikelihoodMap(haplotypes, costsByEndingVertex, alleleVersions); - result.put(sample, prallm); + calculatePerReadAlleleLikelihoodMap(costsByEndingVertex,result.sampleMatrix(s) ); } + result.normalizeLikelihoods(true,log10globalReadMismappingRate); logger.debug("Likelihood analysis summary: reads anchored " + anchoredReads + "/" + (anchoredReads + nonAnchoredReads) + ""); return result; } @@ -263,8 +262,7 @@ public class GraphBasedLikelihoodCalculationEngineInstance { * @param fileName name of the output file. */ public void printGraph(final String fileName) { - if (haplotypeGraph != null) - haplotypeGraph.printGraph(fileName); + haplotypeGraph.printGraph(fileName); } /** @@ -281,36 +279,24 @@ public class GraphBasedLikelihoodCalculationEngineInstance { * * @return {@code true} iff so. */ + @SuppressWarnings("unused") public boolean hasCycles() { - // It is set to null if it contained cycles. - return haplotypeGraph == null; + return haplotypeGraph.hasCycles(); } /** * Builds the result per-read allele likelihood map. * - * @param haplotypes haplotypes to process. - * @param costsEndingByVertex Read vs haplotype graph subpaths cost indexed by ending vertex. - * @param alleleVersions map between haplotypes and the corresponding allele. - * @return never {@code null} although perhaps empty. + * @param costsEndingByVertex Read vs haplotype graph sub-paths cost indexed by ending vertex. + * @param likelihoods matrix where to set the likelihoods where the first index in the haplotype's and the second + * the read. */ - protected PerReadAlleleLikelihoodMap calculatePerReadAlleleLikelihoodMap( - final Collection haplotypes, - final Map> costsEndingByVertex, final Map alleleVersions) { - - final PerReadAlleleLikelihoodMap result = new PerReadAlleleLikelihoodMap(); - if (haplotypeGraph == null) - return result; - final Map maxAlleleLogLk = new HashMap<>(anchoredReads + nonAnchoredReads + 10); - final Set supportedHaplotypes = new LinkedHashSet<>(haplotypeGraph.getHaplotypes()); - supportedHaplotypes.retainAll(haplotypes); - for (final Haplotype haplotype : supportedHaplotypes) - calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(haplotype, alleleVersions, result, maxAlleleLogLk, costsEndingByVertex); - - makeLikelihoodAdjustment(alleleVersions, result, maxAlleleLogLk.keySet(), maxAlleleLogLk); - applyGlobalReadMismappingRate(alleleVersions, result, maxAlleleLogLk); - return result; + protected void calculatePerReadAlleleLikelihoodMap(final Map> costsEndingByVertex, + final ReadLikelihoods.Matrix likelihoods) { + final int alleleCount = likelihoods.alleleCount(); + for (int h = 0; h < alleleCount; h++) + calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(h, likelihoods, costsEndingByVertex); } /** @@ -322,25 +308,24 @@ public class GraphBasedLikelihoodCalculationEngineInstance { * "likelihood". *

* - * @param haplotype the target haplotype - * @param alleleVersions allele version of the haplotypes. These are the ones to be used in the final output. - * @param result target where to add the read-vs-haplotype likelihoods. - * @param maxAlleleLogLk where to place the maximum likelihood achieve on any haplotype for each read. + * @param haplotypeIndex the target haplotype index in the {@code likelihoods} matrix. + * @param likelihoods matrix of likelihoods. * @param costsEndingByVertex read costs assorted by their end vertex. */ - private void calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(final Haplotype haplotype, - final Map alleleVersions, - final PerReadAlleleLikelihoodMap result, - final Map maxAlleleLogLk, + private void calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(final int haplotypeIndex, + final ReadLikelihoods.Matrix likelihoods, final Map> costsEndingByVertex) { + final Haplotype haplotype = likelihoods.allele(haplotypeIndex); final HaplotypeRoute haplotypeRoute = haplotypeGraph.getHaplotypeRoute(haplotype); final Set haplotypeVertices = haplotypeRoute.vertexSet(); final Map readCostByRead = new HashMap<>(); final Set visitedVertices = new HashSet<>(haplotypeVertices.size()); final List edgeList = haplotypeRoute.getEdges(); + MultiDeBruijnVertex currentVertex = haplotypeRoute.getFirstVertex(); Route pathSoFar = new Route<>(currentVertex, haplotypeGraph); final Iterator edgeIterator = edgeList.iterator(); + while (true) { visitedVertices.add(currentVertex); final Set finishingAtElementCostSet = costsEndingByVertex.get(currentVertex); @@ -351,15 +336,12 @@ public class GraphBasedLikelihoodCalculationEngineInstance { currentVertex = pathSoFar.getLastVertex(); } - final List readCosts = new ArrayList<>(readCostByRead.values()); - Collections.sort(readCosts, ReadCost.COMPARATOR); - for (final ReadCost rc : readCosts) - result.add(rc.read, alleleVersions.get(haplotype), rc.getCost()); - - for (final ReadCost rc : readCosts) { - final Double currentMax = maxAlleleLogLk.get(rc.read); - if (currentMax == null || currentMax < rc.getCost()) - maxAlleleLogLk.put(rc.read, rc.getCost()); + int readIndex = 0; + for (final GATKSAMRecord read : likelihoods.reads()) { + final ReadCost rc = readCostByRead.get(read); + //if (rc != null) + likelihoods.set(haplotypeIndex,readIndex,rc == null ? Double.NEGATIVE_INFINITY : rc.getCost()); + readIndex++; } } @@ -443,33 +425,6 @@ public class GraphBasedLikelihoodCalculationEngineInstance { } } - /** - * Makes sure that the reference allele likelihood is not too much smaller that the best alternative allele. - * The justification of this constraint is explained in - * {@link PairHMMLikelihoodCalculationEngine#computeDiploidHaplotypeLikelihoods}. - * - * @param alleleVersions correspondence between input haplotypes and output alleles. - * @param result the target result map. - * @param maxAlleleLogLk for each read indicates the likelihood of the best alternative allele. - */ - private void applyGlobalReadMismappingRate(final Map alleleVersions, - final PerReadAlleleLikelihoodMap result, - final Map maxAlleleLogLk) { - if (!Double.isNaN(log10globalReadMismappingRate) && !Double.isInfinite(log10globalReadMismappingRate)) { - final Allele referenceAllele = alleleVersions.get(haplotypeGraph.getReferenceHaplotype()); - for (final Map.Entry> entry : result.getLikelihoodReadMap().entrySet()) { - final GATKSAMRecord read = entry.getKey(); - final Map likelihoods = entry.getValue(); - final Double maxLogLk = maxAlleleLogLk.get(read); - if (maxAlleleLogLk == null) continue; - final Double referenceLogLk = likelihoods.get(referenceAllele); - final Double minReferenceLogLk = maxLogLk + log10globalReadMismappingRate; - if (referenceLogLk == null || referenceLogLk < minReferenceLogLk) - likelihoods.put(referenceAllele, minReferenceLogLk); - } - } - } - /** * Calculates path costs for a set of reads. *

@@ -479,17 +434,16 @@ public class GraphBasedLikelihoodCalculationEngineInstance { * likelihood (cost) of traversing a possible path across the event block using that read. *

* - * @param reads reads to analyze. - * @param mayNeedAdjustment set where to add reads whose likelihood might need adjustment. + * @param reads reads to analyze. * @return never {@code null}. */ protected Map> calculatePathCostsByRead( - final List reads, final Set mayNeedAdjustment) { + final List reads) { final Map> result = new HashMap<>(reads.size()); if (!hasVariation) return Collections.emptyMap(); for (final GATKSAMRecord r : reads) { - calculatePathCostsByRead(r, mayNeedAdjustment, result); + calculatePathCostsByRead(r, result); } return result; } @@ -498,10 +452,9 @@ public class GraphBasedLikelihoodCalculationEngineInstance { * Calculates path cost for a single read. * * @param read target read. - * @param mayNeedAdjustment set where to add read whose likelihood might need adjustment. * @param result map where to add the result. */ - private void calculatePathCostsByRead(final GATKSAMRecord read, final Set mayNeedAdjustment, + private void calculatePathCostsByRead(final GATKSAMRecord read, final Map> result) { final ReadAnchoring anchoring = new ReadAnchoring(read,haplotypeGraph); @@ -510,14 +463,11 @@ public class GraphBasedLikelihoodCalculationEngineInstance { if (!anchoring.isAnchoredSomewhere()) { defaultToRegularPairHMM(anchoring, result); nonAnchoredReads++; - return; + } else { + calculateReadSegmentCosts(anchoring, hmm, result); + if (!anchoring.isPerfectAnchoring()) danglingEndPathCosts(anchoring, hmm, result); + anchoredReads++; } - - calculateReadSegmentCosts(anchoring, hmm, result); - - if (!anchoring.isPerfectAnchoring()) danglingEndPathCosts(anchoring, hmm, result); - mayNeedAdjustment.add(read); - anchoredReads++; } /** diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index cfb29d272..1e656facc 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -48,7 +48,9 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import com.google.java.contract.Ensures; import htsjdk.samtools.SAMFileWriter; -import org.broadinstitute.gatk.utils.commandline.*; +import htsjdk.variant.variantcontext.*; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; import org.broadinstitute.gatk.engine.contexts.AlignmentContext; @@ -75,11 +77,12 @@ import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; import org.broadinstitute.gatk.utils.clipping.ReadClipper; +import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.fragments.FragmentCollection; import org.broadinstitute.gatk.utils.fragments.FragmentUtils; -import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils; import org.broadinstitute.gatk.utils.gvcf.GVCFWriter; import org.broadinstitute.gatk.utils.haplotype.Haplotype; @@ -89,12 +92,10 @@ import org.broadinstitute.gatk.utils.haplotypeBAMWriter.HaplotypeBAMWriter; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.pairhmm.PairHMM; +import org.broadinstitute.gatk.utils.sam.AlignmentUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.ReadUtils; import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType; -import htsjdk.variant.variantcontext.*; -import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants; import java.io.FileNotFoundException; @@ -932,12 +933,12 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final Map> reads = splitReadsBySample( regionForGenotyping.getReads() ); // Calculate the likelihoods: CPU intesive part. - final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,reads); + final ReadLikelihoods readLikelihoods = + likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,samplesList,reads); - // Realign all the reads to the most likely haplotype for use by the annotations - for( final Map.Entry entry : stratifiedReadMap.entrySet() ) { - entry.getValue().realignReadsToMostLikelyHaplotype(haplotypes, assemblyResult.getPaddedReferenceLoc()); - } + // Realign reads to their best haplotype. + final Map readRealignments = realignReadsToTheirBestHaplotype(readLikelihoods, assemblyResult.getPaddedReferenceLoc()); + readLikelihoods.changeReads(readRealignments); // Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there // was a bad interaction between that selection and the marginalization that happens over each event when computing @@ -947,7 +948,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( haplotypes, - stratifiedReadMap, + readLikelihoods, perSampleFilteredReadList, assemblyResult.getFullReferenceWithPadding(), assemblyResult.getPaddedReferenceLoc(), @@ -964,7 +965,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In assemblyResult.getPaddedReferenceLoc(), haplotypes, calledHaplotypes.getCalledHaplotypes(), - stratifiedReadMap); + readLikelihoods); } if( SCAC.DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } @@ -982,15 +983,36 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // output variant containing region. result.addAll(referenceConfidenceModel.calculateRefConfidence(assemblyResult.getReferenceHaplotype(), calledHaplotypes.getCalledHaplotypes(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping, - stratifiedReadMap, calledHaplotypes.getCalls())); + readLikelihoods, calledHaplotypes.getCalls())); // output right-flanking non-variant section: if (trimmingResult.hasRightFlankingRegion()) result.addAll(referenceModelForNoVariation(trimmingResult.nonVariantRightFlankRegion(),false)); return result; } - } else { + } else return calledHaplotypes.getCalls(); + } + + /** + * Returns a map with the original read as a key and the realigned read as the value. + *

+ * Missing keys or equivalent key and value pairs mean that the read was not realigned. + *

+ * @return never {@code null} + */ + private Map realignReadsToTheirBestHaplotype(final ReadLikelihoods originalReadLikelihoods, final GenomeLoc paddedReferenceLoc) { + + final Collection.BestAllele> bestAlleles = originalReadLikelihoods.bestAlleles(); + final Map result = new HashMap<>(bestAlleles.size()); + + for (final ReadLikelihoods.BestAllele bestAllele : bestAlleles) { + final GATKSAMRecord originalRead = bestAllele.read; + final Haplotype bestHaplotype = bestAllele.allele; + final boolean isInformative = bestAllele.isInformative(); + final GATKSAMRecord realignedRead = AlignmentUtils.createReadAlignedToRef(originalRead,bestHaplotype,paddedReferenceLoc.getStart(),isInformative); + result.put(originalRead,realignedRead); } + return result; } private boolean containsCalls(final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes) { @@ -1086,23 +1108,15 @@ public class HaplotypeCaller extends ActiveRegionWalker, In * @param region the active region containing reads * @return a map from sample -> PerReadAlleleLikelihoodMap that maps each read to ref */ - public static Map createDummyStratifiedReadMap(final Haplotype refHaplotype, - final List samples, - final ActiveRegion region) { - final Allele refAllele = Allele.create(refHaplotype, true); + public static ReadLikelihoods createDummyStratifiedReadMap(final Haplotype refHaplotype, + final List samples, + final ActiveRegion region) { + return new ReadLikelihoods<>(samples, Collections.singletonList(refHaplotype), + splitReadsBySample(samples, region.getReads())); - final Map map = new LinkedHashMap<>(1); - for ( final Map.Entry> entry : splitReadsBySample(samples, region.getReads()).entrySet() ) { - final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); - for ( final GATKSAMRecord read : entry.getValue() ) { - likelihoodMap.add(read, refAllele, 0.0); - } - map.put(entry.getKey(), likelihoodMap); - } - - return map; } + //--------------------------------------------------------------------------------------------------------------- // // reduce diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java index 90dda170e..32f54436c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java @@ -57,8 +57,7 @@ import org.broadinstitute.gatk.tools.walkers.genotyper.OutputMode; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.collections.DefaultHashMap; -import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.EventMap; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.haplotype.MergeVariantsAcrossHaplotypes; @@ -74,7 +73,7 @@ import java.util.*; public class HaplotypeCallerGenotypingEngine extends GenotypingEngine { private final static List NO_CALL = Collections.singletonList(Allele.NO_CALL); - private final static int ALLELE_EXTENSION = 2; + private static final int ALLELE_EXTENSION = 2; private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger; @@ -161,17 +160,17 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine(haplotypes,likelihoods) + * @param readLikelihoods Map from reads->(haplotypes,likelihoods) * @param perSampleFilteredReadList Map from sample to reads that were filtered after assembly and before calculating per-read likelihoods. * @param ref Reference bytes at active region * @param refLoc Corresponding active region genome location * @param activeRegionWindow Active window * @param genomeLocParser GenomeLocParser * @param activeAllelesToGenotype Alleles to genotype - * @param emitReferenceConfidence whether we should add a alternative allele to the result variation contexts. + * @param emitReferenceConfidence whether we should add a <NON_REF> alternative allele to the result variation contexts. * * @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes * @@ -180,7 +179,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine haplotypes, - final Map haplotypeReadMap, + final ReadLikelihoods readLikelihoods, final Map> perSampleFilteredReadList, final byte[] ref, final GenomeLoc refLoc, @@ -191,7 +190,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, haplotypeReadMap, ref, refLoc, activeAllelesToGenotype); + final TreeSet startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, readLikelihoods, ref, refLoc, activeAllelesToGenotype); // Walk along each position in the key set and create each event to be outputted final Set calledHaplotypes = new HashSet<>(); final List returnCalls = new ArrayList<>(); - final Map emptyDownSamplingMap = new DefaultHashMap<>(0.0); for( final int loc : startPosKeySet ) { if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region @@ -221,7 +219,9 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine alleleReadMap, final VariantContext mergedVC ) { - final GenotypesContext genotypes = GenotypesContext.create(alleleReadMap.size()); + private GenotypesContext calculateGLsForThisEvent( final ReadLikelihoods readLikelihoods, final VariantContext mergedVC ) { + final GenotypesContext genotypes = GenotypesContext.create(readLikelihoods.sampleCount()); // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample - for( final String sample : alleleReadMap.keySet() ) { + for (final String sample : readLikelihoods.samples() ) { final int numHaplotypes = mergedVC.getAlleles().size(); final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, mergedVC.getAlleles(), true); + final double[][] haplotypeLikelihoodMatrix = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, readLikelihoods, mergedVC.getAlleles(), true); int glIndex = 0; for( int iii = 0; iii < numHaplotypes; iii++ ) { for( int jjj = 0; jjj <= iii; jjj++ ) { genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC } } + logger.debug(" Likelihoods for sample " + sample + " : " + Arrays.toString(genotypeLikelihoods)); genotypes.add(new GenotypeBuilder(sample).alleles(NO_CALL).PL(genotypeLikelihoods).make()); } return genotypes; } - private static Map addFilteredReadList(final GenomeLocParser parser, - final Map perSampleReadMap, - final Map> perSampleFilteredReadList, - final VariantContext call, - final boolean requireOverlap) { - - final Map returnMap = new LinkedHashMap<>(); - final GenomeLoc callLoc = ( requireOverlap ? parser.createGenomeLoc(call) : null ); - for( final Map.Entry sample : perSampleReadMap.entrySet() ) { - final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); - - for( final Map.Entry> mapEntry : sample.getValue().getLikelihoodReadMap().entrySet() ) { - // only count the read if it overlaps the event, otherwise it is not added to the output read list at all - if( !requireOverlap || callLoc.overlapsP(parser.createGenomeLocUnclipped(mapEntry.getKey())) ) { - for( final Map.Entry alleleDoubleEntry : mapEntry.getValue().entrySet() ) { - likelihoodMap.add(mapEntry.getKey(), alleleDoubleEntry.getKey(), alleleDoubleEntry.getValue()); - } - } - } - - // add all filtered reads to the NO_CALL list because they weren't given any likelihoods - for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) { - // only count the read if it overlaps the event, otherwise it is not added to the output read list at all - if( !requireOverlap || callLoc.overlapsP(parser.createGenomeLocUnclipped(read)) ) { - for( final Allele allele : call.getAlleles() ) { - likelihoodMap.add(read, allele, 0.0); - } - } - } - - returnMap.put(sample.getKey(), likelihoodMap); - } - return returnMap; - } - /** * Removes symbolic events from list of haplotypes * @param haplotypes Input/output list of haplotypes, before/after removal @@ -490,48 +486,6 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine (haplotypes, likelihoods) - * @param alleleMapper Map from alleles -> list of haplotypes which support that allele - * @param perSampleDownsamplingFraction Map from samples -> downsampling fraction - * @param genomeLocParser a genome loc parser - * @param eventsToGenotype the alleles to genotype in a single VariantContext, will be null if we don't want to require overlap - * @return Map from reads -> (alleles, likelihoods) - */ - protected Map convertHaplotypeReadMapToAlleleReadMap( final Map haplotypeReadMap, - final Map> alleleMapper, - final Map perSampleDownsamplingFraction, - final GenomeLocParser genomeLocParser, - final VariantContext eventsToGenotype) { - final GenomeLoc callLoc = ( eventsToGenotype != null ? genomeLocParser.createGenomeLoc(eventsToGenotype) : null ); - - final Map alleleReadMap = new LinkedHashMap<>(); - for( final Map.Entry haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample - final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); - for( final Map.Entry> alleleMapperEntry : alleleMapper.entrySet() ) { // for each output allele - final List mappedHaplotypes = alleleMapperEntry.getValue(); - for( final Map.Entry> readEntry : haplotypeReadMapEntry.getValue().getLikelihoodReadMap().entrySet() ) { // for each read - if( eventsToGenotype == null || callLoc.overlapsP(genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLocUnclipped(readEntry.getKey()), ALLELE_EXTENSION)) ) { // make sure the read overlaps - double maxLikelihood = Double.NEGATIVE_INFINITY; - for( final Map.Entry alleleDoubleEntry : readEntry.getValue().entrySet() ) { // for each input allele - if( mappedHaplotypes.contains( new Haplotype(alleleDoubleEntry.getKey())) ) { // exact match of haplotype base string - maxLikelihood = Math.max( maxLikelihood, alleleDoubleEntry.getValue() ); - } - } - perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood); - } - } - } - perReadAlleleLikelihoodMap.performPerAlleleDownsampling(perSampleDownsamplingFraction.get(haplotypeReadMapEntry.getKey())); // perform contamination downsampling - alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap); - } - - return alleleReadMap; - } - protected static Map> createAlleleMapper( final Map mergeMap, final Map> eventMap ) { final Map> alleleMapper = new LinkedHashMap<>(); for( final Map.Entry entry : mergeMap.entrySet() ) { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java index 260794497..e662806fe 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java @@ -53,7 +53,7 @@ import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.QualityUtils; import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.pairhmm.*; import org.broadinstitute.gatk.utils.recalibration.covariates.RepeatCovariate; @@ -73,6 +73,7 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula public static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual private final byte constantGCP; + private final double log10globalReadMismappingRate; private final PairHMM.HMM_IMPLEMENTATION hmmType; @@ -177,40 +178,6 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula pairHMMThreadLocal.get().close(); } - - private void writeDebugLikelihoods(final GATKSAMRecord processedRead, final Haplotype haplotype, final double log10l){ - if ( WRITE_LIKELIHOODS_TO_FILE ) { - likelihoodsStream.printf("%s %s %s %s %s %s %f%n", - haplotype.getBaseString(), - new String(processedRead.getReadBases() ), - SAMUtils.phredToFastq(processedRead.getBaseQualities() ), - SAMUtils.phredToFastq(processedRead.getBaseInsertionQualities() ), - SAMUtils.phredToFastq(processedRead.getBaseDeletionQualities() ), - SAMUtils.phredToFastq(constantGCP), - log10l); - } - } - - private Map createAlleleMap(List haplotypes){ - final int numHaplotypes = haplotypes.size(); - final Map alleleMap = new LinkedHashMap<>(numHaplotypes); - for ( final Haplotype haplotype : haplotypes ) { - final Allele allele = Allele.create(haplotype, true); - alleleMap.put(allele, haplotype); - } - return alleleMap; - } - - private Map fillGCPArrays(List reads){ - final Map GCPArrayMap = new LinkedHashMap<>(); - for (GATKSAMRecord read: reads){ - byte [] GCPArray = new byte[read.getReadBases().length]; - Arrays.fill( GCPArray, constantGCP ); // Is there a way to derive empirical estimates for this from the data? - GCPArrayMap.put(read, GCPArray); - } - return GCPArrayMap; - } - private void capMinimumReadQualities(GATKSAMRecord read, byte[] readQuals, byte[] readInsQuals, byte[] readDelQuals) { for( int kkk = 0; kkk < readQuals.length; kkk++ ) { readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG @@ -229,9 +196,9 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula * @return processedReads. A new list of reads, in the same order, whose qualities have been altered by PCR error model and minimal quality thresholding */ private List modifyReadQualities(final List reads) { - List processedReads = new LinkedList<>(); - for ( GATKSAMRecord read : reads ) { + final List result = new ArrayList<>(reads.size()); + for (final GATKSAMRecord read : reads) { final byte[] readBases = read.getReadBases(); // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read @@ -244,71 +211,9 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula // Create a new copy of the read and sets its base qualities to the modified versions. // Pack this into a new list for return - final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, readInsQuals, readDelQuals); - processedReads.add(processedRead); + result.add(GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, readInsQuals, readDelQuals)); } - return processedReads; - } - - /** - * Post-processing of the read/allele likelihoods. - * - * We send quality-capped reads to the pairHMM for evaluation, and it returns a map containing these capped reads. - * We wish to return a map containing the original, unmodified reads. - * - * At the same time, we want to effectively set a lower cap on the reference score, based on the global mis-mapping rate. - * This protects us from the case where the assembly has produced haplotypes - * that are very divergent from reference, but are supported by only one read. In effect - * we capping how badly scoring the reference can be for any read by the chance that the read - * itself just doesn't belong here - * - * @param perReadAlleleLikelihoodMap the original map returned by the PairHMM. Contains the processed reads, the haplotype Alleles, and their log10ls - * @param reads Our original, unmodified reads - * @param processedReads Reads whose minimum base,insertion,deletion qualities have been capped; these were actually used to derive log10ls - * @param alleleHaplotypeMap The map associating the Allele and Haplotype versions of each haplotype - * - * @return processedReadAlleleLikelihoodMap; a new PRALM containing the original reads, and their haplotype log10ls including capped reference log10ls - */ - private PerReadAlleleLikelihoodMap capReferenceHaplotypeLikelihoods(PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, List reads, List processedReads, Map alleleHaplotypeMap){ - - // a new read/allele map, to contain the uncapped reads, haplotypes, and potentially the capped reference log10ls - final PerReadAlleleLikelihoodMap processedReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); - - final int numReads = reads.size(); - for (int readIndex = 0; readIndex < numReads; readIndex++) { - - // Get the original and quality-modified read from their respective lists - // Note that this requires both lists to have reads in the same order - final GATKSAMRecord originalRead = reads.get(readIndex); - final GATKSAMRecord processedRead = processedReads.get(readIndex); - - double bestNonReflog10L = Double.NEGATIVE_INFINITY; - - for ( final Allele allele : alleleHaplotypeMap.keySet() ) { - final double log10l = perReadAlleleLikelihoodMap.getLikelihoodAssociatedWithReadAndAllele(processedRead, allele); - final Haplotype haplotype = alleleHaplotypeMap.get(allele); - if ( haplotype.isNonReference() ) - bestNonReflog10L = Math.max(bestNonReflog10L, log10l); - writeDebugLikelihoods(processedRead, haplotype, log10l); - - // add the ORIGINAL (non-capped) read to the final map, along with the current haplotype and associated log10l - processedReadAlleleLikelihoodMap.add(originalRead, allele, log10l); - } - - // ensure that any haplotype is no worse than the best non-ref haplotype minus the global - // mismapping rate. This protects us from the case where the assembly has produced haplotypes - // that are very divergent from reference, but are supported by only one read. In effect - // we capping how badly scoring any haplotype can be for any read by the chance that the read - // itself just doesn't belong here - final double worstAllowedLog10l = bestNonReflog10L + log10globalReadMismappingRate; - for ( final Allele allele : alleleHaplotypeMap.keySet() ) { - final double log10l = perReadAlleleLikelihoodMap.getLikelihoodAssociatedWithReadAndAllele(processedRead, allele); - if( log10l < worstAllowedLog10l ) { - processedReadAlleleLikelihoodMap.add(originalRead, allele, worstAllowedLog10l); - } - } - } - return processedReadAlleleLikelihoodMap; + return result; } /** @@ -343,85 +248,109 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula pairHMMThreadLocal.get().finalizeRegion(); } - @Override - public Map computeReadLikelihoods( final AssemblyResultSet assemblyResultSet, final Map> perSampleReadList ) { + public ReadLikelihoods computeReadLikelihoods( final AssemblyResultSet assemblyResultSet, final List samples, final Map> perSampleReadList ) { final List haplotypes = assemblyResultSet.getHaplotypeList(); + // configure the HMM initializePairHMM(haplotypes, perSampleReadList); - // Add likelihoods for each sample's reads to our stratifiedReadMap - final Map stratifiedReadMap = new LinkedHashMap<>(); - for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { - // evaluate the likelihood of the reads given those haplotypes - final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue()); - - map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE); - stratifiedReadMap.put(sampleEntry.getKey(), map); + // Add likelihoods for each sample's reads to our result + final ReadLikelihoods result = new ReadLikelihoods<>(samples, haplotypes, perSampleReadList); + final int sampleCount = result.sampleCount(); + for (int s = 0; s < sampleCount; s++) { + final ReadLikelihoods.Matrix sampleLikelihoods = result.sampleMatrix(s); + computeReadLikelihoods(sampleLikelihoods); } - //Used mostly by the JNI implementation(s) to free arrays - finalizePairHMM(); - return stratifiedReadMap; + result.normalizeLikelihoods(false, log10globalReadMismappingRate); + result.filterPoorlyModeledReads(EXPECTED_ERROR_RATE_PER_BASE); + finalizePairHMM(); + return result; } - - private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List haplotypes, final List reads) { + private void computeReadLikelihoods( final ReadLikelihoods.Matrix likelihoods) { // Modify the read qualities by applying the PCR error model and capping the minimum base,insertion,deletion qualities - List processedReads = modifyReadQualities(reads); - - // Get alleles corresponding to our haplotypees - Map alleleHaplotypeMap = createAlleleMap(haplotypes); - - // Get an array containing the constantGCP for each read in our modified read list - Map GCPArrayMap = fillGCPArrays(processedReads); + final List processedReads = modifyReadQualities(likelihoods.reads()); // Run the PairHMM to calculate the log10 likelihood of each (processed) reads' arising from each haplotype - PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = pairHMMThreadLocal.get().computeLikelihoods(processedReads, alleleHaplotypeMap, GCPArrayMap); + pairHMMThreadLocal.get().computeLikelihoods(likelihoods,processedReads,constantGCP); - // Generate a new map containing the original, unmodified reads, and with minimal reference haplotype log10ls determined from the global mis-mapping rate + if (WRITE_LIKELIHOODS_TO_FILE) + writeDebugLikelihoods(likelihoods); + } - return capReferenceHaplotypeLikelihoods(perReadAlleleLikelihoodMap, reads, processedReads, alleleHaplotypeMap); + private void writeDebugLikelihoods(final ReadLikelihoods.Matrix likelihoods) { + final List reads = likelihoods.reads(); + final List haplotypes = likelihoods.alleles(); + final int haplotypeCount = haplotypes.size(); + final int readCount = reads.size(); + for (int r = 0; r < readCount; r++) + for (int a = 0; a < haplotypeCount; a++) + writeDebugLikelihoods(reads.get(r),haplotypes.get(a),likelihoods.get(a,r)); + likelihoodsStream.flush(); + } + + private void writeDebugLikelihoods(final GATKSAMRecord processedRead, final Haplotype haplotype, final double log10l){ + likelihoodsStream.printf("%s %s %s %s %s %s %f%n", + haplotype.getBaseString(), + new String(processedRead.getReadBases() ), + SAMUtils.phredToFastq(processedRead.getBaseQualities()), + SAMUtils.phredToFastq(processedRead.getBaseInsertionQualities() ), + SAMUtils.phredToFastq(processedRead.getBaseDeletionQualities() ), + SAMUtils.phredToFastq(constantGCP), + log10l); } @Requires({"alleleOrdering.size() > 0"}) @Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"}) + @Deprecated public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, - final Map stratifiedReadMap, - final List alleleOrdering, + final ReadLikelihoods readLikelihoods, + final List alleleOrdering, final boolean normalize ) { - return computeDiploidHaplotypeLikelihoods(Collections.singleton(sample), stratifiedReadMap, alleleOrdering, normalize); + return computeDiploidHaplotypeLikelihoods(Collections.singleton(sample), readLikelihoods, alleleOrdering, normalize); } @Requires({"alleleOrdering.size() > 0"}) @Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, - final Map stratifiedReadMap, - final List alleleOrdering, + @Deprecated + private static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, + final ReadLikelihoods readLikelihoods, + final List alleleOrdering, final boolean normalize) { final int numHaplotypes = alleleOrdering.size(); + final int[] alleleIndices = new int[alleleOrdering.size()]; + final ListIterator alleleIterator = alleleOrdering.listIterator(); + int nextAlleleIndex = 0; + while (alleleIterator.hasNext()) + if ((alleleIndices[nextAlleleIndex++] = readLikelihoods.alleleIndex((Allele) alleleIterator.next())) == -1) + throw new IllegalArgumentException("allele " + alleleIterator.previous() + " not found in likelihood collection "); + final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; - for( int iii = 0; iii < numHaplotypes; iii++ ) { - Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY); - } // compute the diploid haplotype likelihoods - for( int iii = 0; iii < numHaplotypes; iii++ ) { - final Allele iii_allele = alleleOrdering.get(iii); - for( int jjj = 0; jjj <= iii; jjj++ ) { - final Allele jjj_allele = alleleOrdering.get(jjj); - double haplotypeLikelihood = 0.0; - for( final String sample : samples ) { - for( final Map.Entry> entry : stratifiedReadMap.get(sample).getLikelihoodReadMap().entrySet() ) { - // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) - // First term is approximated by Jacobian log with table lookup. - haplotypeLikelihood += ( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + MathUtils.LOG_ONE_HALF ); + for(final String sample : samples) { + final int sampleIndex = readLikelihoods.sampleIndex(sample); + if (sampleIndex == -1) + throw new IllegalArgumentException("the sample provided is not in the likelihood collection"); + final ReadLikelihoods.Matrix sampleLikelihoods = readLikelihoods.sampleMatrix(sampleIndex); + final int sampleReadCount = readLikelihoods.sampleReadCount(sampleIndex); + for( int iii = 0; iii < numHaplotypes; iii++ ) { + final int iii_allele = alleleIndices[iii]; + for( int jjj = 0; jjj <= iii; jjj++ ) { + final int jjj_allele = alleleIndices[jjj]; + double haplotypeLikelihood = 0.0; + for (int r = 0; r < sampleReadCount; r++) { + final double value = MathUtils.approximateLog10SumLog10(sampleLikelihoods.get(iii_allele,r), + sampleLikelihoods.get(jjj_allele,r)) + MathUtils.LOG_ONE_HALF; + haplotypeLikelihood += value; } + haplotypeLikelihoodMatrix[iii][jjj] += haplotypeLikelihood; } - haplotypeLikelihoodMatrix[iii][jjj] = haplotypeLikelihood; } } @@ -431,6 +360,7 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula @Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"}) @Ensures({"result.length == result[0].length", "result.length == likelihoodMatrix.length"}) + @Deprecated protected static double[][] normalizeDiploidLikelihoodMatrixFromLog10( final double[][] likelihoodMatrix ) { final int numHaplotypes = likelihoodMatrix.length; double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2]; @@ -450,131 +380,6 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula return likelihoodMatrix; } - // -------------------------------------------------------------------------------- - // - // System to compute the best N haplotypes for genotyping - // - // -------------------------------------------------------------------------------- -// -// /** -// * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele -// * @param map an annoying map object that moves us between the allele and haplotype representation -// * @param haplotypeAsAllele the allele version of the haplotype -// * @return the haplotype version, with its score incremented by 1 if its non-reference -// */ -// private Haplotype updateSelectHaplotype(final Map map, final Allele haplotypeAsAllele) { -// final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic -// if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value -// return h; -// } -// -// /** -// * Take the best N haplotypes and return them as a list -// * -// * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample -// * as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing -// * order of score (so higher score haplotypes are preferred). The N we take is determined by -// * -// * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation) -// * -// * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is -// * bounded by maxNumHaplotypesInPopulation as that number can grow without bound -// * -// * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1 -// * @param nSamples the number of samples used to select the haplotypes -// * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples -// * @return a list of N or fewer haplotypes, with the reference haplotype first -// */ -// private List selectBestHaplotypesAccordingToScore(final Set selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) { -// final List selectedHaplotypesList = new ArrayList<>(selectedHaplotypes); -// Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator()); -// final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1; -// final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation); -// final List bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep); -// if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list"); -// return bestHaplotypes; -// } -// -// /** -// * Select the best haplotypes for genotyping the samples in stratifiedReadMap -// * -// * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely -// * haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for -// * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get -// * one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation -// * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the -// * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference. -// * -// * @param haplotypes a list of all haplotypes we're considering -// * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype -// * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes -// * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation -// */ -// public List selectBestHaplotypesFromEachSample(final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation) { -// if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes); -// -// if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes -// -// // all of the haplotypes that at least one sample called as one of the most likely -// final Set selectedHaplotypes = new HashSet<>(); -// selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected -// -// // our annoying map from allele -> haplotype -// final Map allele2Haplotype = new HashMap<>(); -// for ( final Haplotype h : haplotypes ) { -// h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes -// allele2Haplotype.put(Allele.create(h, h.isReference()), h); -// } -// -// // for each sample, compute the most likely pair of haplotypes -// for ( final Map.Entry entry : stratifiedReadMap.entrySet() ) { -// // get the two most likely haplotypes under a diploid model for this sample -// final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles(); -// -// if ( mla != null ) { // there was something to evaluate in this sample -// // note that there must be at least 2 haplotypes -// final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele()); -// final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele()); -// -//// if ( DEBUG ) { -//// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey()); -//// } -// -// // add these two haplotypes to the set of haplotypes that have been selected -// selectedHaplotypes.add(best); -// selectedHaplotypes.add(second); -// -// // we've already selected all of our haplotypes, and we don't need to prune them down -// if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation ) -// break; -// } -// } -// -// // take the best N haplotypes forward, in order of the number of samples that choose them -// final int nSamples = stratifiedReadMap.size(); -// final List bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation); -// -// if ( DEBUG ) { -// logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples."); -// for ( final Haplotype h : bestHaplotypes ) { -// logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype")); -// } -// } -// return bestHaplotypes; -// } -// -// /** -// * Find the haplotype that isRef(), or @throw ReviewedGATKException if one isn't found -// * @param haplotypes non-null list of haplotypes -// * @return the reference haplotype -// */ -// private static Haplotype findReferenceHaplotype( final List haplotypes ) { -// for( final Haplotype h : haplotypes ) { -// if( h.isReference() ) return h; -// } -// throw new ReviewedGATKException( "No reference haplotype found in the list of haplotypes!" ); -// } - // -------------------------------------------------------------------------------- // // Experimental attempts at PCR error rate modeling diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java index 7edcfde39..bd72a764d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java @@ -1,52 +1,53 @@ /* * By downloading the PROGRAM you agree to the following terms of use: -* +* * BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* +* * This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* +* * WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and * WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. * NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* +* * 1. DEFINITIONS * 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* +* * 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. * The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. * 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY * LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. * Copyright 2012 Broad Institute, Inc. * Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. * LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* +* * 4. INDEMNIFICATION * LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* +* * 5. NO REPRESENTATIONS OR WARRANTIES * THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. * IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* +* * 6. ASSIGNMENT * This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* +* * 7. MISCELLANEOUS * 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. * 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. * 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. * 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ + package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import htsjdk.variant.variantcontext.Allele; @@ -62,21 +63,25 @@ import java.util.Random; public class RandomLikelihoodCalculationEngine implements ReadLikelihoodCalculationEngine { @Override - public Map computeReadLikelihoods(final AssemblyResultSet assemblyResultSet, final Map> reads) { + public ReadLikelihoods computeReadLikelihoods(final AssemblyResultSet assemblyResultSet, + final List samples, + final Map> reads) { final List haplotypes = assemblyResultSet.getHaplotypeList(); - final Map result = new HashMap<>(reads.size()); + final ReadLikelihoods result = new ReadLikelihoods(samples, haplotypes, reads); final Map alleles = new HashMap<>(haplotypes.size()); for (final Haplotype haplotype : haplotypes) alleles.put(haplotype,Allele.create(haplotype,false)); final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - for (final String sample : reads.keySet()) { - final PerReadAlleleLikelihoodMap pralm = new PerReadAlleleLikelihoodMap(); - for (final GATKSAMRecord read : reads.get(sample)) - for (final Haplotype haplotype : haplotypes ) - pralm.add(read,alleles.get(haplotype),-Math.abs(rnd.nextDouble())); - result.put(sample,pralm); + final int sampleCount = samples.size(); + final int alleleCount = alleles.size(); + for (int i = 0; i < sampleCount; i++) { + final List sampleReads = result.sampleReads(i); + final int readCount = sampleReads.size(); + final ReadLikelihoods.Matrix sampleLikelihoods = result.sampleMatrix(i); + for (int a = 0; a < alleleCount; a++) + for (int r = 0; r < readCount; r++) + sampleLikelihoods.set(a,r,-Math.abs(rnd.nextDouble())); } - return result; } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java index 535ca813f..6119cba1c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java @@ -46,7 +46,8 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; -import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import java.util.List; @@ -87,7 +88,7 @@ public interface ReadLikelihoodCalculationEngine { * @return never {@code null}, and with at least one entry for input sample (keys in {@code perSampleReadList}. * The value maps can be potentially empty though. */ - public Map computeReadLikelihoods(AssemblyResultSet assemblyResultSet, + public ReadLikelihoods computeReadLikelihoods(AssemblyResultSet assemblyResultSet, List samples, Map> perSampleReadList); public void close(); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java index fd8769f4e..dc9512eee 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java @@ -47,13 +47,16 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import htsjdk.samtools.*; +import htsjdk.variant.variantcontext.*; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFSimpleHeaderLine; import org.broadinstitute.gatk.engine.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.QualityUtils; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; -import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; import org.broadinstitute.gatk.utils.pileup.PileupElement; @@ -62,9 +65,6 @@ import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.gatk.utils.sam.AlignmentUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; -import htsjdk.variant.variantcontext.*; -import htsjdk.variant.vcf.VCFHeaderLine; -import htsjdk.variant.vcf.VCFSimpleHeaderLine; import java.io.File; import java.util.*; @@ -86,6 +86,7 @@ public class ReferenceConfidenceModel { private final GenomeLocParser genomeLocParser; private final Set samples; + private final SAMFileHeader header; // TODO -- really shouldn't depend on this private final int indelInformativeDepthIndelSize; private final static boolean WRITE_DEBUGGING_BAM = false; @@ -113,6 +114,7 @@ public class ReferenceConfidenceModel { this.genomeLocParser = genomeLocParser; this.samples = samples; + this.header = header; this.indelInformativeDepthIndelSize = indelInformativeDepthIndelSize; if ( WRITE_DEBUGGING_BAM ) { @@ -156,10 +158,10 @@ public class ReferenceConfidenceModel { * * @param refHaplotype the reference haplotype, used to get the reference bases across activeRegion.getLoc() * @param calledHaplotypes a list of haplotypes that segregate in this region, for realignment of the reads in the - * stratifiedReadMap, corresponding to each reads best haplotype. Must contain the refHaplotype. + * readLikelihoods, corresponding to each reads best haplotype. Must contain the refHaplotype. * @param paddedReferenceLoc the location of refHaplotype (which might be larger than activeRegion.getLoc()) * @param activeRegion the active region we want to get the reference confidence over - * @param stratifiedReadMap a map from a single sample to its PerReadAlleleLikelihoodMap for each haplotype in calledHaplotypes + * @param readLikelihoods a map from a single sample to its PerReadAlleleLikelihoodMap for each haplotype in calledHaplotypes * @param variantCalls calls made in this region. The return result will contain any variant call in this list in the * correct order by genomic position, and any variant in this list will stop us emitting a ref confidence * under any position it covers (for snps and insertions that is 1 bp, but for deletions its the entire ref span) @@ -170,22 +172,22 @@ public class ReferenceConfidenceModel { final Collection calledHaplotypes, final GenomeLoc paddedReferenceLoc, final ActiveRegion activeRegion, - final Map stratifiedReadMap, + final ReadLikelihoods readLikelihoods, final List variantCalls) { if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null"); if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype"); if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null"); if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null"); - if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null"); - if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size()); + if ( readLikelihoods == null ) throw new IllegalArgumentException("readLikelihoods cannot be null"); + if ( readLikelihoods.sampleCount() != 1 ) throw new IllegalArgumentException("readLikelihoods must contain exactly one sample but it contained " + readLikelihoods.sampleCount()); if ( refHaplotype.length() != activeRegion.getExtendedLoc().size() ) throw new IllegalArgumentException("refHaplotype " + refHaplotype.length() + " and activeRegion location size " + activeRegion.getLocation().size() + " are different"); final GenomeLoc refSpan = activeRegion.getLocation(); - final List refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, activeRegion, refSpan, stratifiedReadMap); + final List refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, activeRegion, refSpan, readLikelihoods); final byte[] ref = refHaplotype.getBases(); final List results = new ArrayList<>(refSpan.size()); - final String sampleName = stratifiedReadMap.keySet().iterator().next(); + final String sampleName = readLikelihoods.sample(0); final int globalRefOffset = refSpan.getStart() - activeRegion.getExtendedLoc().getStart(); for ( final ReadBackedPileup pileup : refPileups ) { @@ -311,15 +313,15 @@ public class ReferenceConfidenceModel { final GenomeLoc paddedReferenceLoc, final ActiveRegion activeRegion, final GenomeLoc activeRegionSpan, - final Map stratifiedReadMap) { + final ReadLikelihoods readLikelihoods) { if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null"); if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype"); if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null"); if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null"); - if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null"); - if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size()); + if ( readLikelihoods == null ) throw new IllegalArgumentException("readLikelihoods cannot be null"); + if ( readLikelihoods.sampleCount() != 1 ) throw new IllegalArgumentException("readLikelihoods must contain exactly one sample but it contained " + readLikelihoods.sampleCount()); final List reads = activeRegion.getReads(); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculator.java index ac33221d5..2c17d4d5c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculator.java @@ -47,11 +47,10 @@ package org.broadinstitute.gatk.utils.haplotype; import com.google.java.contract.Requires; +import htsjdk.variant.variantcontext.VariantContext; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine; import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import java.util.*; @@ -66,7 +65,7 @@ import java.util.*; */ public class HaplotypeLDCalculator { private final List haplotypes; - private final Map haplotypeReadMap; + private final ReadLikelihoods readLikelihoods; private List> haplotypeLikelihoodsPerSample = null; // linear contigency table with table[0] == [0][0], table[1] = [0][1], table[2] = [1][0], table[3] = [1][1] @@ -75,14 +74,15 @@ public class HaplotypeLDCalculator { /** * For testing */ + @SuppressWarnings("unchecked") protected HaplotypeLDCalculator() { haplotypes = Collections.emptyList(); - haplotypeReadMap = Collections.emptyMap(); + readLikelihoods = new ReadLikelihoods<>((List)Collections.EMPTY_LIST, (List)Collections.EMPTY_LIST, Collections.EMPTY_MAP); } - public HaplotypeLDCalculator(List haplotypes, Map haplotypeReadMap) { + public HaplotypeLDCalculator(final List haplotypes, final ReadLikelihoods haplotypeReadMap) { this.haplotypes = haplotypes; - this.haplotypeReadMap = haplotypeReadMap; + this.readLikelihoods = haplotypeReadMap; } /** @@ -94,13 +94,13 @@ public class HaplotypeLDCalculator { private void buildHaplotypeLikelihoodsPerSampleIfNecessary() { if ( haplotypeLikelihoodsPerSample == null ) { // do the lazy computation - final Set samples = haplotypeReadMap.keySet(); - haplotypeLikelihoodsPerSample = new LinkedList>(); + final Set samples = new LinkedHashSet<>(readLikelihoods.samples()); + haplotypeLikelihoodsPerSample = new LinkedList<>(); for( final String sample : samples ) { - final Map map = new HashMap(haplotypes.size()); + final Map map = new HashMap<>(haplotypes.size()); for( final Haplotype h : haplotypes ) { // count up the co-occurrences of the events for the R^2 calculation - final double haplotypeLikelihood = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, haplotypeReadMap, Collections.singletonList(Allele.create(h, true)), false)[0][0]; + final double haplotypeLikelihood = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, readLikelihoods, Collections.singletonList(h), false)[0][0]; map.put(h, haplotypeLikelihood); } haplotypeLikelihoodsPerSample.add(map); @@ -162,7 +162,7 @@ public class HaplotypeLDCalculator { * * The probability is just p11_22 / (p11_22 + p hets) * - * @table linear contigency table with table[0] == [0][0], table[1] = [0][1], table[2] = [1][0], table[3] = [1][1] + * @param table linear contigency table with table[0] == [0][0], table[1] = [0][1], table[2] = [1][0], table[3] = [1][1] * doesn't have to be normalized as this function does the normalization internally * @return the real space probability that the data is phased */ diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/LDMerger.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/LDMerger.java index 710fbc32c..f5f6a5bb8 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/LDMerger.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/LDMerger.java @@ -49,12 +49,15 @@ package org.broadinstitute.gatk.utils.haplotype; import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; -import java.util.*; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.TreeSet; /** * Merges VariantContexts in a series of haplotypes according to their pairwise LD @@ -94,19 +97,19 @@ public class LDMerger extends MergeVariantsAcrossHaplotypes { * Merge as many events among the haplotypes as possible based on pairwise LD among variants * * @param haplotypes a list of haplotypes whose events we want to merge - * @param haplotypeReadMap map from sample name -> read likelihoods for each haplotype + * @param readLikelihoods map from sample name -> read likelihoods for each haplotype * @param startPosKeySet a set of starting positions of all events among the haplotypes * @param ref the reference bases * @param refLoc the span of the reference bases */ @Override public boolean merge( final List haplotypes, - final Map haplotypeReadMap, + final ReadLikelihoods readLikelihoods, final TreeSet startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) { if ( haplotypes == null ) throw new IllegalArgumentException("haplotypes cannot be null"); - if ( haplotypeReadMap == null ) throw new IllegalArgumentException("haplotypeReadMap cannot be null"); + if ( readLikelihoods == null ) throw new IllegalArgumentException("readLikelihoods cannot be null"); if ( startPosKeySet == null ) throw new IllegalArgumentException("startPosKeySet cannot be null"); if ( ref == null ) throw new IllegalArgumentException("ref cannot be null"); if ( refLoc == null ) throw new IllegalArgumentException("refLoc cannot be null"); @@ -114,8 +117,8 @@ public class LDMerger extends MergeVariantsAcrossHaplotypes { if( startPosKeySet.size() <= 1 ) { return false; } - final int nSamples = haplotypeReadMap.keySet().size(); - final HaplotypeLDCalculator r2Calculator = new HaplotypeLDCalculator(haplotypes, haplotypeReadMap); + final int nSamples = readLikelihoods.sampleCount(); + final HaplotypeLDCalculator r2Calculator = new HaplotypeLDCalculator(haplotypes, readLikelihoods); boolean somethingWasMerged = false; boolean mapWasUpdated = true; while( mapWasUpdated ) { @@ -207,7 +210,7 @@ public class LDMerger extends MergeVariantsAcrossHaplotypes { * @param haplotypes our haplotypes * @param thisStart the starting position of the first event to merge * @param nextStart the starting position of the next event to merge - * @return + * @return never {@code null}. */ private LDMergeData getPairOfEventsToMerge(final List haplotypes, final int thisStart, final int nextStart) { final LDMergeData mergeData = new LDMergeData(); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/MergeVariantsAcrossHaplotypes.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/MergeVariantsAcrossHaplotypes.java index f3565e4b8..3add2a41b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/MergeVariantsAcrossHaplotypes.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/MergeVariantsAcrossHaplotypes.java @@ -47,10 +47,9 @@ package org.broadinstitute.gatk.utils.haplotype; import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import java.util.List; -import java.util.Map; import java.util.TreeSet; /** @@ -63,14 +62,14 @@ public class MergeVariantsAcrossHaplotypes { * Merge variants across the haplotypes, updating the haplotype event maps and startPos set as appropriate * * @param haplotypes a list of haplotypes whose events we want to merge - * @param haplotypeReadMap map from sample name -> read likelihoods for each haplotype + * @param readLikelihoods map from sample name -> read likelihoods for each haplotype * @param startPosKeySet a set of starting positions of all events among the haplotypes * @param ref the reference bases * @param refLoc the span of the reference bases * @return true if anything was merged */ public boolean merge( final List haplotypes, - final Map haplotypeReadMap, + final ReadLikelihoods readLikelihoods, final TreeSet startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java index ac9706011..ede47f03b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java @@ -47,13 +47,13 @@ package org.broadinstitute.gatk.utils.haplotypeBAMWriter; import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; -import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import htsjdk.variant.variantcontext.Allele; -import java.util.*; +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; /** * A haplotype bam writer that writes out all haplotypes as reads and then @@ -66,6 +66,7 @@ import java.util.*; * Time: 1:50 PM */ class AllHaplotypeBAMWriter extends HaplotypeBAMWriter { + public AllHaplotypeBAMWriter(final ReadDestination destination) { super(destination); } @@ -78,13 +79,11 @@ class AllHaplotypeBAMWriter extends HaplotypeBAMWriter { final GenomeLoc paddedReferenceLoc, final Collection bestHaplotypes, final Set calledHaplotypes, - final Map stratifiedReadMap) { + final ReadLikelihoods readLikelihoods) { writeHaplotypesAsReads(haplotypes, new HashSet<>(bestHaplotypes), paddedReferenceLoc); - - for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { - for( final GATKSAMRecord read : readAlleleLikelihoodMap.getLikelihoodReadMap().keySet() ) { + final int sampleCount = readLikelihoods.sampleCount(); + for (int s = 0; s < sampleCount; s++) + for (final GATKSAMRecord read : readLikelihoods.sampleReads(s)) writeReadAgainstHaplotype(read); - } - } } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java index b1c0acb74..ff1afa0f7 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java @@ -47,15 +47,11 @@ package org.broadinstitute.gatk.utils.haplotypeBAMWriter; import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; -import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import htsjdk.variant.variantcontext.Allele; import java.util.Collection; -import java.util.HashMap; -import java.util.Map; import java.util.Set; /** @@ -82,16 +78,15 @@ class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter { final GenomeLoc paddedReferenceLoc, final Collection bestHaplotypes, final Set calledHaplotypes, - final Map stratifiedReadMap) { + final ReadLikelihoods readLikelihoods) { if ( calledHaplotypes.isEmpty() ) // only write out called haplotypes return; writeHaplotypesAsReads(calledHaplotypes, calledHaplotypes, paddedReferenceLoc); - for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { - for ( final GATKSAMRecord read : readAlleleLikelihoodMap.getLikelihoodReadMap().keySet() ) { + final int sampleCount = readLikelihoods.sampleCount(); + for (int s = 0; s < sampleCount; s++) + for (final GATKSAMRecord read : readLikelihoods.sampleReads(s)) writeReadAgainstHaplotype(read); - } - } } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java index 082cd73b5..8df32f902 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java @@ -46,22 +46,18 @@ package org.broadinstitute.gatk.utils.haplotypeBAMWriter; -import htsjdk.samtools.Cigar; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMTag; import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.sam.AlignmentUtils; -import org.broadinstitute.gatk.utils.sam.CigarUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.smithwaterman.SWPairwiseAlignment; import java.util.Collection; import java.util.HashSet; -import java.util.Map; import java.util.Set; /** @@ -155,17 +151,17 @@ public abstract class HaplotypeBAMWriter { * @param paddedReferenceLoc the span of the based reference here * @param bestHaplotypes a list of the best (a subset of all) haplotypes that actually went forward into genotyping * @param calledHaplotypes a list of the haplotypes at where actually called as non-reference - * @param stratifiedReadMap a map from sample -> likelihoods for each read for each of the best haplotypes + * @param readLikelihoods a map from sample -> likelihoods for each read for each of the best haplotypes */ public abstract void writeReadsAlignedToHaplotypes(final Collection haplotypes, final GenomeLoc paddedReferenceLoc, final Collection bestHaplotypes, final Set calledHaplotypes, - final Map stratifiedReadMap); + final ReadLikelihoods readLikelihoods); public void writeReadsAlignedToHaplotypes(final Collection haplotypes, final GenomeLoc paddedReferenceLoc, - final Map stratifiedReadMap) { + final ReadLikelihoods stratifiedReadMap) { writeReadsAlignedToHaplotypes(haplotypes, paddedReferenceLoc, haplotypes, new HashSet<>(haplotypes), stratifiedReadMap); } @@ -210,7 +206,7 @@ public abstract class HaplotypeBAMWriter { record.setCigar(AlignmentUtils.consolidateCigar(haplotype.getCigar())); record.setMappingQuality(isAmongBestHaplotypes ? 60 : 0); record.setReadName("HC" + uniqueNameCounter++); - record.setAttribute(AlignmentUtils.HAPLOTYPE_TAG, haplotype.hashCode()); + record.setAttribute(AlignmentUtils.HAPLOTYPE_TAG,haplotype.hashCode()); record.setReadUnmappedFlag(false); record.setReferenceIndex(paddedRefLoc.getContigIndex()); record.setAttribute(SAMTag.RG.toString(), READ_GROUP_ID); diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/AlleleBiasedDownsamplingUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/AlleleBiasedDownsamplingUtils.java index 534facd44..0bcf4ee62 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/AlleleBiasedDownsamplingUtils.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/AlleleBiasedDownsamplingUtils.java @@ -25,13 +25,16 @@ package org.broadinstitute.gatk.engine.downsampling; -import org.broadinstitute.gatk.utils.*; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.BaseUtils; +import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.collections.DefaultHashMap; import org.broadinstitute.gatk.utils.exceptions.GATKException; import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.pileup.*; +import org.broadinstitute.gatk.utils.pileup.PileupElement; +import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; +import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.text.XReadLines; import htsjdk.variant.variantcontext.Allele; @@ -39,8 +42,6 @@ import java.io.File; import java.io.IOException; import java.util.*; -import org.apache.log4j.Logger; - public class AlleleBiasedDownsamplingUtils { // define this class so that we can use Java generics below @@ -216,7 +217,7 @@ public class AlleleBiasedDownsamplingUtils { * @param downsamplingFraction the fraction of total reads to remove per allele * @return list of reads TO REMOVE from allele biased down-sampling */ - public static List selectAlleleBiasedReads(final Map> alleleReadMap, final double downsamplingFraction) { + public static
List selectAlleleBiasedReads(final Map> alleleReadMap, final double downsamplingFraction) { int totalReads = 0; for ( final List reads : alleleReadMap.values() ) totalReads += reads.size(); diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java index d875f15f5..60c882417 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java @@ -27,17 +27,18 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import org.broadinstitute.gatk.utils.commandline.RodBinding; +import htsjdk.variant.variantcontext.*; +import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.contexts.AlignmentContext; import org.broadinstitute.gatk.engine.contexts.ReferenceContext; import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.*; import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; -import htsjdk.variant.variantcontext.*; -import htsjdk.variant.vcf.*; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import java.util.*; @@ -204,6 +205,15 @@ public class VariantAnnotatorEngine { return annotateDBs(tracker, annotated); } + public VariantContext annotateContextForActiveRegion(final RefMetaDataTracker tracker, + final ReadLikelihoods readLikelihoods, + final VariantContext vc) { + //TODO we transform the read-likelihood into the Map^2 previous version for the sake of not changing of not changing annotation interface. + //TODO should we change those interfaces? + final Map annotationLikelihoods = readLikelihoods.toPerReadAlleleLikelihoodMap(); + return annotateContextForActiveRegion(tracker, annotationLikelihoods, vc); + } + public VariantContext annotateContextForActiveRegion(final RefMetaDataTracker tracker, final Map perReadAlleleLikelihoodMap, final VariantContext vc) { diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMM.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMM.java index da29e0c42..eb0d408e3 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMM.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMM.java @@ -29,6 +29,7 @@ import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import htsjdk.variant.variantcontext.Allele; @@ -125,7 +126,27 @@ public abstract class PairHMM { * @param readMaxLength the max length of reads we want to use with this PairHMM */ public void initialize( final List haplotypes, final Map> perSampleReadList, final int readMaxLength, final int haplotypeMaxLength ) { - initialize(readMaxLength, haplotypeMaxLength); + initialize(readMaxLength, haplotypeMaxLength); + } + + private int findMaxReadLength(final GATKSAMRecord ... reads) { + int max = 0; + for (final GATKSAMRecord read : reads) { + final int readLength = read.getReadLength(); + if (max < readLength) + max = readLength; + } + return max; + } + + private int findMaxAlleleLength(final List alleles) { + int max = 0; + for (final Allele allele : alleles) { + final int alleleLength = allele.length(); + if (max < alleleLength) + max = alleleLength; + } + return max; } protected int findMaxReadLength(final List reads) { @@ -147,6 +168,63 @@ public abstract class PairHMM { return listMaxHaplotypeLength; } + /** + * Given a list of reads and haplotypes, for every read compute the total probability of said read arising from + * each haplotype given base substitution, insertion, and deletion probabilities. + * + * @param processedReads reads to analyze + * @param likelihoods where to store the likelihoods where position [a][r] is reserved for the likelihood of {@code reads[r]} + * conditional to {@code alleles[a]}. + * @param constantGCP constant penalty for gap continuations. + * + * @return never {@code null}. + */ + public void computeLikelihoods(final ReadLikelihoods.Matrix likelihoods, final List processedReads, final byte constantGCP) { + if(doProfiling) + startTime = System.nanoTime(); + // (re)initialize the pairHMM only if necessary + final int readMaxLength = findMaxReadLength(processedReads); + final int haplotypeMaxLength = findMaxAlleleLength(likelihoods.alleles()); + if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) + initialize(readMaxLength, haplotypeMaxLength); + + final int readCount = processedReads.size(); + final List alleles = likelihoods.alleles(); + final int alleleCount = alleles.size(); + mLikelihoodArray = new double[readCount * alleleCount]; + int idx = 0; + int readIndex = 0; + for(final GATKSAMRecord read : processedReads){ + final int readLength = read.getReadLength(); + final byte[] readBases = read.getReadBases(); + final byte[] readQuals = read.getBaseQualities(); + final byte[] readInsQuals = read.getBaseInsertionQualities(); + final byte[] readDelQuals = read.getBaseDeletionQualities(); + final byte[] overallGCP = new byte[readLength]; + Arrays.fill(overallGCP,constantGCP); + + // peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation) + final boolean isFirstHaplotype = true; + for (int a = 0; a < alleleCount; a++) { + final Allele allele = alleles.get(a); + final byte[] alleleBases = allele.getBases(); + final byte[] nextAlleleBases = a == alleles.size() - 1 ? null : alleles.get(a + 1).getBases(); + final double lk = computeReadLikelihoodGivenHaplotypeLog10(alleleBases, + readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextAlleleBases); + likelihoods.set(a, readIndex, lk); + mLikelihoodArray[idx++] = lk; + } + readIndex++; + } + if(doProfiling) { + threadLocalPairHMMComputeTimeDiff = (System.nanoTime() - startTime); + //synchronized(doProfiling) + { + pairHMMComputeTime += threadLocalPairHMMComputeTimeDiff; + } + } + } + /** * Given a list of reads and haplotypes, for every read compute the total probability of said read arising from * each haplotype given base substitution, insertion, and deletion probabilities. @@ -156,6 +234,7 @@ public abstract class PairHMM { * @param GCPArrayMap Each read is associated with an array containing the gap continuation penalties for use in the model. Length of each GCP-array must match that of its read. * @return a PerReadAlleleLikelihoodMap containing each read, haplotype-allele, and the log10 probability of * said read coming from the said haplotype under the provided error model + * @deprecated */ public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap) { if(doProfiling) @@ -178,7 +257,7 @@ public abstract class PairHMM { // peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation) byte[] currentHaplotypeBases = null; - boolean isFirstHaplotype = true; + final boolean isFirstHaplotype = true; Allele currentAllele = null; double log10l; //for (final Allele allele : alleleHaplotypeMap.keySet()){ diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java index 289275abd..4c9a4447b 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java @@ -31,7 +31,6 @@ import htsjdk.samtools.Cigar; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.haplotype.Haplotype; @@ -75,7 +74,11 @@ public final class AlignmentUtils { * @param haplotype the haplotype that the read should be aligned to, before aligning to the reference * @param referenceStart the start of the reference that haplotype is aligned to. Provides global coordinate frame. * @param isInformative true if the read is differentially informative for one of the haplotypes - * @return a GATKSAMRecord aligned to reference, or null if no meaningful alignment is possible + * + * @throws IllegalArgumentException if {@code originalRead} is {@code null} or {@code haplotype} is {@code null} or it + * does not have a Cigar or the {@code referenceStart} is invalid (less than 1). + * + * @return a GATKSAMRecord aligned to reference. Never {@code null}. */ public static GATKSAMRecord createReadAlignedToRef(final GATKSAMRecord originalRead, final Haplotype haplotype, From 0b472f6bffc8b15ea1433b0dff768b0f21d3e531 Mon Sep 17 00:00:00 2001 From: Valentin Ruano-Rubio Date: Thu, 7 Aug 2014 17:05:08 -0400 Subject: [PATCH 3/5] Added new test to verify the functionality of ReadLikelihoods.java and its use in HC. Updated existing integration test md5s. Stories: https://www.pivotaltracker.com/story/show/70222086 https://www.pivotaltracker.com/story/show/67961652 --- ...LikelihoodCalculationEnginesBenchmark.java | 6 +- ...lexAndSymbolicVariantsIntegrationTest.java | 4 +- .../HaplotypeCallerIntegrationTest.java | 16 +- ...ngLikelihoodCalculationEngineUnitTest.java | 4 +- .../ReferenceConfidenceModelUnitTest.java | 13 +- .../genotyper/ReadLikelihoodsUnitTest.java | 559 ++++++++++++++++++ .../gatk/utils/UtilsUnitTest.java | 34 +- 7 files changed, 615 insertions(+), 21 deletions(-) create mode 100644 protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java index 39171a667..66af674e9 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java @@ -111,8 +111,8 @@ public class HCLikelihoodCalculationEnginesBenchmark extends SimpleBenchmark { @SuppressWarnings("unused") public void timeGraphBasedLikelihoods(final int reps) { for (int i = 0; i < reps; i++) { - GraphBasedLikelihoodCalculationEngineInstance rtlce = new GraphBasedLikelihoodCalculationEngineInstance(dataSet.assemblyResultSet(), new FastLoglessPairHMM((byte)10),Double.NEGATIVE_INFINITY,HeterogeneousKmerSizeResolution.COMBO_MAX); - rtlce.computeReadLikelihoods(dataSet.haplotypeList(), Collections.singletonMap("anonymous", dataSet.readList())); + final GraphBasedLikelihoodCalculationEngineInstance rtlce = new GraphBasedLikelihoodCalculationEngineInstance(dataSet.assemblyResultSet(), new FastLoglessPairHMM((byte)10),Double.NEGATIVE_INFINITY,HeterogeneousKmerSizeResolution.COMBO_MAX); + rtlce.computeReadLikelihoods(dataSet.haplotypeList(), Collections.singletonList("anonymous"), Collections.singletonMap("anonymous", dataSet.readList())); } } @@ -121,7 +121,7 @@ public class HCLikelihoodCalculationEnginesBenchmark extends SimpleBenchmark { for (int i = 0; i < reps; i++) { final PairHMMLikelihoodCalculationEngine engine = new PairHMMLikelihoodCalculationEngine((byte) 10, PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3, true, PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.NONE); - engine.computeReadLikelihoods(dataSet.assemblyResultSet(), Collections.singletonMap("anonymous", dataSet.readList())); + engine.computeReadLikelihoods(dataSet.assemblyResultSet(), Collections.singletonList("anonymous"), Collections.singletonMap("anonymous", dataSet.readList())); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 573e61df7..2e8fc67db 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -94,7 +94,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "1a65d432437670d2816a42ea270c06a1"); + "7096aa0ba002757cdcb9e1868d51a85f"); } private void HCTestComplexConsensusMode(String bam, String args, String md5) { @@ -106,7 +106,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleConsensusModeComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538 -L 20:133041-133161 -L 20:300207-300337", - "a5bb8f7eb1f7db997be9fd8928a788f6"); + "45be554608485d8d3979284277d00f95"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 72c47b8ba..033fff46a 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -49,13 +49,13 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.tribble.readers.LineIterator; import htsjdk.tribble.readers.PositionalBufferedStream; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFCodec; import org.broadinstitute.gatk.engine.walkers.WalkerTest; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.vcf.VCFCodec; import org.testng.Assert; import org.testng.annotations.Test; @@ -84,7 +84,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "60e2f0c3ce33a05c060035d86bc79543"); + HCTest(CEUTRIO_BAM, "", "321a0b7cb6ff80691f23566670e1d604"); } @Test @@ -99,12 +99,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerGraphBasedSingleSample() { - HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "07910f50710349eacd2560452fac3e8d"); + HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "6cf15ddbfa4a3738e891fd9a09da8d07"); } @Test public void testHaplotypeCallerGraphBasedMultiSample() { - HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "c5ef449a46b80b69dde87aa52041fe50"); + HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "4c2a2dad6379b13fee4c7faca17441f5"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -115,7 +115,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "b61e0bdf0e3180cb4f5abd3491b05aa6"); + "07c5df4cbd91170d0061355c1a8f7fc9"); } @Test @@ -244,7 +244,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestDBSNPAnnotationWGSGraphBased() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("fc2e81d02c6bbf15147a46797402fda7")); + Arrays.asList("7748a90ffd7daa15a8d518fc528bfcc5")); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); } @@ -253,7 +253,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + " -L " + hg19Intervals + " -isr INTERSECTION", 1, - Arrays.asList("83bf354f0238a362d4b6ef8d14679999")); + Arrays.asList("6bea6d26a3512a7b74d9610d188b6e9a")); executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java index 5edb1223a..db6923feb 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java @@ -262,7 +262,7 @@ public class ReadThreadingLikelihoodCalculationEngineUnitTest extends ActiveRegi dataSet = (ActiveRegionTestDataSet) params[0]; if (INTRODUCE_READ_ERRORS) dataSet.introduceErrors(new Random(13)); graphEngine = new GraphBasedLikelihoodCalculationEngineInstance(dataSet.assemblyResultSet(),hmm,Double.NEGATIVE_INFINITY, HeterogeneousKmerSizeResolution.COMBO_MAX); - graphLks = graphEngine.computeReadLikelihoods(dataSet.haplotypeList(),Collections.singletonMap("anonymous",dataSet.readList())).get("anonymous"); + graphLks = graphEngine.computeReadLikelihoods(dataSet.haplotypeList(),Collections.singletonList("anonymous"),Collections.singletonMap("anonymous",dataSet.readList())).toPerReadAlleleLikelihoodMap(0); // clip reads at the anchors. final Map clippedReads = anchorClippedReads(graphEngine.getHaplotypeGraph(),dataSet.readList()); @@ -272,7 +272,7 @@ public class ReadThreadingLikelihoodCalculationEngineUnitTest extends ActiveRegi clippedReadList.add(clippedReads.containsKey(r) ? clippedReads.get(r) : r); } - loglessLks = fullPairHMM.computeReadLikelihoods(dataSet.assemblyResultSet(),Collections.singletonMap("anonymous",clippedReadList)).get("anonymous"); + loglessLks = fullPairHMM.computeReadLikelihoods(dataSet.assemblyResultSet(),Collections.singletonList("anonymous"),Collections.singletonMap("anonymous",clippedReadList)).toPerReadAlleleLikelihoodMap(0); // Change clipped by unclipped in the resulting likelihood map. for (final GATKSAMRecord r : clippedReads.keySet()) { diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java index c807457e5..08edeee98 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java @@ -48,9 +48,12 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import htsjdk.samtools.SAMFileHeader; import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.*; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.UnvalidatingGenomeLoc; +import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; -import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; @@ -285,7 +288,7 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { data.getActiveRegion().add(data.makeRead(0, data.getRefLength())); } - final Map likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); + final ReadLikelihoods likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); final List expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads); final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); @@ -302,7 +305,7 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { final List calls = Collections.emptyList(); data.getActiveRegion().add(data.makeRead(start, readLen)); - final Map likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); + final ReadLikelihoods likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); final List expectedDPs = new ArrayList<>(Collections.nCopies(data.getActiveRegion().getLocation().size(), 0)); for ( int i = start; i < readLen + start; i++ ) expectedDPs.set(i, 1); @@ -337,7 +340,7 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { data.getActiveRegion().add(data.makeRead(0, data.getRefLength())); } - final Map likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); + final ReadLikelihoods likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); final List expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads); final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java new file mode 100644 index 000000000..c339a2269 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java @@ -0,0 +1,559 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.gatk.utils.genotyper; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Test code for {@link ReadLikelihoods} + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class ReadLikelihoodsUnitTest +{ + private static final double EPSILON = 1e-6; + private static final int ODD_READ_START = 101; + private static final int EVEN_READ_START = 1; + + @Test(dataProvider = "dataSets") + public void testInstantiationAndQuery(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods result = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + + Assert.assertEquals(result.sampleCount(), samples.length); + Assert.assertEquals(result.alleleCount(), alleles.length); + + + testSampleQueries(samples, reads, result); + testAlleleQueries(alleles, result); + testLikelihoodMatrixQueries(samples, result, null); + } + + @Test(dataProvider = "dataSets") + public void testLikelihoodFillingAndQuery(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods result = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final double[][][] likelihoods = fillWithRandomLikelihoods(samples, alleles, result); + testLikelihoodMatrixQueries(samples, result, likelihoods); + } + + private double[][][] fillWithRandomLikelihoods(final String[] samples, final Allele[] alleles, final ReadLikelihoods result) { + final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final double[][][] likelihoods = new double[samples.length][alleles.length][]; + for (int s = 0; s < likelihoods.length; s++) { + final ReadLikelihoods.Matrix sampleLikelihoods = result.sampleMatrix(s); + for (int a = 0; a < likelihoods[s].length; a++) { + likelihoods[s][a] = new double[result.sampleReadCount(s)]; + for (int r = 0; r < likelihoods[s][a].length; r++) + sampleLikelihoods.set(a,r,likelihoods[s][a][r] = -Math.abs(rnd.nextGaussian())); + } + } + return likelihoods; + } + + @Test(dataProvider = "dataSets") + public void testBestAlleles(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + fillWithRandomLikelihoods(samples,alleles,original); + final int alleleCount = alleles.length; + for (int s = 0; s < samples.length; s++) { + final int sampleReadCount = original.sampleReadCount(s); + final ReadLikelihoods.Matrix sampleMatrix = original.sampleMatrix(s); + final double[] bestLkArray = new double[sampleReadCount]; + final int[] bestIndexArray = new int[sampleReadCount]; + final double[] confidenceArray = new double[sampleReadCount]; + for (int r = 0; r < sampleReadCount; r++) { + int bestAlleleIndex = -1; + double bestAlleleLk = Double.NEGATIVE_INFINITY; + double secondBestAlleleLk = Double.NEGATIVE_INFINITY; + for (int a = 0; a < alleleCount; a++) { + final double lk = sampleMatrix.get(a,r); + if (lk > bestAlleleLk) { + secondBestAlleleLk = bestAlleleLk; + bestAlleleLk = lk; + bestAlleleIndex = a; + } else if (lk > secondBestAlleleLk) { + secondBestAlleleLk = lk; + } + } + bestLkArray[r] = bestAlleleLk; + confidenceArray[r] = bestAlleleLk - secondBestAlleleLk; + bestIndexArray[r] = bestAlleleIndex; + } + final Collection.BestAllele> bestAlleles = original.bestAlleles(); + for (final ReadLikelihoods.BestAllele bestAllele : bestAlleles) { + final int readIndex = original.readIndex(s,bestAllele.read); + if (readIndex == -1) continue; + Assert.assertEquals(bestLkArray[readIndex],bestAllele.likelihood); + Assert.assertEquals(bestAllele.allele,alleles[bestIndexArray[readIndex]]); + Assert.assertEquals(bestAllele.confidence,confidenceArray[readIndex],EPSILON); + } + } + } + + @Test(dataProvider = "dataSets") + public void testBestAlleleMap(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + fillWithRandomLikelihoods(samples,alleles,original); + final Map> expected = new HashMap<>(alleles.length); + for (final Allele allele : alleles) + expected.put(allele,new ArrayList()); + + final int alleleCount = alleles.length; + for (int s = 0; s < samples.length; s++) { + final int sampleReadCount = original.sampleReadCount(s); + final ReadLikelihoods.Matrix sampleMatrix = original.sampleMatrix(s); + for (int r = 0; r < sampleReadCount; r++) { + int bestAlleleIndex = -1; + double bestAlleleLk = Double.NEGATIVE_INFINITY; + double secondBestAlleleLk = Double.NEGATIVE_INFINITY; + for (int a = 0; a < alleleCount; a++) { + final double lk = sampleMatrix.get(a,r); + if (lk > bestAlleleLk) { + secondBestAlleleLk = bestAlleleLk; + bestAlleleLk = lk; + bestAlleleIndex = a; + } else if (lk > secondBestAlleleLk) { + secondBestAlleleLk = lk; + } + } + if ((bestAlleleLk - secondBestAlleleLk) > ReadLikelihoods.BestAllele.INFORMATIVE_THRESHOLD) + expected.get(alleles[bestAlleleIndex]).add(sampleMatrix.read(r)); + } + } + + final Map> actual = original.readsByBestAlleleMap(); + + Assert.assertEquals(actual.size(),alleles.length); + for (final Allele allele : alleles) { + final List expectedList = expected.get(allele); + final List actualList = actual.get(allele); + final Set expectedSet = new HashSet<>(expectedList); + final Set actualSet = new HashSet<>(actualList); + Assert.assertEquals(actualSet,expectedSet); + } + } + + @Test(dataProvider = "dataSets") + public void testFilterPoorlyModeledReads(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + + for (int s = 0; s < samples.length; s++) { + final int sampleReadCount = original.sampleReadCount(s); + for (int r = 0; r < sampleReadCount; r++) { + if ((r & 1) == 0) continue; + for (int a = 0; a < alleles.length; a++) + original.sampleMatrix(s).set(a,r,-10000); + } + } + + final ReadLikelihoods result = original.clone(); + result.filterPoorlyModeledReads(2.0); + + for (int s = 0; s < samples.length; s++) { + final int oldSampleReadCount = original.sampleReadCount(s); + final int newSampleReadCount = result.sampleReadCount(s); + Assert.assertEquals(newSampleReadCount,(oldSampleReadCount + 1) / 2); + final ReadLikelihoods.Matrix newSampleMatrix = result.sampleMatrix(s); + final ReadLikelihoods.Matrix oldSampleMatrix = original.sampleMatrix(s); + for (int r = 0 ; r < newSampleReadCount; r++) { + Assert.assertEquals(original.readIndex(s, result.sampleReads(s).get(r)), r * 2); + for (int a = 0; a < alleles.length; a++) { + Assert.assertEquals(newSampleMatrix.get(a,r),oldSampleMatrix.get(a,r*2)); + } + } + } + } + + @Test(dataProvider = "dataSets") + public void testFilterReadsToOverlap(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final GenomeLoc evenReadOverlap = locParser.createGenomeLoc(SAM_HEADER.getSequenceDictionary().getSequences().get(0).getSequenceName(),EVEN_READ_START ,EVEN_READ_START ); + fillWithRandomLikelihoods(samples,alleles,original); + final ReadLikelihoods result = original.clone(); + result.filterToOnlyOverlappingUnclippedReads(evenReadOverlap); + final double[][][] newLikelihoods = new double[samples.length][alleles.length][]; + for (int s = 0; s < samples.length ; s++) + for (int a = 0; a < alleles.length; a++) { + newLikelihoods[s][a] = new double[(original.sampleReadCount(s) + 1) / 2]; + final ReadLikelihoods.Matrix sampleMatrix = original.sampleMatrix(s); + for (int r = 0; r < newLikelihoods[s][a].length; r++) { + Assert.assertEquals(result.readIndex(s,sampleMatrix.read(r << 1)),r); + newLikelihoods[s][a][r] = sampleMatrix.get(a, r << 1); + } + } + testLikelihoodMatrixQueries(samples,result,newLikelihoods); + } + + @Test(dataProvider = "marginalizationDataSets") + public void testMarginalizationWithOverlap(final String[] samples, final Allele[] alleles, final Map> reads, final Map> newToOldAlleleMapping) { + final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final GenomeLoc evenReadOverlap = locParser.createGenomeLoc(SAM_HEADER.getSequenceDictionary().getSequences().get(0).getSequenceName(),EVEN_READ_START ,EVEN_READ_START ); + fillWithRandomLikelihoods(samples, alleles, original); + final ReadLikelihoods marginalized = original.marginalize(newToOldAlleleMapping,evenReadOverlap); + Assert.assertNotNull(marginalized); + Assert.assertEquals(newToOldAlleleMapping.size(),marginalized.alleleCount()); + for (int a = 0; a < marginalized.alleleCount(); a++) { + final List oldAlleles = newToOldAlleleMapping.get(marginalized.allele(a)); + Assert.assertNotNull(oldAlleles); + for (int s = 0; s < samples.length; s++) { + final ReadLikelihoods.Matrix oldSmapleLikelihoods = original.sampleMatrix(s); + final ReadLikelihoods.Matrix sampleLikelihoods = marginalized.sampleMatrix(s); + final int sampleReadCount = sampleLikelihoods.readCount(); + final int oldSampleReadCount = oldSmapleLikelihoods.readCount(); + Assert.assertEquals(sampleReadCount,(oldSampleReadCount + 1) / 2); + for (int r = 0; r < sampleReadCount; r++) { + double oldBestLk = Double.NEGATIVE_INFINITY; + for (final Allele oldAllele : oldAlleles) { + oldBestLk = Math.max(oldSmapleLikelihoods.get(original.alleleIndex(oldAllele),r << 1), oldBestLk); + } + Assert.assertEquals(sampleLikelihoods.get(a,r),oldBestLk); + } + } + } + } + + @Test(dataProvider = "marginalizationDataSets") + public void testMarginalization(final String[] samples, final Allele[] alleles, final Map> reads, final Map> newToOldAlleleMapping) { + final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + fillWithRandomLikelihoods(samples, alleles, original); + final ReadLikelihoods marginalized = original.marginalize(newToOldAlleleMapping); + Assert.assertNotNull(marginalized); + Assert.assertEquals(newToOldAlleleMapping.size(),marginalized.alleleCount()); + for (int a = 0; a < marginalized.alleleCount(); a++) { + final List oldAlleles = newToOldAlleleMapping.get(marginalized.allele(a)); + Assert.assertNotNull(oldAlleles); + for (int s = 0; s < samples.length; s++) { + final ReadLikelihoods.Matrix oldSmapleLikelihoods = original.sampleMatrix(s); + final ReadLikelihoods.Matrix sampleLikelihoods = marginalized.sampleMatrix(s); + final int sampleReadCount = sampleLikelihoods.readCount(); + final int oldSampleReadCount = oldSmapleLikelihoods.readCount(); + Assert.assertEquals(oldSampleReadCount,sampleReadCount); + for (int r = 0; r < sampleReadCount; r++) { + double oldBestLk = Double.NEGATIVE_INFINITY; + for (final Allele oldAllele : oldAlleles) { + oldBestLk = Math.max(oldSmapleLikelihoods.get(original.alleleIndex(oldAllele),r), oldBestLk); + } + Assert.assertEquals(sampleLikelihoods.get(a,r),oldBestLk); + } + } + } + } + + @Test(dataProvider = "dataSets") + public void testNormalizeBestToZero(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); + final ReadLikelihoods result= original.clone(); + result.normalizeLikelihoods(true, Double.NEGATIVE_INFINITY); + testAlleleQueries(alleles,result); + final int alleleCount = alleles.length; + final double[][][] newLikelihoods = new double[originalLikelihoods.length][alleles.length][]; + for (int s = 0; s < samples.length; s++) { + final int sampleReadCount = original.sampleReadCount(s); + for (int a = 0; a < alleleCount; a++) + newLikelihoods[s][a] = new double[sampleReadCount]; + for (int r = 0; r < sampleReadCount; r++) { + double bestLk = originalLikelihoods[s][0][r]; + for (int a = 1; a < alleleCount; a++) { + bestLk = Math.max(bestLk,originalLikelihoods[s][a][r]); + } + for (int a = 0; a < alleleCount; a++) { + newLikelihoods[s][a][r] = originalLikelihoods[s][a][r] - bestLk; + } + } + } + testLikelihoodMatrixQueries(samples,result,newLikelihoods); + } + + @Test(dataProvider = "dataSets") + public void testNormalizeCapWorstLK(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); + final ReadLikelihoods result= original.clone(); + result.normalizeLikelihoods(false, - 0.001); + testAlleleQueries(alleles,result); + final int alleleCount = alleles.length; + final double[][][] newLikelihoods = new double[originalLikelihoods.length][alleles.length][]; + for (int s = 0; s < samples.length; s++) { + final int sampleReadCount = original.sampleReadCount(s); + for (int a = 0; a < alleleCount; a++) + newLikelihoods[s][a] = new double[sampleReadCount]; + for (int r = 0; r < sampleReadCount; r++) { + double bestAltLk = Double.NEGATIVE_INFINITY; + for (int a = 0; a < alleleCount; a++) { + if (alleles[a].isReference()) + continue; + bestAltLk = Math.max(bestAltLk,originalLikelihoods[s][a][r]); + } + if (bestAltLk == Double.NEGATIVE_INFINITY) + for (int a = 0; a < alleleCount; a++) { + newLikelihoods[s][a][r] = originalLikelihoods[s][a][r]; + } + else + for (int a = 0; a < alleleCount; a++) { + newLikelihoods[s][a][r] = Math.max(originalLikelihoods[s][a][r],bestAltLk - 0.001); + } + } + } + testLikelihoodMatrixQueries(samples,result,newLikelihoods); + } + + @Test(dataProvider = "dataSets") + public void testNormalizeCapWorstLKAndBestToZero(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); + final ReadLikelihoods result= original.clone(); + result.normalizeLikelihoods(true, - 0.001); + testAlleleQueries(alleles,result); + final int alleleCount = alleles.length; + final double[][][] newLikelihoods = new double[originalLikelihoods.length][alleles.length][]; + for (int s = 0; s < samples.length; s++) { + final int sampleReadCount = original.sampleReadCount(s); + for (int a = 0; a < alleleCount; a++) + newLikelihoods[s][a] = new double[sampleReadCount]; + for (int r = 0; r < sampleReadCount; r++) { + double bestAltLk = Double.NEGATIVE_INFINITY; + double bestLk = Double.NEGATIVE_INFINITY; + for (int a = 0; a < alleleCount; a++) { + bestLk = Math.max(bestLk,originalLikelihoods[s][a][r]); + if (alleles[a].isReference()) + continue; + bestAltLk = Math.max(bestAltLk,originalLikelihoods[s][a][r]); + } + if (bestAltLk == Double.NEGATIVE_INFINITY) + for (int a = 0; a < alleleCount; a++) { + newLikelihoods[s][a][r] = originalLikelihoods[s][a][r] - bestLk; + } + else + for (int a = 0; a < alleleCount; a++) { + newLikelihoods[s][a][r] = Math.max(originalLikelihoods[s][a][r],bestAltLk - 0.001) - bestLk; + } + } + } + testLikelihoodMatrixQueries(samples,result,newLikelihoods); + } + + + @Test(dataProvider = "dataSets") + public void testAddNonRefAllele(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); + final ReadLikelihoods result = original.clone(); + result.addNonReferenceAllele(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + Assert.assertEquals(result.alleleCount(),original.alleleCount() + 1); + Assert.assertEquals(result.alleleIndex(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE),result.alleleCount() - 1); + final double[][][] newLikelihoods = new double[originalLikelihoods.length][][]; + for (int s = 0; s < samples.length; s++) { + newLikelihoods[s] = Arrays.copyOf(originalLikelihoods[s],originalLikelihoods[s].length + 1); + final int sampleReadCount = original.sampleReadCount(s); + final int ordinaryAlleleCount = originalLikelihoods[s].length; + newLikelihoods[s][ordinaryAlleleCount] = new double[sampleReadCount]; + for (int r = 0; r < sampleReadCount; r++) { + double bestLk = newLikelihoods[s][0][r]; + double secondBestLk = Double.NEGATIVE_INFINITY; + for (int a = 1; a < ordinaryAlleleCount; a++) { + final double lk = originalLikelihoods[s][a][r]; + if (lk > bestLk) { + secondBestLk = bestLk; + bestLk = lk; + } else if (lk > secondBestLk) { + secondBestLk = lk; + } + } + final double expectedNonRefLk = Double.isInfinite(secondBestLk) ? bestLk : secondBestLk; + newLikelihoods[s][ordinaryAlleleCount][r] = expectedNonRefLk; + } + } + testLikelihoodMatrixQueries(samples,result,newLikelihoods); + } + + private void testLikelihoodMatrixQueries(String[] samples, ReadLikelihoods result, final double[][][] likelihoods) { + for (final String sample : samples) { + final int sampleIndex = result.sampleIndex(sample); + final double[][] likelihoodMatrix = result.sampleValues(sampleIndex); + final int sampleReadCount = result.sampleReadCount(sampleIndex); + Assert.assertEquals(result.alleleCount(), likelihoodMatrix.length); + for (int a = 0; a < likelihoodMatrix.length; a++) { + Assert.assertEquals(likelihoodMatrix[a].length,sampleReadCount); + for (int r = 0; r < sampleReadCount; r++) + Assert.assertEquals(likelihoodMatrix[a][r], + likelihoods == null ? 0.0 : likelihoods[sampleIndex][a][r], EPSILON); + } + } + } + + private void testAlleleQueries(Allele[] alleles, ReadLikelihoods result) { + final Set alleleIndices = new HashSet<>(); + for (final Allele allele : alleles) { + final int alleleIndex = result.alleleIndex(allele); + Assert.assertTrue(alleleIndex >= 0); + Assert.assertFalse(alleleIndices.contains(alleleIndex)); + alleleIndices.add(alleleIndex); + Assert.assertSame(allele,alleles[alleleIndex]); + } + } + + private void testSampleQueries(String[] samples, Map> reads, ReadLikelihoods result) { + final Set sampleIds = new HashSet<>(samples.length); + for (final String sample : samples) { + final int sampleIndex = result.sampleIndex(sample); + Assert.assertTrue(sampleIndex >= 0); + Assert.assertFalse(sampleIds.contains(sampleIndex)); + sampleIds.add(sampleIndex); + + final List sampleReads = result.sampleReads(sampleIndex); + final Set sampleReadsSet = new HashSet<>(sampleReads); + final List expectedSampleReadArray = reads.get(sample); + final Set expectedSampleReadsSet = new HashSet<>(expectedSampleReadArray); + Assert.assertEquals(sampleReadsSet,expectedSampleReadsSet); + + final int sampleReadCount = sampleReads.size(); + for (int r = 0; r < sampleReadCount; r++) { + Assert.assertSame(sampleReads.get(r), expectedSampleReadArray.get(r)); + final int readIndex = result.readIndex(sampleIndex, sampleReads.get(r)); + Assert.assertEquals(readIndex,r); + } + } + } + + private String[][] SAMPLE_SETS = new String[][] { + {"A","B","C"}, + {"A"}, + {"C","A","D","E","Salsa","Gazpacho"}, + }; + + private Allele[][] ALLELE_SETS = new Allele[][] { + {Allele.create("A",true), Allele.create("T"), Allele.create("C")}, + {Allele.create("A",true)}, + {Allele.create("ATTTA"), Allele.create("A",true)}, + {Allele.create("A"), Allele.create("AT",true)} + }; + + @DataProvider(name="marginalizationDataSets") + public Object[][] marginalizationDataSets() { + try { + final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Object[][] result = new Object[SAMPLE_SETS.length * ALLELE_SETS.length * ALLELE_SETS.length][]; + int nextIndex = 0; + for (int s = 0; s < SAMPLE_SETS.length; s++) + for (int a = 0; a < ALLELE_SETS.length; a++) { + for (int b = 0; b < ALLELE_SETS.length; b++) { + if (ALLELE_SETS[b].length < ALLELE_SETS[a].length) + result[nextIndex++] = new Object[]{SAMPLE_SETS[s], ALLELE_SETS[a], + dataSetReads(SAMPLE_SETS[s], rnd), randomAlleleMap(ALLELE_SETS[a], ALLELE_SETS[b]) + }; + } + } + return Arrays.copyOf(result,nextIndex); + }catch (final Throwable e) { + throw new RuntimeException(e); + } + } + + private Map> randomAlleleMap(final Allele[] fromAlleles, final Allele[] toAlleles) { + final Map> result = new HashMap<>(toAlleles.length); + for (final Allele toAllele : toAlleles ) + result.put(toAllele,new ArrayList(fromAlleles.length)); + final ArrayList remaining = new ArrayList<>(Arrays.asList(fromAlleles)); + int nextToIndex = 0; + final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + for (int i = 0; i < fromAlleles.length; i++) { + final int fromAlleleIndex = rnd.nextInt(remaining.size()); + result.get(toAlleles[nextToIndex]).add(remaining.remove(fromAlleleIndex)); + nextToIndex = (nextToIndex + 1) % toAlleles.length; + } + return result; + } + + + @DataProvider(name="dataSets") + public Object[][] dataSets() { + try { + final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Object[][] result = new Object[SAMPLE_SETS.length * ALLELE_SETS.length][]; + int nextIndex = 0; + for (int s = 0; s < SAMPLE_SETS.length; s++) + for (int a = 0; a < ALLELE_SETS.length; a++) { + result[nextIndex++] = new Object[]{SAMPLE_SETS[s], ALLELE_SETS[a], + dataSetReads(SAMPLE_SETS[s], rnd) + }; + } + return result; + }catch (final Throwable e) { + throw new RuntimeException(e); + } + } + + final SAMFileHeader SAM_HEADER = ArtificialSAMUtils.createArtificialSamHeader(); + final GenomeLocParser locParser = new GenomeLocParser(SAM_HEADER.getSequenceDictionary()); + + private Map> dataSetReads(final String[] samples, + final Random rnd) { + final Map> result = new HashMap<>(samples.length); + for (final String sample : samples) { + final int readCount = rnd.nextInt(100); + final List reads = new ArrayList<>(readCount); + for (int r = 0; r < readCount; r++) { + final int alignmentStart = (r & 1) == 0 ? EVEN_READ_START : ODD_READ_START; + reads.add(ArtificialSAMUtils.createArtificialRead(SAM_HEADER, + "RRR" + sample + "00" + r, 0, alignmentStart ,"AAAAA".getBytes(), new byte[] {30,30,30,30,30}, "5M")); + } + result.put(sample,reads); + } + return result; + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/UtilsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/UtilsUnitTest.java index 9c4027995..8fd4c3c3a 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/UtilsUnitTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/UtilsUnitTest.java @@ -28,7 +28,6 @@ package org.broadinstitute.gatk.utils; import org.apache.commons.io.FileUtils; import org.broadinstitute.gatk.utils.io.IOUtils; import org.testng.Assert; -import org.broadinstitute.gatk.utils.BaseTest; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -255,4 +254,37 @@ public class UtilsUnitTest extends BaseTest { }; } + + @Test(dataProvider = "skimArrayData") + public void testSkimArray(final String original, final String remove) { + final StringBuilder resultBuilder = new StringBuilder(); + final boolean[] removeBoolean = new boolean[remove.length()]; + for (int i = 0; i < original.length(); i++) + if (remove.charAt(i) == '1') { + resultBuilder.append(original.charAt(i)); + removeBoolean[i] = false; + } else + removeBoolean[i] = true; + + final String expected = resultBuilder.toString(); + final byte[] resultBytes = Utils.skimArray(original.getBytes(),removeBoolean); + final String resultString = new String(resultBytes); + Assert.assertEquals(resultString,expected); + } + + @DataProvider(name = "skimArrayData") + public Object[][] skimArrayData() { + return new Object[][] { + {"romeo+juliette" , "11111111111111" }, + {"romeo+juliette" , "11111011111111" }, + {"romeo+juliette" , "00000011111111" }, + {"romeo+juliette" , "11111100000000" }, + {"romeo+juliette" , "11111011111111" }, + {"romeo+juliette" , "01111010000001" }, + {"romeo+juliette" , "01100110000110" }, + {"romeo+juliette" , "10101010101010" }, + {"romeo+juliette" , "01010101010101" }, + {"romeo+juliette" , "01111010111001" }, + }; + } } From 9a9a68409e279799469038770e14b8a4cca7bee7 Mon Sep 17 00:00:00 2001 From: Valentin Ruano-Rubio Date: Thu, 31 Jul 2014 15:47:46 -0400 Subject: [PATCH 4/5] ReadLikelihoods class introduction final changes before merging Stories: https://www.pivotaltracker.com/story/show/70222086 https://www.pivotaltracker.com/story/show/67961652 Changes: Done some changes that I missed in relation with making sure that all PairHMM implentations use the same interface; as a consequence we were running always the standard PairHMM. Fixed some additional bugs detected when running it on full wgs single sample and exom multi sample data set. Updated some integration test md5s. Fixing GraphBased bugs with new master code Fixed ReadLikelihoods.changeReads difficult to spot bug. Changed PairHMM interface to fix a bug Fixed missing changes for various PairHMM implementations to get them to use the new structure. Fixed various bugs only detectable when running with full sample(s). Believe to have fixed the lack of annotations in UG runs Fixed integrationt test MD5s Updating some md5s Fixed yet another md5 probably left out by mistake --- ...GraphBasedLikelihoodCalculationEngine.java | 7 +- .../HaplotypeCallerGenotypingEngine.java | 5 +- .../PairHMMLikelihoodCalculationEngine.java | 13 ++- .../indels/PairHMMIndelErrorModel.java | 37 ++++--- .../gatk/utils/pairhmm/CnyPairHMM.java | 45 ++++----- .../utils/pairhmm/DebugJNILoglessPairHMM.java | 55 +++++------ .../utils/pairhmm/VectorLoglessPairHMM.java | 50 ++++------ ...perGeneralPloidySuite1IntegrationTest.java | 2 +- ...perGeneralPloidySuite2IntegrationTest.java | 2 +- ...dGenotyperIndelCallingIntegrationTest.java | 14 +-- ...GenotyperNormalCallingIntegrationTest.java | 13 +-- ...lexAndSymbolicVariantsIntegrationTest.java | 4 +- .../HaplotypeCallerIntegrationTest.java | 2 +- .../genotyper/ReadLikelihoodsUnitTest.java | 52 +++++++++- .../gatk/utils/genotyper/ReadLikelihoods.java | 62 +++++++++++- .../gatk/utils/pairhmm/PairHMM.java | 97 +++---------------- 16 files changed, 245 insertions(+), 215 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java index 7ba81ca8c..ace4f22d2 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java @@ -119,9 +119,12 @@ public class GraphBasedLikelihoodCalculationEngine implements ReadLikelihoodCalc hmm,log10GlobalReadMismappingRate,heterogeneousKmerSizeResolution); final List haplotypes = assemblyResultSet.getHaplotypeList(); final List supportedHaplotypes = graphLikelihoodEngine.getHaplotypeList(); - if (supportedHaplotypes.size() != haplotypes.size()) + final ReadLikelihoods result = graphLikelihoodEngine.computeReadLikelihoods(supportedHaplotypes, samples, perSampleReadList); + if (supportedHaplotypes.size() != haplotypes.size()) { logger.warn("Some haplotypes were drop due to missing route on the graph (supported / all): " + supportedHaplotypes.size() + "/" + haplotypes.size()); - final ReadLikelihoods result = graphLikelihoodEngine.computeReadLikelihoods(supportedHaplotypes,samples,perSampleReadList); + result.addMissingAlleles(haplotypes,Double.NEGATIVE_INFINITY); + } + if (debugMode != DebugMode.NONE) graphLikelihoodDebugDumps(assemblyResultSet.getRegionForGenotyping(), graphLikelihoodEngine,result); return result; } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java index 32f54436c..a1bf09043 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java @@ -338,7 +338,8 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine= eventsAtThisLoc.size() + 1"}) @Ensures({"result.size() == eventsAtThisLoc.size() + 1"}) - protected static Map> createEventMapper( final int loc, final List eventsAtThisLoc, final List haplotypes ) { + protected static Map> createEventMapper( final int loc, final List eventsAtThisLoc, final List haplotypes) { final Map> eventMapper = new LinkedHashMap<>(eventsAtThisLoc.size()+1); final Event refEvent = new Event(null); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java index e662806fe..4feed71d9 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java @@ -275,13 +275,24 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula // Modify the read qualities by applying the PCR error model and capping the minimum base,insertion,deletion qualities final List processedReads = modifyReadQualities(likelihoods.reads()); + final Map gapContinuationPenalties = buildGapContinuationPenalties(processedReads,constantGCP); // Run the PairHMM to calculate the log10 likelihood of each (processed) reads' arising from each haplotype - pairHMMThreadLocal.get().computeLikelihoods(likelihoods,processedReads,constantGCP); + pairHMMThreadLocal.get().computeLikelihoods(likelihoods,processedReads,gapContinuationPenalties); if (WRITE_LIKELIHOODS_TO_FILE) writeDebugLikelihoods(likelihoods); } + private Map buildGapContinuationPenalties(final List processedReads, final byte gcp) { + final Map result = new HashMap<>(processedReads.size()); + for (final GATKSAMRecord read : processedReads) { + final byte[] readGcpArray = new byte[read.getReadLength()]; + Arrays.fill(readGcpArray,gcp); + result.put(read,readGcpArray); + } + return result; + } + private void writeDebugLikelihoods(final ReadLikelihoods.Matrix likelihoods) { final List reads = likelihoods.reads(); final List haplotypes = likelihoods.alleles(); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java index 32e1dcd0e..a2422c544 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java @@ -47,11 +47,13 @@ package org.broadinstitute.gatk.tools.walkers.indels; import com.google.java.contract.Ensures; +import htsjdk.variant.variantcontext.Allele; import org.broadinstitute.gatk.engine.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.clipping.ReadClipper; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.pairhmm.ArrayLoglessPairHMM; import org.broadinstitute.gatk.utils.pairhmm.Log10PairHMM; @@ -61,12 +63,8 @@ import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.ReadUtils; -import htsjdk.variant.variantcontext.Allele; -import java.util.Arrays; -import java.util.LinkedHashMap; -import java.util.LinkedList; -import java.util.Map; +import java.util.*; public class PairHMMIndelErrorModel { @@ -297,10 +295,8 @@ public class PairHMMIndelErrorModel { final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap) { final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; - final LinkedList readList = new LinkedList<>(); - final Map readGCPArrayMap = new LinkedHashMap<>(); int readIdx=0; - for (PileupElement p: pileup) { + for (final PileupElement p: pileup) { // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location) if (perReadAlleleLikelihoodMap.containsPileupElement(p)) { @@ -439,28 +435,31 @@ public class PairHMMIndelErrorModel { // Create a new read based on the current one, but with trimmed bases/quals, for use in the HMM final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, baseInsertionQualities, baseDeletionQualities); - readList.add(processedRead); // Pack the shortened read and its associated gap-continuation-penalty array into a map, as required by PairHMM - readGCPArrayMap.put(processedRead,contextLogGapContinuationProbabilities); + final Map readGCPArrayMap = Collections.singletonMap(processedRead,contextLogGapContinuationProbabilities); // Create a map of alleles to a new set of haplotypes, whose bases have been trimmed to the appropriate genomic locations final Map trimmedHaplotypeMap = trimHaplotypes(haplotypeMap, startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, ref); + // Apparently more than one allele can map to the same haplotype after trimming + final Set distinctHaplotypesSet = new LinkedHashSet<>(trimmedHaplotypeMap.values()); + final List distinctHaplotypesList = Arrays.asList(distinctHaplotypesSet.toArray(new Haplotype[distinctHaplotypesSet.size()])); // Get the likelihoods for our clipped read against each of our trimmed haplotypes. - final PerReadAlleleLikelihoodMap singleReadRawLikelihoods = pairHMM.computeLikelihoods(readList, trimmedHaplotypeMap, readGCPArrayMap); + final ReadLikelihoods rl = new ReadLikelihoods<>( + + Collections.singletonList("DUMMY_SAMPLE"),distinctHaplotypesList,Collections.singletonMap("DUMMY_SAMPLE",Collections.singletonList(processedRead))); + final ReadLikelihoods.Matrix dummySampleLikelihoods = rl.sampleMatrix(0); + pairHMM.computeLikelihoods(rl.sampleMatrix(0), Collections.singletonList(processedRead), readGCPArrayMap); // Pack the original pilup element, each allele, and each associated log10 likelihood into a final map, and add each likelihood to the array - for (Allele a: trimmedHaplotypeMap.keySet()){ - double readLikelihood = singleReadRawLikelihoods.getLikelihoodAssociatedWithReadAndAllele(processedRead, a); - perReadAlleleLikelihoodMap.add(p, a, readLikelihood); + for (final Allele a: trimmedHaplotypeMap.keySet()){ + final Haplotype h = trimmedHaplotypeMap.get(a); + final int hIndex = rl.alleleIndex(h); + final double readLikelihood = dummySampleLikelihoods.get(hIndex,0); readLikelihoods[readIdx][j++] = readLikelihood; + perReadAlleleLikelihoodMap.add(read,a,readLikelihood); } - // The readList for sending to the HMM should only ever contain 1 read, as each must be clipped individually - readList.remove(processedRead); - - // The same is true for the read/GCP-array map - readGCPArrayMap.remove(processedRead); } } readIdx++; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/CnyPairHMM.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/CnyPairHMM.java index 80be36f02..f506b7ba2 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/CnyPairHMM.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/CnyPairHMM.java @@ -46,10 +46,10 @@ package org.broadinstitute.gatk.utils.pairhmm; -import org.broadinstitute.gatk.utils.haplotype.Haplotype; -import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import java.io.File; import java.lang.reflect.Field; @@ -192,47 +192,42 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM { * {@inheritDoc} */ @Override - public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap){ + public void computeLikelihoods(final ReadLikelihoods.Matrix likelihoods, final List processedReads, final Map gcp){ // initialize the pairHMM if necessary if (! initialized) { - int readMaxLength = findMaxReadLength(reads); - int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap); + int readMaxLength = findMaxReadLength(processedReads); + int haplotypeMaxLength = findMaxHaplotypeLength(likelihoods.alleles()); initialize(readMaxLength, haplotypeMaxLength); } // Pass the read bases/quals, and the haplotypes as a list into the HMM - performBatchAdditions(reads, alleleHaplotypeMap, GCPArrayMap); + performBatchAdditions(processedReads, likelihoods.alleles(), gcp); - // Get the log10-likelihoods for each read/haplotype ant pack into the results map - final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); - collectLikelihoodResults(reads, alleleHaplotypeMap, likelihoodMap); - - return likelihoodMap; + // Get the log10-likelihoods for each read/haplotype ant pack into the read-likelihoods matrix. + collectLikelihoodResults(likelihoods); } - private void collectLikelihoodResults(List reads, Map alleleHaplotypeMap, PerReadAlleleLikelihoodMap likelihoodMap) { - for(final GATKSAMRecord read : reads){ - final double[] likelihoods = batchGetResult(); - int jjj = 0; - for (Allele allele : alleleHaplotypeMap.keySet()){ - final double log10l = likelihoods[jjj]; - likelihoodMap.add(read, allele, log10l); - jjj++; - } + private void collectLikelihoodResults(ReadLikelihoods.Matrix likelihoods) { + final int numHaplotypes = likelihoods.alleleCount(); + final int numReads = likelihoods.readCount(); + for(int r = 0; r < numReads; r++) { + final double[] likelihoodValues = batchGetResult(); + + for (int h = 0; h < numHaplotypes; h++) + likelihoods.set(h,r, likelihoodValues[h]); } } - private void performBatchAdditions(List reads, Map alleleHaplotypeMap, Map GCPArrayMap) { - final List haplotypeList = getHaplotypeList(alleleHaplotypeMap); + private void performBatchAdditions(final List reads, final List haplotypes, Map gcp) { for(final GATKSAMRecord read : reads){ final byte[] readBases = read.getReadBases(); final byte[] readQuals = read.getBaseQualities(); final byte[] readInsQuals = read.getBaseInsertionQualities(); final byte[] readDelQuals = read.getBaseDeletionQualities(); - final byte[] overallGCP = GCPArrayMap.get(read); + final byte[] overallGCP = gcp.get(read); - batchAdd(haplotypeList, readBases, readQuals, readInsQuals, readDelQuals, overallGCP); + batchAdd(haplotypes, readBases, readQuals, readInsQuals, readDelQuals, overallGCP); } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/DebugJNILoglessPairHMM.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/DebugJNILoglessPairHMM.java index 89c94a3e2..5be872f0e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/DebugJNILoglessPairHMM.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/DebugJNILoglessPairHMM.java @@ -49,23 +49,21 @@ package org.broadinstitute.gatk.utils.pairhmm; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.gatk.utils.QualityUtils; - +import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import htsjdk.variant.variantcontext.Allele; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import static org.broadinstitute.gatk.utils.pairhmm.PairHMMModel.*; -import java.util.List; -import java.util.Map; -import java.util.HashMap; +import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; -import java.io.BufferedWriter; -import java.util.Map; -import java.util.HashMap; import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.broadinstitute.gatk.utils.pairhmm.PairHMMModel.*; /** @@ -156,10 +154,10 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM { * {@inheritDoc} */ @Override - public PerReadAlleleLikelihoodMap computeLikelihoods( final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap ) { + public void computeLikelihoods( final ReadLikelihoods.Matrix likelihoods, final List processedReads, final Map GCPArrayMap ) { // (re)initialize the pairHMM only if necessary - final int readMaxLength = verify ? findMaxReadLength(reads) : 0; - final int haplotypeMaxLength = verify ? findMaxHaplotypeLength(alleleHaplotypeMap) : 0; + final int readMaxLength = verify ? findMaxReadLength(processedReads) : 0; + final int haplotypeMaxLength = verify ? findMaxHaplotypeLength(likelihoods.alleles()) : 0; if(verify) { if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) @@ -167,15 +165,15 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM { if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling jniComputeLikelihoods in debug/verify mode"); } - int readListSize = reads.size(); - int numHaplotypes = alleleHaplotypeMap.size(); + int readListSize = processedReads.size(); + int numHaplotypes = likelihoods.alleleCount(); int numTestcases = readListSize*numHaplotypes; if(debug0_1) System.out.println("Java numReads "+readListSize+" numHaplotypes "+numHaplotypes); int idx = 0; - for(GATKSAMRecord read : reads) + for(final GATKSAMRecord read : processedReads) { - byte [] overallGCP = GCPArrayMap.get(read); + final byte [] overallGCP = GCPArrayMap.get(read); if(debug0_1) System.out.println("Java read length "+read.getReadBases().length); if(debug3) @@ -195,9 +193,9 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM { if(verify) { idx = 0; - for (Map.Entry currEntry : alleleHaplotypeMap.entrySet()) //order is important - access in same order always + for (final Haplotype h : likelihoods.alleles()) //order is important - access in same order always { - byte[] haplotypeBases = currEntry.getValue().getBases(); + byte[] haplotypeBases = h.getBases(); if(debug0_1) System.out.println("Java haplotype length "+haplotypeBases.length); if(debug3) @@ -212,10 +210,10 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM { PerReadAlleleLikelihoodMap likelihoodMap = null; if(verify) { - jniPairHMM.computeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap); + jniPairHMM.computeLikelihoods(likelihoods, processedReads, GCPArrayMap); likelihoodArray = jniPairHMM.getLikelihoodArray(); //to compare values - likelihoodMap = super.computeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap); + super.computeLikelihoods(likelihoods, processedReads, GCPArrayMap); } else { @@ -234,13 +232,13 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM { idx = 0; int idxInsideHaplotypeList = 0; int readIdx = 0; - for(GATKSAMRecord read : reads) + for(final GATKSAMRecord read : processedReads) { for(int j=0;j currEntry : alleleHaplotypeMap.entrySet())//order is important - access in same order always + for (final Haplotype haplotype : likelihoods.alleles())//order is important - access in same order always { - idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(currEntry.getValue()); + idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(haplotype); likelihoodArray[idx] = tmpArray[idxInsideHaplotypeList]; ++idx; } @@ -271,13 +269,13 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM { idx = 0; System.out.println("Dump : Java numReads "+readListSize+" numHaplotypes "+numHaplotypes); boolean firstLine = true; - for(GATKSAMRecord read : reads) + for(final GATKSAMRecord read : processedReads) { byte [] overallGCP = GCPArrayMap.get(read); byte[] tmpByteArray = new byte[read.getReadBases().length]; - for (Map.Entry currEntry : alleleHaplotypeMap.entrySet()) //order is important - access in same order always + for (final Haplotype haplotype : likelihoods.alleles()) //order is important - access in same order always { - byte[] haplotypeBases = currEntry.getValue().getBases(); + byte[] haplotypeBases = haplotype.getBases(); debugDump("debug_dump.txt",new String(haplotypeBases)+" ",true); debugDump("debug_dump.txt",new String(read.getReadBases())+" ",true); for(int k=0;k haplotypeToHaplotypeListIdxMap = new HashMap(); + private HashMap haplotypeToHaplotypeListIdxMap = new HashMap<>(); private JNIHaplotypeDataHolderClass[] mHaplotypeDataArray; @Override public HashMap getHaplotypeToHaplotypeListIdxMap() { return haplotypeToHaplotypeListIdxMap; } @@ -204,22 +194,23 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM { * {@inheritDoc} */ @Override - public PerReadAlleleLikelihoodMap computeLikelihoods( final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap ) { + public void computeLikelihoods( final ReadLikelihoods.Matrix likelihoods, final List processedReads, final Map gcp ) { + if (processedReads.isEmpty()) + return; if(doProfiling) startTime = System.nanoTime(); - int readListSize = reads.size(); - int numHaplotypes = alleleHaplotypeMap.size(); - int numTestcases = readListSize*numHaplotypes; + int readListSize = processedReads.size(); + int numHaplotypes = likelihoods.alleleCount(); JNIReadDataHolderClass[] readDataArray = new JNIReadDataHolderClass[readListSize]; int idx = 0; - for(GATKSAMRecord read : reads) + for(GATKSAMRecord read : processedReads) { readDataArray[idx] = new JNIReadDataHolderClass(); readDataArray[idx].readBases = read.getReadBases(); readDataArray[idx].readQuals = read.getBaseQualities(); readDataArray[idx].insertionGOP = read.getBaseInsertionQualities(); readDataArray[idx].deletionGOP = read.getBaseDeletionQualities(); - readDataArray[idx].overallGCP = GCPArrayMap.get(read); + readDataArray[idx].overallGCP = gcp.get(read); ++idx; } @@ -230,20 +221,18 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM { // for(haplotypes) // compute_full_prob() jniComputeLikelihoods(readListSize, numHaplotypes, readDataArray, mHaplotypeDataArray, mLikelihoodArray, 12); - - final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); - idx = 0; - int idxInsideHaplotypeList = 0; + int readIdx = 0; - for(GATKSAMRecord read : reads) + for(int r = 0; r < readListSize; r++) { - for (Map.Entry currEntry : alleleHaplotypeMap.entrySet())//order is important - access in same order always - { + int hapIdx = 0; + for (final Haplotype haplotype : likelihoods.alleles()) { + //Since the order of haplotypes in the List and alleleHaplotypeMap is different, //get idx of current haplotype in the list and use this idx to get the right likelihoodValue - idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(currEntry.getValue()); - likelihoodMap.add(read, currEntry.getKey(), mLikelihoodArray[readIdx + idxInsideHaplotypeList]); - ++idx; + final int idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(haplotype); + likelihoods.set(hapIdx,r,mLikelihoodArray[readIdx + idxInsideHaplotypeList]); + ++hapIdx; } readIdx += numHaplotypes; } @@ -256,7 +245,6 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM { pairHMMSetupTime += threadLocalSetupTimeDiff; } } - return likelihoodMap; } /** diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java index 7ac0c86df..3d6103966 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java @@ -79,6 +79,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "ae90d6be28c19e455083a47bc95c3b1b"); + executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "a9730f883a2e01e93d56b11321c9fd52"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java index 6d95098fe..98a1586f9 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java @@ -58,7 +58,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","9fc4f04105bde31bafc93548745cb67e"); + executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","93cf7ed2645599d0ddd319c6989eadcc"); } @Test(enabled = true) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java index e8971f820..286096111 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -73,7 +73,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("8a4de9e1f59cffe80a4372cf02fe809e")); + Arrays.asList("24d5f65860507ec7f8d9441bfee8653a")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -100,7 +100,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("2b92df91a9337b9d9f03db5699bb41f2")); + Arrays.asList("188dc4d70dc5f4de09f645270cc3f4e1")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -110,7 +110,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("591c9a7bd7d64974b4a2dd932fdef138")); + Arrays.asList("31aaad9adbf12bb7fb12d53080e5c2f5")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -120,7 +120,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("e784b6d1b3725f72a2c42b19e8d24cb1")); + Arrays.asList("fecb8ae4e8f5ea2b2bf3826d674298f7")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -135,7 +135,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, - Arrays.asList("d101beb3d1f71a14a31438cce9786881")); + Arrays.asList("0662ec6fe6d6e9807f758b1c82adf056")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -176,7 +176,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("96c2ab043f523c7676e4bc36dd3c9d0b")); + Arrays.asList("3d5140ffc02a125ee26c63ec55449192")); executeTest("test minIndelFraction 0.0", spec); } @@ -184,7 +184,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("53ce3aa9ee16302ab2ec21f742b5e157")); + Arrays.asList("7c62eb9142d9124e49c123dcc052ce57")); executeTest("test minIndelFraction 0.25", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index 2c61d097e..8398b1776 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -53,18 +53,19 @@ import java.util.Arrays; public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ - private final static String baseCommand = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommand = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; // -------------------------------------------------------------------------------------------------------------- // // testing normal calling // - // -------------------------------------------------------------------------------------------------------------- + // ---------------------------------------------------- ---------------------------------------------------------- @Test public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("058b54cff14527485c7b1ab035ebb6c4")); + Arrays.asList("c6462121e5a419546f50d53646274fd3")); executeTest("test MultiSample Pilot1", spec); } @@ -96,7 +97,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("120270ddf48dfd461f756c89ea1ab074")); + Arrays.asList("85e8ba3cfeef5e3b72418bcfaec11668")); executeTest("test Multiple SNP alleles", spec); } @@ -112,7 +113,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("bc5a143868e3ad3acc9bb7c09798cdf2")); + Arrays.asList("bd867fb341e75a9f1fc78d25bda93f63")); executeTest("test reverse trim", spec); } @@ -120,7 +121,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("02fd6357bd1082115822c8a931cbb6a3")); + Arrays.asList("e87419cd42fb9684ee3674ee54abe567")); executeTest("test mismatched PLs", spec); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 2e8fc67db..573e61df7 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -94,7 +94,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "7096aa0ba002757cdcb9e1868d51a85f"); + "1a65d432437670d2816a42ea270c06a1"); } private void HCTestComplexConsensusMode(String bam, String args, String md5) { @@ -106,7 +106,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleConsensusModeComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538 -L 20:133041-133161 -L 20:300207-300337", - "45be554608485d8d3979284277d00f95"); + "a5bb8f7eb1f7db997be9fd8928a788f6"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 033fff46a..86aa7ca21 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -115,7 +115,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "07c5df4cbd91170d0061355c1a8f7fc9"); + "4047ea70f37b59b782402f678ea2dfee"); } @Test diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java index c339a2269..61a6860fd 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java @@ -388,6 +388,55 @@ public class ReadLikelihoodsUnitTest } + @Test(dataProvider = "dataSets") + public void testAddMissingAlleles(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); + final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); + final ReadLikelihoods result = original.clone(); + + // If all the alleles pass are present in the read-likelihoods collection there is no change. + result.addMissingAlleles(result.alleles(),Double.NEGATIVE_INFINITY); + testLikelihoodMatrixQueries(samples,result,originalLikelihoods); + + // If the allele list passed is empty there is no effect. + result.addMissingAlleles(Collections.EMPTY_LIST,Double.NEGATIVE_INFINITY); + testLikelihoodMatrixQueries(samples,result,originalLikelihoods); + + final Allele newOne; + final Allele newTwo; + final Allele newThree; + + // We add a single missing. + result.addMissingAlleles(Arrays.asList(newOne = Allele.create("ACCCCCAAAATTTAAAGGG".getBytes(),false)),-12345.6); + Assert.assertEquals(original.alleleCount() + 1, result.alleleCount()); + + // We add too more amongst exisisting alleles: + result.addMissingAlleles(Arrays.asList(newTwo = Allele.create("ATATATTATATTAATATT".getBytes(), false),result.allele(1), + result.allele(0),newThree = Allele.create("TGTGTGTATTG".getBytes(),false),Allele.create("ACCCCCAAAATTTAAAGGG".getBytes(),false)),-6.54321); + + Assert.assertEquals(original.alleleCount()+3,result.alleleCount()); + + final List expectedAlleles = new ArrayList<>(original.alleles()); + expectedAlleles.add(newOne); expectedAlleles.add(newTwo); expectedAlleles.add(newThree); + + Assert.assertEquals(result.alleles(),expectedAlleles); + + final double[][][] newLikelihoods = new double[originalLikelihoods.length][][]; + for (int s = 0; s < samples.length; s++) { + newLikelihoods[s] = Arrays.copyOf(originalLikelihoods[s],originalLikelihoods[s].length + 3); + final int sampleReadCount = original.sampleReadCount(s); + final int originalAlleleCount = originalLikelihoods[s].length; + newLikelihoods[s][originalAlleleCount] = new double[sampleReadCount]; + Arrays.fill(newLikelihoods[s][originalAlleleCount],-12345.6); + newLikelihoods[s][originalAlleleCount+1] = new double[sampleReadCount]; + Arrays.fill(newLikelihoods[s][originalAlleleCount+1],-6.54321); + newLikelihoods[s][originalAlleleCount+2] = new double[sampleReadCount]; + Arrays.fill(newLikelihoods[s][originalAlleleCount+2],-6.54321); + } + testLikelihoodMatrixQueries(samples,result,newLikelihoods); + } + + @Test(dataProvider = "dataSets") public void testAddNonRefAllele(final String[] samples, final Allele[] alleles, final Map> reads) { final ReadLikelihoods original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads); @@ -480,7 +529,8 @@ public class ReadLikelihoodsUnitTest {Allele.create("A",true), Allele.create("T"), Allele.create("C")}, {Allele.create("A",true)}, {Allele.create("ATTTA"), Allele.create("A",true)}, - {Allele.create("A"), Allele.create("AT",true)} + {Allele.create("A"), Allele.create("AT",true)}, + {Allele.create("A",false), Allele.create("AT",false)}, }; @DataProvider(name="marginalizationDataSets") diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java index f03dfbe78..bf12d3a33 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java @@ -95,7 +95,7 @@ public class ReadLikelihoods implements Cloneable { /** * Index of the reference allele if any, otherwise -1. */ - private final int referenceAlleleIndex; + private int referenceAlleleIndex; /** * Caches the read-list per sample list returned by {@link #sampleReads} @@ -568,12 +568,66 @@ public class ReadLikelihoods implements Cloneable { final GATKSAMRecord replacement = readRealignments.get(read); if (replacement == null) continue; - readIndex.put(sampleReads[r] = replacement,r); readIndex.remove(read); + readIndex.put(sampleReads[r] = replacement,r); } } } + /** + * Add alleles that are missing in the read-likelihoods collection giving all reads a default + * likelihood value. + * @param alleles the potentially missing alleles. + * @param defaultLikelihood the default read likelihood value for that allele. + * + * @throws IllegalArgumentException if {@code alleles} is {@code null} or there is more than + * one missing allele that is a reference or there is one but the collection already has + * a reference allele. + */ + public void addMissingAlleles(final Collection alleles, final double defaultLikelihood) { + if (alleles == null) + throw new IllegalArgumentException("the alleles list cannot be null"); + if (alleles.isEmpty()) + return; + final List allelesToAdd = new ArrayList(alleles.size()); + for (final A allele : alleles) + if (!alleleIndex.containsKey(allele)) + allelesToAdd.add(allele); + if (allelesToAdd.isEmpty()) + return; + + final int oldAlleleCount = this.alleles.length; + final int newAlleleCount = this.alleles.length + allelesToAdd.size(); + + alleleList = null; + int referenceIndex = this.referenceAlleleIndex; + this.alleles = Arrays.copyOf(this.alleles,newAlleleCount); + int nextIndex = oldAlleleCount; + for (final A allele : allelesToAdd) { + if (allele.isReference()) { + if (referenceIndex != -1) + throw new IllegalArgumentException("there cannot be more than one reference allele"); + referenceIndex = nextIndex; + } + alleleIndex.put(this.alleles[nextIndex] = allele, nextIndex++); + } + + if (referenceIndex != -1) + referenceAlleleIndex = referenceIndex; + + final int sampleCount = samples.length; + for (int s = 0; s < sampleCount; s++) { + final int sampleReadCount = readsBySampleIndex[s].length; + final double[][] newValuesBySampleIndex = Arrays.copyOf(valuesBySampleIndex[s],newAlleleCount); + for (int a = oldAlleleCount; a < newAlleleCount; a++) { + newValuesBySampleIndex[a] = new double[sampleReadCount]; + if (defaultLikelihood != 0.0) + Arrays.fill(newValuesBySampleIndex[a],defaultLikelihood); + } + valuesBySampleIndex[s] = newValuesBySampleIndex; + } + } + /** * Likelihood matrix between a set of alleles and reads. * @param the allele-type. @@ -740,7 +794,7 @@ public class ReadLikelihoods implements Cloneable { * or its values contain reference to non-existing alleles in this read-likelihood collection. Also no new allele * can have zero old alleles mapping nor two new alleles can make reference to the same old allele. */ - public ReadLikelihoods marginalize(final Map> newToOldAlleleMap,final GenomeLoc overlap) { + public ReadLikelihoods marginalize(final Map> newToOldAlleleMap, final GenomeLoc overlap) { if (overlap == null) return marginalize(newToOldAlleleMap); @@ -775,7 +829,7 @@ public class ReadLikelihoods implements Cloneable { final int newSampleReadCount = sampleReadsToKeep.length; if (newSampleReadCount == oldSampleReadCount) { newReadIndexBySampleIndex[s] = new Object2IntOpenHashMap<>(readIndexBySampleIndex[s]); - newReadsBySampleIndex[s] = readsBySampleIndex[s].clone(); + newReadsBySampleIndex[s] = oldSampleReads.clone(); } else { final Object2IntMap newSampleReadIndex = newReadIndexBySampleIndex[s] = new Object2IntOpenHashMap<>(newSampleReadCount); final GATKSAMRecord[] newSampleReads = newReadsBySampleIndex[s] = new GATKSAMRecord[newSampleReadCount]; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMM.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMM.java index eb0d408e3..6c4460cb3 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMM.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMM.java @@ -26,15 +26,15 @@ package org.broadinstitute.gatk.utils.pairhmm; import com.google.java.contract.Requires; +import htsjdk.variant.variantcontext.Allele; import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import htsjdk.variant.variantcontext.Allele; import java.util.Arrays; +import java.util.Collection; import java.util.List; import java.util.Map; /** @@ -158,10 +158,9 @@ public abstract class PairHMM { return listMaxReadLength; } - protected int findMaxHaplotypeLength(final Map haplotypeMap) { + protected int findMaxHaplotypeLength(final Collection haplotypes) { int listMaxHaplotypeLength = 0; - for( final Allele a: haplotypeMap.keySet() ) { - final Haplotype h = haplotypeMap.get(a); + for( final Haplotype h : haplotypes) { final int haplotypeLength = h.getBases().length; if( haplotypeLength > listMaxHaplotypeLength ) { listMaxHaplotypeLength = haplotypeLength; } } @@ -172,14 +171,20 @@ public abstract class PairHMM { * Given a list of reads and haplotypes, for every read compute the total probability of said read arising from * each haplotype given base substitution, insertion, and deletion probabilities. * - * @param processedReads reads to analyze + * @param processedReads reads to analyze instead of the ones present in the destination read-likelihoods. * @param likelihoods where to store the likelihoods where position [a][r] is reserved for the likelihood of {@code reads[r]} * conditional to {@code alleles[a]}. - * @param constantGCP constant penalty for gap continuations. + * @param gcp penalty for gap continuations base array map for processed reads. + * + * @throws IllegalArgumentException * * @return never {@code null}. */ - public void computeLikelihoods(final ReadLikelihoods.Matrix likelihoods, final List processedReads, final byte constantGCP) { + public void computeLikelihoods(final ReadLikelihoods.Matrix likelihoods, + final List processedReads, + final Map gcp) { + if (processedReads.isEmpty()) + return; if(doProfiling) startTime = System.nanoTime(); // (re)initialize the pairHMM only if necessary @@ -195,13 +200,11 @@ public abstract class PairHMM { int idx = 0; int readIndex = 0; for(final GATKSAMRecord read : processedReads){ - final int readLength = read.getReadLength(); final byte[] readBases = read.getReadBases(); final byte[] readQuals = read.getBaseQualities(); final byte[] readInsQuals = read.getBaseInsertionQualities(); final byte[] readDelQuals = read.getBaseDeletionQualities(); - final byte[] overallGCP = new byte[readLength]; - Arrays.fill(overallGCP,constantGCP); + final byte[] overallGCP = gcp.get(read); // peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation) final boolean isFirstHaplotype = true; @@ -225,78 +228,6 @@ public abstract class PairHMM { } } - /** - * Given a list of reads and haplotypes, for every read compute the total probability of said read arising from - * each haplotype given base substitution, insertion, and deletion probabilities. - * - * @param reads the list of reads - * @param alleleHaplotypeMap the list of haplotypes - * @param GCPArrayMap Each read is associated with an array containing the gap continuation penalties for use in the model. Length of each GCP-array must match that of its read. - * @return a PerReadAlleleLikelihoodMap containing each read, haplotype-allele, and the log10 probability of - * said read coming from the said haplotype under the provided error model - * @deprecated - */ - public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap) { - if(doProfiling) - startTime = System.nanoTime(); - - // (re)initialize the pairHMM only if necessary - final int readMaxLength = findMaxReadLength(reads); - final int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap); - if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) { initialize(readMaxLength, haplotypeMaxLength); } - - final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); - mLikelihoodArray = new double[reads.size()*alleleHaplotypeMap.size()]; - int idx = 0; - for(GATKSAMRecord read : reads){ - final byte[] readBases = read.getReadBases(); - final byte[] readQuals = read.getBaseQualities(); - final byte[] readInsQuals = read.getBaseInsertionQualities(); - final byte[] readDelQuals = read.getBaseDeletionQualities(); - final byte[] overallGCP = GCPArrayMap.get(read); - - // peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation) - byte[] currentHaplotypeBases = null; - final boolean isFirstHaplotype = true; - Allele currentAllele = null; - double log10l; - //for (final Allele allele : alleleHaplotypeMap.keySet()){ - for (Map.Entry currEntry : alleleHaplotypeMap.entrySet()){ - //final Haplotype haplotype = alleleHaplotypeMap.get(allele); - final Allele allele = currEntry.getKey(); - final Haplotype haplotype = currEntry.getValue(); - final byte[] nextHaplotypeBases = haplotype.getBases(); - if (currentHaplotypeBases != null) { - log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, - readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases); - mLikelihoodArray[idx++] = log10l; - likelihoodMap.add(read, currentAllele, log10l); - } - // update the current haplotype - currentHaplotypeBases = nextHaplotypeBases; - currentAllele = allele; - } - // process the final haplotype - if (currentHaplotypeBases != null) { - - // there is no next haplotype, so pass null for nextHaplotypeBases. - log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, - readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, null); - likelihoodMap.add(read, currentAllele, log10l); - mLikelihoodArray[idx++] = log10l; - } - } - if(doProfiling) - { - threadLocalPairHMMComputeTimeDiff = (System.nanoTime() - startTime); - //synchronized(doProfiling) - { - pairHMMComputeTime += threadLocalPairHMMComputeTimeDiff; - } - } - return likelihoodMap; - } - /** * Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion * probabilities. From b39508cd159d6c285a5d1f3e63eb10683655a2c3 Mon Sep 17 00:00:00 2001 From: Valentin Ruano-Rubio Date: Mon, 11 Aug 2014 17:47:25 -0400 Subject: [PATCH 5/5] ReadLikelihoods class introduction final changes before merging Stories: https://www.pivotaltracker.com/story/show/70222086 https://www.pivotaltracker.com/story/show/67961652 Changes: Done some changes that I missed in relation with making sure that all PairHMM implentations use the same interface; as a consequence we were running always the standard PairHMM. Fixed some additional bugs detected when running it on full wgs single sample and exom multi sample data set. Updated some integration test md5s. --- .../walkers/indels/PairHMMIndelErrorModel.java | 4 ++-- ...enotyperGeneralPloidySuite1IntegrationTest.java | 2 +- ...enotyperGeneralPloidySuite2IntegrationTest.java | 2 +- ...nifiedGenotyperIndelCallingIntegrationTest.java | 14 +++++++------- ...ifiedGenotyperNormalCallingIntegrationTest.java | 8 ++++---- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java index a2422c544..6f71868bb 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java @@ -447,8 +447,8 @@ public class PairHMMIndelErrorModel { final List distinctHaplotypesList = Arrays.asList(distinctHaplotypesSet.toArray(new Haplotype[distinctHaplotypesSet.size()])); // Get the likelihoods for our clipped read against each of our trimmed haplotypes. final ReadLikelihoods rl = new ReadLikelihoods<>( - Collections.singletonList("DUMMY_SAMPLE"),distinctHaplotypesList,Collections.singletonMap("DUMMY_SAMPLE",Collections.singletonList(processedRead))); + final ReadLikelihoods.Matrix dummySampleLikelihoods = rl.sampleMatrix(0); pairHMM.computeLikelihoods(rl.sampleMatrix(0), Collections.singletonList(processedRead), readGCPArrayMap); @@ -458,7 +458,7 @@ public class PairHMMIndelErrorModel { final int hIndex = rl.alleleIndex(h); final double readLikelihood = dummySampleLikelihoods.get(hIndex,0); readLikelihoods[readIdx][j++] = readLikelihood; - perReadAlleleLikelihoodMap.add(read,a,readLikelihood); + perReadAlleleLikelihoodMap.add(p,a,readLikelihood); } } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java index 3d6103966..7ac0c86df 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java @@ -79,6 +79,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "a9730f883a2e01e93d56b11321c9fd52"); + executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "ae90d6be28c19e455083a47bc95c3b1b"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java index 98a1586f9..6d95098fe 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java @@ -58,7 +58,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","93cf7ed2645599d0ddd319c6989eadcc"); + executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","9fc4f04105bde31bafc93548745cb67e"); } @Test(enabled = true) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java index 286096111..e8971f820 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -73,7 +73,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("24d5f65860507ec7f8d9441bfee8653a")); + Arrays.asList("8a4de9e1f59cffe80a4372cf02fe809e")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -100,7 +100,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("188dc4d70dc5f4de09f645270cc3f4e1")); + Arrays.asList("2b92df91a9337b9d9f03db5699bb41f2")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -110,7 +110,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("31aaad9adbf12bb7fb12d53080e5c2f5")); + Arrays.asList("591c9a7bd7d64974b4a2dd932fdef138")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -120,7 +120,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("fecb8ae4e8f5ea2b2bf3826d674298f7")); + Arrays.asList("e784b6d1b3725f72a2c42b19e8d24cb1")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -135,7 +135,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, - Arrays.asList("0662ec6fe6d6e9807f758b1c82adf056")); + Arrays.asList("d101beb3d1f71a14a31438cce9786881")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -176,7 +176,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("3d5140ffc02a125ee26c63ec55449192")); + Arrays.asList("96c2ab043f523c7676e4bc36dd3c9d0b")); executeTest("test minIndelFraction 0.0", spec); } @@ -184,7 +184,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("7c62eb9142d9124e49c123dcc052ce57")); + Arrays.asList("53ce3aa9ee16302ab2ec21f742b5e157")); executeTest("test minIndelFraction 0.25", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index 8398b1776..8e5b9aa68 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -65,7 +65,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("c6462121e5a419546f50d53646274fd3")); + Arrays.asList("058b54cff14527485c7b1ab035ebb6c4")); executeTest("test MultiSample Pilot1", spec); } @@ -97,7 +97,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("85e8ba3cfeef5e3b72418bcfaec11668")); + Arrays.asList("120270ddf48dfd461f756c89ea1ab074")); executeTest("test Multiple SNP alleles", spec); } @@ -113,7 +113,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("bd867fb341e75a9f1fc78d25bda93f63")); + Arrays.asList("bc5a143868e3ad3acc9bb7c09798cdf2")); executeTest("test reverse trim", spec); } @@ -121,7 +121,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("e87419cd42fb9684ee3674ee54abe567")); + Arrays.asList("02fd6357bd1082115822c8a931cbb6a3")); executeTest("test mismatched PLs", spec); } }