From 09ac3779d63625f390f42a50be8b5a72e84c7718 Mon Sep 17 00:00:00 2001 From: Valentin Ruano-Rubio Date: Thu, 24 Jul 2014 23:42:53 -0400 Subject: [PATCH] Added ReadLikelihoods component to substitute Map. It uses a more efficient java array[] based implementation and encapsulates operations perform with such a read-likelihood collection such as marginalization, filtering by position, poor modeling or capping worst likelihoods and so forth. Stories: https://www.pivotaltracker.com/story/show/70222086 https://www.pivotaltracker.com/story/show/67961652 --- .../org/broadinstitute/gatk/utils/Utils.java | 255 +++ .../gatk/utils/genotyper/ReadLikelihoods.java | 1502 +++++++++++++++++ 2 files changed, 1757 insertions(+) create mode 100644 public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/Utils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/Utils.java index 8fdec27b4..b4c0c75fa 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/Utils.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/Utils.java @@ -34,6 +34,7 @@ import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; import org.broadinstitute.gatk.utils.text.TextFormattingUtils; +import java.lang.reflect.Array; import java.math.BigInteger; import java.net.InetAddress; import java.security.MessageDigest; @@ -882,4 +883,258 @@ public class Utils { return false; return true; } + + /** + * Skims out positions of an array returning a shorter one with the remaning positions in the same order. + * @param original the original array to splice. + * @param remove for each position in {@code original} indicates whether it should be spliced away ({@code true}), + * or retained ({@code false}) + * + * @param the array type. + * + * @throws IllegalArgumentException if either {@code original} or {@code remove} is {@code null}, + * or {@code remove length is different to {@code original}'s}, or {@code original} is not in + * fact an array. + * + * @return never {@code null}. + */ + public static T skimArray(final T original, final boolean[] remove) { + return skimArray(original,0,null,0,remove,0); + } + + /** + * Skims out positions of an array returning a shorter one with the remaning positions in the same order. + * + *

+ * If the {@code dest} array provide is not long enough a new one will be created and returned with the + * same component type. All elements before {@code destOffset} will be copied from the input to the + * result array. If {@code dest} is {@code null}, a brand-new array large enough will be created where + * the position preceding {@code destOffset} will be left with the default value. The component type + * Will match the one of the {@code source} array. + *

+ * + * @param source the original array to splice. + * @param sourceOffset the first position to skim. + * @param dest the destination array. + * @param destOffset the first position where to copy the skimed array values. + * @param remove for each position in {@code original} indicates whether it should be spliced away ({@code true}), + * or retained ({@code false}) + * @param removeOffset the first position in the remove index array to consider. + * + * @param the array type. + * + * @throws IllegalArgumentException if either {@code original} or {@code remove} is {@code null}, + * or {@code remove length is different to {@code original}'s}, or {@code original} is not in + * fact an array. + * + * @return never {@code null}. + */ + public static T skimArray(final T source, final int sourceOffset, final T dest, final int destOffset, final boolean[] remove, final int removeOffset) { + if (source == null) + throw new IllegalArgumentException("the source array cannot be null"); + @SuppressWarnings("unchecked") + final Class sourceClazz = (Class) source.getClass(); + + if (!sourceClazz.isArray()) + throw new IllegalArgumentException("the source array is not in fact an array instance"); + final int length = Array.getLength(source) - sourceOffset; + if (length < 0) + throw new IllegalArgumentException("the source offset goes beyond the source array length"); + return skimArray(source,sourceOffset,dest,destOffset,remove,removeOffset,length); + } + + /** + * Skims out positions of an array returning a shorter one with the remaning positions in the same order. + * + *

+ * If the {@code dest} array provide is not long enough a new one will be created and returned with the + * same component type. All elements before {@code destOffset} will be copied from the input to the + * result array. If {@code dest} is {@code null}, a brand-new array large enough will be created where + * the position preceding {@code destOffset} will be left with the default value. The component type + * Will match the one of the {@code source} array. + *

+ * + * @param source the original array to splice. + * @param sourceOffset the first position to skim. + * @param dest the destination array. + * @param destOffset the first position where to copy the skimed array values. + * @param remove for each position in {@code original} indicates whether it should be spliced away ({@code true}), + * or retained ({@code false}) + * @param removeOffset the first position in the remove index array to consider. + * @param length the total number of position in {@code source} to consider. Thus only the {@code sourceOffset} to + * {@code sourceOffset + length - 1} region will be skimmed. + * + * @param the array type. + * + * @throws IllegalArgumentException if either {@code original} or {@code remove} is {@code null}, + * or {@code remove length is different to {@code original}'s}, or {@code original} is not in + * fact an array. + * + * @return never {@code null}. + */ + public static T skimArray(final T source, final int sourceOffset, final T dest, final int destOffset, + final boolean[] remove, final int removeOffset, final int length) { + if (source == null) + throw new IllegalArgumentException("the source array cannot be null"); + if (remove == null) + throw new IllegalArgumentException("the remove array cannot be null"); + if (sourceOffset < 0) + throw new IllegalArgumentException("the source array offset cannot be negative"); + if (destOffset < 0) + throw new IllegalArgumentException("the destination array offset cannot be negative"); + if (removeOffset < 0) + throw new IllegalArgumentException("the remove array offset cannot be negative"); + if (length < 0) + throw new IllegalArgumentException("the length provided cannot be negative"); + + final int removeLength = Math.min(remove.length - removeOffset,length); + + if (removeLength < 0) + throw new IllegalArgumentException("the remove offset provided falls beyond the remove array end"); + + + @SuppressWarnings("unchecked") + final Class sourceClazz = (Class) source.getClass(); + + if (!sourceClazz.isArray()) + throw new IllegalArgumentException("the source array is not in fact an array instance"); + + final Class destClazz = skimArrayDetermineDestArrayClass(dest, sourceClazz); + + final int sourceLength = Array.getLength(source); + + if (sourceLength < length + sourceOffset) + throw new IllegalArgumentException("the source array is too small considering length and offset"); + + // count how many positions are to be removed. + + int removeCount = 0; + + final int removeEnd = removeLength + removeOffset; + for (int i = removeOffset; i < removeEnd; i++) + if (remove[i]) removeCount++; + + + final int newLength = length - removeCount; + + + @SuppressWarnings("unchecked") + final T result = skimArrayBuildResultArray(dest, destOffset, destClazz, newLength); + // No removals, just copy the whole thing. + + if (removeCount == 0) + System.arraycopy(source,sourceOffset,result,destOffset,length); + else if (length > 0) { // if length == 0 nothing to do. + int nextOriginalIndex = 0; + int nextNewIndex = 0; + int nextRemoveIndex = removeOffset; + while (nextOriginalIndex < length && nextNewIndex < newLength) { + while (nextRemoveIndex < removeEnd && remove[nextRemoveIndex++]) { nextOriginalIndex++; } // skip positions to be spliced. + // Since we make the nextNewIndex < newLength check in the while condition + // there is no need to include the following break, as is guaranteed not to be true: + // if (nextOriginalIndex >= length) break; // we reach the final (last positions are to be spliced. + final int copyStart = nextOriginalIndex; + while (++nextOriginalIndex < length && (nextRemoveIndex >= removeEnd || !remove[nextRemoveIndex])) { nextRemoveIndex++; } + final int copyEnd = nextOriginalIndex; + final int copyLength = copyEnd - copyStart; + System.arraycopy(source, sourceOffset + copyStart, result, destOffset + nextNewIndex, copyLength); + nextNewIndex += copyLength; + } + } + return result; + } + + private static T skimArrayBuildResultArray(final T dest, final int destOffset, final Class destClazz, final int newLength) { + @SuppressWarnings("unchecked") + final T result; + + if (dest == null) + result = (T) Array.newInstance(destClazz.getComponentType(), newLength + destOffset); + else if (Array.getLength(dest) < newLength + destOffset) { + result = (T) Array.newInstance(destClazz.getComponentType(),newLength + destOffset); + if (destOffset > 0) System.arraycopy(dest,0,result,0,destOffset); + } else + result = dest; + return result; + } + + private static Class skimArrayDetermineDestArrayClass(final T dest, Class sourceClazz) { + final Class destClazz; + if (dest == null) + destClazz = sourceClazz; + else { + destClazz = (Class) dest.getClass(); + if (destClazz != sourceClazz) { + if (!destClazz.isArray()) + throw new IllegalArgumentException("the destination array class must be an array"); + if (sourceClazz.getComponentType().isAssignableFrom(destClazz.getComponentType())) + throw new IllegalArgumentException("the provided destination array class cannot contain values from the source due to type incompatibility"); + } + } + return destClazz; + } + + /** + * Makes a deep clone of the array provided. + * + *

+ * When you can use {@link Arrays#copyOf} or an array {@link Object#clone()} to create a copy of itself, + * if it is multi-dimentional each sub array or matrix would be cloned. + *

+ * + *

+ * Notice however that if the base type is an Object type, the base elements themselves wont be cloned. + *

+ * + * @param array the array to deep-clone. + * @param type of the array. + * + * @throws IllegalArgumentException if {@code array} is {@code null} or is not an array. + */ + public static T deepCloneArray(final T array) { + + if (array == null) + throw new IllegalArgumentException(""); + + @SuppressWarnings("unchecked") + final Class clazz = (Class) array.getClass(); + + + if (!clazz.isArray()) + throw new IllegalArgumentException("the input is not an array"); + + final int dimension = calculateArrayDimensions(clazz); + + return deepCloneArrayUnchecked(array,clazz, dimension); + } + + private static int calculateArrayDimensions(final Class clazz) { + if (clazz.isArray()) + return calculateArrayDimensions(clazz.getComponentType()) + 1; + else + return 0; + } + + private static T deepCloneArrayUnchecked(final T array, final Class clazz, final int dimension) { + + + final int length = Array.getLength(array); + + final Class componentClass = clazz.getComponentType(); + + final T result = (T) Array.newInstance(componentClass,length); + + if (dimension <= 1) { + System.arraycopy(array, 0, result, 0, length); + return result; + } + + + final int dimensionMinus1 = dimension - 1; + + for (int i = 0; i < length; i++) + Array.set(result,i,deepCloneArrayUnchecked(Array.get(array,i),componentClass,dimensionMinus1)); + + return result; + } } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java new file mode 100644 index 000000000..f03dfbe78 --- /dev/null +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java @@ -0,0 +1,1502 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.genotyper; + +import htsjdk.variant.variantcontext.Allele; +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.objects.Object2IntMap; +import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; +import org.broadinstitute.gatk.engine.downsampling.AlleleBiasedDownsamplingUtils; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; + +import java.util.*; + +/** + * Read-likelihoods container implementation based on integer indexed arrays. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class ReadLikelihoods implements Cloneable { + + /** + * Reads by sample index. Each sub array contains reference to the reads of the ith sample. + */ + private GATKSAMRecord[][] readsBySampleIndex; + + /** + * Indexed per sample, allele and finally read (within sample). + *

+ * valuesBySampleIndex[s][a][r] == lnLk(R_r | A_a) where R_r comes from Sample s. + *

+ */ + private double[][][] valuesBySampleIndex; + + /** + * Sorted list samples. + */ + private final String[] samples; + + /** + * Unmodifiable list version of the samples + */ + private List sampleList; + + /** + * Unmodifiable list version of the alleles + */ + private List
alleleList; + + /** + * Sorted array of alleles. + */ + private A[] alleles; + + /** + * Maps from sample name to its index. + */ + private final Object2IntMap sampleIndex; + + /** + * Maps from allele to its index. + */ + private final Object2IntMap alleleIndex; + + /** + * Maps from each read to its index within the sample. + */ + private final Object2IntMap[] readIndexBySampleIndex; + + /** + * Index of the reference allele if any, otherwise -1. + */ + private final int referenceAlleleIndex; + + /** + * Caches the read-list per sample list returned by {@link #sampleReads} + */ + private final List[] readListBySampleIndex; + + /** + * Sample matrices. + */ + private final Matrix[] sampleMatrices; + + /** + * Constructs a new read-likelihood collection. + * + *

+ * The initial likelihoods for all allele-read combinations are + * 0. + *

+ * + * @param samples all supported samples in the collection. + * @param alleles all supported alleles in the collection. + * @param reads reads stratified per sample. + * + * @throws IllegalArgumentException if any of {@code allele}, {@code samples} + * or {@code reads} is {@code null}, + * or if they contain null values. + */ + @SuppressWarnings("unchecked") + public ReadLikelihoods(final List samples, final List
alleles, + final Map> reads) { + if (alleles == null) + throw new IllegalArgumentException("allele list cannot be null"); + if (samples == null) + throw new IllegalArgumentException("sample list cannot be null"); + if (reads == null) + throw new IllegalArgumentException("read map cannot be null"); + + final int sampleCount = samples.size(); + final int alleleCount = alleles.size(); + this.alleles = alleles.toArray((A[]) new Allele[alleleCount]); + this.samples = samples.toArray(new String[sampleCount]); + readsBySampleIndex = new GATKSAMRecord[sampleCount][]; + readListBySampleIndex = new List[sampleCount]; + valuesBySampleIndex = new double[sampleCount][][]; + referenceAlleleIndex = findReferenceAllele(this.alleles); + + sampleIndex = new Object2IntOpenHashMap<>(sampleCount); + alleleIndex = new Object2IntOpenHashMap<>(alleleCount); + readIndexBySampleIndex = new Object2IntMap[sampleCount]; + + setupIndexes(reads, sampleCount, alleleCount); + + sampleMatrices = (Matrix[]) new Matrix[sampleCount]; + } + + // Add all the indices to alleles, sample and reads in the look-up maps. + private void setupIndexes(final Map> reads, final int sampleCount, final int alleleCount) { + alleleIndex.clear(); + for (int i = 0; i < alleleCount; i++) { + final A allele = alleles[i]; + if (allele == null) + throw new IllegalArgumentException("no allele can be null"); + alleleIndex.put(allele, i); + } + if (alleleIndex.size() != alleleCount) + throw new IllegalArgumentException("repeated alleles"); + + sampleIndex.clear(); + for (int i = 0; i < sampleCount; i++) { + setupSampleData(i, reads, alleleCount); + } + + if (sampleIndex.size() != sampleCount) + throw new IllegalArgumentException("repeated sample names"); + } + + // Assumes that {@link #samples} has been initialized with the sample names. + private void setupSampleData(final int sampleIndex, final Map> readsBySample, + final int alleleCount) { + final String sample = this.samples[sampleIndex]; + if (sample == null) + throw new IllegalArgumentException("no sample can be null"); + this.sampleIndex.put(sample, sampleIndex); + final List reads = readsBySample.get(sample); + readsBySampleIndex[sampleIndex] = reads == null + ? new GATKSAMRecord[0] + : reads.toArray(new GATKSAMRecord[reads.size()]); + final int sampleReadCount = readsBySampleIndex[sampleIndex].length; + + readIndexBySampleIndex[sampleIndex] = new Object2IntOpenHashMap<>(sampleReadCount); + for (int r = 0; r < sampleReadCount; r++) { + final GATKSAMRecord read = readsBySampleIndex[sampleIndex][r]; + if (read == null) + throw new IllegalArgumentException("read cannot be null"); + if (readIndexBySampleIndex[sampleIndex].containsKey(read)) + throw new IllegalArgumentException("repeated read " + read); + readIndexBySampleIndex[sampleIndex].put(readsBySampleIndex[sampleIndex][r], r); + } + + final double[][] sampleValues = new double[alleleCount][sampleReadCount]; + valuesBySampleIndex[sampleIndex] = sampleValues; + } + + /** + * Create an independent copy of this read-likelihoods collection + */ + public ReadLikelihoods clone() { + + final int sampleCount = samples.length; + final int alleleCount = alleles.length; + + final double[][][] newLikelihoodValues = new double[sampleCount][alleleCount][]; + + @SuppressWarnings("unchecked") + final Object2IntMap[] newReadIndexBySampleIndex = new Object2IntMap[sampleCount]; + final GATKSAMRecord[][] newReadsBySampleIndex = new GATKSAMRecord[sampleCount][]; + + for (int s = 0; s < sampleCount; s++) { + newReadIndexBySampleIndex[s] = new Object2IntOpenHashMap<>(readIndexBySampleIndex[s]); + newReadsBySampleIndex[s] = readsBySampleIndex[s].clone(); + for (int a = 0; a < alleleCount; a++) + newLikelihoodValues[s][a] = valuesBySampleIndex[s][a].clone(); + } + + // Finally we create the new read-likelihood + return new ReadLikelihoods<>(alleles.clone(), samples, sampleIndex, + newReadsBySampleIndex, + newReadIndexBySampleIndex, newLikelihoodValues); + } + + // Internally used constructor. + @SuppressWarnings("unchecked") + private ReadLikelihoods(final A[] alleles, final String[] samples, final Object2IntMap sampleIndex, + final GATKSAMRecord[][] readsBySampleIndex, final Object2IntMap[] readIndex, + final double[][][] values) { + this.samples = samples; + this.alleles = alleles; + this.readsBySampleIndex = readsBySampleIndex; + this.valuesBySampleIndex = values; + this.readIndexBySampleIndex = readIndex; + this.readListBySampleIndex = new List[samples.length]; + + this.sampleIndex = sampleIndex; + + this.alleleIndex = new Object2IntOpenHashMap<>(alleles.length); + for (int i = 0; i < alleles.length; i++) + alleleIndex.put(alleles[i],i); + + if (alleleIndex.size() != alleles.length) + throw new IllegalArgumentException("the allele index is null"); + + referenceAlleleIndex = findReferenceAllele(alleles); + sampleMatrices = (Matrix[]) new Matrix[samples.length]; + } + + + // Search for the reference allele, if not found the index is -1. + private int findReferenceAllele(final A[] alleles) { + for (int i = 0; i < alleles.length; i++) + if (alleles[i].isReference()) + return i; + return -1; + } + + /** + * Checks whether a sample is included likelihood collection. + * + * @param sample the query sample. + * + * @throws IllegalArgumentException if {@code sample} is {@code null}. + * @return {@code true} iff the sample is in the likelihood collection. + */ + @SuppressWarnings("unused") + public boolean containsSample(final String sample) { + return sampleIndex.containsKey(sample); + } + + /** + * Returns the index of a sample within the likelihood collection. + * + * @param sample the query sample. + * + * @throws IllegalArgumentException if {@code sample} is {@code null}. + * @return -1 if the allele is not included, 0 or greater otherwise. + */ + public int sampleIndex(final String sample) { + if (sample == null) + throw new IllegalArgumentException("the input allele cannot be null"); + if (sampleIndex.containsKey(sample)) + return sampleIndex.getInt(sample); + else + return -1; + } + + /** + * Number of samples included in the likelihood collection. + * @return 0 or greater. + */ + public int sampleCount() { + return samples.length; + } + + /** + * Returns sample name given its index. + * + * @param sampleIndex query index. + * + * @throws IllegalArgumentException if {@code sampleIndex} is negative. + * + * @return never {@code null}. + */ + public String sample(final int sampleIndex) { + checkSampleIndex(sampleIndex); + return samples[sampleIndex]; + } + + /** + * Checks whether the allele is within the likelihood collection. + * + * @param allele the query allele. + * + * @throws IllegalArgumentException if {@code allele} is {@code null}. + * @return {@code true} iff the allele is in the likelihood collection. + */ + @SuppressWarnings("unused") + public boolean containsAllele(final A allele) { + return alleleIndex.containsKey(allele); + } + + /** + * Returns the index of an allele within the likelihood collection. + * + * @param allele the query allele. + * + * @throws IllegalArgumentException if {@code allele} is {@code null}. + * + * @return -1 if the allele is not included, 0 or greater otherwise. + */ + public int alleleIndex(final A allele) { + if (allele == null) + throw new IllegalArgumentException("the input allele cannot be null"); + if (alleleIndex.containsKey(allele)) + return alleleIndex.getInt(allele); + else + return -1; + } + + /** + * Returns number of alleles in the collection. + * @return 0 or greater. + */ + @SuppressWarnings("unused") + public int alleleCount() { + return alleles.length; + } + + /** + * Returns the allele given its index. + * + * @param alleleIndex the allele index. + * + * @throws IllegalArgumentException the allele index is {@code null}. + * + * @return never {@code null}. + */ + public A allele(final int alleleIndex) { + checkAlleleIndex(alleleIndex); + return alleles[alleleIndex]; + } + + /** + * Returns the reads that belong to a sample sorted by their index (within that sample). + * + * @param sampleIndex the requested sample. + * @return never {@code null} but perhaps a zero-length array if there is no reads in sample. No element in + * the array will be null. + */ + public List sampleReads(final int sampleIndex) { + checkSampleIndex(sampleIndex); + final List extantList = readListBySampleIndex[sampleIndex]; + if (extantList == null) + return readListBySampleIndex[sampleIndex] = Collections.unmodifiableList(Arrays.asList(readsBySampleIndex[sampleIndex])); + else + return extantList; + } + + /** + * Returns the likelihood matrix for a sample. + *

+ * The matrix is indexed by allele and the by read index. + * + *

+     *          result[a][r] == lnLk(Read_r | Allele_a)
+     *     
+ *

+ * + *

+ * The matrix is live and changes to it update the likelihood in the collection, please use with care. + *

+ * + * @param sampleIndex the sample index. + * + * @return never {@code null}. + */ + /* package */ double[][] sampleValues(final int sampleIndex) { + checkSampleIndex(sampleIndex); + return valuesBySampleIndex[sampleIndex]; + } + + + /** + * Returns a read vs allele likelihood matrix corresponding to a sample. + * + * @param sampleIndex target sample. + * + * @throws IllegalArgumentException if {@code sampleIndex} is not null. + * + * @return never {@code null} + */ + public Matrix
sampleMatrix(final int sampleIndex) { + checkSampleIndex(sampleIndex); + final Matrix extantResult = sampleMatrices[sampleIndex]; + if (extantResult != null) + return extantResult; + else + return sampleMatrices[sampleIndex] = new SampleMatrix(sampleIndex); + } + + /** + * Adjusts likelihoods so that for each read, the best allele likelihood is 0 and caps the minimum likelihood + * of any allele for each read based on the maximum alternative allele likelihood. + * + * @param bestToZero set the best likelihood to 0, others will be subtracted the same amount. + * @param maximumLikelihoodDifferenceCap maximum difference between the best alternative allele likelihood + * and any other likelihood. + * + * @throws IllegalArgumentException if {@code maximumDifferenceWithBestAlternative} is not 0 or less. + */ + public void normalizeLikelihoods(final boolean bestToZero, final double maximumLikelihoodDifferenceCap) { + if (maximumLikelihoodDifferenceCap >= 0.0 || Double.isNaN(maximumLikelihoodDifferenceCap)) + throw new IllegalArgumentException("the minimum reference likelihood fall cannot be positive"); + + if (maximumLikelihoodDifferenceCap == Double.NEGATIVE_INFINITY && !bestToZero) + return; + + final int alleleCount = alleles.length; + if (alleleCount == 0) // trivial case there is no alleles. + return; + else if (alleleCount == 1 && !bestToZero) + return; + + for (int s = 0; s < valuesBySampleIndex.length; s++) { + final double[][] sampleValues = valuesBySampleIndex[s]; + final int readCount = readsBySampleIndex[s].length; + for (int r = 0; r < readCount; r++) + normalizeLikelihoodsPerRead(bestToZero, maximumLikelihoodDifferenceCap, sampleValues, s, r); + } + } + + // Does the normalizeLikelihoods job for each read. + private void normalizeLikelihoodsPerRead(final boolean bestToZero, final double maximumBestAltLikelihoodDifference, + final double[][] sampleValues, final int sampleIndex, final int readIndex) { + + final BestAllele bestAlternativeAllele = searchBestAllele(sampleIndex,readIndex,false); + + final double worstLikelihoodCap = bestAlternativeAllele.likelihood + maximumBestAltLikelihoodDifference; + + final double referenceLikelihood = referenceAlleleIndex == -1 ? Double.NEGATIVE_INFINITY : + sampleValues[referenceAlleleIndex][readIndex]; + + + final double bestAbsoluteLikelihood = Math.max(bestAlternativeAllele.likelihood,referenceLikelihood); + + if (bestToZero) { + if (bestAbsoluteLikelihood == Double.NEGATIVE_INFINITY) + for (int a = 0; a < alleles.length; a++) + sampleValues[a][readIndex] = 0; + else if (worstLikelihoodCap != Double.NEGATIVE_INFINITY) + for (int a = 0; a < alleles.length; a++) + sampleValues[a][readIndex] = (sampleValues[a][readIndex] < worstLikelihoodCap ? worstLikelihoodCap : sampleValues[a][readIndex]) - bestAbsoluteLikelihood; + else + for (int a = 0; a < alleles.length; a++) + sampleValues[a][readIndex] -= bestAbsoluteLikelihood; + } else // else if (maximumReferenceLikelihoodFall != Double.NEGATIVE_INFINITY ) { // + // Guarantee to be the case by enclosing code. + for (int a = 0; a < alleles.length; a++) + if (sampleValues[a][readIndex] < worstLikelihoodCap) + sampleValues[a][readIndex] = worstLikelihoodCap; + } + + /** + * Returns the samples in this read-likelihood collection. + *

+ * Samples are sorted by their index in the collection. + *

+ * + *

+ * The returned list is an unmodifiable view on the read-likelihoods sample list. + *

+ * + * @return never {@code null}. + */ + public List samples() { + if (sampleList == null) + sampleList = Collections.unmodifiableList(Arrays.asList(samples)); + return sampleList; + } + + /** + * Returns the samples in this read-likelihood collection. + *

+ * Samples are sorted by their index in the collection. + *

+ * + *

+ * The returned list is an unmodifiable. It will not be updated if the collection + * allele list changes. + *

+ * + * @return never {@code null}. + */ + public List
alleles() { + if (alleleList == null) + alleleList = Collections.unmodifiableList(Arrays.asList(alleles)); + return alleleList; + } + + /** + * Search the best allele for a read. + * + * @param sampleIndex including sample index. + * @param readIndex target read index. + * + * @return never {@code null}, but with {@link BestAllele#allele allele} == {@code null} + * if non-could be found. + */ + private BestAllele searchBestAllele(final int sampleIndex, final int readIndex, final boolean canBeReference) { + final int alleleCount = alleles.length; + if (alleleCount == 0 || (alleleCount == 1 && referenceAlleleIndex == 0 && !canBeReference)) + return new BestAllele(sampleIndex,readIndex,-1,Double.NEGATIVE_INFINITY,Double.NEGATIVE_INFINITY); + + final double[][] sampleValues = valuesBySampleIndex[sampleIndex]; + int bestAlleleIndex = canBeReference || referenceAlleleIndex != 0 ? 0 : 1; + + double bestLikelihood = sampleValues[bestAlleleIndex][readIndex]; + double secondBestLikelihood = Double.NEGATIVE_INFINITY; + for (int a = bestAlleleIndex + 1; a < alleleCount; a++) { + if (!canBeReference && referenceAlleleIndex == a) + continue; + final double candidateLikelihood = sampleValues[a][readIndex]; + if (candidateLikelihood > bestLikelihood) { + bestAlleleIndex = a; + secondBestLikelihood = bestLikelihood; + bestLikelihood = candidateLikelihood; + } else if (candidateLikelihood > secondBestLikelihood) { + secondBestLikelihood = candidateLikelihood; + } + } + return new BestAllele(sampleIndex,readIndex,bestAlleleIndex,bestLikelihood,secondBestLikelihood); + } + + public void changeReads(final Map readRealignments) { + final int sampleCount = samples.length; + for (int s = 0; s < sampleCount; s++) { + final GATKSAMRecord[] sampleReads = readsBySampleIndex[s]; + final Object2IntMap readIndex = readIndexBySampleIndex[s]; + final int sampleReadCount = sampleReads.length; + for (int r = 0; r < sampleReadCount; r++) { + final GATKSAMRecord read = sampleReads[r]; + final GATKSAMRecord replacement = readRealignments.get(read); + if (replacement == null) + continue; + readIndex.put(sampleReads[r] = replacement,r); + readIndex.remove(read); + } + } + } + + /** + * Likelihood matrix between a set of alleles and reads. + * @param the allele-type. + */ + public interface Matrix { + + /** + * List of reads in the matrix sorted by their index therein. + * @return never {@code null}. + */ + public List reads(); + + /** + * List of alleles in the matrix sorted by their index in the collection. + * @return never {@code null}. + */ + public List alleles(); + + /** + * Set the likelihood of a read given an allele through their indices. + * + * @param alleleIndex the target allele index. + * @param readIndex the target read index. + * @param value new likelihood value for the target read give the target allele. + * + * @throws IllegalArgumentException if {@code alleleIndex} or {@code readIndex} + * are not valid allele and read indices respectively. + */ + public void set(final int alleleIndex, final int readIndex, final double value); + + /** + * Returns the likelihood of a read given a haplotype. + * + * @param alleleIndex the index of the given haplotype. + * @param readIndex the index of the target read. + * + * @throws IllegalArgumentException if {@code alleleIndex} or {@code readIndex} is not a + * valid allele or read index respectively. + * + * @return the requested likelihood, whatever value was provided using {@link #set(int,int,double) set} + * or 0.0 if none was set. + */ + public double get(final int alleleIndex, final int readIndex); + + /** + * Queries the index of an allele in the matrix. + * + * @param allele the target allele. + * + * @throws IllegalArgumentException if {@code allele} is {@code null}. + * @return -1 if such allele does not exist, otherwise its index which 0 or greater. + */ + @SuppressWarnings("unused") + public int alleleIndex(final A allele); + + /** + * Queries the index of a read in the matrix. + * + * @param read the target read. + * + * @throws IllegalArgumentException if {@code read} is {@code null}. + * + * @return -1 if there is not such a read in the matrix, otherwise its index + * which is 0 or greater. + */ + @SuppressWarnings("unused") + public int readIndex(final GATKSAMRecord read); + + /** + * Number of allele in the matrix. + * @return never negative. + */ + public int alleleCount(); + + /** + * Number of reads in the matrix. + * @return never negative. + */ + public int readCount(); + + /** + * Returns the allele given its index. + * + * @param alleleIndex the target allele index. + * + * @throws IllegalArgumentException if {@code alleleIndex} is not a valid allele index. + * @return never {@code null}. + */ + public A allele(final int alleleIndex); + + /** + * Returns the allele given its index. + * + * @param readIndex the target allele index. + * + * @throws IllegalArgumentException if {@code readIndex} is not a valid read index. + * @return never {@code null}. + */ + public GATKSAMRecord read(final int readIndex); + } + + /** + * Perform marginalization from an allele set to another (smaller one) taking the maximum value + * for each read in the original allele subset. + * + * @param newToOldAlleleMap map where the keys are the new alleles and the value list the original + * alleles that correspond to the new one. + * @return never {@code null}. The result will have the requested set of new alleles (keys in {@code newToOldAlleleMap}, and + * the same set of samples and reads as the original. + * + * @throws IllegalArgumentException is {@code newToOldAlleleMap} is {@code null} or contains {@code null} values, + * or its values contain reference to non-existing alleles in this read-likelihood collection. Also no new allele + * can have zero old alleles mapping nor two new alleles can make reference to the same old allele. + */ + public ReadLikelihoods marginalize(final Map> newToOldAlleleMap) { + + if (newToOldAlleleMap == null) + throw new IllegalArgumentException("the input allele mapping cannot be null"); + + @SuppressWarnings("unchecked") + final B[] newAlleles = newToOldAlleleMap.keySet().toArray((B[]) new Allele[newToOldAlleleMap.size()]); + final int oldAlleleCount = alleles.length; + final int newAlleleCount = newAlleles.length; + + // we get the index correspondence between new old -> new allele, -1 entries mean that the old + // allele does not map to any new; supported but typically not the case. + final int[] oldToNewAlleleIndexMap = oldToNewAlleleIndexMap(newToOldAlleleMap, newAlleles, oldAlleleCount, newAlleleCount); + + // We calculate the marginal likelihoods. + + final double[][][] newLikelihoodValues = marginalLikelihoods(oldAlleleCount, newAlleleCount, oldToNewAlleleIndexMap, null); + + final int sampleCount = samples.length; + + @SuppressWarnings("unchecked") + final Object2IntMap[] newReadIndexBySampleIndex = new Object2IntMap[sampleCount]; + final GATKSAMRecord[][] newReadsBySampleIndex = new GATKSAMRecord[sampleCount][]; + + for (int s = 0; s < sampleCount; s++) { + newReadIndexBySampleIndex[s] = new Object2IntOpenHashMap<>(readIndexBySampleIndex[s]); + newReadsBySampleIndex[s] = readsBySampleIndex[s].clone(); + } + + // Finally we create the new read-likelihood + return new ReadLikelihoods<>(newAlleles, samples, sampleIndex, + newReadsBySampleIndex, + newReadIndexBySampleIndex, newLikelihoodValues); + } + + + /** + * Perform marginalization from an allele set to another (smaller one) taking the maximum value + * for each read in the original allele subset. + * + * @param newToOldAlleleMap map where the keys are the new alleles and the value list the original + * alleles that correspond to the new one. + * @return never {@code null}. The result will have the requested set of new alleles (keys in {@code newToOldAlleleMap}, and + * the same set of samples and reads as the original. + * + * @param overlap if not {@code null}, only reads that overlap the location (with unclipping) will be present in + * the output read-collection. + * + * @throws IllegalArgumentException is {@code newToOldAlleleMap} is {@code null} or contains {@code null} values, + * or its values contain reference to non-existing alleles in this read-likelihood collection. Also no new allele + * can have zero old alleles mapping nor two new alleles can make reference to the same old allele. + */ + public ReadLikelihoods marginalize(final Map> newToOldAlleleMap,final GenomeLoc overlap) { + + if (overlap == null) + return marginalize(newToOldAlleleMap); + + if (newToOldAlleleMap == null) + throw new IllegalArgumentException("the input allele mapping cannot be null"); + + @SuppressWarnings("unchecked") + final B[] newAlleles = newToOldAlleleMap.keySet().toArray((B[]) new Allele[newToOldAlleleMap.size()]); + final int oldAlleleCount = alleles.length; + final int newAlleleCount = newAlleles.length; + + // we get the index correspondence between new old -> new allele, -1 entries mean that the old + // allele does not map to any new; supported but typically not the case. + final int[] oldToNewAlleleIndexMap = oldToNewAlleleIndexMap(newToOldAlleleMap, newAlleles, oldAlleleCount, newAlleleCount); + + final int[][] readsToKeep = overlappingReadIndicesBySampleIndex(overlap); + // We calculate the marginal likelihoods. + + final double[][][] newLikelihoodValues = marginalLikelihoods(oldAlleleCount, newAlleleCount, oldToNewAlleleIndexMap, readsToKeep); + + final int sampleCount = samples.length; + + @SuppressWarnings("unchecked") + final Object2IntMap[] newReadIndexBySampleIndex = new Object2IntMap[sampleCount]; + final GATKSAMRecord[][] newReadsBySampleIndex = new GATKSAMRecord[sampleCount][]; + + for (int s = 0; s < sampleCount; s++) { + final int[] sampleReadsToKeep = readsToKeep[s]; + final GATKSAMRecord[] oldSampleReads = readsBySampleIndex[s]; + final int oldSampleReadCount = oldSampleReads.length; + final int newSampleReadCount = sampleReadsToKeep.length; + if (newSampleReadCount == oldSampleReadCount) { + newReadIndexBySampleIndex[s] = new Object2IntOpenHashMap<>(readIndexBySampleIndex[s]); + newReadsBySampleIndex[s] = readsBySampleIndex[s].clone(); + } else { + final Object2IntMap newSampleReadIndex = newReadIndexBySampleIndex[s] = new Object2IntOpenHashMap<>(newSampleReadCount); + final GATKSAMRecord[] newSampleReads = newReadsBySampleIndex[s] = new GATKSAMRecord[newSampleReadCount]; + for (int r = 0; r < newSampleReadCount; r++) + newSampleReadIndex.put(newSampleReads[r] = oldSampleReads[sampleReadsToKeep[r]],r); + } + } + + // Finally we create the new read-likelihood + return new ReadLikelihoods<>(newAlleles, samples, sampleIndex, + newReadsBySampleIndex, + newReadIndexBySampleIndex, newLikelihoodValues); + } + + private int[][] overlappingReadIndicesBySampleIndex(final GenomeLoc overlap) { + if (overlap == null) + return null; + final int sampleCount = samples.length; + final int[][] result = new int[sampleCount][]; + final IntArrayList buffer = new IntArrayList(200); + final int referenceIndex = overlap.getContigIndex(); + final int overlapStart = overlap.getStart(); + final int overlapEnd = overlap.getStop(); + for (int s = 0; s < sampleCount; s++) { + buffer.clear(); + final GATKSAMRecord[] sampleReads = readsBySampleIndex[s]; + final int sampleReadCount = sampleReads.length; + buffer.ensureCapacity(sampleReadCount); + for (int r = 0; r < sampleReadCount; r++) + if (unclippedReadOverlapsRegion(sampleReads[r], referenceIndex, overlapStart, overlapEnd)) + buffer.add(r); + result[s] = buffer.toIntArray(); + } + return result; + } + + public static boolean unclippedReadOverlapsRegion(final GATKSAMRecord read, final GenomeLoc region) { + return unclippedReadOverlapsRegion(read, region.getContigIndex(), region.getStart(), region.getStop()); + } + + private static boolean unclippedReadOverlapsRegion(final GATKSAMRecord sampleRead, final int referenceIndex, final int start, final int end) { + final int readReference = sampleRead.getReferenceIndex(); + if (readReference != referenceIndex) + return false; + + final int readStart = sampleRead.getUnclippedStart(); + if (readStart > end) + return false; + + final int readEnd = sampleRead.getReadUnmappedFlag() ? sampleRead.getUnclippedEnd() + : Math.max(sampleRead.getUnclippedEnd(), sampleRead.getUnclippedStart()); + return readEnd >= start; + } + + + + // Calculate the marginal likelihoods considering the old -> new allele index mapping. + private double[][][] marginalLikelihoods(final int oldAlleleCount, final int newAlleleCount, final int[] oldToNewAlleleIndexMap, final int[][] readsToKeep) { + + final int sampleCount = samples.length; + final double[][][] result = new double[sampleCount][][]; + + for (int s = 0; s < sampleCount; s++) { + final int sampleReadCount = readsBySampleIndex[s].length; + final double[][] oldSampleValues = valuesBySampleIndex[s]; + final int[] sampleReadToKeep = readsToKeep == null || readsToKeep[s].length == sampleReadCount ? null : readsToKeep[s]; + final int newSampleReadCount = sampleReadToKeep == null ? sampleReadCount : sampleReadToKeep.length; + final double[][] newSampleValues = result[s] = new double[newAlleleCount][newSampleReadCount]; + // We initiate all likelihoods to -Inf. + for (int a = 0; a < newAlleleCount; a++) + Arrays.fill(newSampleValues[a], Double.NEGATIVE_INFINITY); + // For each old allele and read we update the new table keeping the maximum likelihood. + for (int r = 0; r < newSampleReadCount; r++) { + for (int a = 0; a < oldAlleleCount; a++) { + final int oldReadIndex = newSampleReadCount == sampleReadCount ? r : sampleReadToKeep[r]; + final int newAlleleIndex = oldToNewAlleleIndexMap[a]; + if (newAlleleIndex == -1) + continue; + final double likelihood = oldSampleValues[a][oldReadIndex]; + if (likelihood > newSampleValues[newAlleleIndex][r]) + newSampleValues[newAlleleIndex][r] = likelihood; + } + } + } + return result; + } + + // calculates an old to new allele index map array. + private int[] oldToNewAlleleIndexMap(final Map> newToOldAlleleMap, final B[] newAlleles, + final int oldAlleleCount, final int newAlleleCount) { + + final int[] oldToNewAlleleIndexMap = new int[oldAlleleCount]; + Arrays.fill(oldToNewAlleleIndexMap, -1); // -1 indicate that there is no new allele that make reference to that old one. + + for (int i = 0; i < newAlleleCount; i++) { + final B newAllele = newAlleles[i]; + if (newAllele == null) + throw new IllegalArgumentException("input alleles cannot be null"); + final List oldAlleles = newToOldAlleleMap.get(newAllele); + if (oldAlleles == null) + throw new IllegalArgumentException("no new allele list can be null"); + for (final A oldAllele : oldAlleles) { + if (oldAllele == null) + throw new IllegalArgumentException("old alleles cannot be null"); + final int oldAlleleIndex = alleleIndex(oldAllele); + if (oldAlleleIndex == -1) + throw new IllegalArgumentException("missing old allele " + oldAllele + " in likelihood collection "); + if (oldToNewAlleleIndexMap[oldAlleleIndex] != -1) + throw new IllegalArgumentException("collision: two new alleles make reference to the same old allele"); + oldToNewAlleleIndexMap[oldAlleleIndex] = i; + } + } + return oldToNewAlleleIndexMap; + } + + /** + * Remove those reads that do not overlap certain genomic location. + * + *

+ * This method modifies the current read-likelihoods collection. + *

+ * + * @param location the target location. + * + * @throws IllegalArgumentException the location cannot be {@code null} nor unmapped. + */ + @SuppressWarnings("unused") + public void filterToOnlyOverlappingUnclippedReads(final GenomeLoc location) { + if (location == null) + throw new IllegalArgumentException("the location cannot be null"); + if (location.isUnmapped()) + throw new IllegalArgumentException("the location cannot be unmapped"); + + final int sampleCount = samples.length; + + final int locContig = location.getContigIndex(); + final int locStart = location.getStart(); + final int locEnd = location.getStop(); + + final Set readsToRemove = new HashSet<>(100); // can we improve this estimate? + for (int s = 0; s < sampleCount; s++) + for (final GATKSAMRecord read : readsBySampleIndex[s]) + if (!unclippedReadOverlapsRegion(read, locContig, locStart, locEnd)) + readsToRemove.add(read); + removeReads(readsToRemove); + } + + // Compare the read coordinates to the location of interest. + private boolean readOverlapsLocation(final String contig, final int locStart, + final int locEnd, final GATKSAMRecord read) { + final boolean overlaps; + + if (read.getReadUnmappedFlag()) + overlaps = false; + else if (!read.getReferenceName().equals(contig)) + overlaps = false; + else { + int alnStart = read.getAlignmentStart(); + int alnStop = read.getAlignmentEnd(); + if (alnStart > alnStop) { // Paranoia? based on GLP.createGenomeLoc(Read) this can happen?. + final int end = alnStart; + alnStart = alnStop; + alnStop = end; + } + overlaps = !(alnStop < locStart || alnStart > locEnd); + } + return overlaps; + } + + /** + * Removes those read that the best possible likelihood given any allele is just too low. + * + *

+ * This is determined by a maximum error per read-base against the best likelihood possible. + *

+ * + * @param maximumErrorPerBase the minimum acceptable error rate per read base, must be + * a positive number. + * + * @throws IllegalStateException is not supported for read-likelihood that do not contain alleles. + * + * @throws IllegalArgumentException if {@code maximumErrorPerBase} is negative. + */ + public void filterPoorlyModeledReads(final double maximumErrorPerBase) { + if (alleles.length == 0) + throw new IllegalStateException("unsupported for read-likelihood collections with no alleles"); + if (Double.isNaN(maximumErrorPerBase) || maximumErrorPerBase <= 0.0) + throw new IllegalArgumentException("the maximum error per base must be a positive number"); + final int sampleCount = samples.length; + + final Set readsToRemove = new HashSet<>(100); // rough estimate, can be improved? + for (int s = 0; s < sampleCount; s++) { + final GATKSAMRecord[] sampleReads = readsBySampleIndex[s]; + final int sampleReadCount = sampleReads.length; + for (int r = 0; r < sampleReadCount; r++) { + final GATKSAMRecord read = sampleReads[r]; + if (readIsPoorlyModelled(s,r,read, maximumErrorPerBase)) + readsToRemove.add(read); + } + } + removeReads(readsToRemove); + } + + // Check whether the read is poorly modelled. + protected boolean readIsPoorlyModelled(final int sampleIndex, final int readIndex, final GATKSAMRecord read, final double maxErrorRatePerBase) { + final double maxErrorsForRead = Math.min(2.0, Math.ceil(read.getReadLength() * maxErrorRatePerBase)); + final double log10QualPerBase = -4.0; + final double log10MaxLikelihoodForTrueAllele = maxErrorsForRead * log10QualPerBase; + + final int alleleCount = alleles.length; + final double[][] sampleValues = valuesBySampleIndex[sampleIndex]; + for (int a = 0; a < alleleCount; a++) + if (sampleValues[a][readIndex] >= log10MaxLikelihoodForTrueAllele) + return false; + return true; + } + + + /** + * Add more reads to the collection. + * + * @param readsBySample reads to add. + * @param initialLikelihood the likelihood for the new entries. + * + * @throws IllegalArgumentException if {@code readsBySample} is {@code null} or {@code readsBySample} contains + * {@code null} reads, or {@code readsBySample} contains read that are already present in the read-likelihood + * collection. + */ + public void addReads(final Map> readsBySample, final double initialLikelihood) { + + for (final Map.Entry> entry : readsBySample.entrySet()) { + + final String sample = entry.getKey(); + final List newSampleReads = entry.getValue(); + final int sampleIndex = this.sampleIndex.getInt(sample); + + if (sampleIndex == -1) + throw new IllegalArgumentException("input sample " + sample + + " is not part of the read-likelihoods collection"); + + if (newSampleReads == null || newSampleReads.size() == 0) + continue; + + final int sampleReadCount = readsBySampleIndex[sampleIndex].length; + final int newSampleReadCount = sampleReadCount + newSampleReads.size(); + + appendReads(newSampleReads, sampleIndex, sampleReadCount, newSampleReadCount); + extendsLikelihoodArrays(initialLikelihood, sampleIndex, sampleReadCount, newSampleReadCount); + } + } + + // Extends the likelihood arrays-matrices. + private void extendsLikelihoodArrays(double initialLikelihood, int sampleIndex, int sampleReadCount, int newSampleReadCount) { + final double[][] sampleValues = valuesBySampleIndex[sampleIndex]; + final int alleleCount = alleles.length; + for (int a = 0; a < alleleCount; a++) + sampleValues[a] = Arrays.copyOf(sampleValues[a], newSampleReadCount); + if (initialLikelihood != 0.0) // the default array new value. + for (int a = 0; a < alleleCount; a++) + Arrays.fill(sampleValues[a],sampleReadCount,newSampleReadCount,initialLikelihood); + } + + // Append the new read reference into the structure per-sample. + private void appendReads(final List newSampleReads, final int sampleIndex, + final int sampleReadCount, final int newSampleReadCount) { + final GATKSAMRecord[] sampleReads = readsBySampleIndex[sampleIndex] = + Arrays.copyOf(readsBySampleIndex[sampleIndex], newSampleReadCount); + + int nextReadIndex = sampleReadCount; + final Object2IntMap sampleReadIndex = readIndexBySampleIndex[sampleIndex]; + for (final GATKSAMRecord newRead : newSampleReads) { + if (sampleReadIndex.containsKey(newRead)) // might be worth handle this without exception (ignore the read?) but in practice should never be the case. + throw new IllegalArgumentException("you cannot add reads that are already in read-likelihood collection"); + sampleReads[nextReadIndex] = newRead; + sampleReadIndex.put(newRead,nextReadIndex++); + } + } + + /** + * Adds the non-reference allele to the read-likelihood collection setting each read likelihood to the second + * best found (or best one if only one allele has likelihood). + * + *

Nothing will happen if the read-likelihoods collection already includes the non-ref allele

+ * + *

+ * Implementation note: even when strictly speaking we do not need to demand the calling code to pass + * the reference the non-ref allele, we still demand it in order to lead the + * the calling code to use the right generic type for this likelihoods + * collection {@link Allele}. + *

+ * + * @param nonRefAllele the non-ref allele. + * + * @throws IllegalArgumentException if {@code nonRefAllele} is anything but the designated <NON_REF> + * symbolic allele {@link GATKVariantContextUtils#NON_REF_SYMBOLIC_ALLELE}. + */ + public void addNonReferenceAllele(final A nonRefAllele) { + + if (nonRefAllele == null) + throw new IllegalArgumentException("non-ref allele cannot be null"); + if (!nonRefAllele.equals(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE)) + throw new IllegalArgumentException("the non-ref allele is not valid"); + // Already present? + if (alleleIndex.containsKey(nonRefAllele)) + return; + + final int alleleCount = alleles.length; + final int newAlleleCount = alleleCount + 1; + alleles = Arrays.copyOf(alleles,newAlleleCount); + alleles[alleleCount] = nonRefAllele; + alleleIndex.put(nonRefAllele,alleleCount); + alleleList = null; // remove the cached alleleList. + + final int sampleCount = samples.length; + for (int s = 0; s < sampleCount; s++) + addNonReferenceAlleleLikelihoodsPerSample(alleleCount, newAlleleCount, s); + } + + // Updates per-sample structures according to the addition of the NON_REF allele. + private void addNonReferenceAlleleLikelihoodsPerSample(final int alleleCount, final int newAlleleCount, final int sampleIndex) { + final double[][] sampleValues = valuesBySampleIndex[sampleIndex] = Arrays.copyOf(valuesBySampleIndex[sampleIndex], newAlleleCount); + final int sampleReadCount = readsBySampleIndex[sampleIndex].length; + + final double[] nonRefAlleleLikelihoods = sampleValues[alleleCount] = new double [sampleReadCount]; + Arrays.fill(nonRefAlleleLikelihoods,Double.NEGATIVE_INFINITY); + for (int r = 0; r < sampleReadCount; r++) { + final BestAllele bestAllele = searchBestAllele(sampleIndex,r,true); + final double secondBestLikelihood = Double.isInfinite(bestAllele.confidence) ? bestAllele.likelihood + : bestAllele.likelihood - bestAllele.confidence; + nonRefAlleleLikelihoods[r] = secondBestLikelihood; + } + } + + /** + * Downsamples reads based on contamination fractions making sure that all alleles are affected proportionally. + * + * @param perSampleDownsamplingFraction contamination sample map where the sample name are the keys and the + * fractions are the values. + * + * @throws IllegalArgumentException if {@code perSampleDownsamplingFraction} is {@code null}. + */ + public void contaminationDownsampling(final Map perSampleDownsamplingFraction) { + + final int sampleCount = samples.length; + final Set readsToRemove = new HashSet<>(100); // blind estimate, can be improved? + for (int s = 0; s < sampleCount; s++) { + final String sample = samples[s]; + final Double fractionDouble = perSampleDownsamplingFraction.get(sample); + if (fractionDouble == null) + continue; + final double fraction = fractionDouble; + if (Double.isNaN(fraction) || fraction <= 0.0) + continue; + if (fraction >= 1.0) + readsToRemove.addAll(Arrays.asList(readsBySampleIndex[s])); + else { + final Map> readsByBestAllelesMap = readsByBestAlleleMap(s); + readsToRemove.addAll(AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(readsByBestAllelesMap, fraction)); + } + } + removeReads(readsToRemove); + } + + /** + * Returns the collection of best allele estimates for the reads based on the read-likelihoods. + * + * @throws IllegalStateException if there is no alleles. + * + * @return never {@code null}, one element per read in the read-likelihoods collection. + */ + public Collection bestAlleles() { + final List result = new ArrayList<>(100); // blind estimate. + final int sampleCount = samples.length; + for (int s = 0; s < sampleCount; s++) { + final GATKSAMRecord[] sampleReads = readsBySampleIndex[s]; + final int readCount = sampleReads.length; + for (int r = 0; r < readCount; r++) + result.add(searchBestAllele(s,r,true)); + } + return result; + } + + /** + * Returns reads stratified by their best allele. + * @param sampleIndex the target sample. + * @return never {@code null}, perhaps empty. + */ + public Map> readsByBestAlleleMap(final int sampleIndex) { + checkSampleIndex(sampleIndex); + final int alleleCount = alleles.length; + final int sampleReadCount = readsBySampleIndex[sampleIndex].length; + final Map> result = new HashMap<>(alleleCount); + for (final A allele : alleles) + result.put(allele,new ArrayList(sampleReadCount)); + readsByBestAlleleMap(sampleIndex,result); + return result; + } + + /** + * Returns reads stratified by their best allele. + * @return never {@code null}, perhaps empty. + */ + @SuppressWarnings("unused") + public Map> readsByBestAlleleMap() { + final int alleleCount = alleles.length; + final Map> result = new HashMap<>(alleleCount); + final int totalReadCount = readCount(); + for (final A allele : alleles) + result.put(allele,new ArrayList(totalReadCount)); + final int sampleCount = samples.length; + for (int s = 0; s < sampleCount; s++) + readsByBestAlleleMap(s,result); + return result; + } + + private void readsByBestAlleleMap(final int sampleIndex, final Map> result) { + final GATKSAMRecord[] reads = readsBySampleIndex[sampleIndex]; + final int readCount = reads.length; + + for (int r = 0; r < readCount; r++) { + final BestAllele bestAllele = searchBestAllele(sampleIndex,r,true); + if (!bestAllele.isInformative()) + continue; + result.get(bestAllele.allele).add(bestAllele.read); + } + } + + /** + * Returns the index of a read within a sample read-likelihood sub collection. + * @param sampleIndex the sample index. + * @param read the query read. + * @return -1 if there is no such read in that sample, 0 or greater otherwise. + */ + @SuppressWarnings("unused") + public int readIndex(final int sampleIndex, final GATKSAMRecord read) { + final Object2IntMap readIndex = readIndexBySampleIndex[sampleIndex]; + if (readIndex.containsKey(read)) + return readIndexBySampleIndex[sampleIndex].getInt(read); + else + return -1; + } + + /** + * Returns the total number of reads in the read-likelihood collection. + * + * @return never {@code null} + */ + public int readCount() { + int sum = 0; + for (int i = 0; i < samples.length; i++) + sum += readsBySampleIndex[i].length; + return sum; + } + + /** + * Returns the number of reads that belong to a sample in the read-likelihood collection. + * @param sampleIndex the query sample index. + * + * @throws IllegalArgumentException if {@code sampleIndex} is not a valid sample index. + * @return 0 or greater. + */ + public int sampleReadCount(int sampleIndex) { + checkSampleIndex(sampleIndex); + return readsBySampleIndex[sampleIndex].length; + } + + /** + * Contains information about the best allele for a read search result. + */ + public class BestAllele { + public static final double INFORMATIVE_THRESHOLD = 0.2; + + /** + * Null if there is no possible match (no allele?). + */ + public final A allele; + + /** + * The containing sample. + */ + public final String sample; + + /** + * The query read. + */ + public final GATKSAMRecord read; + + /** + * If allele != null, the indicates the likelihood of the read. + */ + public final double likelihood; + + /** + * Confidence that the read actually was generated under that likelihood. + * This is equal to the difference between this and the second best allele match. + */ + public final double confidence; + + private BestAllele(final int sampleIndex, final int readIndex, final int bestAlleleIndex, + final double likelihood, final double secondBestLikelihood) { + allele = bestAlleleIndex == -1 ? null : alleles[bestAlleleIndex]; + this.likelihood = likelihood; + sample = samples[sampleIndex]; + read = readsBySampleIndex[sampleIndex][readIndex]; + confidence = likelihood == secondBestLikelihood ? 0 : likelihood - secondBestLikelihood; + } + + public boolean isInformative() { + return confidence > INFORMATIVE_THRESHOLD; + } + } + + + /** + * Remove a set of reads from the read-likelihoods + * @param readsToRemove collection of reads to remove. + */ + @SuppressWarnings("unused") + public void removeReads(final Collection readsToRemove) { + if (readsToRemove == null) + throw new IllegalArgumentException("remove read collection cannot be null"); + if (readsToRemove.isEmpty()) + return; + removeReads(new HashSet<>(readsToRemove)); + } + + // Requires that the collection passed iterator can remove elements, and it can be modified. + private void removeReads(final Set readsToRemove) { + if (readsToRemove.isEmpty()) + return; + final int sampleCount = samples.length; + final int alleleCount = alleles.length; + for (int s = 0; s < sampleCount; s++) { + removeSampleReads(s, readsToRemove, alleleCount); + if (readsToRemove.isEmpty()) break; // each iteration may exhaust the reads to remove. + } + } + + // Requires that the collection passed iterator can remove elements, and it can be modified. + private void removeSampleReads(final int sampleIndex, final Set readsToRemove, final int alleleCount) { + final GATKSAMRecord[] sampleReads = readsBySampleIndex[sampleIndex]; + final int sampleReadCount = sampleReads.length; + + final Object2IntMap indexByRead = readIndexBySampleIndex[sampleIndex]; + // Count how many we are going to remove, which ones (indexes) and remove entry from the read-index map. + final boolean[] removeIndex = new boolean[sampleReadCount]; + int removeCount = 0; // captures the number of deletions. + int firstDeleted = sampleReadCount; // captures the first position that was deleted. + + final Iterator readsToRemoveIterator = readsToRemove.iterator(); + while (readsToRemoveIterator.hasNext()) { + final GATKSAMRecord read = readsToRemoveIterator.next(); + if (indexByRead.containsKey(read)) { + final int index = indexByRead.getInt(read); + if (firstDeleted > index) + firstDeleted = index; + removeCount++; + removeIndex[index] = true; + readsToRemoveIterator.remove(); + indexByRead.remove(read); + } + } + + // Nothing to remove we just finish here. + if (removeCount == 0) + return; + + final int newSampleReadCount = sampleReadCount - removeCount; + + // Now we skim out the removed reads from the read array. + final GATKSAMRecord[] oldSampleReads = readsBySampleIndex[sampleIndex]; + final GATKSAMRecord[] newSampleReads = new GATKSAMRecord[newSampleReadCount]; + + System.arraycopy(oldSampleReads,0,newSampleReads,0,firstDeleted); + Utils.skimArray(oldSampleReads,firstDeleted, newSampleReads, firstDeleted, removeIndex, firstDeleted); + + // Update the indices for the extant reads from the first deletion onwards. + for (int r = firstDeleted; r < newSampleReadCount; r++) { + indexByRead.put(newSampleReads[r], r); + } + + // Then we skim out the likelihoods of the removed reads. + final double[][] oldSampleValues = valuesBySampleIndex[sampleIndex]; + final double[][] newSampleValues = new double[alleleCount][newSampleReadCount]; + for (int a = 0; a < alleleCount; a++) { + System.arraycopy(oldSampleValues[a],0,newSampleValues[a],0,firstDeleted); + Utils.skimArray(oldSampleValues[a], firstDeleted, newSampleValues[a], firstDeleted, removeIndex, firstDeleted); + } + valuesBySampleIndex[sampleIndex] = newSampleValues; + readsBySampleIndex[sampleIndex] = newSampleReads; + readListBySampleIndex[sampleIndex] = null; // reset the unmodifiable list. + } + + /** + * Transform into a multi-sample HashMap backed {@link PerReadAlleleLikelihoodMap} type. + * @return never {@code null}. + * + */ + @Deprecated + @SuppressWarnings("all") + public Map toPerReadAlleleLikelihoodMap () { + final int sampleCount = samples.length; + final Map result = new HashMap<>(sampleCount); + for (int s = 0; s < sampleCount; s++) + result.put(samples[s],toPerReadAlleleLikelihoodMap(s)); + return result; + } + + /** + * Transform into a single-sample HashMap backed {@link PerReadAlleleLikelihoodMap} type. + * + * @return never {@code null}. + */ + @Deprecated + public PerReadAlleleLikelihoodMap toPerReadAlleleLikelihoodMap(final int sampleIndex) { + checkSampleIndex(sampleIndex); + final PerReadAlleleLikelihoodMap result = new PerReadAlleleLikelihoodMap(); + final int alleleCount = alleles.length; + final GATKSAMRecord[] sampleReads = readsBySampleIndex[sampleIndex]; + final int sampleReadCount = sampleReads.length; + for (int a = 0; a < alleleCount; a++) { + final A allele = alleles[a]; + final double[] readLikelihoods = valuesBySampleIndex[sampleIndex][a]; + for (int r = 0; r < sampleReadCount; r++) + result.add(sampleReads[r], allele, readLikelihoods[r]); + } + return result; + } + + /** + * Implements a likelihood matrix per sample given its index. + */ + private class SampleMatrix implements Matrix
{ + + private final int sampleIndex; + + private SampleMatrix(final int sampleIndex) { + this.sampleIndex = sampleIndex; + } + + @Override + public List reads() { + return sampleReads(sampleIndex); + } + + @Override + public List alleles() { + return ReadLikelihoods.this.alleles(); + } + + @Override + public void set(final int alleleIndex, final int readIndex, final double value) { + valuesBySampleIndex[sampleIndex][alleleIndex][readIndex] = value; + } + + @Override + public double get(final int alleleIndex, final int readIndex) { + return valuesBySampleIndex[sampleIndex][alleleIndex][readIndex]; + } + + @Override + public int alleleIndex(final A allele) { + return ReadLikelihoods.this.alleleIndex(allele); + } + + @Override + public int readIndex(final GATKSAMRecord read) { + return ReadLikelihoods.this.readIndex(sampleIndex,read); + } + + @Override + public int alleleCount() { + return alleles.length; + } + + @Override + public int readCount() { + return readsBySampleIndex[sampleIndex].length; + } + + @Override + public A allele(int alleleIndex) { + return ReadLikelihoods.this.allele(alleleIndex); + } + + @Override + public GATKSAMRecord read(final int readIndex) { + if (readIndex < 0) + throw new IllegalArgumentException("the read-index cannot be negative"); + final GATKSAMRecord[] sampleReads = readsBySampleIndex[sampleIndex]; + if (readIndex >= sampleReads.length) + throw new IllegalArgumentException("the read-index is beyond the read count of the sample"); + return sampleReads[readIndex]; + } + } + + /** + * Checks whether the provide sample index is valid. + *

+ * If not, it throws an exception. + *

+ * @param sampleIndex the target sample index. + * + * @throws IllegalArgumentException if {@code sampleIndex} is invalid, i.e. outside the range [0,{@link #sampleCount}). + */ + private void checkSampleIndex(final int sampleIndex) { + if (sampleIndex < 0 || sampleIndex >= samples.length) + throw new IllegalArgumentException("invalid sample index: " + sampleIndex); + } + + /** + * Checks whether the provide allele index is valid. + *

+ * If not, it throws an exception. + *

+ * @param alleleIndex the target sample index. + * + * @throws IllegalArgumentException if {@code alleleIndex} is invalid, i.e. outside the range [0,{@link #sampleCount}). + */ + private void checkAlleleIndex(final int alleleIndex) { + if (alleleIndex < 0 || alleleIndex >= alleles.length) + throw new IllegalArgumentException("invalid allele index: " + alleleIndex); + } +} \ No newline at end of file