diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java index 3c9da90d2..d8c10145f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java @@ -61,6 +61,9 @@ import java.util.List; public class GenotypeCalculationArgumentCollection implements Cloneable{ + + public static final String MAX_ALTERNATE_ALLELES_SHORT_NAME = "maxAltAlleles"; + /** * Depending on the value of the --max_alternate_alleles argument, we may genotype only a fraction of the alleles being sent on for genotyping. * Using this argument instructs the genotyper to annotate (in the INFO field) the number of alternate alleles that were originally discovered at the site. @@ -122,7 +125,7 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ * As of GATK 2.2 the genotyper can handle a very large number of events, so the default maximum has been increased to 6. */ @Advanced - @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) + @Argument(fullName = "max_alternate_alleles", shortName = MAX_ALTERNATE_ALLELES_SHORT_NAME, doc = "Maximum number of alternate alleles to genotype", required = false) public int MAX_ALTERNATE_ALLELES = 6; /** diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingLikelihoods.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingLikelihoods.java index c247862b5..c0e62c20a 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingLikelihoods.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingLikelihoods.java @@ -54,9 +54,13 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.GenotypeLikelihoods; import org.broadinstitute.gatk.utils.genotyper.AlleleList; +import org.broadinstitute.gatk.utils.genotyper.AlleleListUtils; +import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList; import org.broadinstitute.gatk.utils.genotyper.SampleList; +import java.util.ArrayList; import java.util.List; +import java.util.Set; /** * Genotyping Likelihoods collection. diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/VariantCallContext.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/VariantCallContext.java index d7f36975e..252bec476 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/VariantCallContext.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/VariantCallContext.java @@ -69,7 +69,7 @@ public class VariantCallContext extends VariantContext { // Should this site be emitted? public boolean shouldEmit = true; - VariantCallContext(VariantContext vc, boolean confidentlyCalledP) { + public VariantCallContext(VariantContext vc, boolean confidentlyCalledP) { super(vc); this.confidentlyCalled = confidentlyCalledP; } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculator.java index fa91a2f2b..5680e5f15 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculator.java @@ -53,6 +53,9 @@ package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingLikelihoods; +import org.broadinstitute.gatk.utils.SimpleTimer; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.GenotypesContext; @@ -128,6 +131,7 @@ public abstract class AFCalculator implements Cloneable { * @return result (for programming convenience) */ public AFCalculationResult getLog10PNonRef(final VariantContext vc, final int defaultPloidy, final int maximumAlternativeAlleles, final double[] log10AlleleFrequencyPriors) { + if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null"); if ( vc.getNAlleles() == 1 ) throw new IllegalArgumentException("VariantContext has only a single reference allele, but getLog10PNonRef requires at least one at all " + vc); if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null"); @@ -135,6 +139,9 @@ public abstract class AFCalculator implements Cloneable { // reset the result, so we can store our new result there final StateTracker stateTracker = getStateTracker(true,maximumAlternativeAlleles); + //TODO All implementations of the reduce-scope seems to employ a bad criterion to + //TODO decide what alleles to keep. This must be changed eventually. + //TODO issue {@see https://github.com/broadinstitute/gsa-unstable/issues/1376} final VariantContext vcWorking = reduceScope(vc,defaultPloidy, maximumAlternativeAlleles); callTimer.start(); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java index 90fe73575..cad0697da 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java @@ -51,19 +51,19 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; +import com.google.common.annotations.VisibleForTesting; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import htsjdk.samtools.util.StringUtil; import htsjdk.variant.variantcontext.*; +import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection; +import org.broadinstitute.gatk.utils.*; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; -import org.broadinstitute.gatk.utils.genotyper.AlleleList; import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList; import org.broadinstitute.gatk.utils.genotyper.SampleList; import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.genotyper.*; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.EventMap; @@ -82,6 +82,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine 0"}) @Ensures("result != null") // TODO - can this be refactored? this is hard to follow! - public CalledHaplotypes assignGenotypeLikelihoods( final List haplotypes, + CalledHaplotypes assignGenotypeLikelihoods( final List haplotypes, final ReadLikelihoods readLikelihoods, final Map> perSampleFilteredReadList, final byte[] ref, @@ -241,9 +242,6 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine mergeMap = new LinkedHashMap<>(); mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele for(int iii = 0; iii < eventsAtThisLoc.size(); iii++) { @@ -256,39 +254,25 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine it = allelesToDrop.iterator(); + final StringBuilder builder = new StringBuilder(); + for (int i = 0; i < MAX_DROPPED_ALTERNATIVE_ALLELES_TO_LOG; i++) { + builder.append(it.next().toString()).append(", "); + } + allelesToDropString = builder.append(it.next().toString()).append(" and ").append(allelesToDrop.size() - 20).append(" more").toString(); + } + logger.warn(String.format("location %s: too many alternative alleles found (%d) larger than the maximum requested with -%s (%d), the following will be dropped: %s.", location, + readAlleleLikelihoods.alleleCount() - 1, GenotypeCalculationArgumentCollection.MAX_ALTERNATE_ALLELES_SHORT_NAME, configuration.genotypeArgs.MAX_ALTERNATE_ALLELES, + allelesToDropString)); + readAlleleLikelihoods.dropAlleles(allelesToDrop); + } + + /** + * Returns the set of alleles that should be dropped in order to bring down the number + * of alternative alleles to the maximum allowed. + * + *

+ * The alleles that put forward for removal are those with the lowest estimated allele count. + *

+ *

+ * Allele counts are estimated herein as the weighted average count + * across samples and phased genotypes where the weight is the genotype likelihood-- we apply + * a uniform prior to all genotypes configurations. + *

+ *

+ * In case of a tie, unlikely for non trivial likelihoods, we keep the alleles with the lower index. + *

+ * + * @param genotypeLikelihoods target genotype likelihoods. + * @param maxAlternativeAlleles maximum number of alternative alleles allowed. + * @return never {@code null}. + */ + private Set excessAlternativeAlleles(final GenotypingLikelihoods genotypeLikelihoods, final int maxAlternativeAlleles) { + final int alleleCount = genotypeLikelihoods.alleleCount(); + final int excessAlternativeAlleleCount = Math.max(0, alleleCount - 1 - maxAlternativeAlleles); + if (excessAlternativeAlleleCount <= 0) { + return Collections.emptySet(); + } + + final double log10NumberOfAlleles = MathUtils.Log10Cache.get(alleleCount); // log10(Num of Alleles); e.g. log10(2) for diploids. + final double[] log10EstimatedACs = new double[alleleCount]; // where we store the AC estimates. + // Set allele counts to 0 (i.e. exp(-Inf)) at the start. + Arrays.fill(log10EstimatedACs, Double.NEGATIVE_INFINITY); + + for (int i = 0; i < genotypeLikelihoods.sampleCount(); i++) { + final GenotypeLikelihoodCalculator calculator = GenotypeLikelihoodCalculators.getInstance(genotypeLikelihoods.samplePloidy(i), alleleCount); + final int numberOfUnphasedGenotypes = calculator.genotypeCount(); + // unphased genotype log10 likelihoods + final double[] log10Likelihoods = genotypeLikelihoods.sampleLikelihoods(i).getAsVector(); + // total number of phased genotypes for all possible combinations of allele counts. + final double log10NumberOfPhasedGenotypes = calculator.ploidy() * log10NumberOfAlleles; + for (int j = 0; j < numberOfUnphasedGenotypes; j++) { + final GenotypeAlleleCounts alleleCounts = calculator.genotypeAlleleCountsAt(j); + // given the current unphased genotype, how many phased genotypes there are: + final double log10NumberOfPhasedGenotypesForThisUnphasedGenotype = alleleCounts.log10CombinationCount(); + final double log10GenotypeLikelihood = log10Likelihoods[j]; + for (int k = 0; k < alleleCounts.distinctAlleleCount(); k++) { + final int alleleIndex = alleleCounts.alleleIndexAt(k); + final int alleleCallCount = alleleCounts.alleleCountAt(k); + final double log10AlleleCount = MathUtils.Log10Cache.get(alleleCallCount); + final double log10Weight = log10GenotypeLikelihood + log10NumberOfPhasedGenotypesForThisUnphasedGenotype + - log10NumberOfPhasedGenotypes; + // update the allele AC adding the contribution of this unphased genotype at this sample. + log10EstimatedACs[alleleIndex] = MathUtils.log10sumLog10(log10EstimatedACs[alleleIndex], + log10Weight + log10AlleleCount); + } + } + } + + final PriorityQueue lessFrequentFirst = new PriorityQueue<>(alleleCount, new Comparator() { + @Override + public int compare(final Allele a1, final Allele a2) { + final int index1 = genotypeLikelihoods.alleleIndex(a1); + final int index2 = genotypeLikelihoods.alleleIndex(a2); + final double freq1 = log10EstimatedACs[index1]; + final double freq2 = log10EstimatedACs[index2]; + if (freq1 != freq2) { + return Double.compare(freq1, freq2); + } else { + return Integer.compare(index2, index1); + } + } + }); + + for (int i = 1; i < alleleCount; i++) { + lessFrequentFirst.add(genotypeLikelihoods.alleleAt(i)); + } + + final Set result = new HashSet<>(excessAlternativeAlleleCount); + for (int i = 0; i < excessAlternativeAlleleCount; i++) { + result.add(lessFrequentFirst.remove()); + } + return result; + } + /** * Tries to phase the individual alleles based on pairwise comparisons to the other alleles based on all called haplotypes * @@ -325,13 +451,14 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine> constructHaplotypeMapping(final List originalCalls, + @VisibleForTesting + static Map> constructHaplotypeMapping(final List originalCalls, final Set calledHaplotypes) { final Map> haplotypeMap = new HashMap<>(originalCalls.size()); for ( final VariantContext call : originalCalls ) { // don't try to phase if there is not exactly 1 alternate allele if ( ! isBiallelic(call) ) { - haplotypeMap.put(call, Collections.emptySet()); + haplotypeMap.put(call, Collections.emptySet()); continue; } @@ -362,10 +489,11 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine originalCalls, - final Map> haplotypeMap, - final int totalAvailableHaplotypes, - final Map> phaseSetMapping) { + @VisibleForTesting + static int constructPhaseSetMapping(final List originalCalls, + final Map> haplotypeMap, + final int totalAvailableHaplotypes, + final Map> phaseSetMapping) { final int numCalls = originalCalls.size(); int uniqueCounter = 0; @@ -457,9 +585,10 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine constructPhaseGroups(final List originalCalls, - final Map> phaseSetMapping, - final int indexTo) { + @VisibleForTesting + static List constructPhaseGroups(final List originalCalls, + final Map> phaseSetMapping, + final int indexTo) { final List phasedCalls = new ArrayList<>(originalCalls); // if we managed to find any phased groups, update the VariantContexts @@ -561,6 +690,10 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine(new HashSet<>(call.getAlleles()))); + } + // Skim the filtered map based on the location so that we do not add filtered read that are going to be removed // right after a few lines of code bellow. final Map> overlappingFilteredReads = overlappingFilteredReads(perSampleFilteredReadList, loc); @@ -687,15 +820,12 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine readLikelihoods, final VariantContext mergedVC, final List noCallAlleles ) { - final List vcAlleles = mergedVC.getAlleles(); - final AlleleList alleleList = readLikelihoods.alleleCount() == vcAlleles.size() ? readLikelihoods : new IndexedAlleleList<>(vcAlleles); - final GenotypingLikelihoods likelihoods = genotypingModel.calculateLikelihoods(alleleList,new GenotypingData<>(ploidyModel,readLikelihoods)); + private GenotypesContext calculateGLsForThisEvent(final ReadLikelihoods readLikelihoods, final List noCallAlleles) { + final GenotypingLikelihoods likelihoods = genotypingModel.calculateLikelihoods(readLikelihoods, new GenotypingData<>(ploidyModel, readLikelihoods)); final int sampleCount = samples.sampleCount(); final GenotypesContext result = GenotypesContext.create(sampleCount); for (int s = 0; s < sampleCount; s++) @@ -709,7 +839,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine haplotypes ) { + private static void cleanUpSymbolicUnassembledEvents(final List haplotypes) { final List haplotypesToRemove = new ArrayList<>(); for( final Haplotype h : haplotypes ) { for( final VariantContext vc : h.getEventMap().getVariantContexts() ) { @@ -742,9 +872,9 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine> eventMapper = new LinkedHashMap<>(eventsAtThisLoc.size()+1); final Event refEvent = new Event(null); - eventMapper.put(refEvent, new ArrayList()); + eventMapper.put(refEvent, new ArrayList<>()); for( final VariantContext vc : eventsAtThisLoc ) { - eventMapper.put(new Event(vc), new ArrayList()); + eventMapper.put(new Event(vc), new ArrayList<>()); } for( final Haplotype h : haplotypes ) { @@ -764,11 +894,12 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine generateVCsFromAlignment( final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd ) { + @VisibleForTesting + static Map generateVCsFromAlignment(final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd) { return new EventMap(haplotype, ref, refLoc, sourceNameToAdd); } - protected static boolean containsVCWithMatchingAlleles( final List list, final VariantContext vcToTest ) { + private static boolean containsVCWithMatchingAlleles( final List list, final VariantContext vcToTest ) { for( final VariantContext vc : list ) { if( vc.hasSameAllelesAs(vcToTest) ) { return true; @@ -780,7 +911,8 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine implements SampleList, AlleleList */ private int referenceAlleleIndex = -1; + /** + * Index of the non-ref allele if any, otherwise - 1. + */ + private int nonRefAlleleIndex = -1; + /** * Caches the read-list per sample list returned by {@link #sampleReads} */ @@ -138,6 +145,7 @@ public class ReadLikelihoods implements SampleList, AlleleList readListBySampleIndex = new List[sampleCount]; valuesBySampleIndex = new double[sampleCount][][]; referenceAlleleIndex = findReferenceAllele(alleles); + nonRefAlleleIndex = findNonRefAllele(alleles); readIndexBySampleIndex = new Object2IntMap[sampleCount]; @@ -219,6 +227,15 @@ public class ReadLikelihoods implements SampleList, AlleleList return -1; } + private int findNonRefAllele(final AlleleList alleles) { + final int alleleCount = alleles.alleleCount(); + for (int i = alleleCount - 1; i >= 0; i--) { + if (alleles.alleleAt(i).equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE)) + return i; + } + return - 1; + } + /** * Returns the index of a sample within the likelihood collection. * @@ -419,16 +436,21 @@ public class ReadLikelihoods implements SampleList, AlleleList } - /** - * Search the best allele for a read. - * - * @param sampleIndex including sample index. - * @param readIndex target read index. - * - * @return never {@code null}, but with {@link BestAllele#allele allele} == {@code null} - * if non-could be found. - */ private BestAllele searchBestAllele(final int sampleIndex, final int readIndex, final boolean canBeReference) { + return searchBestAllele(sampleIndex, readIndex, canBeReference, alleles); + } + + /** + * Search the best allele for a read. + * + * @param sampleIndex including sample index. + * @param readIndex target read index. + * + * @return never {@code null}, but with {@link BestAllele#allele allele} == {@code null} + * if non-could be found. + */ + private BestAllele searchBestAllele(final int sampleIndex, final int readIndex, final boolean canBeReference, + final AlleleList allelesToConsider) { final int alleleCount = alleles.alleleCount(); if (alleleCount == 0 || (alleleCount == 1 && referenceAlleleIndex == 0 && !canBeReference)) return new BestAllele(sampleIndex,readIndex,-1,Double.NEGATIVE_INFINITY,Double.NEGATIVE_INFINITY); @@ -441,6 +463,10 @@ public class ReadLikelihoods implements SampleList, AlleleList for (int a = bestAlleleIndex + 1; a < alleleCount; a++) { if (!canBeReference && referenceAlleleIndex == a) continue; + if (nonRefAlleleIndex == a) + continue; + if (allelesToConsider.alleleIndex(alleles.alleleAt(a)) < 0) + continue; final double candidateLikelihood = sampleValues[a][readIndex]; if (candidateLikelihood > bestLikelihood) { bestAlleleIndex = a; @@ -501,6 +527,7 @@ public class ReadLikelihoods implements SampleList, AlleleList alleleList = null; int referenceIndex = this.referenceAlleleIndex; + int nonRefIndex = this.nonRefAlleleIndex; @SuppressWarnings("unchecked") final A[] newAlleles = (A[]) new Allele[newAlleleCount]; for (int a = 0; a < oldAlleleCount; a++) @@ -511,19 +538,20 @@ public class ReadLikelihoods implements SampleList, AlleleList if (referenceIndex != -1) throw new IllegalArgumentException("there cannot be more than one reference allele"); referenceIndex = newIndex; + } else if (allele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE)) { + if (nonRefAlleleIndex != -1) + throw new IllegalArgumentException(String.format("there cannot be more than one %s allele", GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE_NAME)); + nonRefIndex = newIndex; } newAlleles[newIndex++] = allele; } alleles = new IndexedAlleleList<>(newAlleles); - if (referenceIndex != -1) - referenceAlleleIndex = referenceIndex; - final int sampleCount = samples.sampleCount(); for (int s = 0; s < sampleCount; s++) { final int sampleReadCount = readsBySampleIndex[s].length; - final double[][] newValuesBySampleIndex = Arrays.copyOf(valuesBySampleIndex[s],newAlleleCount); + final double[][] newValuesBySampleIndex = Arrays.copyOf(valuesBySampleIndex[s], newAlleleCount); for (int a = oldAlleleCount; a < newAlleleCount; a++) { newValuesBySampleIndex[a] = new double[sampleReadCount]; if (defaultLikelihood != 0.0) @@ -531,8 +559,86 @@ public class ReadLikelihoods implements SampleList, AlleleList } valuesBySampleIndex[s] = newValuesBySampleIndex; } + + if (referenceIndex != -1) + referenceAlleleIndex = referenceIndex; + if (nonRefIndex != -1) { + nonRefAlleleIndex = nonRefIndex; + updateNonRefAlleleLikelihoods(); + } } + /** + * Modify this likelihood collection dropping some of its alleles. + * @param allelesToDrop set of alleles to be dropped. + * @throws IllegalArgumentException if {@code allelesToDrop} is {@code null} or contain elements that are + * not alleles in this collection. + */ + public void dropAlleles(final Set allelesToDrop) { + if (allelesToDrop == null) { + throw new IllegalArgumentException("the input allele to drop set cannot be null"); + } + if (allelesToDrop.isEmpty()) { + return; + } + final boolean[] indicesToDrop = new boolean[alleles.alleleCount()]; + for (final A allele : allelesToDrop) { + final int index = alleles.alleleIndex(allele); + if (index < 0) { + throw new IllegalArgumentException("unknown allele: " + allele); + } + indicesToDrop[index] = true; + } + + @SuppressWarnings("unchecked") + final A[] newAlleles = (A[]) new Allele[alleles.alleleCount() - allelesToDrop.size()]; + final int[] newAlleleIndices = new int[newAlleles.length]; + int nextIndex = 0; + for (int i = 0; i < alleles.alleleCount(); i++) { + if (indicesToDrop[i]) { + continue; + } + newAlleleIndices[nextIndex] = i; + newAlleles[nextIndex++] = alleles.alleleAt(i); + } + for (int i = 0; i < samples.sampleCount(); i++) { + final double[][] oldSampleValues = valuesBySampleIndex[i]; + final double[][] newSampleValues = new double[newAlleles.length][]; + for (int j = 0; j < newAlleles.length; j++) { + newSampleValues[j] = oldSampleValues[newAlleleIndices[j]]; + } + valuesBySampleIndex[i] = newSampleValues; + } + alleleList = Collections.unmodifiableList(Arrays.asList(newAlleles)); + alleles = new IndexedAlleleList<>(alleleList); + if (nonRefAlleleIndex >= 0) { + nonRefAlleleIndex = findNonRefAllele(alleles); + updateNonRefAlleleLikelihoods(); + } + if (referenceAlleleIndex >= 0) + referenceAlleleIndex = findReferenceAllele(alleles); + } + + /** + * Drop all the alleles not present in the subset passed as a parameter. + * @param subset the alleles to retain. + * @throws IllegalArgumentException if {@code alleles} is {@code null} or contain alleles unknown to this likelihood + * collection. + */ + public void retainAlleles(final Set subset) { + if (alleles == null) { + throw new IllegalArgumentException("the retain subset must not be null"); + } else if (!alleles().containsAll(subset) || subset.size() > alleleCount()) { + throw new IllegalArgumentException("some of the alleles to retain are not present in the read-likelihoods collection"); + } else if (subset.isEmpty() || (subset.size() == 1 && subset.contains(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE))) { + throw new IllegalArgumentException("there must be at least one allele to retain"); + } else { + final Set allelesToDrop = new HashSet<>(alleles()); + allelesToDrop.removeAll(subset); + dropAlleles(allelesToDrop); + } + } + /** * Likelihood matrix between a set of alleles and reads. * @param the allele-type. @@ -543,13 +649,13 @@ public class ReadLikelihoods implements SampleList, AlleleList * List of reads in the matrix sorted by their index therein. * @return never {@code null}. */ - public List reads(); + List reads(); /** * List of alleles in the matrix sorted by their index in the collection. * @return never {@code null}. */ - public List alleles(); + List alleles(); /** * Set the likelihood of a read given an allele through their indices. @@ -561,7 +667,7 @@ public class ReadLikelihoods implements SampleList, AlleleList * @throws IllegalArgumentException if {@code alleleIndex} or {@code readIndex} * are not valid allele and read indices respectively. */ - public void set(final int alleleIndex, final int readIndex, final double value); + void set(final int alleleIndex, final int readIndex, final double value); /** * Returns the likelihood of a read given a haplotype. @@ -575,7 +681,7 @@ public class ReadLikelihoods implements SampleList, AlleleList * @return the requested likelihood, whatever value was provided using {@link #set(int,int,double) set} * or 0.0 if none was set. */ - public double get(final int alleleIndex, final int readIndex); + double get(final int alleleIndex, final int readIndex); /** * Queries the index of an allele in the matrix. @@ -586,7 +692,7 @@ public class ReadLikelihoods implements SampleList, AlleleList * @return -1 if such allele does not exist, otherwise its index which 0 or greater. */ @SuppressWarnings("unused") - public int alleleIndex(final A allele); + int alleleIndex(final A allele); /** * Queries the index of a read in the matrix. @@ -599,19 +705,19 @@ public class ReadLikelihoods implements SampleList, AlleleList * which is 0 or greater. */ @SuppressWarnings("unused") - public int readIndex(final GATKSAMRecord read); + int readIndex(final GATKSAMRecord read); /** * Number of allele in the matrix. * @return never negative. */ - public int alleleCount(); + int alleleCount(); /** * Number of reads in the matrix. * @return never negative. */ - public int readCount(); + int readCount(); /** * Returns the allele given its index. @@ -621,7 +727,7 @@ public class ReadLikelihoods implements SampleList, AlleleList * @throws IllegalArgumentException if {@code alleleIndex} is not a valid allele index. * @return never {@code null}. */ - public A alleleAt(final int alleleIndex); + A alleleAt(final int alleleIndex); /** * Returns the allele given its index. @@ -631,7 +737,7 @@ public class ReadLikelihoods implements SampleList, AlleleList * @throws IllegalArgumentException if {@code readIndex} is not a valid read index. * @return never {@code null}. */ - public GATKSAMRecord readAt(final int readIndex); + GATKSAMRecord readAt(final int readIndex); /** @@ -640,7 +746,7 @@ public class ReadLikelihoods implements SampleList, AlleleList * @param dest the destination array. * @param offset the copy offset within the destination allele */ - public void copyAlleleLikelihoods(final int alleleIndex, final double[] dest, final int offset); + void copyAlleleLikelihoods(final int alleleIndex, final double[] dest, final int offset); } /** @@ -905,7 +1011,6 @@ public class ReadLikelihoods implements SampleList, AlleleList * * @throws IllegalArgumentException the location cannot be {@code null} nor unmapped. */ - @SuppressWarnings("unused") public void filterToOnlyOverlappingUnclippedReads(final GenomeLoc location) { if (location == null) throw new IllegalArgumentException("the location cannot be null"); @@ -921,7 +1026,6 @@ public class ReadLikelihoods implements SampleList, AlleleList final int alleleCount = alleles.alleleCount(); final IntArrayList removeIndices = new IntArrayList(10); for (int s = 0; s < sampleCount; s++) { - int readRemoveCount = 0; final GATKSAMRecord[] sampleReads = readsBySampleIndex[s]; final int sampleReadCount = sampleReads.length; for (int r = 0; r < sampleReadCount; r++) @@ -932,28 +1036,6 @@ public class ReadLikelihoods implements SampleList, AlleleList } } - // Compare the read coordinates to the location of interest. - private boolean readOverlapsLocation(final String contig, final int locStart, - final int locEnd, final GATKSAMRecord read) { - final boolean overlaps; - - if (read.getReadUnmappedFlag()) - overlaps = false; - else if (!read.getReferenceName().equals(contig)) - overlaps = false; - else { - int alnStart = read.getAlignmentStart(); - int alnStop = read.getAlignmentEnd(); - if (alnStart > alnStop) { // Paranoia? based on GLP.createGenomeLoc(Read) this can happen?. - final int end = alnStart; - alnStart = alnStop; - alnStop = end; - } - overlaps = !(alnStop < locStart || alnStart > locEnd); - } - return overlaps; - } - /** * Removes those read that the best possible likelihood given any allele is just too low. * @@ -991,7 +1073,7 @@ public class ReadLikelihoods implements SampleList, AlleleList } // Check whether the read is poorly modelled. - protected boolean readIsPoorlyModelled(final int sampleIndex, final int readIndex, final GATKSAMRecord read, final double maxErrorRatePerBase) { + private boolean readIsPoorlyModelled(final int sampleIndex, final int readIndex, final GATKSAMRecord read, final double maxErrorRatePerBase) { final double maxErrorsForRead = Math.min(2.0, Math.ceil(read.getReadLength() * maxErrorRatePerBase)); final double log10QualPerBase = -4.0; final double log10MaxLikelihoodForTrueAllele = maxErrorsForRead * log10QualPerBase; @@ -1089,37 +1171,25 @@ public class ReadLikelihoods implements SampleList, AlleleList throw new IllegalArgumentException("non-ref allele cannot be null"); if (!nonRefAllele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE)) throw new IllegalArgumentException("the non-ref allele is not valid"); - // Already present? - if (alleles.alleleIndex(nonRefAllele) != -1) - return; - - final int oldAlleleCount = alleles.alleleCount(); - final int newAlleleCount = oldAlleleCount + 1; - @SuppressWarnings("unchecked") - final A[] newAlleles = (A[]) new Allele[newAlleleCount]; - for (int a = 0; a < oldAlleleCount; a++) - newAlleles[a] = alleles.alleleAt(a); - newAlleles[oldAlleleCount] = nonRefAllele; - alleles = new IndexedAlleleList<>(newAlleles); - alleleList = null; // remove the cached alleleList. - - final int sampleCount = samples.sampleCount(); - for (int s = 0; s < sampleCount; s++) - addNonReferenceAlleleLikelihoodsPerSample(oldAlleleCount, newAlleleCount, s); + addMissingAlleles(Collections.singleton(nonRefAllele), Double.NEGATIVE_INFINITY); + updateNonRefAlleleLikelihoods(); } - // Updates per-sample structures according to the addition of the NON_REF allele. - private void addNonReferenceAlleleLikelihoodsPerSample(final int alleleCount, final int newAlleleCount, final int sampleIndex) { - final double[][] sampleValues = valuesBySampleIndex[sampleIndex] = Arrays.copyOf(valuesBySampleIndex[sampleIndex], newAlleleCount); - final int sampleReadCount = readsBySampleIndex[sampleIndex].length; + public void updateNonRefAlleleLikelihoods() { + updateNonRefAlleleLikelihoods(alleles); + } - final double[] nonRefAlleleLikelihoods = sampleValues[alleleCount] = new double [sampleReadCount]; - Arrays.fill(nonRefAlleleLikelihoods,Double.NEGATIVE_INFINITY); - for (int r = 0; r < sampleReadCount; r++) { - final BestAllele bestAllele = searchBestAllele(sampleIndex,r,true); - final double secondBestLikelihood = Double.isInfinite(bestAllele.confidence) ? bestAllele.likelihood - : bestAllele.likelihood - bestAllele.confidence; - nonRefAlleleLikelihoods[r] = secondBestLikelihood; + public void updateNonRefAlleleLikelihoods(final AlleleList allelesToConsider) { + if (nonRefAlleleIndex < 0) + return; + for (int s = 0; s < samples.sampleCount(); s++) { + final double[][] sampleValues = valuesBySampleIndex[s]; + for (int r = 0; r < sampleValues[0].length; r++) { + final BestAllele bestAllele = searchBestAllele(s, r, true, allelesToConsider); + final double secondBestLikelihood = Double.isInfinite(bestAllele.confidence) ? bestAllele.likelihood + : bestAllele.likelihood - bestAllele.confidence; + sampleValues[nonRefAlleleIndex][r] = secondBestLikelihood; + } } } @@ -1170,9 +1240,9 @@ public class ReadLikelihoods implements SampleList, AlleleList * * @return never {@code null}. */ - public static ReadLikelihoods fromPerAlleleReadLikelihoodsMap(final AlleleList alleleList, final Map map) { + @VisibleForTesting + static ReadLikelihoods fromPerAlleleReadLikelihoodsMap(final AlleleList alleleList, final Map map) { - //TODO add test code for this method. // First we need to create the read-likelihood collection with all required alleles, samples and reads. final SampleList sampleList = new IndexedSampleList(map.keySet()); final int alleleCount = alleleList.alleleCount(); @@ -1228,7 +1298,8 @@ public class ReadLikelihoods implements SampleList, AlleleList * @param sampleIndex the target sample. * @return never {@code null}, perhaps empty. */ - public Map> readsByBestAlleleMap(final int sampleIndex) { + @VisibleForTesting + Map> readsByBestAlleleMap(final int sampleIndex) { checkSampleIndex(sampleIndex); final int alleleCount = alleles.alleleCount(); final int sampleReadCount = readsBySampleIndex[sampleIndex].length;