Fix for the sum(AD) > DP bug.

Closes issue #1340
This commit is contained in:
Valentin Ruano Rubio 2016-04-25 01:15:46 -04:00
parent 25fa25b618
commit 9d32dec9cd
9 changed files with 361 additions and 144 deletions

View File

@ -61,6 +61,9 @@ import java.util.List;
public class GenotypeCalculationArgumentCollection implements Cloneable{
public static final String MAX_ALTERNATE_ALLELES_SHORT_NAME = "maxAltAlleles";
/**
* Depending on the value of the --max_alternate_alleles argument, we may genotype only a fraction of the alleles being sent on for genotyping.
* Using this argument instructs the genotyper to annotate (in the INFO field) the number of alternate alleles that were originally discovered at the site.
@ -122,7 +125,7 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
* As of GATK 2.2 the genotyper can handle a very large number of events, so the default maximum has been increased to 6.
*/
@Advanced
@Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
@Argument(fullName = "max_alternate_alleles", shortName = MAX_ALTERNATE_ALLELES_SHORT_NAME, doc = "Maximum number of alternate alleles to genotype", required = false)
public int MAX_ALTERNATE_ALLELES = 6;
/**

View File

@ -54,9 +54,13 @@ package org.broadinstitute.gatk.tools.walkers.genotyper;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.GenotypeLikelihoods;
import org.broadinstitute.gatk.utils.genotyper.AlleleList;
import org.broadinstitute.gatk.utils.genotyper.AlleleListUtils;
import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList;
import org.broadinstitute.gatk.utils.genotyper.SampleList;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
/**
* Genotyping Likelihoods collection.

View File

@ -69,7 +69,7 @@ public class VariantCallContext extends VariantContext {
// Should this site be emitted?
public boolean shouldEmit = true;
VariantCallContext(VariantContext vc, boolean confidentlyCalledP) {
public VariantCallContext(VariantContext vc, boolean confidentlyCalledP) {
super(vc);
this.confidentlyCalled = confidentlyCalledP;
}

View File

@ -53,6 +53,9 @@ package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingLikelihoods;
import org.broadinstitute.gatk.utils.SimpleTimer;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.GenotypesContext;
@ -128,6 +131,7 @@ public abstract class AFCalculator implements Cloneable {
* @return result (for programming convenience)
*/
public AFCalculationResult getLog10PNonRef(final VariantContext vc, final int defaultPloidy, final int maximumAlternativeAlleles, final double[] log10AlleleFrequencyPriors) {
if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null");
if ( vc.getNAlleles() == 1 ) throw new IllegalArgumentException("VariantContext has only a single reference allele, but getLog10PNonRef requires at least one at all " + vc);
if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null");
@ -135,6 +139,9 @@ public abstract class AFCalculator implements Cloneable {
// reset the result, so we can store our new result there
final StateTracker stateTracker = getStateTracker(true,maximumAlternativeAlleles);
//TODO All implementations of the reduce-scope seems to employ a bad criterion to
//TODO decide what alleles to keep. This must be changed eventually.
//TODO issue {@see https://github.com/broadinstitute/gsa-unstable/issues/1376}
final VariantContext vcWorking = reduceScope(vc,defaultPloidy, maximumAlternativeAlleles);
callTimer.start();

View File

@ -51,19 +51,19 @@
package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import com.google.common.annotations.VisibleForTesting;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import htsjdk.samtools.util.StringUtil;
import htsjdk.variant.variantcontext.*;
import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection;
import org.broadinstitute.gatk.utils.*;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.genotyper.AlleleList;
import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList;
import org.broadinstitute.gatk.utils.genotyper.SampleList;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.genotyper.*;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.haplotype.EventMap;
@ -82,6 +82,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
protected static final int ALLELE_EXTENSION = 2;
private static final String phase01 = "0|1";
private static final String phase10 = "1|0";
private static final int MAX_DROPPED_ALTERNATIVE_ALLELES_TO_LOG = 20;
private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger;
@ -188,7 +189,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
@Ensures("result != null")
// TODO - can this be refactored? this is hard to follow!
public CalledHaplotypes assignGenotypeLikelihoods( final List<Haplotype> haplotypes,
CalledHaplotypes assignGenotypeLikelihoods( final List<Haplotype> haplotypes,
final ReadLikelihoods<Haplotype> readLikelihoods,
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
final byte[] ref,
@ -241,9 +242,6 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
final GenotypeLikelihoodsCalculationModel.Model calculationModel = mergedVC.isSNP()
? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL;
if (emitReferenceConfidence)
mergedVC = addNonRefSymbolicAllele(mergedVC);
final Map<VariantContext, Allele> mergeMap = new LinkedHashMap<>();
mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
for(int iii = 0; iii < eventsAtThisLoc.size(); iii++) {
@ -256,39 +254,25 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
if (logger != null) logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
}
ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION));
final ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper, genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC), ALLELE_EXTENSION));
if (configuration.isSampleContaminationPresent())
readAlleleLikelihoods.contaminationDownsampling(configuration.getSampleContamination());
final boolean someAllelesWereDropped = configuration.genotypeArgs.MAX_ALTERNATE_ALLELES < readAlleleLikelihoods.alleleCount() - 1;
if (emitReferenceConfidence)
if (someAllelesWereDropped) {
reduceNumberOfAlternativeAllelesBasedOnLikelihoods(readAlleleLikelihoods, genomeLocParser.createGenomeLoc(mergedVC));
}
if (emitReferenceConfidence) {
mergedVC = addNonRefSymbolicAllele(mergedVC);
readAlleleLikelihoods.addNonReferenceAllele(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
}
final GenotypesContext genotypes = calculateGLsForThisEvent( readAlleleLikelihoods, mergedVC, noCallAlleles );
final VariantContext call = calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), calculationModel);
if( call != null ) {
readAlleleLikelihoods = prepareReadAlleleLikelihoodsForAnnotation(readLikelihoods, perSampleFilteredReadList,
genomeLocParser, emitReferenceConfidence, alleleMapper, readAlleleLikelihoods, call);
ReferenceContext referenceContext = new ReferenceContext(genomeLocParser, genomeLocParser.createGenomeLoc(mergedVC.getChr(), mergedVC.getStart(), mergedVC.getEnd()), refLoc, ref);
VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(referenceContext, tracker,readAlleleLikelihoods, call, emitReferenceConfidence);
if( call.getAlleles().size() != mergedVC.getAlleles().size() )
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
// maintain the set of all called haplotypes
for ( final Allele calledAllele : call.getAlleles() ) {
final List<Haplotype> haplotypeList = alleleMapper.get(calledAllele);
if (haplotypeList == null) continue;
calledHaplotypes.addAll(haplotypeList);
}
if ( !emitReferenceConfidence ) {
// set GTs to no-call when GQ is 0 in normal mode
annotatedCall = clearUnconfidentGenotypeCalls(annotatedCall);
}
final GenotypesContext genotypes = calculateGLsForThisEvent(readAlleleLikelihoods, noCallAlleles );
final VariantContext call = calculateGenotypes(new VariantContextBuilder(mergedVC).alleles(readAlleleLikelihoods.alleles()).genotypes(genotypes).make(), calculationModel);
if ( call != null ) {
final VariantContext annotatedCall = annotateCall(readLikelihoods, perSampleFilteredReadList, ref, refLoc, genomeLocParser, tracker, emitReferenceConfidence, calledHaplotypes, mergedVC, alleleMapper, readAlleleLikelihoods, someAllelesWereDropped, call);
returnCalls.add( annotatedCall );
}
}
@ -298,6 +282,148 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
return new CalledHaplotypes(phasedCalls, calledHaplotypes);
}
private VariantContext annotateCall(final ReadLikelihoods<Haplotype> readLikelihoods,
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
final byte[] ref, final GenomeLoc refLoc, final GenomeLocParser genomeLocParser,
final RefMetaDataTracker tracker,
final boolean emitReferenceConfidence,
final Set<Haplotype> calledHaplotypes, final VariantContext mergedVC,
final Map<Allele, List<Haplotype>> alleleMapper,
final ReadLikelihoods<Allele> readAlleleLikelihoods,
final boolean someAlternativeAllelesWereAlreadyDropped,
final VariantContext call) {
final int initialAlleleNumber = readAlleleLikelihoods.alleleCount();
final ReadLikelihoods<Allele> readAlleleLikelihoodsForAnnotation = prepareReadAlleleLikelihoodsForAnnotation(readLikelihoods, perSampleFilteredReadList,
genomeLocParser, emitReferenceConfidence, alleleMapper, readAlleleLikelihoods, call);
ReferenceContext referenceContext = new ReferenceContext(genomeLocParser, genomeLocParser.createGenomeLoc(mergedVC), refLoc, ref);
final boolean someAlternativeAllelesWereDropped = call.getAlleles().size() != initialAlleleNumber;
VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(referenceContext, tracker,readAlleleLikelihoodsForAnnotation, call, emitReferenceConfidence);
if (someAlternativeAllelesWereDropped || someAlternativeAllelesWereAlreadyDropped)
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
// maintain the set of all called haplotypes
for ( final Allele calledAllele : call.getAlleles() ) {
final List<Haplotype> haplotypeList = alleleMapper.get(calledAllele);
if (haplotypeList == null) continue;
calledHaplotypes.addAll(haplotypeList);
}
return !emitReferenceConfidence ? clearUnconfidentGenotypeCalls(annotatedCall) : annotatedCall;
}
/**
* Reduce the number alternative alleles in a read-likelihoods collection to the maximum-alt-allele user parameter value.
* <p>
* We always keep the reference allele.
* As for the other alleles we keep the ones with the highest AF estimated as
* described in {@link #excessAlternativeAlleles(GenotypingLikelihoods, int)}
* </p>
* @param readAlleleLikelihoods the target read-likelihood collection.
*/
private void reduceNumberOfAlternativeAllelesBasedOnLikelihoods(final ReadLikelihoods<Allele> readAlleleLikelihoods, final GenomeLoc location) {
final GenotypingLikelihoods<Allele> genotypeLikelihoods = genotypingModel.calculateLikelihoods(readAlleleLikelihoods, new GenotypingData<>(ploidyModel,readAlleleLikelihoods));
final Set<Allele> allelesToDrop = excessAlternativeAlleles(genotypeLikelihoods, configuration.genotypeArgs.MAX_ALTERNATE_ALLELES);
final String allelesToDropString;
if (allelesToDrop.size() < MAX_DROPPED_ALTERNATIVE_ALLELES_TO_LOG) {
allelesToDropString = StringUtil.join(", ", allelesToDrop);
} else {
final Iterator<Allele> it = allelesToDrop.iterator();
final StringBuilder builder = new StringBuilder();
for (int i = 0; i < MAX_DROPPED_ALTERNATIVE_ALLELES_TO_LOG; i++) {
builder.append(it.next().toString()).append(", ");
}
allelesToDropString = builder.append(it.next().toString()).append(" and ").append(allelesToDrop.size() - 20).append(" more").toString();
}
logger.warn(String.format("location %s: too many alternative alleles found (%d) larger than the maximum requested with -%s (%d), the following will be dropped: %s.", location,
readAlleleLikelihoods.alleleCount() - 1, GenotypeCalculationArgumentCollection.MAX_ALTERNATE_ALLELES_SHORT_NAME, configuration.genotypeArgs.MAX_ALTERNATE_ALLELES,
allelesToDropString));
readAlleleLikelihoods.dropAlleles(allelesToDrop);
}
/**
* Returns the set of alleles that should be dropped in order to bring down the number
* of alternative alleles to the maximum allowed.
*
* <p>
* The alleles that put forward for removal are those with the lowest estimated allele count.
* </p>
* <p>
* Allele counts are estimated herein as the weighted average count
* across samples and phased genotypes where the weight is the genotype likelihood-- we apply
* a uniform prior to all genotypes configurations.
* </p>
* <p>
* In case of a tie, unlikely for non trivial likelihoods, we keep the alleles with the lower index.
* </p>
*
* @param genotypeLikelihoods target genotype likelihoods.
* @param maxAlternativeAlleles maximum number of alternative alleles allowed.
* @return never {@code null}.
*/
private Set<Allele> excessAlternativeAlleles(final GenotypingLikelihoods<Allele> genotypeLikelihoods, final int maxAlternativeAlleles) {
final int alleleCount = genotypeLikelihoods.alleleCount();
final int excessAlternativeAlleleCount = Math.max(0, alleleCount - 1 - maxAlternativeAlleles);
if (excessAlternativeAlleleCount <= 0) {
return Collections.emptySet();
}
final double log10NumberOfAlleles = MathUtils.Log10Cache.get(alleleCount); // log10(Num of Alleles); e.g. log10(2) for diploids.
final double[] log10EstimatedACs = new double[alleleCount]; // where we store the AC estimates.
// Set allele counts to 0 (i.e. exp(-Inf)) at the start.
Arrays.fill(log10EstimatedACs, Double.NEGATIVE_INFINITY);
for (int i = 0; i < genotypeLikelihoods.sampleCount(); i++) {
final GenotypeLikelihoodCalculator calculator = GenotypeLikelihoodCalculators.getInstance(genotypeLikelihoods.samplePloidy(i), alleleCount);
final int numberOfUnphasedGenotypes = calculator.genotypeCount();
// unphased genotype log10 likelihoods
final double[] log10Likelihoods = genotypeLikelihoods.sampleLikelihoods(i).getAsVector();
// total number of phased genotypes for all possible combinations of allele counts.
final double log10NumberOfPhasedGenotypes = calculator.ploidy() * log10NumberOfAlleles;
for (int j = 0; j < numberOfUnphasedGenotypes; j++) {
final GenotypeAlleleCounts alleleCounts = calculator.genotypeAlleleCountsAt(j);
// given the current unphased genotype, how many phased genotypes there are:
final double log10NumberOfPhasedGenotypesForThisUnphasedGenotype = alleleCounts.log10CombinationCount();
final double log10GenotypeLikelihood = log10Likelihoods[j];
for (int k = 0; k < alleleCounts.distinctAlleleCount(); k++) {
final int alleleIndex = alleleCounts.alleleIndexAt(k);
final int alleleCallCount = alleleCounts.alleleCountAt(k);
final double log10AlleleCount = MathUtils.Log10Cache.get(alleleCallCount);
final double log10Weight = log10GenotypeLikelihood + log10NumberOfPhasedGenotypesForThisUnphasedGenotype
- log10NumberOfPhasedGenotypes;
// update the allele AC adding the contribution of this unphased genotype at this sample.
log10EstimatedACs[alleleIndex] = MathUtils.log10sumLog10(log10EstimatedACs[alleleIndex],
log10Weight + log10AlleleCount);
}
}
}
final PriorityQueue<Allele> lessFrequentFirst = new PriorityQueue<>(alleleCount, new Comparator<Allele>() {
@Override
public int compare(final Allele a1, final Allele a2) {
final int index1 = genotypeLikelihoods.alleleIndex(a1);
final int index2 = genotypeLikelihoods.alleleIndex(a2);
final double freq1 = log10EstimatedACs[index1];
final double freq2 = log10EstimatedACs[index2];
if (freq1 != freq2) {
return Double.compare(freq1, freq2);
} else {
return Integer.compare(index2, index1);
}
}
});
for (int i = 1; i < alleleCount; i++) {
lessFrequentFirst.add(genotypeLikelihoods.alleleAt(i));
}
final Set<Allele> result = new HashSet<>(excessAlternativeAlleleCount);
for (int i = 0; i < excessAlternativeAlleleCount; i++) {
result.add(lessFrequentFirst.remove());
}
return result;
}
/**
* Tries to phase the individual alleles based on pairwise comparisons to the other alleles based on all called haplotypes
*
@ -325,13 +451,14 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
* @param calledHaplotypes the set of haplotypes used for calling
* @return non-null Map
*/
protected static Map<VariantContext, Set<Haplotype>> constructHaplotypeMapping(final List<VariantContext> originalCalls,
@VisibleForTesting
static Map<VariantContext, Set<Haplotype>> constructHaplotypeMapping(final List<VariantContext> originalCalls,
final Set<Haplotype> calledHaplotypes) {
final Map<VariantContext, Set<Haplotype>> haplotypeMap = new HashMap<>(originalCalls.size());
for ( final VariantContext call : originalCalls ) {
// don't try to phase if there is not exactly 1 alternate allele
if ( ! isBiallelic(call) ) {
haplotypeMap.put(call, Collections.<Haplotype>emptySet());
haplotypeMap.put(call, Collections.emptySet());
continue;
}
@ -362,10 +489,11 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
* note that it is okay for this method NOT to populate the phaseSetMapping at all (e.g. in an impossible-to-phase situation)
* @return the next incremental unique index
*/
protected static int constructPhaseSetMapping(final List<VariantContext> originalCalls,
final Map<VariantContext, Set<Haplotype>> haplotypeMap,
final int totalAvailableHaplotypes,
final Map<VariantContext, Pair<Integer, String>> phaseSetMapping) {
@VisibleForTesting
static int constructPhaseSetMapping(final List<VariantContext> originalCalls,
final Map<VariantContext, Set<Haplotype>> haplotypeMap,
final int totalAvailableHaplotypes,
final Map<VariantContext, Pair<Integer, String>> phaseSetMapping) {
final int numCalls = originalCalls.size();
int uniqueCounter = 0;
@ -457,9 +585,10 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
* @param indexTo last index (exclusive) of phase group IDs
* @return a non-null list which represents the possibly phased version of the calls
*/
protected static List<VariantContext> constructPhaseGroups(final List<VariantContext> originalCalls,
final Map<VariantContext, Pair<Integer, String>> phaseSetMapping,
final int indexTo) {
@VisibleForTesting
static List<VariantContext> constructPhaseGroups(final List<VariantContext> originalCalls,
final Map<VariantContext, Pair<Integer, String>> phaseSetMapping,
final int indexTo) {
final List<VariantContext> phasedCalls = new ArrayList<>(originalCalls);
// if we managed to find any phased groups, update the VariantContexts
@ -561,6 +690,10 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
}
if (call.getAlleles().size() != readAlleleLikelihoodsForAnnotations.alleleCount()) {
readAlleleLikelihoodsForAnnotations.updateNonRefAlleleLikelihoods(new IndexedAlleleList<>(new HashSet<>(call.getAlleles())));
}
// Skim the filtered map based on the location so that we do not add filtered read that are going to be removed
// right after a few lines of code bellow.
final Map<String, List<GATKSAMRecord>> overlappingFilteredReads = overlappingFilteredReads(perSampleFilteredReadList, loc);
@ -687,15 +820,12 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
/**
* For a particular event described in inputVC, form PL vector for each sample by looking into allele read map and filling likelihood matrix for each allele
* @param readLikelihoods Allele map describing mapping from reads to alleles and corresponding likelihoods
* @param mergedVC Input VC with event to genotype
* @return GenotypesContext object wrapping genotype objects with PLs
*/
@Requires({"readLikelihoods!= null", "mergedVC != null"})
@Ensures("result != null")
protected GenotypesContext calculateGLsForThisEvent( final ReadLikelihoods<Allele> readLikelihoods, final VariantContext mergedVC, final List<Allele> noCallAlleles ) {
final List<Allele> vcAlleles = mergedVC.getAlleles();
final AlleleList<Allele> alleleList = readLikelihoods.alleleCount() == vcAlleles.size() ? readLikelihoods : new IndexedAlleleList<>(vcAlleles);
final GenotypingLikelihoods<Allele> likelihoods = genotypingModel.calculateLikelihoods(alleleList,new GenotypingData<>(ploidyModel,readLikelihoods));
private GenotypesContext calculateGLsForThisEvent(final ReadLikelihoods<Allele> readLikelihoods, final List<Allele> noCallAlleles) {
final GenotypingLikelihoods<Allele> likelihoods = genotypingModel.calculateLikelihoods(readLikelihoods, new GenotypingData<>(ploidyModel, readLikelihoods));
final int sampleCount = samples.sampleCount();
final GenotypesContext result = GenotypesContext.create(sampleCount);
for (int s = 0; s < sampleCount; s++)
@ -709,7 +839,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
*/
// TODO - split into input haplotypes and output haplotypes as not to share I/O arguments
@Requires("haplotypes != null")
protected static void cleanUpSymbolicUnassembledEvents( final List<Haplotype> haplotypes ) {
private static void cleanUpSymbolicUnassembledEvents(final List<Haplotype> haplotypes) {
final List<Haplotype> haplotypesToRemove = new ArrayList<>();
for( final Haplotype h : haplotypes ) {
for( final VariantContext vc : h.getEventMap().getVariantContexts() ) {
@ -742,9 +872,9 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
final Map<Event, List<Haplotype>> eventMapper = new LinkedHashMap<>(eventsAtThisLoc.size()+1);
final Event refEvent = new Event(null);
eventMapper.put(refEvent, new ArrayList<Haplotype>());
eventMapper.put(refEvent, new ArrayList<>());
for( final VariantContext vc : eventsAtThisLoc ) {
eventMapper.put(new Event(vc), new ArrayList<Haplotype>());
eventMapper.put(new Event(vc), new ArrayList<>());
}
for( final Haplotype h : haplotypes ) {
@ -764,11 +894,12 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
}
@Deprecated
protected static Map<Integer,VariantContext> generateVCsFromAlignment( final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd ) {
@VisibleForTesting
static Map<Integer,VariantContext> generateVCsFromAlignment(final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd) {
return new EventMap(haplotype, ref, refLoc, sourceNameToAdd);
}
protected static boolean containsVCWithMatchingAlleles( final List<VariantContext> list, final VariantContext vcToTest ) {
private static boolean containsVCWithMatchingAlleles( final List<VariantContext> list, final VariantContext vcToTest ) {
for( final VariantContext vc : list ) {
if( vc.hasSameAllelesAs(vcToTest) ) {
return true;
@ -780,7 +911,8 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
protected static class Event {
public VariantContext vc;
public Event( final VariantContext vc ) {
@VisibleForTesting
Event( final VariantContext vc ) {
this.vc = vc;
}
@ -800,7 +932,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
*
* @return never {@code null}.
*/
public PloidyModel getPloidyModel() {
PloidyModel getPloidyModel() {
return ploidyModel;
}
@ -809,7 +941,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<AssemblyBa
*
* @return never {@code null}.
*/
public GenotypingModel getGenotypingModel() {
GenotypingModel getGenotypingModel() {
return genotypingModel;
}

View File

@ -352,7 +352,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testHaplotypeCallerMultiAllelicNonRef() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A StrandAlleleCountsBySample",
b37KGReference, privateTestDir + "multiallelic-nonref.bam", "2:47641259-47641859", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("87f9203f5ac637080469052b702c61c7"));
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("1d9e75bd09a6fc5a1d9156fe8a7d43ce"));
spec.disableShadowBCF();
executeTest(" testHaplotypeCallerMultiAllelicNonRef", spec);
}

View File

@ -174,7 +174,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void testHaplotypeCallerMultiSampleGGA() throws IOException {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf" +
" -isr INTERSECTION -L " + GGA_INTERVALS_FILE,
"6d3cea3ee76b6eba14c1dfe230cff96b");
"af3f54bee3347cafb34734dc7d3516a9");
}
@Test
@ -186,7 +186,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGATetraploid() throws IOException {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -ploidy 4 -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf -isr INTERSECTION -L 20:10080000-10100000",
"9315d146a66c7baf3b615eb480c54dc1");
"d7c3aac002701bec6b6d0539c65207be");
}
@Test

View File

@ -315,11 +315,11 @@ public class MathUtils {
return maxValue + (sum != 1.0 ? Math.log10(sum) : 0.0);
}
public static double sumLog10(final double[] log10values) {
public static double sumLog10(final double ... log10values) {
return Math.pow(10.0, log10sumLog10(log10values));
}
public static double log10sumLog10(final double[] log10values) {
public static double log10sumLog10(final double ... log10values) {
return log10sumLog10(log10values, 0);
}

View File

@ -25,6 +25,7 @@
package org.broadinstitute.gatk.utils.genotyper;
import com.google.common.annotations.VisibleForTesting;
import htsjdk.variant.variantcontext.Allele;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.objects.Object2IntMap;
@ -34,6 +35,7 @@ import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import java.util.*;
@ -92,6 +94,11 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
*/
private int referenceAlleleIndex = -1;
/**
* Index of the non-ref allele if any, otherwise - 1.
*/
private int nonRefAlleleIndex = -1;
/**
* Caches the read-list per sample list returned by {@link #sampleReads}
*/
@ -138,6 +145,7 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
readListBySampleIndex = new List[sampleCount];
valuesBySampleIndex = new double[sampleCount][][];
referenceAlleleIndex = findReferenceAllele(alleles);
nonRefAlleleIndex = findNonRefAllele(alleles);
readIndexBySampleIndex = new Object2IntMap[sampleCount];
@ -219,6 +227,15 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
return -1;
}
private int findNonRefAllele(final AlleleList<A> alleles) {
final int alleleCount = alleles.alleleCount();
for (int i = alleleCount - 1; i >= 0; i--) {
if (alleles.alleleAt(i).equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE))
return i;
}
return - 1;
}
/**
* Returns the index of a sample within the likelihood collection.
*
@ -419,16 +436,21 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
}
/**
* Search the best allele for a read.
*
* @param sampleIndex including sample index.
* @param readIndex target read index.
*
* @return never {@code null}, but with {@link BestAllele#allele allele} == {@code null}
* if non-could be found.
*/
private BestAllele searchBestAllele(final int sampleIndex, final int readIndex, final boolean canBeReference) {
return searchBestAllele(sampleIndex, readIndex, canBeReference, alleles);
}
/**
* Search the best allele for a read.
*
* @param sampleIndex including sample index.
* @param readIndex target read index.
*
* @return never {@code null}, but with {@link BestAllele#allele allele} == {@code null}
* if non-could be found.
*/
private BestAllele searchBestAllele(final int sampleIndex, final int readIndex, final boolean canBeReference,
final AlleleList<A> allelesToConsider) {
final int alleleCount = alleles.alleleCount();
if (alleleCount == 0 || (alleleCount == 1 && referenceAlleleIndex == 0 && !canBeReference))
return new BestAllele(sampleIndex,readIndex,-1,Double.NEGATIVE_INFINITY,Double.NEGATIVE_INFINITY);
@ -441,6 +463,10 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
for (int a = bestAlleleIndex + 1; a < alleleCount; a++) {
if (!canBeReference && referenceAlleleIndex == a)
continue;
if (nonRefAlleleIndex == a)
continue;
if (allelesToConsider.alleleIndex(alleles.alleleAt(a)) < 0)
continue;
final double candidateLikelihood = sampleValues[a][readIndex];
if (candidateLikelihood > bestLikelihood) {
bestAlleleIndex = a;
@ -501,6 +527,7 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
alleleList = null;
int referenceIndex = this.referenceAlleleIndex;
int nonRefIndex = this.nonRefAlleleIndex;
@SuppressWarnings("unchecked")
final A[] newAlleles = (A[]) new Allele[newAlleleCount];
for (int a = 0; a < oldAlleleCount; a++)
@ -511,19 +538,20 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
if (referenceIndex != -1)
throw new IllegalArgumentException("there cannot be more than one reference allele");
referenceIndex = newIndex;
} else if (allele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE)) {
if (nonRefAlleleIndex != -1)
throw new IllegalArgumentException(String.format("there cannot be more than one %s allele", GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE_NAME));
nonRefIndex = newIndex;
}
newAlleles[newIndex++] = allele;
}
alleles = new IndexedAlleleList<>(newAlleles);
if (referenceIndex != -1)
referenceAlleleIndex = referenceIndex;
final int sampleCount = samples.sampleCount();
for (int s = 0; s < sampleCount; s++) {
final int sampleReadCount = readsBySampleIndex[s].length;
final double[][] newValuesBySampleIndex = Arrays.copyOf(valuesBySampleIndex[s],newAlleleCount);
final double[][] newValuesBySampleIndex = Arrays.copyOf(valuesBySampleIndex[s], newAlleleCount);
for (int a = oldAlleleCount; a < newAlleleCount; a++) {
newValuesBySampleIndex[a] = new double[sampleReadCount];
if (defaultLikelihood != 0.0)
@ -531,8 +559,86 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
}
valuesBySampleIndex[s] = newValuesBySampleIndex;
}
if (referenceIndex != -1)
referenceAlleleIndex = referenceIndex;
if (nonRefIndex != -1) {
nonRefAlleleIndex = nonRefIndex;
updateNonRefAlleleLikelihoods();
}
}
/**
* Modify this likelihood collection dropping some of its alleles.
* @param allelesToDrop set of alleles to be dropped.
* @throws IllegalArgumentException if {@code allelesToDrop} is {@code null} or contain elements that are
* not alleles in this collection.
*/
public void dropAlleles(final Set<A> allelesToDrop) {
if (allelesToDrop == null) {
throw new IllegalArgumentException("the input allele to drop set cannot be null");
}
if (allelesToDrop.isEmpty()) {
return;
}
final boolean[] indicesToDrop = new boolean[alleles.alleleCount()];
for (final A allele : allelesToDrop) {
final int index = alleles.alleleIndex(allele);
if (index < 0) {
throw new IllegalArgumentException("unknown allele: " + allele);
}
indicesToDrop[index] = true;
}
@SuppressWarnings("unchecked")
final A[] newAlleles = (A[]) new Allele[alleles.alleleCount() - allelesToDrop.size()];
final int[] newAlleleIndices = new int[newAlleles.length];
int nextIndex = 0;
for (int i = 0; i < alleles.alleleCount(); i++) {
if (indicesToDrop[i]) {
continue;
}
newAlleleIndices[nextIndex] = i;
newAlleles[nextIndex++] = alleles.alleleAt(i);
}
for (int i = 0; i < samples.sampleCount(); i++) {
final double[][] oldSampleValues = valuesBySampleIndex[i];
final double[][] newSampleValues = new double[newAlleles.length][];
for (int j = 0; j < newAlleles.length; j++) {
newSampleValues[j] = oldSampleValues[newAlleleIndices[j]];
}
valuesBySampleIndex[i] = newSampleValues;
}
alleleList = Collections.unmodifiableList(Arrays.asList(newAlleles));
alleles = new IndexedAlleleList<>(alleleList);
if (nonRefAlleleIndex >= 0) {
nonRefAlleleIndex = findNonRefAllele(alleles);
updateNonRefAlleleLikelihoods();
}
if (referenceAlleleIndex >= 0)
referenceAlleleIndex = findReferenceAllele(alleles);
}
/**
* Drop all the alleles not present in the subset passed as a parameter.
* @param subset the alleles to retain.
* @throws IllegalArgumentException if {@code alleles} is {@code null} or contain alleles unknown to this likelihood
* collection.
*/
public void retainAlleles(final Set<A> subset) {
if (alleles == null) {
throw new IllegalArgumentException("the retain subset must not be null");
} else if (!alleles().containsAll(subset) || subset.size() > alleleCount()) {
throw new IllegalArgumentException("some of the alleles to retain are not present in the read-likelihoods collection");
} else if (subset.isEmpty() || (subset.size() == 1 && subset.contains(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE))) {
throw new IllegalArgumentException("there must be at least one allele to retain");
} else {
final Set<A> allelesToDrop = new HashSet<>(alleles());
allelesToDrop.removeAll(subset);
dropAlleles(allelesToDrop);
}
}
/**
* Likelihood matrix between a set of alleles and reads.
* @param <A> the allele-type.
@ -543,13 +649,13 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
* List of reads in the matrix sorted by their index therein.
* @return never {@code null}.
*/
public List<GATKSAMRecord> reads();
List<GATKSAMRecord> reads();
/**
* List of alleles in the matrix sorted by their index in the collection.
* @return never {@code null}.
*/
public List<A> alleles();
List<A> alleles();
/**
* Set the likelihood of a read given an allele through their indices.
@ -561,7 +667,7 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
* @throws IllegalArgumentException if {@code alleleIndex} or {@code readIndex}
* are not valid allele and read indices respectively.
*/
public void set(final int alleleIndex, final int readIndex, final double value);
void set(final int alleleIndex, final int readIndex, final double value);
/**
* Returns the likelihood of a read given a haplotype.
@ -575,7 +681,7 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
* @return the requested likelihood, whatever value was provided using {@link #set(int,int,double) set}
* or 0.0 if none was set.
*/
public double get(final int alleleIndex, final int readIndex);
double get(final int alleleIndex, final int readIndex);
/**
* Queries the index of an allele in the matrix.
@ -586,7 +692,7 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
* @return -1 if such allele does not exist, otherwise its index which 0 or greater.
*/
@SuppressWarnings("unused")
public int alleleIndex(final A allele);
int alleleIndex(final A allele);
/**
* Queries the index of a read in the matrix.
@ -599,19 +705,19 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
* which is 0 or greater.
*/
@SuppressWarnings("unused")
public int readIndex(final GATKSAMRecord read);
int readIndex(final GATKSAMRecord read);
/**
* Number of allele in the matrix.
* @return never negative.
*/
public int alleleCount();
int alleleCount();
/**
* Number of reads in the matrix.
* @return never negative.
*/
public int readCount();
int readCount();
/**
* Returns the allele given its index.
@ -621,7 +727,7 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
* @throws IllegalArgumentException if {@code alleleIndex} is not a valid allele index.
* @return never {@code null}.
*/
public A alleleAt(final int alleleIndex);
A alleleAt(final int alleleIndex);
/**
* Returns the allele given its index.
@ -631,7 +737,7 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
* @throws IllegalArgumentException if {@code readIndex} is not a valid read index.
* @return never {@code null}.
*/
public GATKSAMRecord readAt(final int readIndex);
GATKSAMRecord readAt(final int readIndex);
/**
@ -640,7 +746,7 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
* @param dest the destination array.
* @param offset the copy offset within the destination allele
*/
public void copyAlleleLikelihoods(final int alleleIndex, final double[] dest, final int offset);
void copyAlleleLikelihoods(final int alleleIndex, final double[] dest, final int offset);
}
/**
@ -905,7 +1011,6 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
*
* @throws IllegalArgumentException the location cannot be {@code null} nor unmapped.
*/
@SuppressWarnings("unused")
public void filterToOnlyOverlappingUnclippedReads(final GenomeLoc location) {
if (location == null)
throw new IllegalArgumentException("the location cannot be null");
@ -921,7 +1026,6 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
final int alleleCount = alleles.alleleCount();
final IntArrayList removeIndices = new IntArrayList(10);
for (int s = 0; s < sampleCount; s++) {
int readRemoveCount = 0;
final GATKSAMRecord[] sampleReads = readsBySampleIndex[s];
final int sampleReadCount = sampleReads.length;
for (int r = 0; r < sampleReadCount; r++)
@ -932,28 +1036,6 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
}
}
// Compare the read coordinates to the location of interest.
private boolean readOverlapsLocation(final String contig, final int locStart,
final int locEnd, final GATKSAMRecord read) {
final boolean overlaps;
if (read.getReadUnmappedFlag())
overlaps = false;
else if (!read.getReferenceName().equals(contig))
overlaps = false;
else {
int alnStart = read.getAlignmentStart();
int alnStop = read.getAlignmentEnd();
if (alnStart > alnStop) { // Paranoia? based on GLP.createGenomeLoc(Read) this can happen?.
final int end = alnStart;
alnStart = alnStop;
alnStop = end;
}
overlaps = !(alnStop < locStart || alnStart > locEnd);
}
return overlaps;
}
/**
* Removes those read that the best possible likelihood given any allele is just too low.
*
@ -991,7 +1073,7 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
}
// Check whether the read is poorly modelled.
protected boolean readIsPoorlyModelled(final int sampleIndex, final int readIndex, final GATKSAMRecord read, final double maxErrorRatePerBase) {
private boolean readIsPoorlyModelled(final int sampleIndex, final int readIndex, final GATKSAMRecord read, final double maxErrorRatePerBase) {
final double maxErrorsForRead = Math.min(2.0, Math.ceil(read.getReadLength() * maxErrorRatePerBase));
final double log10QualPerBase = -4.0;
final double log10MaxLikelihoodForTrueAllele = maxErrorsForRead * log10QualPerBase;
@ -1089,37 +1171,25 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
throw new IllegalArgumentException("non-ref allele cannot be null");
if (!nonRefAllele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE))
throw new IllegalArgumentException("the non-ref allele is not valid");
// Already present?
if (alleles.alleleIndex(nonRefAllele) != -1)
return;
final int oldAlleleCount = alleles.alleleCount();
final int newAlleleCount = oldAlleleCount + 1;
@SuppressWarnings("unchecked")
final A[] newAlleles = (A[]) new Allele[newAlleleCount];
for (int a = 0; a < oldAlleleCount; a++)
newAlleles[a] = alleles.alleleAt(a);
newAlleles[oldAlleleCount] = nonRefAllele;
alleles = new IndexedAlleleList<>(newAlleles);
alleleList = null; // remove the cached alleleList.
final int sampleCount = samples.sampleCount();
for (int s = 0; s < sampleCount; s++)
addNonReferenceAlleleLikelihoodsPerSample(oldAlleleCount, newAlleleCount, s);
addMissingAlleles(Collections.singleton(nonRefAllele), Double.NEGATIVE_INFINITY);
updateNonRefAlleleLikelihoods();
}
// Updates per-sample structures according to the addition of the NON_REF allele.
private void addNonReferenceAlleleLikelihoodsPerSample(final int alleleCount, final int newAlleleCount, final int sampleIndex) {
final double[][] sampleValues = valuesBySampleIndex[sampleIndex] = Arrays.copyOf(valuesBySampleIndex[sampleIndex], newAlleleCount);
final int sampleReadCount = readsBySampleIndex[sampleIndex].length;
public void updateNonRefAlleleLikelihoods() {
updateNonRefAlleleLikelihoods(alleles);
}
final double[] nonRefAlleleLikelihoods = sampleValues[alleleCount] = new double [sampleReadCount];
Arrays.fill(nonRefAlleleLikelihoods,Double.NEGATIVE_INFINITY);
for (int r = 0; r < sampleReadCount; r++) {
final BestAllele bestAllele = searchBestAllele(sampleIndex,r,true);
final double secondBestLikelihood = Double.isInfinite(bestAllele.confidence) ? bestAllele.likelihood
: bestAllele.likelihood - bestAllele.confidence;
nonRefAlleleLikelihoods[r] = secondBestLikelihood;
public void updateNonRefAlleleLikelihoods(final AlleleList<A> allelesToConsider) {
if (nonRefAlleleIndex < 0)
return;
for (int s = 0; s < samples.sampleCount(); s++) {
final double[][] sampleValues = valuesBySampleIndex[s];
for (int r = 0; r < sampleValues[0].length; r++) {
final BestAllele bestAllele = searchBestAllele(s, r, true, allelesToConsider);
final double secondBestLikelihood = Double.isInfinite(bestAllele.confidence) ? bestAllele.likelihood
: bestAllele.likelihood - bestAllele.confidence;
sampleValues[nonRefAlleleIndex][r] = secondBestLikelihood;
}
}
}
@ -1170,9 +1240,9 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
*
* @return never {@code null}.
*/
public static ReadLikelihoods<Allele> fromPerAlleleReadLikelihoodsMap(final AlleleList<Allele> alleleList, final Map<String,PerReadAlleleLikelihoodMap> map) {
@VisibleForTesting
static ReadLikelihoods<Allele> fromPerAlleleReadLikelihoodsMap(final AlleleList<Allele> alleleList, final Map<String,PerReadAlleleLikelihoodMap> map) {
//TODO add test code for this method.
// First we need to create the read-likelihood collection with all required alleles, samples and reads.
final SampleList sampleList = new IndexedSampleList(map.keySet());
final int alleleCount = alleleList.alleleCount();
@ -1228,7 +1298,8 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
* @param sampleIndex the target sample.
* @return never {@code null}, perhaps empty.
*/
public Map<A,List<GATKSAMRecord>> readsByBestAlleleMap(final int sampleIndex) {
@VisibleForTesting
Map<A,List<GATKSAMRecord>> readsByBestAlleleMap(final int sampleIndex) {
checkSampleIndex(sampleIndex);
final int alleleCount = alleles.alleleCount();
final int sampleReadCount = readsBySampleIndex[sampleIndex].length;