Merge pull request #1428 from broadinstitute/vrr_issue_1419_out_of_mem_genotype_gvcfs
RCM Variant sites merger won't output PL when there are too many alle…
This commit is contained in:
commit
9c3b928945
|
|
@ -63,7 +63,6 @@ import org.broadinstitute.gatk.utils.GenomeLoc;
|
|||
import org.broadinstitute.gatk.utils.MathUtils;
|
||||
import org.broadinstitute.gatk.utils.Utils;
|
||||
import org.broadinstitute.gatk.utils.collections.Pair;
|
||||
import org.broadinstitute.gatk.utils.exceptions.GATKException;
|
||||
import org.broadinstitute.gatk.utils.exceptions.UserException;
|
||||
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
|
||||
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
|
||||
|
|
@ -147,11 +146,22 @@ public class ReferenceConfidenceVariantContextMerger {
|
|||
|
||||
final List<Allele> allelesList = new ArrayList<>(finalAlleleSet);
|
||||
|
||||
//TODO quick fix patch to address memory issue described in https://github.com/broadinstitute/gsa-unstable/issues/1419
|
||||
//TODO The reason to impose this limit here is that in practice the tool that is affected by the mem issue, GenotypeGVCFs will
|
||||
//TODO skip the site when the number of alleles is bigger than that limit so this change does not change the outputs.
|
||||
//TODO However we need to change this with a more permanent solution.
|
||||
//TODO For example we could impose maxAltAlleles or maxGenotypes in the output at every step including CombineGVCFs and GenotypeGVCFs
|
||||
//TODO in order to avoid to add yet another limit .
|
||||
final boolean shouldComputePLs = allelesList.size() <= GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED;
|
||||
if (!shouldComputePLs) {
|
||||
logger.debug(String.format("location %s:%d has too many alleles (%d) to compute PLs (maximum allowed %d). PL genotype annotations won't be produced at this site", loc.getContig(), loc.getStart(), allelesList.size(), GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED));
|
||||
}
|
||||
|
||||
for ( final Pair<VariantContext,List<Allele>> pair : vcAndNewAllelePairs ) {
|
||||
final VariantContext vc = pair.getFirst();
|
||||
final List<Allele> remappedAlleles = pair.getSecond();
|
||||
|
||||
mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList, samplesAreUniquified);
|
||||
mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList, samplesAreUniquified, shouldComputePLs);
|
||||
|
||||
// special case DP (add it up) for all events
|
||||
if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) ) {
|
||||
|
|
@ -413,18 +423,20 @@ public class ReferenceConfidenceVariantContextMerger {
|
|||
* @param remappedAlleles the list of remapped alleles for the sample
|
||||
* @param targetAlleles the list of target alleles
|
||||
* @param samplesAreUniquified true if sample names have been uniquified
|
||||
* @param shouldComputePLs true if the PL can be computed in this merge.
|
||||
*/
|
||||
private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes,
|
||||
final VariantContext vc,
|
||||
final List<Allele> remappedAlleles,
|
||||
final List<Allele> targetAlleles,
|
||||
final boolean samplesAreUniquified) {
|
||||
final boolean samplesAreUniquified,
|
||||
final boolean shouldComputePLs) {
|
||||
final int maximumPloidy = vc.getMaxPloidy(GATKVariantContextUtils.DEFAULT_PLOIDY);
|
||||
// the map is different depending on the ploidy, so in order to keep this method flexible (mixed ploidies)
|
||||
// we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible.
|
||||
final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][];
|
||||
final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size());
|
||||
int[] perSampleIndexesOfRelevantAlleles;
|
||||
|
||||
|
||||
for (final Genotype g : vc.getGenotypes()) {
|
||||
final String name;
|
||||
|
|
@ -433,23 +445,28 @@ public class ReferenceConfidenceVariantContextMerger {
|
|||
else
|
||||
name = g.getSampleName();
|
||||
final int ploidy = g.getPloidy();
|
||||
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy()));
|
||||
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy()))
|
||||
.noPL();
|
||||
genotypeBuilder.name(name);
|
||||
final boolean hasPL = g.hasPL();
|
||||
|
||||
final boolean doPLs = shouldComputePLs && g.hasPL();
|
||||
final boolean hasAD = g.hasAD();
|
||||
final boolean hasSAC = g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY);
|
||||
if (hasPL || hasSAC) {
|
||||
perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, vc.getStart(), g);
|
||||
if (g.hasPL()) {
|
||||
if (doPLs || hasSAC || hasAD) {
|
||||
final int[] perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, vc.getStart(), g);
|
||||
if (doPLs) {
|
||||
// lazy initialization of the genotype index map by ploidy.
|
||||
|
||||
final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null
|
||||
? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(perSampleIndexesOfRelevantAlleles)
|
||||
: genotypeIndexMapsByPloidy[ploidy];
|
||||
final int[] PLs = generatePL(g, genotypeIndexMapByPloidy);
|
||||
final int[] AD = g.hasAD() ? generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles) : null;
|
||||
genotypeBuilder.PL(PLs).AD(AD);
|
||||
genotypeBuilder.PL(PLs);
|
||||
}
|
||||
if (g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY)) {
|
||||
if (hasAD) {
|
||||
genotypeBuilder.AD(generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles));
|
||||
}
|
||||
if (hasSAC) {
|
||||
final List<Integer> sacIndexesToUse = adaptToSACIndexes(perSampleIndexesOfRelevantAlleles);
|
||||
final int[] SACs = GATKVariantContextUtils.makeNewSACs(g, sacIndexesToUse);
|
||||
genotypeBuilder.attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, SACs);
|
||||
|
|
|
|||
|
|
@ -51,13 +51,16 @@
|
|||
|
||||
package org.broadinstitute.gatk.tools.walkers.variantutils;
|
||||
|
||||
import htsjdk.samtools.SAMFileHeader;
|
||||
import htsjdk.variant.variantcontext.*;
|
||||
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.gatk.utils.*;
|
||||
import org.broadinstitute.gatk.utils.exceptions.UserException;
|
||||
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
|
||||
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
|
|
@ -65,10 +68,7 @@ import org.testng.annotations.Test;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Tests {@link org.broadinstitute.gatk.tools.walkers.variantutils.ReferenceConfidenceVariantContextMerger}.
|
||||
|
|
@ -170,6 +170,84 @@ public class VariantContextMergerUnitTest extends BaseTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMergeTooManyAlleles() {
|
||||
final int ALLELE_COUNT = 100;
|
||||
final int SAMPLE_COUNT = 200;
|
||||
final double MNP_PROB = 0.1;
|
||||
final double MNP_MUT_RATE = 0.1;
|
||||
final double NO_PL_PROB = 0.1;
|
||||
final Random rdn = new Random(13);
|
||||
final RandomDNA randomDNA = new RandomDNA(rdn);
|
||||
final Allele refAllele = Allele.create(randomDNA.nextBases(30), true);
|
||||
final Set<Allele> alleles = new LinkedHashSet<>(ALLELE_COUNT);
|
||||
alleles.add(refAllele);
|
||||
alleles.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
|
||||
while (alleles.size() < ALLELE_COUNT) {
|
||||
if (rdn.nextDouble() <= MNP_PROB) {
|
||||
final byte[] bytes = refAllele.getBases().clone();
|
||||
for (int i = 0; i < bytes.length; i++) {
|
||||
bytes[i] = rdn.nextDouble() <= MNP_MUT_RATE ? randomDNA.nextBase() : bytes[i];
|
||||
}
|
||||
if (!Arrays.equals(bytes, refAllele.getBases())) {
|
||||
alleles.add(Allele.create(bytes, false));
|
||||
}
|
||||
} else {
|
||||
final int newLength = rdn.nextInt(refAllele.getBases().length + 100) + 1;
|
||||
if (newLength < refAllele.getBases().length) {
|
||||
alleles.add(Allele.create(Arrays.copyOf(refAllele.getBases(), newLength), false));
|
||||
} else if (newLength > refAllele.getBases().length) {
|
||||
final byte[] bases = randomDNA.nextBases(newLength);
|
||||
System.arraycopy(refAllele.getBases(), 0, bases, 0, refAllele.getBases().length);
|
||||
} // else same length... we skip and try again.
|
||||
}
|
||||
}
|
||||
final List<Allele> alleleList = new ArrayList<>(alleles);
|
||||
|
||||
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000);
|
||||
final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
|
||||
final List<VariantContext> variantContexts = new ArrayList<>(SAMPLE_COUNT);
|
||||
for (int i = 0; i < SAMPLE_COUNT; i++) {
|
||||
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder("SAMPLE_" + (i+1));
|
||||
genotypeBuilder.alleles(GATKVariantContextUtils.noCallAlleles(2));
|
||||
final List<Allele> sampleAlleles = new ArrayList<>();
|
||||
sampleAlleles.add(refAllele);
|
||||
sampleAlleles.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
|
||||
for (int j = 2; j < alleles.size(); j++) {
|
||||
if (rdn.nextDouble() <= 0.01) {
|
||||
sampleAlleles.add(alleleList.get(j));
|
||||
}
|
||||
}
|
||||
if (rdn.nextDouble() > NO_PL_PROB) {
|
||||
final int[] PLs = new int[(sampleAlleles.size() * (sampleAlleles.size() + 1)) >> 1];
|
||||
for (int j = 0; j < PLs.length; j++) {
|
||||
PLs[j] = rdn.nextInt(1000);
|
||||
}
|
||||
genotypeBuilder.PL(PLs);
|
||||
}
|
||||
final int[] AD = new int[sampleAlleles.size()];
|
||||
for (int j = 0; j < AD.length; j++) {
|
||||
AD[j] = rdn.nextInt(100);
|
||||
}
|
||||
genotypeBuilder.AD(AD);
|
||||
|
||||
variantContexts.add(new VariantContextBuilder()
|
||||
.loc("chr1", 10000, 10000 + refAllele.getBases().length - 1)
|
||||
.alleles(sampleAlleles)
|
||||
.genotypes(genotypeBuilder.make())
|
||||
.make());
|
||||
}
|
||||
final VariantContext result = ReferenceConfidenceVariantContextMerger.merge(variantContexts,
|
||||
genomeLocParser.createGenomeLoc("chr1", 10000), refAllele.getBases()[0], false, true, null);
|
||||
Assert.assertNotNull(result);
|
||||
Assert.assertEquals(result.getGenotypes().size(), SAMPLE_COUNT);
|
||||
Assert.assertTrue(result.getNAlleles() > GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED,
|
||||
"Not necessarily a bug, need to fix the random data generation to make sure there is enough alt-alleles to go beyond this threshold");
|
||||
System.err.println("Alleles: " + result.getNAlleles());
|
||||
for (int i = 0; i < SAMPLE_COUNT; i++) {
|
||||
Assert.assertFalse(result.getGenotype(i).hasPL(), "" + i);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "referenceConfidenceMergeData")
|
||||
public Object[][] makeReferenceConfidenceMergeData() {
|
||||
|
|
|
|||
Loading…
Reference in New Issue