Merge pull request #1428 from broadinstitute/vrr_issue_1419_out_of_mem_genotype_gvcfs

RCM Variant sites merger won't output PL when there are too many alle…
This commit is contained in:
Valentin Ruano Rubio 2016-07-01 13:58:38 -04:00 committed by GitHub
commit 9c3b928945
2 changed files with 111 additions and 16 deletions

View File

@ -63,7 +63,6 @@ import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.exceptions.GATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
@ -147,11 +146,22 @@ public class ReferenceConfidenceVariantContextMerger {
final List<Allele> allelesList = new ArrayList<>(finalAlleleSet);
//TODO quick fix patch to address memory issue described in https://github.com/broadinstitute/gsa-unstable/issues/1419
//TODO The reason to impose this limit here is that in practice the tool that is affected by the mem issue, GenotypeGVCFs will
//TODO skip the site when the number of alleles is bigger than that limit so this change does not change the outputs.
//TODO However we need to change this with a more permanent solution.
//TODO For example we could impose maxAltAlleles or maxGenotypes in the output at every step including CombineGVCFs and GenotypeGVCFs
//TODO in order to avoid to add yet another limit .
final boolean shouldComputePLs = allelesList.size() <= GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED;
if (!shouldComputePLs) {
logger.debug(String.format("location %s:%d has too many alleles (%d) to compute PLs (maximum allowed %d). PL genotype annotations won't be produced at this site", loc.getContig(), loc.getStart(), allelesList.size(), GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED));
}
for ( final Pair<VariantContext,List<Allele>> pair : vcAndNewAllelePairs ) {
final VariantContext vc = pair.getFirst();
final List<Allele> remappedAlleles = pair.getSecond();
mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList, samplesAreUniquified);
mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList, samplesAreUniquified, shouldComputePLs);
// special case DP (add it up) for all events
if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) ) {
@ -413,18 +423,20 @@ public class ReferenceConfidenceVariantContextMerger {
* @param remappedAlleles the list of remapped alleles for the sample
* @param targetAlleles the list of target alleles
* @param samplesAreUniquified true if sample names have been uniquified
* @param shouldComputePLs true if the PL can be computed in this merge.
*/
private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes,
final VariantContext vc,
final List<Allele> remappedAlleles,
final List<Allele> targetAlleles,
final boolean samplesAreUniquified) {
final boolean samplesAreUniquified,
final boolean shouldComputePLs) {
final int maximumPloidy = vc.getMaxPloidy(GATKVariantContextUtils.DEFAULT_PLOIDY);
// the map is different depending on the ploidy, so in order to keep this method flexible (mixed ploidies)
// we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible.
final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][];
final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size());
int[] perSampleIndexesOfRelevantAlleles;
for (final Genotype g : vc.getGenotypes()) {
final String name;
@ -433,23 +445,28 @@ public class ReferenceConfidenceVariantContextMerger {
else
name = g.getSampleName();
final int ploidy = g.getPloidy();
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy()));
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy()))
.noPL();
genotypeBuilder.name(name);
final boolean hasPL = g.hasPL();
final boolean doPLs = shouldComputePLs && g.hasPL();
final boolean hasAD = g.hasAD();
final boolean hasSAC = g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY);
if (hasPL || hasSAC) {
perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, vc.getStart(), g);
if (g.hasPL()) {
if (doPLs || hasSAC || hasAD) {
final int[] perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, vc.getStart(), g);
if (doPLs) {
// lazy initialization of the genotype index map by ploidy.
final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null
? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(perSampleIndexesOfRelevantAlleles)
: genotypeIndexMapsByPloidy[ploidy];
final int[] PLs = generatePL(g, genotypeIndexMapByPloidy);
final int[] AD = g.hasAD() ? generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles) : null;
genotypeBuilder.PL(PLs).AD(AD);
genotypeBuilder.PL(PLs);
}
if (g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY)) {
if (hasAD) {
genotypeBuilder.AD(generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles));
}
if (hasSAC) {
final List<Integer> sacIndexesToUse = adaptToSACIndexes(perSampleIndexesOfRelevantAlleles);
final int[] SACs = GATKVariantContextUtils.makeNewSACs(g, sacIndexesToUse);
genotypeBuilder.attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, SACs);

View File

@ -51,13 +51,16 @@
package org.broadinstitute.gatk.tools.walkers.variantutils;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.variant.variantcontext.*;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.gatk.utils.*;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
@ -65,10 +68,7 @@ import org.testng.annotations.Test;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.*;
/**
* Tests {@link org.broadinstitute.gatk.tools.walkers.variantutils.ReferenceConfidenceVariantContextMerger}.
@ -170,6 +170,84 @@ public class VariantContextMergerUnitTest extends BaseTest {
}
}
@Test
public void testMergeTooManyAlleles() {
final int ALLELE_COUNT = 100;
final int SAMPLE_COUNT = 200;
final double MNP_PROB = 0.1;
final double MNP_MUT_RATE = 0.1;
final double NO_PL_PROB = 0.1;
final Random rdn = new Random(13);
final RandomDNA randomDNA = new RandomDNA(rdn);
final Allele refAllele = Allele.create(randomDNA.nextBases(30), true);
final Set<Allele> alleles = new LinkedHashSet<>(ALLELE_COUNT);
alleles.add(refAllele);
alleles.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
while (alleles.size() < ALLELE_COUNT) {
if (rdn.nextDouble() <= MNP_PROB) {
final byte[] bytes = refAllele.getBases().clone();
for (int i = 0; i < bytes.length; i++) {
bytes[i] = rdn.nextDouble() <= MNP_MUT_RATE ? randomDNA.nextBase() : bytes[i];
}
if (!Arrays.equals(bytes, refAllele.getBases())) {
alleles.add(Allele.create(bytes, false));
}
} else {
final int newLength = rdn.nextInt(refAllele.getBases().length + 100) + 1;
if (newLength < refAllele.getBases().length) {
alleles.add(Allele.create(Arrays.copyOf(refAllele.getBases(), newLength), false));
} else if (newLength > refAllele.getBases().length) {
final byte[] bases = randomDNA.nextBases(newLength);
System.arraycopy(refAllele.getBases(), 0, bases, 0, refAllele.getBases().length);
} // else same length... we skip and try again.
}
}
final List<Allele> alleleList = new ArrayList<>(alleles);
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000);
final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
final List<VariantContext> variantContexts = new ArrayList<>(SAMPLE_COUNT);
for (int i = 0; i < SAMPLE_COUNT; i++) {
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder("SAMPLE_" + (i+1));
genotypeBuilder.alleles(GATKVariantContextUtils.noCallAlleles(2));
final List<Allele> sampleAlleles = new ArrayList<>();
sampleAlleles.add(refAllele);
sampleAlleles.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
for (int j = 2; j < alleles.size(); j++) {
if (rdn.nextDouble() <= 0.01) {
sampleAlleles.add(alleleList.get(j));
}
}
if (rdn.nextDouble() > NO_PL_PROB) {
final int[] PLs = new int[(sampleAlleles.size() * (sampleAlleles.size() + 1)) >> 1];
for (int j = 0; j < PLs.length; j++) {
PLs[j] = rdn.nextInt(1000);
}
genotypeBuilder.PL(PLs);
}
final int[] AD = new int[sampleAlleles.size()];
for (int j = 0; j < AD.length; j++) {
AD[j] = rdn.nextInt(100);
}
genotypeBuilder.AD(AD);
variantContexts.add(new VariantContextBuilder()
.loc("chr1", 10000, 10000 + refAllele.getBases().length - 1)
.alleles(sampleAlleles)
.genotypes(genotypeBuilder.make())
.make());
}
final VariantContext result = ReferenceConfidenceVariantContextMerger.merge(variantContexts,
genomeLocParser.createGenomeLoc("chr1", 10000), refAllele.getBases()[0], false, true, null);
Assert.assertNotNull(result);
Assert.assertEquals(result.getGenotypes().size(), SAMPLE_COUNT);
Assert.assertTrue(result.getNAlleles() > GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED,
"Not necessarily a bug, need to fix the random data generation to make sure there is enough alt-alleles to go beyond this threshold");
System.err.println("Alleles: " + result.getNAlleles());
for (int i = 0; i < SAMPLE_COUNT; i++) {
Assert.assertFalse(result.getGenotype(i).hasPL(), "" + i);
}
}
@DataProvider(name = "referenceConfidenceMergeData")
public Object[][] makeReferenceConfidenceMergeData() {