RCM Variant sites merger won't output PL when there are too many alleles in order to avoid memory issues with large cohort runs.

Small additional "cosmetic" changes to the code
Addresses issue #1419.
This commit is contained in:
Valentin Ruano Rubio 2016-06-30 11:41:03 -04:00
parent 1ff234e7dd
commit 45607d1b30
2 changed files with 111 additions and 16 deletions

View File

@ -63,7 +63,6 @@ import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.exceptions.GATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
@ -147,11 +146,22 @@ public class ReferenceConfidenceVariantContextMerger {
final List<Allele> allelesList = new ArrayList<>(finalAlleleSet);
//TODO quick fix patch to address memory issue described in https://github.com/broadinstitute/gsa-unstable/issues/1419
//TODO The reason to impose this limit here is that in practice the tool that is affected by the mem issue, GenotypeGVCFs will
//TODO skip the site when the number of alleles is bigger than that limit so this change does not change the outputs.
//TODO However we need to change this with a more permanent solution.
//TODO For example we could impose maxAltAlleles or maxGenotypes in the output at every step including CombineGVCFs and GenotypeGVCFs
//TODO in order to avoid to add yet another limit .
final boolean shouldComputePLs = allelesList.size() <= GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED;
if (!shouldComputePLs) {
logger.debug(String.format("location %s:%d has too many alleles (%d) to compute PLs (maximum allowed %d). PL genotype annotations won't be produced at this site", loc.getContig(), loc.getStart(), allelesList.size(), GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED));
}
for ( final Pair<VariantContext,List<Allele>> pair : vcAndNewAllelePairs ) {
final VariantContext vc = pair.getFirst();
final List<Allele> remappedAlleles = pair.getSecond();
mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList, samplesAreUniquified);
mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList, samplesAreUniquified, shouldComputePLs);
// special case DP (add it up) for all events
if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) ) {
@ -413,18 +423,20 @@ public class ReferenceConfidenceVariantContextMerger {
* @param remappedAlleles the list of remapped alleles for the sample
* @param targetAlleles the list of target alleles
* @param samplesAreUniquified true if sample names have been uniquified
* @param shouldComputePLs true if the PL can be computed in this merge.
*/
private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes,
final VariantContext vc,
final List<Allele> remappedAlleles,
final List<Allele> targetAlleles,
final boolean samplesAreUniquified) {
final boolean samplesAreUniquified,
final boolean shouldComputePLs) {
final int maximumPloidy = vc.getMaxPloidy(GATKVariantContextUtils.DEFAULT_PLOIDY);
// the map is different depending on the ploidy, so in order to keep this method flexible (mixed ploidies)
// we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible.
final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][];
final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size());
int[] perSampleIndexesOfRelevantAlleles;
for (final Genotype g : vc.getGenotypes()) {
final String name;
@ -433,23 +445,28 @@ public class ReferenceConfidenceVariantContextMerger {
else
name = g.getSampleName();
final int ploidy = g.getPloidy();
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy()));
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy()))
.noPL();
genotypeBuilder.name(name);
final boolean hasPL = g.hasPL();
final boolean doPLs = shouldComputePLs && g.hasPL();
final boolean hasAD = g.hasAD();
final boolean hasSAC = g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY);
if (hasPL || hasSAC) {
perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, vc.getStart(), g);
if (g.hasPL()) {
if (doPLs || hasSAC || hasAD) {
final int[] perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, vc.getStart(), g);
if (doPLs) {
// lazy initialization of the genotype index map by ploidy.
final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null
? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(perSampleIndexesOfRelevantAlleles)
: genotypeIndexMapsByPloidy[ploidy];
final int[] PLs = generatePL(g, genotypeIndexMapByPloidy);
final int[] AD = g.hasAD() ? generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles) : null;
genotypeBuilder.PL(PLs).AD(AD);
genotypeBuilder.PL(PLs);
}
if (g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY)) {
if (hasAD) {
genotypeBuilder.AD(generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles));
}
if (hasSAC) {
final List<Integer> sacIndexesToUse = adaptToSACIndexes(perSampleIndexesOfRelevantAlleles);
final int[] SACs = GATKVariantContextUtils.makeNewSACs(g, sacIndexesToUse);
genotypeBuilder.attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, SACs);

View File

@ -51,13 +51,16 @@
package org.broadinstitute.gatk.tools.walkers.variantutils;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.variant.variantcontext.*;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.gatk.utils.*;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
@ -65,10 +68,7 @@ import org.testng.annotations.Test;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.*;
/**
* Tests {@link org.broadinstitute.gatk.tools.walkers.variantutils.ReferenceConfidenceVariantContextMerger}.
@ -170,6 +170,84 @@ public class VariantContextMergerUnitTest extends BaseTest {
}
}
@Test
public void testMergeTooManyAlleles() {
final int ALLELE_COUNT = 100;
final int SAMPLE_COUNT = 200;
final double MNP_PROB = 0.1;
final double MNP_MUT_RATE = 0.1;
final double NO_PL_PROB = 0.1;
final Random rdn = new Random(13);
final RandomDNA randomDNA = new RandomDNA(rdn);
final Allele refAllele = Allele.create(randomDNA.nextBases(30), true);
final Set<Allele> alleles = new LinkedHashSet<>(ALLELE_COUNT);
alleles.add(refAllele);
alleles.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
while (alleles.size() < ALLELE_COUNT) {
if (rdn.nextDouble() <= MNP_PROB) {
final byte[] bytes = refAllele.getBases().clone();
for (int i = 0; i < bytes.length; i++) {
bytes[i] = rdn.nextDouble() <= MNP_MUT_RATE ? randomDNA.nextBase() : bytes[i];
}
if (!Arrays.equals(bytes, refAllele.getBases())) {
alleles.add(Allele.create(bytes, false));
}
} else {
final int newLength = rdn.nextInt(refAllele.getBases().length + 100) + 1;
if (newLength < refAllele.getBases().length) {
alleles.add(Allele.create(Arrays.copyOf(refAllele.getBases(), newLength), false));
} else if (newLength > refAllele.getBases().length) {
final byte[] bases = randomDNA.nextBases(newLength);
System.arraycopy(refAllele.getBases(), 0, bases, 0, refAllele.getBases().length);
} // else same length... we skip and try again.
}
}
final List<Allele> alleleList = new ArrayList<>(alleles);
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000);
final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
final List<VariantContext> variantContexts = new ArrayList<>(SAMPLE_COUNT);
for (int i = 0; i < SAMPLE_COUNT; i++) {
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder("SAMPLE_" + (i+1));
genotypeBuilder.alleles(GATKVariantContextUtils.noCallAlleles(2));
final List<Allele> sampleAlleles = new ArrayList<>();
sampleAlleles.add(refAllele);
sampleAlleles.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
for (int j = 2; j < alleles.size(); j++) {
if (rdn.nextDouble() <= 0.01) {
sampleAlleles.add(alleleList.get(j));
}
}
if (rdn.nextDouble() > NO_PL_PROB) {
final int[] PLs = new int[(sampleAlleles.size() * (sampleAlleles.size() + 1)) >> 1];
for (int j = 0; j < PLs.length; j++) {
PLs[j] = rdn.nextInt(1000);
}
genotypeBuilder.PL(PLs);
}
final int[] AD = new int[sampleAlleles.size()];
for (int j = 0; j < AD.length; j++) {
AD[j] = rdn.nextInt(100);
}
genotypeBuilder.AD(AD);
variantContexts.add(new VariantContextBuilder()
.loc("chr1", 10000, 10000 + refAllele.getBases().length - 1)
.alleles(sampleAlleles)
.genotypes(genotypeBuilder.make())
.make());
}
final VariantContext result = ReferenceConfidenceVariantContextMerger.merge(variantContexts,
genomeLocParser.createGenomeLoc("chr1", 10000), refAllele.getBases()[0], false, true, null);
Assert.assertNotNull(result);
Assert.assertEquals(result.getGenotypes().size(), SAMPLE_COUNT);
Assert.assertTrue(result.getNAlleles() > GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED,
"Not necessarily a bug, need to fix the random data generation to make sure there is enough alt-alleles to go beyond this threshold");
System.err.println("Alleles: " + result.getNAlleles());
for (int i = 0; i < SAMPLE_COUNT; i++) {
Assert.assertFalse(result.getGenotype(i).hasPL(), "" + i);
}
}
@DataProvider(name = "referenceConfidenceMergeData")
public Object[][] makeReferenceConfidenceMergeData() {