From 45607d1b30c15013ddaa13a40ebc8bddd38c3f98 Mon Sep 17 00:00:00 2001 From: Valentin Ruano Rubio Date: Thu, 30 Jun 2016 11:41:03 -0400 Subject: [PATCH] RCM Variant sites merger won't output PL when there are too many alleles in order to avoid memory issues with large cohort runs. Small additional "cosmetic" changes to the code Addresses issue #1419. --- ...ferenceConfidenceVariantContextMerger.java | 41 ++++++--- .../VariantContextMergerUnitTest.java | 86 ++++++++++++++++++- 2 files changed, 111 insertions(+), 16 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java index 5ad0c581c..05c0a930d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java @@ -63,7 +63,6 @@ import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.exceptions.GATKException; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; @@ -147,11 +146,22 @@ public class ReferenceConfidenceVariantContextMerger { final List allelesList = new ArrayList<>(finalAlleleSet); + //TODO quick fix patch to address memory issue described in https://github.com/broadinstitute/gsa-unstable/issues/1419 + //TODO The reason to impose this limit here is that in practice the tool that is affected by the mem issue, GenotypeGVCFs will + //TODO skip the site when the number of alleles is bigger than that limit so this change does not change the outputs. + //TODO However we need to change this with a more permanent solution. + //TODO For example we could impose maxAltAlleles or maxGenotypes in the output at every step including CombineGVCFs and GenotypeGVCFs + //TODO in order to avoid to add yet another limit . + final boolean shouldComputePLs = allelesList.size() <= GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED; + if (!shouldComputePLs) { + logger.debug(String.format("location %s:%d has too many alleles (%d) to compute PLs (maximum allowed %d). PL genotype annotations won't be produced at this site", loc.getContig(), loc.getStart(), allelesList.size(), GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED)); + } + for ( final Pair> pair : vcAndNewAllelePairs ) { final VariantContext vc = pair.getFirst(); final List remappedAlleles = pair.getSecond(); - mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList, samplesAreUniquified); + mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList, samplesAreUniquified, shouldComputePLs); // special case DP (add it up) for all events if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) ) { @@ -413,18 +423,20 @@ public class ReferenceConfidenceVariantContextMerger { * @param remappedAlleles the list of remapped alleles for the sample * @param targetAlleles the list of target alleles * @param samplesAreUniquified true if sample names have been uniquified + * @param shouldComputePLs true if the PL can be computed in this merge. */ private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes, final VariantContext vc, final List remappedAlleles, final List targetAlleles, - final boolean samplesAreUniquified) { + final boolean samplesAreUniquified, + final boolean shouldComputePLs) { final int maximumPloidy = vc.getMaxPloidy(GATKVariantContextUtils.DEFAULT_PLOIDY); // the map is different depending on the ploidy, so in order to keep this method flexible (mixed ploidies) // we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible. final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][]; final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size()); - int[] perSampleIndexesOfRelevantAlleles; + for (final Genotype g : vc.getGenotypes()) { final String name; @@ -433,23 +445,28 @@ public class ReferenceConfidenceVariantContextMerger { else name = g.getSampleName(); final int ploidy = g.getPloidy(); - final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy())); + final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy())) + .noPL(); genotypeBuilder.name(name); - final boolean hasPL = g.hasPL(); + + final boolean doPLs = shouldComputePLs && g.hasPL(); + final boolean hasAD = g.hasAD(); final boolean hasSAC = g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY); - if (hasPL || hasSAC) { - perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, vc.getStart(), g); - if (g.hasPL()) { + if (doPLs || hasSAC || hasAD) { + final int[] perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, vc.getStart(), g); + if (doPLs) { // lazy initialization of the genotype index map by ploidy. final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null ? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(perSampleIndexesOfRelevantAlleles) : genotypeIndexMapsByPloidy[ploidy]; final int[] PLs = generatePL(g, genotypeIndexMapByPloidy); - final int[] AD = g.hasAD() ? generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles) : null; - genotypeBuilder.PL(PLs).AD(AD); + genotypeBuilder.PL(PLs); } - if (g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY)) { + if (hasAD) { + genotypeBuilder.AD(generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles)); + } + if (hasSAC) { final List sacIndexesToUse = adaptToSACIndexes(perSampleIndexesOfRelevantAlleles); final int[] SACs = GATKVariantContextUtils.makeNewSACs(g, sacIndexesToUse); genotypeBuilder.attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, SACs); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java index dabfa27d5..1705ebe49 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java @@ -51,13 +51,16 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; +import htsjdk.samtools.SAMFileHeader; import htsjdk.variant.variantcontext.*; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.gatk.utils.*; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import org.testng.Assert; import org.testng.annotations.BeforeSuite; import org.testng.annotations.DataProvider; @@ -65,10 +68,7 @@ import org.testng.annotations.Test; import java.io.File; import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; +import java.util.*; /** * Tests {@link org.broadinstitute.gatk.tools.walkers.variantutils.ReferenceConfidenceVariantContextMerger}. @@ -170,6 +170,84 @@ public class VariantContextMergerUnitTest extends BaseTest { } } + @Test + public void testMergeTooManyAlleles() { + final int ALLELE_COUNT = 100; + final int SAMPLE_COUNT = 200; + final double MNP_PROB = 0.1; + final double MNP_MUT_RATE = 0.1; + final double NO_PL_PROB = 0.1; + final Random rdn = new Random(13); + final RandomDNA randomDNA = new RandomDNA(rdn); + final Allele refAllele = Allele.create(randomDNA.nextBases(30), true); + final Set alleles = new LinkedHashSet<>(ALLELE_COUNT); + alleles.add(refAllele); + alleles.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); + while (alleles.size() < ALLELE_COUNT) { + if (rdn.nextDouble() <= MNP_PROB) { + final byte[] bytes = refAllele.getBases().clone(); + for (int i = 0; i < bytes.length; i++) { + bytes[i] = rdn.nextDouble() <= MNP_MUT_RATE ? randomDNA.nextBase() : bytes[i]; + } + if (!Arrays.equals(bytes, refAllele.getBases())) { + alleles.add(Allele.create(bytes, false)); + } + } else { + final int newLength = rdn.nextInt(refAllele.getBases().length + 100) + 1; + if (newLength < refAllele.getBases().length) { + alleles.add(Allele.create(Arrays.copyOf(refAllele.getBases(), newLength), false)); + } else if (newLength > refAllele.getBases().length) { + final byte[] bases = randomDNA.nextBases(newLength); + System.arraycopy(refAllele.getBases(), 0, bases, 0, refAllele.getBases().length); + } // else same length... we skip and try again. + } + } + final List alleleList = new ArrayList<>(alleles); + + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000); + final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + final List variantContexts = new ArrayList<>(SAMPLE_COUNT); + for (int i = 0; i < SAMPLE_COUNT; i++) { + final GenotypeBuilder genotypeBuilder = new GenotypeBuilder("SAMPLE_" + (i+1)); + genotypeBuilder.alleles(GATKVariantContextUtils.noCallAlleles(2)); + final List sampleAlleles = new ArrayList<>(); + sampleAlleles.add(refAllele); + sampleAlleles.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); + for (int j = 2; j < alleles.size(); j++) { + if (rdn.nextDouble() <= 0.01) { + sampleAlleles.add(alleleList.get(j)); + } + } + if (rdn.nextDouble() > NO_PL_PROB) { + final int[] PLs = new int[(sampleAlleles.size() * (sampleAlleles.size() + 1)) >> 1]; + for (int j = 0; j < PLs.length; j++) { + PLs[j] = rdn.nextInt(1000); + } + genotypeBuilder.PL(PLs); + } + final int[] AD = new int[sampleAlleles.size()]; + for (int j = 0; j < AD.length; j++) { + AD[j] = rdn.nextInt(100); + } + genotypeBuilder.AD(AD); + + variantContexts.add(new VariantContextBuilder() + .loc("chr1", 10000, 10000 + refAllele.getBases().length - 1) + .alleles(sampleAlleles) + .genotypes(genotypeBuilder.make()) + .make()); + } + final VariantContext result = ReferenceConfidenceVariantContextMerger.merge(variantContexts, + genomeLocParser.createGenomeLoc("chr1", 10000), refAllele.getBases()[0], false, true, null); + Assert.assertNotNull(result); + Assert.assertEquals(result.getGenotypes().size(), SAMPLE_COUNT); + Assert.assertTrue(result.getNAlleles() > GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED, + "Not necessarily a bug, need to fix the random data generation to make sure there is enough alt-alleles to go beyond this threshold"); + System.err.println("Alleles: " + result.getNAlleles()); + for (int i = 0; i < SAMPLE_COUNT; i++) { + Assert.assertFalse(result.getGenotype(i).hasPL(), "" + i); + } + } @DataProvider(name = "referenceConfidenceMergeData") public Object[][] makeReferenceConfidenceMergeData() {