diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java index 5ad0c581c..05c0a930d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java @@ -63,7 +63,6 @@ import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.exceptions.GATKException; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; @@ -147,11 +146,22 @@ public class ReferenceConfidenceVariantContextMerger { final List allelesList = new ArrayList<>(finalAlleleSet); + //TODO quick fix patch to address memory issue described in https://github.com/broadinstitute/gsa-unstable/issues/1419 + //TODO The reason to impose this limit here is that in practice the tool that is affected by the mem issue, GenotypeGVCFs will + //TODO skip the site when the number of alleles is bigger than that limit so this change does not change the outputs. + //TODO However we need to change this with a more permanent solution. + //TODO For example we could impose maxAltAlleles or maxGenotypes in the output at every step including CombineGVCFs and GenotypeGVCFs + //TODO in order to avoid to add yet another limit . + final boolean shouldComputePLs = allelesList.size() <= GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED; + if (!shouldComputePLs) { + logger.debug(String.format("location %s:%d has too many alleles (%d) to compute PLs (maximum allowed %d). PL genotype annotations won't be produced at this site", loc.getContig(), loc.getStart(), allelesList.size(), GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED)); + } + for ( final Pair> pair : vcAndNewAllelePairs ) { final VariantContext vc = pair.getFirst(); final List remappedAlleles = pair.getSecond(); - mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList, samplesAreUniquified); + mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList, samplesAreUniquified, shouldComputePLs); // special case DP (add it up) for all events if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) ) { @@ -413,18 +423,20 @@ public class ReferenceConfidenceVariantContextMerger { * @param remappedAlleles the list of remapped alleles for the sample * @param targetAlleles the list of target alleles * @param samplesAreUniquified true if sample names have been uniquified + * @param shouldComputePLs true if the PL can be computed in this merge. */ private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes, final VariantContext vc, final List remappedAlleles, final List targetAlleles, - final boolean samplesAreUniquified) { + final boolean samplesAreUniquified, + final boolean shouldComputePLs) { final int maximumPloidy = vc.getMaxPloidy(GATKVariantContextUtils.DEFAULT_PLOIDY); // the map is different depending on the ploidy, so in order to keep this method flexible (mixed ploidies) // we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible. final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][]; final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size()); - int[] perSampleIndexesOfRelevantAlleles; + for (final Genotype g : vc.getGenotypes()) { final String name; @@ -433,23 +445,28 @@ public class ReferenceConfidenceVariantContextMerger { else name = g.getSampleName(); final int ploidy = g.getPloidy(); - final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy())); + final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy())) + .noPL(); genotypeBuilder.name(name); - final boolean hasPL = g.hasPL(); + + final boolean doPLs = shouldComputePLs && g.hasPL(); + final boolean hasAD = g.hasAD(); final boolean hasSAC = g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY); - if (hasPL || hasSAC) { - perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, vc.getStart(), g); - if (g.hasPL()) { + if (doPLs || hasSAC || hasAD) { + final int[] perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, vc.getStart(), g); + if (doPLs) { // lazy initialization of the genotype index map by ploidy. final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null ? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(perSampleIndexesOfRelevantAlleles) : genotypeIndexMapsByPloidy[ploidy]; final int[] PLs = generatePL(g, genotypeIndexMapByPloidy); - final int[] AD = g.hasAD() ? generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles) : null; - genotypeBuilder.PL(PLs).AD(AD); + genotypeBuilder.PL(PLs); } - if (g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY)) { + if (hasAD) { + genotypeBuilder.AD(generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles)); + } + if (hasSAC) { final List sacIndexesToUse = adaptToSACIndexes(perSampleIndexesOfRelevantAlleles); final int[] SACs = GATKVariantContextUtils.makeNewSACs(g, sacIndexesToUse); genotypeBuilder.attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, SACs); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java index dabfa27d5..1705ebe49 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java @@ -51,13 +51,16 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; +import htsjdk.samtools.SAMFileHeader; import htsjdk.variant.variantcontext.*; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.gatk.utils.*; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import org.testng.Assert; import org.testng.annotations.BeforeSuite; import org.testng.annotations.DataProvider; @@ -65,10 +68,7 @@ import org.testng.annotations.Test; import java.io.File; import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; +import java.util.*; /** * Tests {@link org.broadinstitute.gatk.tools.walkers.variantutils.ReferenceConfidenceVariantContextMerger}. @@ -170,6 +170,84 @@ public class VariantContextMergerUnitTest extends BaseTest { } } + @Test + public void testMergeTooManyAlleles() { + final int ALLELE_COUNT = 100; + final int SAMPLE_COUNT = 200; + final double MNP_PROB = 0.1; + final double MNP_MUT_RATE = 0.1; + final double NO_PL_PROB = 0.1; + final Random rdn = new Random(13); + final RandomDNA randomDNA = new RandomDNA(rdn); + final Allele refAllele = Allele.create(randomDNA.nextBases(30), true); + final Set alleles = new LinkedHashSet<>(ALLELE_COUNT); + alleles.add(refAllele); + alleles.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); + while (alleles.size() < ALLELE_COUNT) { + if (rdn.nextDouble() <= MNP_PROB) { + final byte[] bytes = refAllele.getBases().clone(); + for (int i = 0; i < bytes.length; i++) { + bytes[i] = rdn.nextDouble() <= MNP_MUT_RATE ? randomDNA.nextBase() : bytes[i]; + } + if (!Arrays.equals(bytes, refAllele.getBases())) { + alleles.add(Allele.create(bytes, false)); + } + } else { + final int newLength = rdn.nextInt(refAllele.getBases().length + 100) + 1; + if (newLength < refAllele.getBases().length) { + alleles.add(Allele.create(Arrays.copyOf(refAllele.getBases(), newLength), false)); + } else if (newLength > refAllele.getBases().length) { + final byte[] bases = randomDNA.nextBases(newLength); + System.arraycopy(refAllele.getBases(), 0, bases, 0, refAllele.getBases().length); + } // else same length... we skip and try again. + } + } + final List alleleList = new ArrayList<>(alleles); + + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000); + final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + final List variantContexts = new ArrayList<>(SAMPLE_COUNT); + for (int i = 0; i < SAMPLE_COUNT; i++) { + final GenotypeBuilder genotypeBuilder = new GenotypeBuilder("SAMPLE_" + (i+1)); + genotypeBuilder.alleles(GATKVariantContextUtils.noCallAlleles(2)); + final List sampleAlleles = new ArrayList<>(); + sampleAlleles.add(refAllele); + sampleAlleles.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); + for (int j = 2; j < alleles.size(); j++) { + if (rdn.nextDouble() <= 0.01) { + sampleAlleles.add(alleleList.get(j)); + } + } + if (rdn.nextDouble() > NO_PL_PROB) { + final int[] PLs = new int[(sampleAlleles.size() * (sampleAlleles.size() + 1)) >> 1]; + for (int j = 0; j < PLs.length; j++) { + PLs[j] = rdn.nextInt(1000); + } + genotypeBuilder.PL(PLs); + } + final int[] AD = new int[sampleAlleles.size()]; + for (int j = 0; j < AD.length; j++) { + AD[j] = rdn.nextInt(100); + } + genotypeBuilder.AD(AD); + + variantContexts.add(new VariantContextBuilder() + .loc("chr1", 10000, 10000 + refAllele.getBases().length - 1) + .alleles(sampleAlleles) + .genotypes(genotypeBuilder.make()) + .make()); + } + final VariantContext result = ReferenceConfidenceVariantContextMerger.merge(variantContexts, + genomeLocParser.createGenomeLoc("chr1", 10000), refAllele.getBases()[0], false, true, null); + Assert.assertNotNull(result); + Assert.assertEquals(result.getGenotypes().size(), SAMPLE_COUNT); + Assert.assertTrue(result.getNAlleles() > GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED, + "Not necessarily a bug, need to fix the random data generation to make sure there is enough alt-alleles to go beyond this threshold"); + System.err.println("Alleles: " + result.getNAlleles()); + for (int i = 0; i < SAMPLE_COUNT; i++) { + Assert.assertFalse(result.getGenotype(i).hasPL(), "" + i); + } + } @DataProvider(name = "referenceConfidenceMergeData") public Object[][] makeReferenceConfidenceMergeData() {