From c09667a20da47d9519bed8768a8eac2171b00146 Mon Sep 17 00:00:00 2001 From: Laura Gauthier Date: Wed, 22 Oct 2014 15:29:13 -0400 Subject: [PATCH] Fix bug in CombineGVCFs so now sample 2 variants occuring within sample 1 deletions get merged properly. CombineGVCFs now outputs ref conf for the duration of deletions so that SNPs occuring in other samples aligned with those deletions will be genotyped correctly --- .../walkers/variantutils/CombineGVCFs.java | 55 +++++++++++++++---- ...ferenceConfidenceVariantContextMerger.java | 47 +++++++++++++--- .../CombineGVCFsIntegrationTest.java | 2 +- .../VariantContextMergerUnitTest.java | 7 ++- 4 files changed, 88 insertions(+), 23 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java index 51e3bb838..b2c33de27 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java @@ -112,10 +112,14 @@ public class CombineGVCFs extends RodWalker VCs; + final Set samples = new HashSet<>(); final byte[] refBases; final GenomeLoc loc; public PositionalState(final List VCs, final byte[] refBases, final GenomeLoc loc) { this.VCs = VCs; + for(final VariantContext vc : VCs){ + samples.addAll(vc.getSampleNames()); + } this.refBases = refBases; this.loc = loc; } @@ -123,6 +127,7 @@ public class CombineGVCFs extends RodWalker VCs = new LinkedList<>(); + final Set samples = new HashSet<>(); GenomeLoc prevPos = null; byte refAfterPrevPos; @@ -179,13 +184,17 @@ public class CombineGVCFs extends RodWalker 1 ? startingStates.refBases[1] : (byte)'N'); + endPreviousStates(previousState, currentPos, startingStates, true); } return previousState; @@ -194,12 +203,18 @@ public class CombineGVCFs extends RodWalker intersection = new HashSet(startingStates.samples); + intersection.retainAll(previousState.samples); + + //if there's a starting VC with a sample that's already in a current VC, don't skip this position + return lastPosRun != null && thisPos == lastPosRun.getStart() + 1 && intersection.isEmpty(); } /** @@ -237,9 +252,15 @@ public class CombineGVCFs extends RodWalker 1 ? startingStates.refBases[1] : (byte)'N'; + else + refBase = startingStates.refBases[0]; final List stoppedVCs = new ArrayList<>(state.VCs.size()); @@ -250,12 +271,24 @@ public class CombineGVCFs extends RodWalker 0 && !atCurrentPosition && startingStates.samples.containsAll(vc.getSampleNames())) { + state.samples.removeAll(vc.getSampleNames()); + state.VCs.remove(i); + } } } - if ( !stoppedVCs.isEmpty() ) { + //if we already output this state in BP resolution mode (reflected by state.prevPos) then skip output + GenomeLoc lastWriteLoc = state.prevPos; + final int lastWritePos = lastWriteLoc == null ? -1 : lastWriteLoc.getStart(); + if ( !stoppedVCs.isEmpty() && pos > lastWritePos) { final GenomeLoc gLoc = genomeLocParser.createGenomeLoc(stoppedVCs.get(0).getChr(), pos); // we need the specialized merge if the site contains anything other than ref blocks diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java index ab4ce349e..3c532bda2 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java @@ -315,7 +315,7 @@ public class ReferenceConfidenceVariantContextMerger { // we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible. final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][]; final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size()); - final int[] indexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart()); + int[] perSampleIndexesOfRelevantAlleles; for ( final Genotype g : VC.getGenotypes() ) { final String name = g.getSampleName(); @@ -325,11 +325,12 @@ public class ReferenceConfidenceVariantContextMerger { final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy())); if (g.hasPL()) { // lazy initialization of the genotype index map by ploidy. + perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart(), g); final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null - ? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(indexesOfRelevantAlleles) + ? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(perSampleIndexesOfRelevantAlleles) : genotypeIndexMapsByPloidy[ploidy]; final int[] PLs = generatePL(g, genotypeIndexMapByPloidy); - final int[] AD = g.hasAD() ? generateAD(g.getAD(), indexesOfRelevantAlleles) : null; + final int[] AD = g.hasAD() ? generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles) : null; genotypeBuilder.PL(PLs).AD(AD).noGQ(); } mergedGenotypes.add(genotypeBuilder.make()); @@ -364,27 +365,55 @@ public class ReferenceConfidenceVariantContextMerger { * * @param remappedAlleles the list of alleles to evaluate * @param targetAlleles the target list of alleles - * @param position position to use for error messages + * @param position position to output error info + * @param g genotype from which targetAlleles are derived * @return non-null array of ints representing indexes */ - protected static int[] getIndexesOfRelevantAlleles(final List remappedAlleles, final List targetAlleles, final int position) { + protected static int[] getIndexesOfRelevantAlleles(final List remappedAlleles, final List targetAlleles, final int position, final Genotype g) { if ( remappedAlleles == null || remappedAlleles.size() == 0 ) throw new IllegalArgumentException("The list of input alleles must not be null or empty"); if ( targetAlleles == null || targetAlleles.size() == 0 ) throw new IllegalArgumentException("The list of target alleles must not be null or empty"); if ( !remappedAlleles.contains(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) ) throw new UserException("The list of input alleles must contain " + GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE + " as an allele but that is not the case at position " + position + "; please use the Haplotype Caller with gVCF output to generate appropriate records"); - final int indexOfGenericAlt = remappedAlleles.indexOf(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + + final int indexOfNonRef = remappedAlleles.indexOf(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + + //if the refs don't match then let the non-ref allele be the most likely of the alts + //TODO: eventually it would be nice to be able to trim alleles for spanning events to see if they really do have the same ref + final boolean refsMatch = targetAlleles.get(0).equals(remappedAlleles.get(0),false); + final int indexOfBestAlt; + if (!refsMatch && g.hasPL()) { + final int[] PLs = g.getPL().clone(); + PLs[0] = Integer.MAX_VALUE; //don't pick 0/0 + final int indexOfBestAltPL = MathUtils.minElementIndex(PLs); + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(indexOfBestAltPL); + indexOfBestAlt = pair.alleleIndex2; + } + else + indexOfBestAlt = indexOfNonRef; final int[] indexMapping = new int[targetAlleles.size()]; - // the reference alleles always match up (even if they don't appear to) + // the reference likelihoods should always map to each other (even if the alleles don't) indexMapping[0] = 0; // create the index mapping, using the allele whenever such a mapping doesn't exist - for ( int i = 1; i < targetAlleles.size(); i++ ) { + final int targetNonRef = targetAlleles.indexOf(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final boolean targetHasNonRef = targetNonRef != -1; + final int lastConcreteAlt = targetHasNonRef ? targetAlleles.size()-2 : targetAlleles.size()-1; + for ( int i = 1; i <= lastConcreteAlt; i++ ) { final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i)); - indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfGenericAlt : indexOfRemappedAllele; + indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele; + } + if (targetHasNonRef) { + if (refsMatch) { + final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(targetNonRef)); + indexMapping[targetNonRef] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele; + } + else { + indexMapping[targetNonRef] = indexOfBestAlt; + } } return indexMapping; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java index 8e176089d..b98711a92 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java @@ -206,7 +206,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest { @Test public void testBasepairResolutionInput() throws Exception { final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -V " + privateTestDir + "gvcf.basepairResolution.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("6116f3c70cd5288f3e8b89b1953a1e15")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("eb56760419b295651b6d54ba3ad18f52")); spec.disableShadowBCF(); executeTest("testBasepairResolutionInput", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java index 08bb0b4a2..f30bbae4a 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java @@ -132,7 +132,8 @@ public class VariantContextMergerUnitTest extends BaseTest { alleles1.add(Allele.create("A", true)); final List alleles2 = new ArrayList<>(1); alleles2.add(Allele.create("A", true)); - ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(alleles1, alleles2, -1); + GenotypeBuilder builder = new GenotypeBuilder(); + ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(alleles1, alleles2, -1, builder.make()); Assert.fail("We should have thrown an exception because the allele was not present"); } @@ -147,7 +148,9 @@ public class VariantContextMergerUnitTest extends BaseTest { if ( allelesIndex > 0 ) myAlleles.add(allAlleles.get(allelesIndex)); - final int[] indexes = ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1); + GenotypeBuilder builder = new GenotypeBuilder(); + + final int[] indexes = ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1, builder.make()); Assert.assertEquals(indexes.length, allAlleles.size());