From 6ebcba523431415c3b8fade94775ee3b66fb08c1 Mon Sep 17 00:00:00 2001 From: Laura Gauthier Date: Wed, 24 Sep 2014 08:41:12 -0400 Subject: [PATCH 2/2] New walker to combine data for different formats of same sample that were called and VQSRed together; has functionality to combine only specified samples, omitting others (e.g. combine the uniquified NA12878s with -usn NA12878.variant51 -usn NA12878.variant102) GenotypeGVCFs now has the ability to unique-ify samples so I can genotype together two different datasets containing the same sample Modify InbreedingCoeff so that it works when genotyping uniquified samples --- .../walkers/annotator/InbreedingCoeff.java | 22 +++++++++ .../walkers/variantutils/CombineGVCFs.java | 2 +- .../walkers/variantutils/GenotypeGVCFs.java | 41 +++++++++++++++-- ...ferenceConfidenceVariantContextMerger.java | 27 ++++++----- .../GenotypeGVCFsIntegrationTest.java | 17 +++++++ .../VariantContextMergerUnitTest.java | 45 ++++++++++++++----- 6 files changed, 127 insertions(+), 27 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java index ef541b023..7a9a123ed 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java @@ -92,6 +92,7 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno private Set founderIds; private int sampleCount; private boolean pedigreeCheckWarningLogged = false; + private boolean didUniquifiedSampleNameCheck = false; @Override public Map annotate(final RefMetaDataTracker tracker, @@ -104,6 +105,11 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno if(founderIds == null && walker != null) { founderIds = ((Walker) walker).getSampleDB().getFounderIds(); } + //if none of the "founders" are in the vc samples, assume we uniquified the samples upstream and they are all founders + if (!didUniquifiedSampleNameCheck) { + checkSampleNames(vc); + didUniquifiedSampleNameCheck = true; + } if ( founderIds == null || founderIds.isEmpty() ) { if ( !pedigreeCheckWarningLogged ) { logger.warn("Annotation will not be calculated, must provide a valid PED file (-ped) from the command line."); @@ -181,6 +187,22 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno return Collections.singletonMap(getKeyNames().get(0), (Object)String.format("%.4f", F)); } + //this method is intended to reconcile uniquified sample names + // it comes into play when calling this annotation from GenotypeGVCFs with --uniquifySamples because founderIds + // is derived from the sampleDB, which comes from the input sample names, but vc will have uniquified (i.e. different) + // sample names. Without this check, the founderIds won't be found in the vc and the annotation won't be calculated. + protected void checkSampleNames(final VariantContext vc) { + Set vcSamples = new HashSet<>(); + vcSamples.addAll(vc.getSampleNames()); + if (!vcSamples.isEmpty()) { + if (founderIds!=null) { + vcSamples.removeAll(founderIds); + if (vcSamples.equals(vc.getSampleNames())) + founderIds = vc.getSampleNames(); + } + } + } + @Override public List getKeyNames() { return Collections.singletonList(GATKVCFConstants.INBREEDING_COEFFICIENT_KEY); } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java index 60d88739d..674006661 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java @@ -293,7 +293,7 @@ public class CombineGVCFs extends RodWalker variantCollection : variantCollections ) + for ( final RodBindingCollection variantCollection : variantCollections ) { variants.addAll(variantCollection.getRodBindings()); + if (uniquifySamples) { + for (final RodBinding rb : variantCollection.getRodBindings()) { + //are inputs passed in with -V:fileTag ? + if (!rb.getTags().isEmpty()) inputsAreTagged = true; + } + } + } + //RodBinding tags are used in sample uniquification + if (inputsAreTagged) + logger.warn("Output uniquified VCF may not be suitable for input to CombineSampleData because input VCF(s) contain tags."); final GenomeAnalysisEngine toolkit = getToolkit(); final Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(toolkit, variants); - final SampleList samples = new IndexedSampleList(SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); + + final GATKVariantContextUtils.GenotypeMergeType mergeType; + if(uniquifySamples) { + mergeType = GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY; + } + else + mergeType = GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE; + + final SampleList samples = new IndexedSampleList(SampleUtils.getSampleList(vcfRods, mergeType)); // create the genotyping engine genotypingEngine = new UnifiedGenotypingEngine(createUAC(), samples, toolkit.getGenomeLocParser(), GeneralPloidyFailOverAFCalculatorProvider.createThreadSafeProvider(toolkit, genotypeArgs, logger), toolkit.getArguments().BAQMode); @@ -201,7 +230,7 @@ public class GenotypeGVCFs extends RodWalker allele from the merged VC + * @param samplesAreUniquified if true, sample names have been uniquified * @return new VariantContext representing the merge of all VCs or null if it not relevant */ - public static VariantContext merge(final List VCs, final GenomeLoc loc, final Byte refBase, final boolean removeNonRefSymbolicAllele) { + public static VariantContext merge(final List VCs, final GenomeLoc loc, final Byte refBase, final boolean removeNonRefSymbolicAllele, + final boolean samplesAreUniquified) { // this can happen if e.g. you are using a dbSNP file that spans a region with no gVCFs if ( VCs == null || VCs.size() == 0 ) return null; @@ -133,7 +135,7 @@ public class ReferenceConfidenceVariantContextMerger { final VariantContext vc = pair.getFirst(); final List remappedAlleles = pair.getSecond(); - mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList); + mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList, samplesAreUniquified); // special case DP (add it up) for all events if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) ) { @@ -303,15 +305,17 @@ public class ReferenceConfidenceVariantContextMerger { * Merge into the context a new genotype represented by the given VariantContext for the provided list of target alleles. * This method assumes that none of the alleles in the VC overlaps with any of the alleles in the set. * - * @param mergedGenotypes the genotypes context to add to - * @param VC the Variant Context for the sample - * @param remappedAlleles the list of remapped alleles for the sample - * @param targetAlleles the list of target alleles + * @param mergedGenotypes the genotypes context to add to + * @param VC the Variant Context for the sample + * @param remappedAlleles the list of remapped alleles for the sample + * @param targetAlleles the list of target alleles + * @param samplesAreUniquified true if sample names have been uniquified */ private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes, final VariantContext VC, final List remappedAlleles, - final List targetAlleles) { + final List targetAlleles, + final boolean samplesAreUniquified) { final int maximumPloidy = VC.getMaxPloidy(GATKVariantContextUtils.DEFAULT_PLOIDY); // the map is different depending on the ploidy, so in order to keep this method flexible (mixed ploidies) // we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible. @@ -320,11 +324,14 @@ public class ReferenceConfidenceVariantContextMerger { int[] perSampleIndexesOfRelevantAlleles; for ( final Genotype g : VC.getGenotypes() ) { - final String name = g.getSampleName(); - if ( mergedGenotypes.containsSample(name) ) - continue; + final String name; + if (samplesAreUniquified) + name = g.getSampleName() + "." + VC.getSource(); + else + name = g.getSampleName(); final int ploidy = g.getPloidy(); final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy())); + genotypeBuilder.name(name); if (g.hasPL()) { // lazy initialization of the genotype index map by ploidy. perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart(), g); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index e0ee04ab1..bdd7cdbf8 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -261,6 +261,23 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { Assert.assertTrue(gVCFList.containsAll(outputVCFList)); } + @Test + public void testUniquifiedSamples() throws IOException { + //two copies of 5 samples; will also test InbreedingCoeff calculation for uniquified samples + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" + + " -V:sample1B " + privateTestDir + "combine.single.sample.pipeline.1.vcf" + + " -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" + + " -V:sample2B " + privateTestDir + "combine.single.sample.pipeline.2.vcf" + + " -V:combined1 " + privateTestDir + "combine.single.sample.pipeline.combined.vcf" + + " -V:combined2 " + privateTestDir + "combine.single.sample.pipeline.combined.vcf" + + " --uniquifySamples", b37KGReference), + 1, + Arrays.asList("ef6a96d57434bb63935fa7d9f012da9f")); + executeTest("testUniquifiedSamples", spec); + + } + /** * Returns a list of attribute values from a VCF file * diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java index 9c75bcb68..578543652 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java @@ -99,8 +99,9 @@ public class VariantContextMergerUnitTest extends BaseTest { } @Test(dataProvider = "referenceConfidenceMergeData") - public void testReferenceConfidenceMerge(final String testID, final List toMerge, final GenomeLoc loc, final boolean returnSiteEvenIfMonomorphic, final VariantContext expectedResult) { - final VariantContext result = ReferenceConfidenceVariantContextMerger.merge(toMerge, loc, returnSiteEvenIfMonomorphic ? (byte) 'A' : null, true); + public void testReferenceConfidenceMerge(final String testID, final List toMerge, final GenomeLoc loc, + final boolean returnSiteEvenIfMonomorphic, final boolean uniquifySamples, final VariantContext expectedResult) { + final VariantContext result = ReferenceConfidenceVariantContextMerger.merge(toMerge, loc, returnSiteEvenIfMonomorphic ? (byte) 'A' : null, true, uniquifySamples); if ( result == null ) { Assert.assertTrue(expectedResult == null); return; @@ -171,6 +172,7 @@ public class VariantContextMergerUnitTest extends BaseTest { final int start = 10; final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, start, start); final VariantContext VCbase = new VariantContextBuilder("test", "20", start, start, Arrays.asList(Aref)).make(); + final VariantContext VCbase2 = new VariantContextBuilder("test2", "20", start, start, Arrays.asList(Aref)).make(); final VariantContext VCprevBase = new VariantContextBuilder("test", "20", start-1, start-1, Arrays.asList(Aref)).make(); final int[] standardPLs = new int[]{30, 20, 10, 71, 72, 73}; @@ -183,26 +185,34 @@ public class VariantContextMergerUnitTest extends BaseTest { final List A_ALT = Arrays.asList(Aref, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); final Genotype gA_ALT = new GenotypeBuilder("A").PL(new int[]{0, 100, 1000}).alleles(noCalls).make(); final VariantContext vcA_ALT = new VariantContextBuilder(VCbase).alleles(A_ALT).genotypes(gA_ALT).make(); + final Allele AAref = Allele.create("AA", true); final List AA_ALT = Arrays.asList(AAref, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); final Genotype gAA_ALT = new GenotypeBuilder("AA").PL(new int[]{0, 80, 800}).alleles(noCalls).make(); final VariantContext vcAA_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_ALT).genotypes(gAA_ALT).make(); + final List A_C = Arrays.asList(Aref, C); final Genotype gA_C = new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10}).alleles(noCalls).make(); final List A_C_ALT = Arrays.asList(Aref, C, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); final Genotype gA_C_ALT = new GenotypeBuilder("A_C").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcA_C = new VariantContextBuilder(VCbase2).alleles(A_C_ALT).genotypes(gA_C).make(); final VariantContext vcA_C_ALT = new VariantContextBuilder(VCbase).alleles(A_C_ALT).genotypes(gA_C_ALT).make(); + final List A_G_ALT = Arrays.asList(Aref, G, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); final Genotype gA_G_ALT = new GenotypeBuilder("A_G").PL(standardPLs).alleles(noCalls).make(); final VariantContext vcA_G_ALT = new VariantContextBuilder(VCbase).alleles(A_G_ALT).genotypes(gA_G_ALT).make(); + final List A_C_G = Arrays.asList(Aref, C, G); final Genotype gA_C_G = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30}).alleles(noCalls).make(); final List A_C_G_ALT = Arrays.asList(Aref, C, G, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); final Genotype gA_C_G_ALT = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74}).alleles(noCalls).make(); + final VariantContext vcA_C_G = new VariantContextBuilder(VCbase2).alleles(A_C_G_ALT).genotypes(gA_C_G).make(); final VariantContext vcA_C_G_ALT = new VariantContextBuilder(VCbase).alleles(A_C_G_ALT).genotypes(gA_C_G_ALT).make(); + final List A_ATC_ALT = Arrays.asList(Aref, ATC, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); final Genotype gA_ATC_ALT = new GenotypeBuilder("A_ATC").PL(standardPLs).alleles(noCalls).make(); final VariantContext vcA_ATC_ALT = new VariantContextBuilder(VCbase).alleles(A_ATC_ALT).genotypes(gA_ATC_ALT).make(); + final Allele A = Allele.create("A", false); final List AA_A_ALT = Arrays.asList(AAref, A, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).alleles(noCalls).make(); @@ -210,40 +220,40 @@ public class VariantContextMergerUnitTest extends BaseTest { // first test the case of a single record tests.add(new Object[]{"test00",Arrays.asList(vcA_C_ALT), - loc, false, + loc, false, false, new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C).make()}); // now, test pairs: // a SNP with another SNP tests.add(new Object[]{"test01",Arrays.asList(vcA_C_ALT, vcA_G_ALT), - loc, false, + loc, false, false, new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, new GenotypeBuilder("A_G").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()}); // a SNP with an indel tests.add(new Object[]{"test02",Arrays.asList(vcA_C_ALT, vcA_ATC_ALT), - loc, false, + loc, false, false, new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC)).genotypes(gA_C_ALT, new GenotypeBuilder("A_ATC").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()}); // a SNP with 2 SNPs tests.add(new Object[]{"test03",Arrays.asList(vcA_C_ALT, vcA_C_G_ALT), - loc, false, + loc, false, false, new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, gA_C_G).make()}); // a SNP with a ref record tests.add(new Object[]{"test04",Arrays.asList(vcA_C_ALT, vcA_ALT), - loc, false, + loc, false, false, new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gA_ALT).make()}); // spanning records: // a SNP with a spanning ref record tests.add(new Object[]{"test05",Arrays.asList(vcA_C_ALT, vcAA_ALT), - loc, false, + loc, false, false, new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gAA_ALT).make()}); // a SNP with a spanning deletion tests.add(new Object[]{"test06",Arrays.asList(vcA_C_ALT, vcAA_A_ALT), - loc, false, + loc, false, false, new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73}).alleles(noCalls).make()).make()}); // combination of all tests.add(new Object[]{"test07",Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT), - loc, false, + loc, false, false, new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, G, ATC)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73}).alleles(noCalls).make(), new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73}).alleles(noCalls).make(), new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10}).alleles(noCalls).make(), @@ -255,12 +265,23 @@ public class VariantContextMergerUnitTest extends BaseTest { // just spanning ref contexts, trying both instances where we want/do not want ref-only contexts tests.add(new Object[]{"test08",Arrays.asList(vcAA_ALT), - loc, false, + loc, false, false, null}); tests.add(new Object[]{"test09", Arrays.asList(vcAA_ALT), - loc, true, + loc, true, false, new VariantContextBuilder(VCbase).alleles(Arrays.asList(Allele.create("A", true))).genotypes(new GenotypeBuilder("AA").PL(new int[]{0}).alleles(noCalls).make()).make()}); + // test uniquification of sample names + tests.add(new Object[]{"test10",Arrays.asList(vcA_C, vcA_C_ALT), loc, false, true, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes( + new GenotypeBuilder("A_C.test2").PL(new int[]{30, 20, 10}).alleles(noCalls).make(), + new GenotypeBuilder("A_C.test").PL(new int[]{30, 20, 10}).alleles(noCalls).make()).make()}); + + tests.add(new Object[]{"test11",Arrays.asList(vcA_C_G, vcA_C_G_ALT), loc, false, true, + new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes( + new GenotypeBuilder("A_C_G.test2").PL(new int[]{40, 20, 30, 20, 10, 30}).alleles(noCalls).make(), + new GenotypeBuilder("A_C_G.test").PL(new int[]{40, 20, 30, 20, 10, 30}).alleles(noCalls).make()).make()}); + final Object[][] result = tests.toArray(new Object[][]{}); return result; }