From 5d19fca6490db8e07a1634074d30094576019859 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Tue, 11 Sep 2012 23:01:00 -0400 Subject: [PATCH 1/2] A couple of bug-fixy changes. 1) SelectVariants could throw a ReviewedStingException (one of the nasty "Bug:") ones if the user requested a sample that wasn't present in the VCF. The walker now checks for this in the initialize() phase, and throws a more informative error if the situation is detected. If the user simply wants to subset the VCF to all the samples requested that are actually present in the VCF, the --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES flag changes this UserException to a Warning, and does the appropriate subsetting. Added integration tests for this. 2) GenotypeLikelihoods has an unsafe method getLog10GQ(GenotypeType), which is completely broken for multi-allelic sites. I marked that method as deprecated, and added methods that use the context of the allele ordering (either directly specified or as a VC) to retrieve the appropriate GQ, and added a unit test to cover this case. VariantsToBinaryPed needs to dynamically calculate the GQ field sometimes (because I have some VCFs with PLs but no GQ). --- .../walkers/variantutils/SelectVariants.java | 29 +++++++++++++++++-- .../variantutils/VariantsToBinaryPed.java | 15 +++++++++- .../variantcontext/GenotypeLikelihoods.java | 27 +++++++++++++++++ .../SelectVariantsIntegrationTest.java | 29 +++++++++++++++++++ .../GenotypeLikelihoodsUnitTest.java | 26 +++++++++++++++++ 5 files changed, 122 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 3d14308b6..7bad19775 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -40,6 +40,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; @@ -325,6 +326,9 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(doc="indel size select",required=false,fullName="maxIndelSize") private int maxIndelSize = Integer.MAX_VALUE; + @Argument(doc="Allow a samples other than those in the VCF to be specified on the command line. These samples will be ignored.",required=false,fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES") + private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false; + /* Private class used to store the intermediate variants in the integer random selection process */ private static class RandomVariantStructure { @@ -386,10 +390,29 @@ public class SelectVariants extends RodWalker implements TreeR Collection samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); Collection samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions); - // first, add any requested samples - samples.addAll(samplesFromFile); - samples.addAll(samplesFromExpressions); + // first, check overlap between requested and present samples + Set commandLineUniqueSamples = new HashSet(samplesFromFile.size()+samplesFromExpressions.size()+sampleNames.size()); + commandLineUniqueSamples.addAll(samplesFromFile); + commandLineUniqueSamples.addAll(samplesFromExpressions); + commandLineUniqueSamples.addAll(sampleNames); + commandLineUniqueSamples.removeAll(vcfSamples); + if ( commandLineUniqueSamples.size() > 0 && ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES ) { + logger.warn("Samples present on command line input that are not present in the VCF. These samples will be ignored."); + samplesFromFile.removeAll(commandLineUniqueSamples); + samplesFromExpressions.retainAll(commandLineUniqueSamples); + } else if (commandLineUniqueSamples.size() > 0 ) { + throw new UserException.BadInput(String.format("%s%n%n%s%n%n%s%n%n%s", + "Samples entered on command line (through -sf or -sn) that are not present in the VCF.", + "A list of these samples:", + Utils.join(",",commandLineUniqueSamples), + "To ignore these samples, run with --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES")); + } + + // second, add the requested samples samples.addAll(sampleNames); + samples.addAll(samplesFromExpressions); + samples.addAll(samplesFromFile); + samples.removeAll(commandLineUniqueSamples); // if none were requested, we want all of them if ( samples.isEmpty() ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java index 2e6a80462..6bc6153df 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java @@ -7,7 +7,9 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -15,6 +17,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -278,7 +281,7 @@ public class VariantsToBinaryPed extends RodWalker { private byte getFlippedEncoding(Genotype g, int offset) { byte b; - if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) { + if ( ! checkGQIsGood(g) ) { b = NO_CALL; } else if ( g.isHomRef() ) { b = HOM_VAR; @@ -293,6 +296,16 @@ public class VariantsToBinaryPed extends RodWalker { return (byte) (b << (2*offset)); } + private boolean checkGQIsGood(Genotype genotype) { + if ( genotype.hasGQ() ) { + return genotype.getGQ() >= minGenotypeQuality; + } else if ( genotype.hasLikelihoods() ) { + return GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector()) >= minGenotypeQuality; + } + + return false; + } + private static String getID(VariantContext v) { if ( v.hasID() ) { return v.getID(); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index 7b4256b70..641eb5449 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.Arrays; import java.util.EnumMap; +import java.util.List; public class GenotypeLikelihoods { private final static int NUM_LIKELIHOODS_CACHE_N_ALLELES = 5; @@ -167,10 +168,36 @@ public class GenotypeLikelihoods { //Return the neg log10 Genotype Quality (GQ) for the given genotype //Returns Double.NEGATIVE_INFINITY in case of missing genotype + + /** + * This is really dangerous and returns completely wrong results for genotypes from a multi-allelic context. + * Use getLog10GQ(Genotype,VariantContext) or getLog10GQ(Genotype,List) in place of it. + * + * If you **know** you're biallelic, use getGQLog10FromLikelihoods directly. + * @param genotype - actually a genotype type (no call, hom ref, het, hom var) + * @return an unsafe quantity that could be negative. In the bi-allelic case, the GQ resulting from best minus next best (if the type is the best). + */ + @Deprecated public double getLog10GQ(GenotypeType genotype){ return getGQLog10FromLikelihoods(genotype.ordinal() - 1 /* NO_CALL IS FIRST */, getAsVector()); } + @Requires({"genotypeAlleles != null","genotypeAlleles.size()==2","contextAlleles != null","contextAlleles.size() >= 1"}) + private double getLog10GQ(List genotypeAlleles,List contextAlleles) { + int allele1Index = contextAlleles.indexOf(genotypeAlleles.get(0)); + int allele2Index = contextAlleles.indexOf(genotypeAlleles.get(1)); + int plIndex = calculatePLindex(allele1Index,allele2Index); + return getGQLog10FromLikelihoods(plIndex,getAsVector()); + } + + public double getLog10GQ(Genotype genotype, List vcAlleles ) { + return getLog10GQ(genotype.getAlleles(),vcAlleles); + } + + public double getLog10GQ(Genotype genotype, VariantContext context) { + return getLog10GQ(genotype,context.getAlleles()); + } + public static double getGQLog10FromLikelihoods(int iOfChoosenGenotype, double[] likelihoods){ if(likelihoods == null) return Double.NEGATIVE_INFINITY; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 77e29f87b..ffd9c9b4a 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -70,6 +70,20 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testComplexSelection--" + testfile, spec); } + @Test + public void testComplexSelectionWithNonExistingSamples() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES -sn A -se '[CDH]' -sn Z -sn T -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile), + 1, + Arrays.asList("4386fbb258dcef4437495a37f5a83c53") + ); + spec.disableShadowBCF(); + executeTest("testComplexSelectionWithNonExistingSamples--" + testfile, spec); + } + @Test public void testNonExistingFieldSelection() { String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; @@ -98,6 +112,21 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testSampleExclusion--" + testfile, spec); } + @Test + public void testSampleInclusionWithNonexistingSamples() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -sn A -sn Z -sn Q -sf " + samplesFile + " --variant " + testfile, + 1, + UserException.BadInput.class + ); + spec.disableShadowBCF(); + + executeTest("testSampleInclusionWithNonexistingSamples--" + testfile, spec); + } + @Test public void testConcordance() { diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java index 69f42e1f9..4ce32cee7 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -29,12 +29,15 @@ package org.broadinstitute.sting.utils.variantcontext; // the imports for unit testing. +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.Assert; import org.testng.annotations.Test; +import java.util.Arrays; import java.util.EnumMap; +import java.util.List; /** @@ -44,6 +47,7 @@ public class GenotypeLikelihoodsUnitTest { double [] v = new double[]{-10.5, -1.25, -5.11}; final static String vGLString = "-10.50,-1.25,-5.11"; final static String vPLString = "93,0,39"; + double[] triAllelic = new double[]{-4.2,-2.0,-3.0,-1.6,0.0,-4.0}; //AA,AB,AC,BB,BC,CC @Test public void testFromVector2() { @@ -139,6 +143,28 @@ public class GenotypeLikelihoodsUnitTest { } } + // this test is completely broken, the method is wrong. + public void testGetQualFromLikelihoodsMultiAllelicBroken() { + GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic); + double actualGQ = gl.getLog10GQ(GenotypeType.HET); + double expectedGQ = 1.6; + Assert.assertEquals(actualGQ,expectedGQ); + } + + public void testGetQualFromLikelihoodsMultiAllelic() { + GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic); + Allele ref = Allele.create(BaseUtils.A,true); + Allele alt1 = Allele.create(BaseUtils.C); + Allele alt2 = Allele.create(BaseUtils.T); + List allAlleles = Arrays.asList(ref,alt1,alt2); + List gtAlleles = Arrays.asList(alt1,alt2); + GenotypeBuilder gtBuilder = new GenotypeBuilder(); + gtBuilder.alleles(gtAlleles); + double actualGQ = gl.getLog10GQ(gtBuilder.make(),allAlleles); + double expectedGQ = 1.6; + Assert.assertEquals(actualGQ,expectedGQ); + } + private void assertDoubleArraysAreEqual(double[] v1, double[] v2) { Assert.assertEquals(v1.length, v2.length); for ( int i = 0; i < v1.length; i++ ) { From 96be1cbea9eefb1d6bd5797c7979aae84a26d04c Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Wed, 12 Sep 2012 10:11:06 -0400 Subject: [PATCH 2/2] My own integration test isn't passing with a clean checkout. This fix to the walker ought to do it. --- .../walkers/variantutils/SelectVariants.java | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 7bad19775..9664a5bde 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -396,10 +396,17 @@ public class SelectVariants extends RodWalker implements TreeR commandLineUniqueSamples.addAll(samplesFromExpressions); commandLineUniqueSamples.addAll(sampleNames); commandLineUniqueSamples.removeAll(vcfSamples); + + // second, add the requested samples + samples.addAll(sampleNames); + samples.addAll(samplesFromExpressions); + samples.addAll(samplesFromFile); + + logger.debug(Utils.join(",",commandLineUniqueSamples)); + if ( commandLineUniqueSamples.size() > 0 && ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES ) { logger.warn("Samples present on command line input that are not present in the VCF. These samples will be ignored."); - samplesFromFile.removeAll(commandLineUniqueSamples); - samplesFromExpressions.retainAll(commandLineUniqueSamples); + samples.removeAll(commandLineUniqueSamples); } else if (commandLineUniqueSamples.size() > 0 ) { throw new UserException.BadInput(String.format("%s%n%n%s%n%n%s%n%n%s", "Samples entered on command line (through -sf or -sn) that are not present in the VCF.", @@ -408,11 +415,6 @@ public class SelectVariants extends RodWalker implements TreeR "To ignore these samples, run with --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES")); } - // second, add the requested samples - samples.addAll(sampleNames); - samples.addAll(samplesFromExpressions); - samples.addAll(samplesFromFile); - samples.removeAll(commandLineUniqueSamples); // if none were requested, we want all of them if ( samples.isEmpty() ) {