From 0f3af9555b3b939eb56cd875fc8e0c6a1231c5d2 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 1 May 2012 14:58:06 -0400 Subject: [PATCH 1/5] Adding an option to SelectVariants which allows the user to re-genotype through the exact model (if PLs are present) the samples in order to recalculate the QUAL and genotypes. This is really the correct way to select a subset of samples, especially when originally called from low coverage data. Also added integration test to cover this case. --- .../walkers/variantutils/SelectVariants.java | 72 ++++++++++++++----- .../SelectVariantsIntegrationTest.java | 13 ++++ 2 files changed, 68 insertions(+), 17 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 42a40cde5..184dfc78b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -33,6 +33,9 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; @@ -235,6 +238,16 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false) protected boolean EXCLUDE_FILTERED = false; + /** + * This argument triggers re-genotyping of the selected samples through the Exact calculation model. Note that this is truly the + * mathematically correct way to select samples (especially when calls were generated from low coverage sequencing data); using the + * hard genotypes to select (i.e. the default mode of SelectVariants) can lead to false positives when errors are confused for variants + * in the original genotyping. We decided not to set the --regenotype option as the default though as the output can be unexpected if + * a user is strictly comparing against the original genotypes (GTs) in the file. + */ + @Argument(fullName="regenotype", shortName="regenotype", doc="re-genotype the selected samples based on their GLs (or PLs)", required=false) + protected Boolean REGENOTYPE = false; + private UnifiedGenotyperEngine UG_engine = null; /** * When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf. @@ -430,6 +443,13 @@ public class SelectVariants extends RodWalker implements TreeR SELECT_RANDOM_FRACTION = fractionRandom > 0; if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track"); + if ( REGENOTYPE ) { + final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); + UAC.GLmodel = GenotypeLikelihoodsCalculationModel.Model.BOTH; + UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES; + UAC.NO_SLOD = true; + UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); + } /** load in the IDs file to a hashset for matching */ if ( rsIDFile != null ) { @@ -509,7 +529,14 @@ public class SelectVariants extends RodWalker implements TreeR continue; VariantContext sub = subsetRecord(vc, samples, EXCLUDE_NON_VARIANTS); - if ( (sub.isPolymorphicInSamples() || !EXCLUDE_NON_VARIANTS) && (!sub.isFiltered() || !EXCLUDE_FILTERED) ) { + + if ( REGENOTYPE && sub.isPolymorphicInSamples() && hasPLs(sub) ) { + final VariantContextBuilder builder = new VariantContextBuilder(UG_engine.calculateGenotypes(tracker, ref, context, sub)).filters(sub.getFiltersMaybeNull()); + addAnnotations(builder, sub); + sub = builder.make(); + } + + if ( (!EXCLUDE_NON_VARIANTS || sub.isPolymorphicInSamples()) && (!EXCLUDE_FILTERED || !sub.isFiltered()) ) { boolean failedJexlMatch = false; for ( VariantContextUtils.JexlVCMatchExp jexl : jexls ) { if ( !VariantContextUtils.match(sub, jexl) ) { @@ -531,6 +558,14 @@ public class SelectVariants extends RodWalker implements TreeR return 1; } + private boolean hasPLs(final VariantContext vc) { + for ( Genotype g : vc.getGenotypes() ) { + if ( g.hasLikelihoods() ) + return true; + } + return false; + } + /** * Checks if vc has a variant call for (at least one of) the samples. * @param vc the variant rod VariantContext. Here, the variant is the dataset you're looking for discordances to (e.g. HapMap) @@ -682,9 +717,26 @@ public class SelectVariants extends RodWalker implements TreeR builder.genotypes(newGC); + addAnnotations(builder, sub); + + return builder.make(); + } + + private void addAnnotations(final VariantContextBuilder builder, final VariantContext originalVC) { + if (KEEP_ORIGINAL_CHR_COUNTS) { + if ( originalVC.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) + builder.attribute("AC_Orig", originalVC.getAttribute(VCFConstants.ALLELE_COUNT_KEY)); + if ( originalVC.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) + builder.attribute("AF_Orig", originalVC.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)); + if ( originalVC.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) + builder.attribute("AN_Orig", originalVC.getAttribute(VCFConstants.ALLELE_NUMBER_KEY)); + } + + VariantContextUtils.calculateChromosomeCounts(builder, false); + int depth = 0; - for (String sample : sub.getSampleNames()) { - Genotype g = sub.getGenotype(sample); + for (String sample : originalVC.getSampleNames()) { + Genotype g = originalVC.getGenotype(sample); if ( g.isNotFiltered() ) { @@ -694,21 +746,7 @@ public class SelectVariants extends RodWalker implements TreeR } } } - - - if (KEEP_ORIGINAL_CHR_COUNTS) { - if ( sub.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) - builder.attribute("AC_Orig", sub.getAttribute(VCFConstants.ALLELE_COUNT_KEY)); - if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) - builder.attribute("AF_Orig", sub.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)); - if ( sub.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) - builder.attribute("AN_Orig", sub.getAttribute(VCFConstants.ALLELE_NUMBER_KEY)); - } - - VariantContextUtils.calculateChromosomeCounts(builder, false); builder.attribute("DP", depth); - - return builder.make(); } private void randomlyAddVariant(int rank, VariantContext vc) { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 973588cf0..f969d8752 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -116,6 +116,19 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testUsingDbsnpName--" + testFile, spec); } + @Test + public void testRegenotype() { + String testFile = validationDataLocation + "combine.3.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s -NO_HEADER", + 1, + Arrays.asList("0fd8e52bdcd1f4b921d8fb5c689f196a") + ); + + executeTest("testRegenotype--" + testFile, spec); + } + @Test public void testMultipleRecordsAtOnePosition() { String testFile = validationDataLocation + "selectVariants.onePosition.vcf"; From 20a0078f23727079aa1d9bd5e6a1ea021a2f4531 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 1 May 2012 15:51:36 -0400 Subject: [PATCH 2/5] Merging active regions across shard boundries if they are contiguous, have the same active status and don't grow too big. --- .../gatk/traversals/TraverseActiveRegions.java | 17 +++++++++++++++-- .../gatk/walkers/annotator/HaplotypeScore.java | 2 +- .../utils/activeregion/ActivityProfile.java | 4 +++- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 76c1ce8c5..6d0ec0e7c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -28,7 +28,7 @@ public class TraverseActiveRegions extends TraversalEngine workQueue = new LinkedList(); + private final LinkedList workQueue = new LinkedList(); private final LinkedHashSet myReads = new LinkedHashSet(); @Override @@ -112,7 +112,20 @@ public class TraverseActiveRegions extends TraversalEngine activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ); // add active regions to queue of regions to process - workQueue.addAll( activeRegions ); + // first check if can merge active regions over shard boundaries + if( !activeRegions.isEmpty() ) { + if( !workQueue.isEmpty() ) { + final ActiveRegion last = workQueue.getLast(); + final ActiveRegion first = activeRegions.get(0); + if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { + workQueue.removeLast(); + activeRegions.remove(first); + workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); + } + } + workQueue.addAll( activeRegions ); + } + logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); // now go and process all of the active regions diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index 6abfdc7d2..f2e919b92 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -376,7 +376,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot } } - // indel likelihoods are stric log-probs, not phred scored + // indel likelihoods are strict log-probs, not phred scored double overallScore = 0.0; for (final double[] readHaplotypeScores : haplotypeScores) { overallScore += MathUtils.arrayMin(readHaplotypeScores); diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index 70593bbed..4333e471e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -158,7 +158,9 @@ public class ActivityProfile { // find the best place to break up the large active region Double minProb = Double.MAX_VALUE; int cutPoint = -1; - for( int iii = curStart + 50; iii < curEnd - 50; iii++ ) { // BUGBUG: assumes maxRegionSize >> 50 + + final int size = curEnd - curStart + 1; + for( int iii = curStart + (int)(size*0.25); iii < curEnd - (int)(size*0.25); iii++ ) { if( isActiveList.get(iii) < minProb ) { minProb = isActiveList.get(iii); cutPoint = iii; } } final List leftList = createActiveRegion(isActive, curStart, cutPoint, activeRegionExtension, maxRegionSize, new ArrayList()); From fc55dcec3c0fe2ed72fcd924d9065202d56013fe Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 1 May 2012 16:02:36 -0400 Subject: [PATCH 4/5] Unfortunately the reverse trimming of alleles still doesn't work with mixed records in some corner cases. Turning it off for now. --- .../sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index caa3a6b6b..a1d02b27f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -419,7 +419,7 @@ public class UnifiedGenotyperEngine { // if we are subsetting alleles (either because there were too many or because some were not polymorphic) // then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). - if ( myAlleles.size() != vc.getAlleles().size() ) + if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // TODO - this function doesn't work with mixed records or records that started as mixed and then became non-mixed vcCall = VariantContextUtils.reverseTrimAlleles(vcCall); if ( annotationEngine != null && !limitedContext && rawContext.hasBasePileup() ) {