From 3060a4a15ebc340512144d45a27aa393b27282d7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 30 Nov 2011 17:05:16 -0500 Subject: [PATCH] Support for list of known CNVs in VariantEval -- VariantSummary now includes novelty of CNVs by reciprocal overlap detection using the standard variant eval -knownCNVs argument -- Genericizes loading for intervals into interval tree by chromosome -- GenomeLoc methods for reciprocal overlap detection, with unit tests --- .../varianteval/VariantEvalWalker.java | 44 +++++- .../evaluators/VariantSummary.java | 139 ++++++++++++------ .../IntervalStratification.java | 21 ++- .../broadinstitute/sting/utils/GenomeLoc.java | 25 ++++ .../org/broadinstitute/sting/BaseTest.java | 6 +- .../sting/utils/GenomeLocUnitTest.java | 61 ++++++++ 6 files changed, 233 insertions(+), 63 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 10d4651b7..1504f5baf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -1,10 +1,12 @@ package org.broadinstitute.sting.gatk.walkers.varianteval; import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.picard.util.IntervalTree; import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -30,6 +32,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; @@ -189,6 +192,13 @@ public class VariantEvalWalker extends RodWalker implements Tr @Input(fullName="stratIntervals", shortName="stratIntervals", doc="File containing tribble-readable features for the IntervalStratificiation", required=false) public IntervalBinding intervalsFile = null; + /** + * File containing tribble-readable features containing known CNVs. For use with VariantSummary table. + */ + @Input(fullName="knownCNVs", shortName="knownCNVs", doc="File containing tribble-readable features describing a known list of copy number variants", required=false) + public IntervalBinding knownCNVsFile = null; + Map> knownCNVsByContig = Collections.emptyMap(); + // Variables private Set jexlExpressions = new TreeSet(); @@ -295,6 +305,28 @@ public class VariantEvalWalker extends RodWalker implements Tr throw new ReviewedStingException(String.format("The ancestral alignments file, '%s', could not be found", ancestralAlignmentsFile.getAbsolutePath())); } } + + + // initialize CNVs + if ( knownCNVsFile != null ) { + knownCNVsByContig = createIntervalTreeByContig(knownCNVsFile); + } + } + + public final Map> createIntervalTreeByContig(final IntervalBinding intervals) { + final Map> byContig = new HashMap>(); + + final List locs = intervals.getIntervals(getToolkit()); + + // set up the map from contig -> interval tree + for ( final String contig : getContigNames() ) + byContig.put(contig, new IntervalTree()); + + for ( final GenomeLoc loc : locs ) { + byContig.get(loc.getContig()).put(loc.getStart(), loc.getStop(), loc); + } + + return byContig; } /** @@ -549,14 +581,6 @@ public class VariantEvalWalker extends RodWalker implements Tr public Set getJexlExpressions() { return jexlExpressions; } - public List getIntervals() { - if ( intervalsFile == null ) - throw new UserException.MissingArgument("stratIntervals", "Must be provided when IntervalStratification is enabled"); - - return intervalsFile.getIntervals(getToolkit()); - } - - public Set getContigNames() { final TreeSet contigs = new TreeSet(); for( final SAMSequenceRecord r : getToolkit().getReferenceDataSource().getReference().getSequenceDictionary().getSequences()) { @@ -568,4 +592,8 @@ public class VariantEvalWalker extends RodWalker implements Tr public GenomeLocParser getGenomeLocParser() { return getToolkit().getGenomeLocParser(); } + + public GenomeAnalysisEngine getToolkit() { + return super.getToolkit(); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java index 503cb8ff4..b74af9f91 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java @@ -24,25 +24,38 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; +import net.sf.picard.util.IntervalTree; +import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import java.util.Collection; -import java.util.EnumMap; -import java.util.HashMap; -import java.util.Map; +import java.util.*; @Analysis(description = "1000 Genomes Phase I summary of variants table") public class VariantSummary extends VariantEvaluator implements StandardEval { + final protected static Logger logger = Logger.getLogger(VariantSummary.class); + + private final static int MAX_INDEL_LENGTH = 50; + private final static double MIN_CNV_OVERLAP = 0.5; + private VariantEvalWalker walker; + + public enum Type { + SNP, INDEL, CNV + } + + Map> knownCNVs = null; + // basic counts on various rates found @DataPoint(description = "Number of samples") public long nSamples = 0; @@ -86,10 +99,10 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { private final static String ALL = "ALL"; - private class TypeSampleMap extends EnumMap> { + private class TypeSampleMap extends EnumMap> { public TypeSampleMap(final Collection samples) { - super(VariantContext.Type.class); - for ( VariantContext.Type type : VariantContext.Type.values() ) { + super(Type.class); + for ( Type type : Type.values() ) { Map bySample = new HashMap(samples.size()); for ( final String sample : samples ) { bySample.put(sample, 0); @@ -99,16 +112,16 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { } } - public final void inc(final VariantContext.Type type, final String sample) { + public final void inc(final Type type, final String sample) { final int count = this.get(type).get(sample); get(type).put(sample, count + 1); } - public final int all(VariantContext.Type type) { + public final int all(Type type) { return get(type).get(ALL); } - public final int meanValue(VariantContext.Type type) { + public final int meanValue(Type type) { long sum = 0; int n = 0; for ( final Map.Entry pair : get(type).entrySet() ) { @@ -120,7 +133,7 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { return (int)(Math.round(sum / (1.0 * n))); } - public final double ratioValue(VariantContext.Type type, TypeSampleMap denoms, boolean allP) { + public final double ratioValue(Type type, TypeSampleMap denoms, boolean allP) { double sum = 0; int n = 0; for ( final String sample : get(type).keySet() ) { @@ -137,6 +150,8 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { public void initialize(VariantEvalWalker walker) { + this.walker = walker; + nSamples = walker.getSampleNamesForEvaluation().size(); countsPerSample = new TypeSampleMap(walker.getSampleNamesForEvaluation()); transitionsPerSample = new TypeSampleMap(walker.getSampleNamesForEvaluation()); @@ -144,6 +159,13 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { allVariantCounts = new TypeSampleMap(walker.getSampleNamesForEvaluation()); knownVariantCounts = new TypeSampleMap(walker.getSampleNamesForEvaluation()); depthPerSample = new TypeSampleMap(walker.getSampleNamesForEvaluation()); + + if ( walker.knownCNVsFile != null ) { + knownCNVs = walker.createIntervalTreeByContig(walker.knownCNVsFile); + final List locs = walker.knownCNVsFile.getIntervals(walker.getToolkit()); + logger.info(String.format("Creating known CNV list %s containing %d intervals covering %d bp", + walker.knownCNVsFile.getSource(), locs.size(), IntervalUtils.intervalSize(locs))); + } } @Override public boolean enabled() { return true; } @@ -156,44 +178,77 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1); } + private final Type getType(VariantContext vc) { + switch (vc.getType()) { + case SNP: + return Type.SNP; + case INDEL: + for ( int l : vc.getIndelLengths() ) + if ( l > MAX_INDEL_LENGTH ) + return Type.CNV; + return Type.INDEL; + case SYMBOLIC: + return Type.CNV; + default: + throw new UserException.BadInput("Unexpected variant context type: " + vc); + } + } + + private final boolean overlapsKnownCNV(VariantContext cnv) { + final GenomeLoc loc = walker.getGenomeLocParser().createGenomeLoc(cnv, true); + IntervalTree intervalTree = knownCNVs.get(loc.getContig()); + + final Iterator> nodeIt = intervalTree.overlappers(loc.getStart(), loc.getStop()); + while ( nodeIt.hasNext() ) { + final double overlapP = loc.reciprocialOverlapFraction(nodeIt.next().getValue()); + if ( overlapP > MIN_CNV_OVERLAP ) + return true; + } + + return false; + } + public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( eval == null || eval.isMonomorphicInSamples() ) return null; + final Type type = getType(eval); + TypeSampleMap titvTable = null; - switch (eval.getType()) { - case SNP: - titvTable = VariantContextUtils.isTransition(eval) ? transitionsPerSample : transversionsPerSample; - titvTable.inc(eval.getType(), ALL); - case INDEL: - case SYMBOLIC: - allVariantCounts.inc(eval.getType(), ALL); - if ( comp != null ) - knownVariantCounts.inc(eval.getType(), ALL); - if ( eval.hasAttribute(VCFConstants.DEPTH_KEY) ) - depthPerSample.inc(eval.getType(), ALL); - break; - default: - throw new UserException.BadInput("Unexpected variant context type: " + eval); + // update DP, if possible + if ( eval.hasAttribute(VCFConstants.DEPTH_KEY) ) + depthPerSample.inc(type, ALL); + + // update counts + allVariantCounts.inc(type, ALL); + + // type specific calculations + if ( type == Type.SNP ) { + titvTable = VariantContextUtils.isTransition(eval) ? transitionsPerSample : transversionsPerSample; + titvTable.inc(type, ALL); } + // novelty calculation + if ( comp != null || (type == Type.CNV && overlapsKnownCNV(eval))) + knownVariantCounts.inc(type, ALL); + // per sample metrics for (final Genotype g : eval.getGenotypes()) { if ( ! g.isNoCall() && ! g.isHomRef() ) { - countsPerSample.inc(eval.getType(), g.getSampleName()); + countsPerSample.inc(type, g.getSampleName()); // update transition / transversion ratio - if ( titvTable != null ) titvTable.inc(eval.getType(), g.getSampleName()); + if ( titvTable != null ) titvTable.inc(type, g.getSampleName()); if ( g.hasAttribute(VCFConstants.DEPTH_KEY) ) - depthPerSample.inc(eval.getType(), g.getSampleName()); + depthPerSample.inc(type, g.getSampleName()); } } return null; // we don't capture any interesting sites } - private final String noveltyRate(VariantContext.Type type) { + private final String noveltyRate(Type type) { final int all = allVariantCounts.all(type); final int known = knownVariantCounts.all(type); final int novel = all - known; @@ -202,22 +257,22 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { } public void finalizeEvaluation() { - nSNPs = allVariantCounts.all(VariantContext.Type.SNP); - nIndels = allVariantCounts.all(VariantContext.Type.INDEL); - nSVs = allVariantCounts.all(VariantContext.Type.SYMBOLIC); + nSNPs = allVariantCounts.all(Type.SNP); + nIndels = allVariantCounts.all(Type.INDEL); + nSVs = allVariantCounts.all(Type.CNV); - TiTvRatio = transitionsPerSample.ratioValue(VariantContext.Type.SNP, transversionsPerSample, true); - TiTvRatioPerSample = transitionsPerSample.ratioValue(VariantContext.Type.SNP, transversionsPerSample, false); + TiTvRatio = transitionsPerSample.ratioValue(Type.SNP, transversionsPerSample, true); + TiTvRatioPerSample = transitionsPerSample.ratioValue(Type.SNP, transversionsPerSample, false); - nSNPsPerSample = countsPerSample.meanValue(VariantContext.Type.SNP); - nIndelsPerSample = countsPerSample.meanValue(VariantContext.Type.INDEL); - nSVsPerSample = countsPerSample.meanValue(VariantContext.Type.SYMBOLIC); + nSNPsPerSample = countsPerSample.meanValue(Type.SNP); + nIndelsPerSample = countsPerSample.meanValue(Type.INDEL); + nSVsPerSample = countsPerSample.meanValue(Type.CNV); - SNPNoveltyRate = noveltyRate(VariantContext.Type.SNP); - IndelNoveltyRate = noveltyRate(VariantContext.Type.INDEL); - SVNoveltyRate = noveltyRate(VariantContext.Type.SYMBOLIC); + SNPNoveltyRate = noveltyRate(Type.SNP); + IndelNoveltyRate = noveltyRate(Type.INDEL); + SVNoveltyRate = noveltyRate(Type.CNV); - SNPDPPerSample = depthPerSample.meanValue(VariantContext.Type.SNP); - IndelDPPerSample = depthPerSample.meanValue(VariantContext.Type.INDEL); + SNPDPPerSample = depthPerSample.meanValue(Type.SNP); + IndelDPPerSample = depthPerSample.meanValue(Type.INDEL); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java index 00a656cc6..d91422a7e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IntervalStratification.java @@ -54,26 +54,23 @@ import java.util.*; */ public class IntervalStratification extends VariantStratifier { final protected static Logger logger = Logger.getLogger(IntervalStratification.class); - final Map> intervalTreeByContig = new HashMap>(); + Map> intervalTreeByContig = null; @Override public void initialize() { - final List locs = getVariantEvalWalker().getIntervals(); + if ( getVariantEvalWalker().intervalsFile == null ) + throw new UserException.MissingArgument("stratIntervals", "Must be provided when IntervalStratification is enabled"); + + final List locs = getVariantEvalWalker().intervalsFile.getIntervals(getVariantEvalWalker().getToolkit()); if ( locs.isEmpty() ) throw new UserException.BadArgumentValue("stratIntervals", "Contains no intervals. Perhaps the file is malformed or empty?"); + intervalTreeByContig = getVariantEvalWalker().createIntervalTreeByContig(getVariantEvalWalker().intervalsFile); + logger.info(String.format("Creating IntervalStratification %s containing %d intervals covering %d bp", getVariantEvalWalker().intervalsFile.getSource(), locs.size(), IntervalUtils.intervalSize(locs))); - // set up the map from contig -> interval tree - for ( final String contig : getVariantEvalWalker().getContigNames() ) - intervalTreeByContig.put(contig, new IntervalTree()); - - for ( final GenomeLoc loc : locs ) { - intervalTreeByContig.get(loc.getContig()).put(loc.getStart(), loc.getStop(), true); - } - states = new ArrayList(Arrays.asList("all", "overlaps.intervals", "outside.intervals")); } @@ -82,8 +79,8 @@ public class IntervalStratification extends VariantStratifier { if (eval != null) { final GenomeLoc loc = getVariantEvalWalker().getGenomeLocParser().createGenomeLoc(eval, true); - IntervalTree intervalTree = intervalTreeByContig.get(loc.getContig()); - IntervalTree.Node node = intervalTree.minOverlapper(loc.getStart(), loc.getStop()); + IntervalTree intervalTree = intervalTreeByContig.get(loc.getContig()); + IntervalTree.Node node = intervalTree.minOverlapper(loc.getStart(), loc.getStop()); //logger.info(String.format("Overlap %s found %s", loc, node)); relevantStates.add( node != null ? "overlaps.intervals" : "outside.intervals"); } diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index c1479bc69..345161416 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -440,4 +440,29 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome return stop - start + 1; } + /** + * reciprocialOverlap: what is the min. percent of gl1 and gl2 covered by both + * + * gl1.s ---------- gk1.e + * gl2.s ---------- gl2.e + * 100% + * + * gl1.s ---------- gk1.e + * gl2.s ---------- gl2.e + * 50% + * + * gl1.s ---------- gk1.e + * gl2.s -------------------- gl2.e + * 25% (50% for gl1 but only 25% for gl2) + */ + public final double reciprocialOverlapFraction(final GenomeLoc o) { + if ( overlapsP(o) ) + return Math.min(overlapPercent(this, o), overlapPercent(o, this)); + else + return 0.0; + } + + private final static double overlapPercent(final GenomeLoc gl1, final GenomeLoc gl2) { + return (1.0 * gl1.intersect(gl2).size()) / gl1.size(); + } } diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index f99a105ae..5032a7810 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -134,7 +134,7 @@ public abstract class BaseTest { */ public static class TestDataProvider { private static final Map> tests = new HashMap>(); - private final String name; + private String name; /** * Create a new TestDataProvider instance bound to the class variable C @@ -151,6 +151,10 @@ public abstract class BaseTest { this(c, ""); } + public void setName(final String name) { + this.name = name; + } + /** * Return all of the data providers in the form expected by TestNG of type class C * @param c diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java index 29c085b70..49778a4d8 100644 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java @@ -9,6 +9,7 @@ import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; @@ -150,4 +151,64 @@ public class GenomeLocUnitTest extends BaseTest { Assert.assertEquals(twoUnmappedMixed.size(),2,"Wrong number of elements in list."); Assert.assertEquals(twoUnmappedMixed,Arrays.asList(chr1,unmapped),"List sorted in wrong order"); } + + // ------------------------------------------------------------------------------------- + // + // testing overlap detection + // + // ------------------------------------------------------------------------------------- + + private class ReciprocalOverlapProvider extends TestDataProvider { + GenomeLoc gl1, gl2; + int overlapSize; + double overlapFraction; + + private ReciprocalOverlapProvider(int start1, int stop1, int start2, int stop2) { + super(ReciprocalOverlapProvider.class); + gl1 = genomeLocParser.createGenomeLoc("chr1", start1, stop1); + gl2 = genomeLocParser.createGenomeLoc("chr1", start2, stop2); + + int shared = 0; + for ( int i = start1; i <= stop1; i++ ) { + if ( i >= start2 && i <= stop2 ) + shared++; + } + + this.overlapSize = shared; + this.overlapFraction = Math.min((1.0*shared)/gl1.size(), (1.0*shared)/gl2.size()); + super.setName(String.format("%d-%d / %d-%d overlap=%d / %.2f", start1, stop1, start2, stop2, overlapSize, overlapFraction)); + } + } + + @DataProvider(name = "ReciprocalOverlapProvider") + public Object[][] makeReciprocalOverlapProvider() { + for ( int start1 = 1; start1 <= 10; start1++ ) { + for ( int stop1 = start1; stop1 <= 10; stop1++ ) { + new ReciprocalOverlapProvider(start1, stop1, 1, 10); + new ReciprocalOverlapProvider(start1, stop1, 5, 10); + new ReciprocalOverlapProvider(start1, stop1, 5, 7); + new ReciprocalOverlapProvider(start1, stop1, 5, 15); + new ReciprocalOverlapProvider(start1, stop1, 11, 20); + + new ReciprocalOverlapProvider(1, 10, start1, stop1); + new ReciprocalOverlapProvider(5, 10, start1, stop1); + new ReciprocalOverlapProvider(5, 7, start1, stop1); + new ReciprocalOverlapProvider(5, 15, start1, stop1); + new ReciprocalOverlapProvider(11, 20, start1, stop1); + } + } + + return ReciprocalOverlapProvider.getTests(ReciprocalOverlapProvider.class); + } + + @Test(dataProvider = "ReciprocalOverlapProvider") + public void testReciprocalOverlapProvider(ReciprocalOverlapProvider cfg) { + if ( cfg.overlapSize == 0 ) { + Assert.assertFalse(cfg.gl1.overlapsP(cfg.gl2)); + } else { + Assert.assertTrue(cfg.gl1.overlapsP(cfg.gl2)); + Assert.assertEquals(cfg.gl1.intersect(cfg.gl2).size(), cfg.overlapSize); + Assert.assertEquals(cfg.gl1.reciprocialOverlapFraction(cfg.gl2), cfg.overlapFraction); + } + } }