Support for list of known CNVs in VariantEval

-- VariantSummary now includes novelty of CNVs by reciprocal overlap detection using the standard variant eval -knownCNVs argument
-- Genericizes loading for intervals into interval tree by chromosome
-- GenomeLoc methods for reciprocal overlap detection, with unit tests
This commit is contained in:
Mark DePristo 2011-11-30 17:05:16 -05:00
parent 28b286ad39
commit 3060a4a15e
6 changed files with 233 additions and 63 deletions

View File

@ -1,10 +1,12 @@
package org.broadinstitute.sting.gatk.walkers.varianteval;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.util.IntervalTree;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -30,6 +32,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
@ -189,6 +192,13 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
@Input(fullName="stratIntervals", shortName="stratIntervals", doc="File containing tribble-readable features for the IntervalStratificiation", required=false)
public IntervalBinding<Feature> intervalsFile = null;
/**
* File containing tribble-readable features containing known CNVs. For use with VariantSummary table.
*/
@Input(fullName="knownCNVs", shortName="knownCNVs", doc="File containing tribble-readable features describing a known list of copy number variants", required=false)
public IntervalBinding<Feature> knownCNVsFile = null;
Map<String, IntervalTree<GenomeLoc>> knownCNVsByContig = Collections.emptyMap();
// Variables
private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>();
@ -295,6 +305,28 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
throw new ReviewedStingException(String.format("The ancestral alignments file, '%s', could not be found", ancestralAlignmentsFile.getAbsolutePath()));
}
}
// initialize CNVs
if ( knownCNVsFile != null ) {
knownCNVsByContig = createIntervalTreeByContig(knownCNVsFile);
}
}
public final Map<String, IntervalTree<GenomeLoc>> createIntervalTreeByContig(final IntervalBinding<Feature> intervals) {
final Map<String, IntervalTree<GenomeLoc>> byContig = new HashMap<String, IntervalTree<GenomeLoc>>();
final List<GenomeLoc> locs = intervals.getIntervals(getToolkit());
// set up the map from contig -> interval tree
for ( final String contig : getContigNames() )
byContig.put(contig, new IntervalTree<GenomeLoc>());
for ( final GenomeLoc loc : locs ) {
byContig.get(loc.getContig()).put(loc.getStart(), loc.getStop(), loc);
}
return byContig;
}
/**
@ -549,14 +581,6 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
public Set<SortableJexlVCMatchExp> getJexlExpressions() { return jexlExpressions; }
public List<GenomeLoc> getIntervals() {
if ( intervalsFile == null )
throw new UserException.MissingArgument("stratIntervals", "Must be provided when IntervalStratification is enabled");
return intervalsFile.getIntervals(getToolkit());
}
public Set<String> getContigNames() {
final TreeSet<String> contigs = new TreeSet<String>();
for( final SAMSequenceRecord r : getToolkit().getReferenceDataSource().getReference().getSequenceDictionary().getSequences()) {
@ -568,4 +592,8 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
public GenomeLocParser getGenomeLocParser() {
return getToolkit().getGenomeLocParser();
}
public GenomeAnalysisEngine getToolkit() {
return super.getToolkit();
}
}

View File

@ -24,25 +24,38 @@
package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators;
import net.sf.picard.util.IntervalTree;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.Collection;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.Map;
import java.util.*;
@Analysis(description = "1000 Genomes Phase I summary of variants table")
public class VariantSummary extends VariantEvaluator implements StandardEval {
final protected static Logger logger = Logger.getLogger(VariantSummary.class);
private final static int MAX_INDEL_LENGTH = 50;
private final static double MIN_CNV_OVERLAP = 0.5;
private VariantEvalWalker walker;
public enum Type {
SNP, INDEL, CNV
}
Map<String, IntervalTree<GenomeLoc>> knownCNVs = null;
// basic counts on various rates found
@DataPoint(description = "Number of samples")
public long nSamples = 0;
@ -86,10 +99,10 @@ public class VariantSummary extends VariantEvaluator implements StandardEval {
private final static String ALL = "ALL";
private class TypeSampleMap extends EnumMap<VariantContext.Type, Map<String, Integer>> {
private class TypeSampleMap extends EnumMap<Type, Map<String, Integer>> {
public TypeSampleMap(final Collection<String> samples) {
super(VariantContext.Type.class);
for ( VariantContext.Type type : VariantContext.Type.values() ) {
super(Type.class);
for ( Type type : Type.values() ) {
Map<String, Integer> bySample = new HashMap<String, Integer>(samples.size());
for ( final String sample : samples ) {
bySample.put(sample, 0);
@ -99,16 +112,16 @@ public class VariantSummary extends VariantEvaluator implements StandardEval {
}
}
public final void inc(final VariantContext.Type type, final String sample) {
public final void inc(final Type type, final String sample) {
final int count = this.get(type).get(sample);
get(type).put(sample, count + 1);
}
public final int all(VariantContext.Type type) {
public final int all(Type type) {
return get(type).get(ALL);
}
public final int meanValue(VariantContext.Type type) {
public final int meanValue(Type type) {
long sum = 0;
int n = 0;
for ( final Map.Entry<String, Integer> pair : get(type).entrySet() ) {
@ -120,7 +133,7 @@ public class VariantSummary extends VariantEvaluator implements StandardEval {
return (int)(Math.round(sum / (1.0 * n)));
}
public final double ratioValue(VariantContext.Type type, TypeSampleMap denoms, boolean allP) {
public final double ratioValue(Type type, TypeSampleMap denoms, boolean allP) {
double sum = 0;
int n = 0;
for ( final String sample : get(type).keySet() ) {
@ -137,6 +150,8 @@ public class VariantSummary extends VariantEvaluator implements StandardEval {
public void initialize(VariantEvalWalker walker) {
this.walker = walker;
nSamples = walker.getSampleNamesForEvaluation().size();
countsPerSample = new TypeSampleMap(walker.getSampleNamesForEvaluation());
transitionsPerSample = new TypeSampleMap(walker.getSampleNamesForEvaluation());
@ -144,6 +159,13 @@ public class VariantSummary extends VariantEvaluator implements StandardEval {
allVariantCounts = new TypeSampleMap(walker.getSampleNamesForEvaluation());
knownVariantCounts = new TypeSampleMap(walker.getSampleNamesForEvaluation());
depthPerSample = new TypeSampleMap(walker.getSampleNamesForEvaluation());
if ( walker.knownCNVsFile != null ) {
knownCNVs = walker.createIntervalTreeByContig(walker.knownCNVsFile);
final List<GenomeLoc> locs = walker.knownCNVsFile.getIntervals(walker.getToolkit());
logger.info(String.format("Creating known CNV list %s containing %d intervals covering %d bp",
walker.knownCNVsFile.getSource(), locs.size(), IntervalUtils.intervalSize(locs)));
}
}
@Override public boolean enabled() { return true; }
@ -156,44 +178,77 @@ public class VariantSummary extends VariantEvaluator implements StandardEval {
nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1);
}
private final Type getType(VariantContext vc) {
switch (vc.getType()) {
case SNP:
return Type.SNP;
case INDEL:
for ( int l : vc.getIndelLengths() )
if ( l > MAX_INDEL_LENGTH )
return Type.CNV;
return Type.INDEL;
case SYMBOLIC:
return Type.CNV;
default:
throw new UserException.BadInput("Unexpected variant context type: " + vc);
}
}
private final boolean overlapsKnownCNV(VariantContext cnv) {
final GenomeLoc loc = walker.getGenomeLocParser().createGenomeLoc(cnv, true);
IntervalTree<GenomeLoc> intervalTree = knownCNVs.get(loc.getContig());
final Iterator<IntervalTree.Node<GenomeLoc>> nodeIt = intervalTree.overlappers(loc.getStart(), loc.getStop());
while ( nodeIt.hasNext() ) {
final double overlapP = loc.reciprocialOverlapFraction(nodeIt.next().getValue());
if ( overlapP > MIN_CNV_OVERLAP )
return true;
}
return false;
}
public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( eval == null || eval.isMonomorphicInSamples() ) return null;
final Type type = getType(eval);
TypeSampleMap titvTable = null;
switch (eval.getType()) {
case SNP:
titvTable = VariantContextUtils.isTransition(eval) ? transitionsPerSample : transversionsPerSample;
titvTable.inc(eval.getType(), ALL);
case INDEL:
case SYMBOLIC:
allVariantCounts.inc(eval.getType(), ALL);
if ( comp != null )
knownVariantCounts.inc(eval.getType(), ALL);
if ( eval.hasAttribute(VCFConstants.DEPTH_KEY) )
depthPerSample.inc(eval.getType(), ALL);
break;
default:
throw new UserException.BadInput("Unexpected variant context type: " + eval);
// update DP, if possible
if ( eval.hasAttribute(VCFConstants.DEPTH_KEY) )
depthPerSample.inc(type, ALL);
// update counts
allVariantCounts.inc(type, ALL);
// type specific calculations
if ( type == Type.SNP ) {
titvTable = VariantContextUtils.isTransition(eval) ? transitionsPerSample : transversionsPerSample;
titvTable.inc(type, ALL);
}
// novelty calculation
if ( comp != null || (type == Type.CNV && overlapsKnownCNV(eval)))
knownVariantCounts.inc(type, ALL);
// per sample metrics
for (final Genotype g : eval.getGenotypes()) {
if ( ! g.isNoCall() && ! g.isHomRef() ) {
countsPerSample.inc(eval.getType(), g.getSampleName());
countsPerSample.inc(type, g.getSampleName());
// update transition / transversion ratio
if ( titvTable != null ) titvTable.inc(eval.getType(), g.getSampleName());
if ( titvTable != null ) titvTable.inc(type, g.getSampleName());
if ( g.hasAttribute(VCFConstants.DEPTH_KEY) )
depthPerSample.inc(eval.getType(), g.getSampleName());
depthPerSample.inc(type, g.getSampleName());
}
}
return null; // we don't capture any interesting sites
}
private final String noveltyRate(VariantContext.Type type) {
private final String noveltyRate(Type type) {
final int all = allVariantCounts.all(type);
final int known = knownVariantCounts.all(type);
final int novel = all - known;
@ -202,22 +257,22 @@ public class VariantSummary extends VariantEvaluator implements StandardEval {
}
public void finalizeEvaluation() {
nSNPs = allVariantCounts.all(VariantContext.Type.SNP);
nIndels = allVariantCounts.all(VariantContext.Type.INDEL);
nSVs = allVariantCounts.all(VariantContext.Type.SYMBOLIC);
nSNPs = allVariantCounts.all(Type.SNP);
nIndels = allVariantCounts.all(Type.INDEL);
nSVs = allVariantCounts.all(Type.CNV);
TiTvRatio = transitionsPerSample.ratioValue(VariantContext.Type.SNP, transversionsPerSample, true);
TiTvRatioPerSample = transitionsPerSample.ratioValue(VariantContext.Type.SNP, transversionsPerSample, false);
TiTvRatio = transitionsPerSample.ratioValue(Type.SNP, transversionsPerSample, true);
TiTvRatioPerSample = transitionsPerSample.ratioValue(Type.SNP, transversionsPerSample, false);
nSNPsPerSample = countsPerSample.meanValue(VariantContext.Type.SNP);
nIndelsPerSample = countsPerSample.meanValue(VariantContext.Type.INDEL);
nSVsPerSample = countsPerSample.meanValue(VariantContext.Type.SYMBOLIC);
nSNPsPerSample = countsPerSample.meanValue(Type.SNP);
nIndelsPerSample = countsPerSample.meanValue(Type.INDEL);
nSVsPerSample = countsPerSample.meanValue(Type.CNV);
SNPNoveltyRate = noveltyRate(VariantContext.Type.SNP);
IndelNoveltyRate = noveltyRate(VariantContext.Type.INDEL);
SVNoveltyRate = noveltyRate(VariantContext.Type.SYMBOLIC);
SNPNoveltyRate = noveltyRate(Type.SNP);
IndelNoveltyRate = noveltyRate(Type.INDEL);
SVNoveltyRate = noveltyRate(Type.CNV);
SNPDPPerSample = depthPerSample.meanValue(VariantContext.Type.SNP);
IndelDPPerSample = depthPerSample.meanValue(VariantContext.Type.INDEL);
SNPDPPerSample = depthPerSample.meanValue(Type.SNP);
IndelDPPerSample = depthPerSample.meanValue(Type.INDEL);
}
}

View File

@ -54,26 +54,23 @@ import java.util.*;
*/
public class IntervalStratification extends VariantStratifier {
final protected static Logger logger = Logger.getLogger(IntervalStratification.class);
final Map<String, IntervalTree<Boolean>> intervalTreeByContig = new HashMap<String, IntervalTree<Boolean>>();
Map<String, IntervalTree<GenomeLoc>> intervalTreeByContig = null;
@Override
public void initialize() {
final List<GenomeLoc> locs = getVariantEvalWalker().getIntervals();
if ( getVariantEvalWalker().intervalsFile == null )
throw new UserException.MissingArgument("stratIntervals", "Must be provided when IntervalStratification is enabled");
final List<GenomeLoc> locs = getVariantEvalWalker().intervalsFile.getIntervals(getVariantEvalWalker().getToolkit());
if ( locs.isEmpty() )
throw new UserException.BadArgumentValue("stratIntervals", "Contains no intervals. Perhaps the file is malformed or empty?");
intervalTreeByContig = getVariantEvalWalker().createIntervalTreeByContig(getVariantEvalWalker().intervalsFile);
logger.info(String.format("Creating IntervalStratification %s containing %d intervals covering %d bp",
getVariantEvalWalker().intervalsFile.getSource(), locs.size(), IntervalUtils.intervalSize(locs)));
// set up the map from contig -> interval tree
for ( final String contig : getVariantEvalWalker().getContigNames() )
intervalTreeByContig.put(contig, new IntervalTree<Boolean>());
for ( final GenomeLoc loc : locs ) {
intervalTreeByContig.get(loc.getContig()).put(loc.getStart(), loc.getStop(), true);
}
states = new ArrayList<String>(Arrays.asList("all", "overlaps.intervals", "outside.intervals"));
}
@ -82,8 +79,8 @@ public class IntervalStratification extends VariantStratifier {
if (eval != null) {
final GenomeLoc loc = getVariantEvalWalker().getGenomeLocParser().createGenomeLoc(eval, true);
IntervalTree<Boolean> intervalTree = intervalTreeByContig.get(loc.getContig());
IntervalTree.Node<Boolean> node = intervalTree.minOverlapper(loc.getStart(), loc.getStop());
IntervalTree<GenomeLoc> intervalTree = intervalTreeByContig.get(loc.getContig());
IntervalTree.Node<GenomeLoc> node = intervalTree.minOverlapper(loc.getStart(), loc.getStop());
//logger.info(String.format("Overlap %s found %s", loc, node));
relevantStates.add( node != null ? "overlaps.intervals" : "outside.intervals");
}

View File

@ -440,4 +440,29 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
return stop - start + 1;
}
/**
* reciprocialOverlap: what is the min. percent of gl1 and gl2 covered by both
*
* gl1.s ---------- gk1.e
* gl2.s ---------- gl2.e
* 100%
*
* gl1.s ---------- gk1.e
* gl2.s ---------- gl2.e
* 50%
*
* gl1.s ---------- gk1.e
* gl2.s -------------------- gl2.e
* 25% (50% for gl1 but only 25% for gl2)
*/
public final double reciprocialOverlapFraction(final GenomeLoc o) {
if ( overlapsP(o) )
return Math.min(overlapPercent(this, o), overlapPercent(o, this));
else
return 0.0;
}
private final static double overlapPercent(final GenomeLoc gl1, final GenomeLoc gl2) {
return (1.0 * gl1.intersect(gl2).size()) / gl1.size();
}
}

View File

@ -134,7 +134,7 @@ public abstract class BaseTest {
*/
public static class TestDataProvider {
private static final Map<Class, List<Object>> tests = new HashMap<Class, List<Object>>();
private final String name;
private String name;
/**
* Create a new TestDataProvider instance bound to the class variable C
@ -151,6 +151,10 @@ public abstract class BaseTest {
this(c, "");
}
public void setName(final String name) {
this.name = name;
}
/**
* Return all of the data providers in the form expected by TestNG of type class C
* @param c

View File

@ -9,6 +9,7 @@ import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
@ -150,4 +151,64 @@ public class GenomeLocUnitTest extends BaseTest {
Assert.assertEquals(twoUnmappedMixed.size(),2,"Wrong number of elements in list.");
Assert.assertEquals(twoUnmappedMixed,Arrays.asList(chr1,unmapped),"List sorted in wrong order");
}
// -------------------------------------------------------------------------------------
//
// testing overlap detection
//
// -------------------------------------------------------------------------------------
private class ReciprocalOverlapProvider extends TestDataProvider {
GenomeLoc gl1, gl2;
int overlapSize;
double overlapFraction;
private ReciprocalOverlapProvider(int start1, int stop1, int start2, int stop2) {
super(ReciprocalOverlapProvider.class);
gl1 = genomeLocParser.createGenomeLoc("chr1", start1, stop1);
gl2 = genomeLocParser.createGenomeLoc("chr1", start2, stop2);
int shared = 0;
for ( int i = start1; i <= stop1; i++ ) {
if ( i >= start2 && i <= stop2 )
shared++;
}
this.overlapSize = shared;
this.overlapFraction = Math.min((1.0*shared)/gl1.size(), (1.0*shared)/gl2.size());
super.setName(String.format("%d-%d / %d-%d overlap=%d / %.2f", start1, stop1, start2, stop2, overlapSize, overlapFraction));
}
}
@DataProvider(name = "ReciprocalOverlapProvider")
public Object[][] makeReciprocalOverlapProvider() {
for ( int start1 = 1; start1 <= 10; start1++ ) {
for ( int stop1 = start1; stop1 <= 10; stop1++ ) {
new ReciprocalOverlapProvider(start1, stop1, 1, 10);
new ReciprocalOverlapProvider(start1, stop1, 5, 10);
new ReciprocalOverlapProvider(start1, stop1, 5, 7);
new ReciprocalOverlapProvider(start1, stop1, 5, 15);
new ReciprocalOverlapProvider(start1, stop1, 11, 20);
new ReciprocalOverlapProvider(1, 10, start1, stop1);
new ReciprocalOverlapProvider(5, 10, start1, stop1);
new ReciprocalOverlapProvider(5, 7, start1, stop1);
new ReciprocalOverlapProvider(5, 15, start1, stop1);
new ReciprocalOverlapProvider(11, 20, start1, stop1);
}
}
return ReciprocalOverlapProvider.getTests(ReciprocalOverlapProvider.class);
}
@Test(dataProvider = "ReciprocalOverlapProvider")
public void testReciprocalOverlapProvider(ReciprocalOverlapProvider cfg) {
if ( cfg.overlapSize == 0 ) {
Assert.assertFalse(cfg.gl1.overlapsP(cfg.gl2));
} else {
Assert.assertTrue(cfg.gl1.overlapsP(cfg.gl2));
Assert.assertEquals(cfg.gl1.intersect(cfg.gl2).size(), cfg.overlapSize);
Assert.assertEquals(cfg.gl1.reciprocialOverlapFraction(cfg.gl2), cfg.overlapFraction);
}
}
}