From 4dd7c5972c402ca77b802fe6a42f1399c9e87e5a Mon Sep 17 00:00:00 2001 From: depristo Date: Sun, 14 Mar 2010 21:08:14 +0000 Subject: [PATCH] Unit tests for -XL arguments; expt. annotation calculating the GC content within 100 bp of the current SNP git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2997 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GenomeAnalysisEngine.java | 46 ++++++----- .../gatk/walkers/annotator/GCContent.java | 43 ++++++++++ .../walkers/annotator/VariantAnnotator.java | 2 +- .../broadinstitute/sting/utils/BaseUtils.java | 6 ++ .../sting/utils/GenomeLocSortedSet.java | 68 ++++++++-------- .../sting/utils/GenomeLocSortedSetTest.java | 80 ++++++++++++++++++- 6 files changed, 189 insertions(+), 56 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 0874e9f5e..b4c0febec 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -158,7 +158,22 @@ public class GenomeAnalysisEngine { // create the output streams initializeOutputStreams(my_walker, microScheduler.getOutputTracker()); - // todo -- call createSetFromList for -XL argument, and unify this with intervals, if provided + initializeIntervals(); + + ShardStrategy shardStrategy = getShardStrategy(my_walker, + microScheduler.getReference(), + intervals, + argCollection.maximumEngineIterations, + readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null); + + // execute the microscheduler, storing the results + return microScheduler.execute(my_walker, shardStrategy, argCollection.maximumEngineIterations); + } + + /** + * Setup the intervals to be processed + */ + private void initializeIntervals() { GenomeLocSortedSet excludeIntervals = null; if (argCollection.excludeIntervals != null && argCollection.intervalMerging.check()) { List rawExcludeIntervals = parseIntervalRegion(argCollection.excludeIntervals, IntervalMergingRule.ALL); @@ -171,17 +186,18 @@ public class GenomeAnalysisEngine { if ( excludeIntervals != null ) { GenomeLocSortedSet toPrune = intervals == null ? GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) : intervals; - intervals = pruneIntervals( toPrune, excludeIntervals ); + long toPruneSize = toPrune.coveredSize(); + long toExcludeSize = excludeIntervals.coveredSize(); + logger.info(String.format("Initial include intervals cover %d bases", toPruneSize)); + logger.info(String.format("Initial exclude intervals cover %d bases", toExcludeSize)); + intervals = toPrune.substractRegions( excludeIntervals ); + long intervalSize = intervals.coveredSize(); + logger.info(String.format("Excluding %d bases from original intervals (%.2f%% reduction)", + toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize))); } - ShardStrategy shardStrategy = getShardStrategy(my_walker, - microScheduler.getReference(), - intervals, - argCollection.maximumEngineIterations, - readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null); - - // execute the microscheduler, storing the results - return microScheduler.execute(my_walker, shardStrategy, argCollection.maximumEngineIterations); + if ( intervals != null ) + logger.info(String.format("Processing %d bases in intervals", intervals.coveredSize())); } /** @@ -306,16 +322,6 @@ public class GenomeAnalysisEngine { return microScheduler; } - private GenomeLocSortedSet pruneIntervals( GenomeLocSortedSet toPrune, GenomeLocSortedSet toExclude) { - logger.info(String.format("pruning intervals from %d against %d", toPrune.size(), toExclude.size())); - //for ( GenomeLoc exclude : toExclude ) - // toPrune.removeRegion(exclude); - GenomeLocSortedSet x = toPrune.substractRegions(toExclude); - logger.info(String.format("done pruning intervals == now have %d", x.size())); - - return x; - } - /** * setup the interval regions, from either the interval file of the genome region string * diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java new file mode 100755 index 000000000..fd0d70e59 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java @@ -0,0 +1,43 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine; + +import java.util.Map; + + +public class GCContent implements VariantAnnotation { + + public String annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + double content = computeGCContent(ref); + return String.format("%.2f", content); + } + + public String getKeyName() { return "GC"; } + + public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine("GC", 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "GC content within 20 bp +/- the variant"); } + + public boolean useZeroQualityReads() { return false; } + + private static double computeGCContent(ReferenceContext ref) { + int gc = 0, at = 0; + + for ( char base : ref.getBases() ) { + int baseIndex = BaseUtils.simpleBaseToBaseIndex(base); + if ( baseIndex == BaseUtils.gIndex || baseIndex == BaseUtils.cIndex ) + gc++; + else if ( baseIndex == BaseUtils.aIndex || baseIndex == BaseUtils.tIndex ) + at++; + else + ; // ignore + } + + int sum = gc + at; + return (100.0*gc) / (sum == 0 ? 1 : sum); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index af94b0aab..56a055938 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -20,7 +20,7 @@ import java.io.*; */ //@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=VariationRod.class)) @Allows(value={DataSource.READS, DataSource.REFERENCE}) -@Reference(window=@Window(start=-20,stop=20)) +@Reference(window=@Window(start=-50,stop=50)) @By(DataSource.REFERENCE) public class VariantAnnotator extends LocusWalker { @Argument(fullName="vcfOutput", shortName="vcf", doc="VCF file to which all variants should be written with annotations", required=true) diff --git a/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 4d7f1ac41..6b9ff4284 100644 --- a/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -14,6 +14,12 @@ public class BaseUtils { public static final byte DELETION_INDEX = 4; public static final byte NO_CALL_INDEX = 5; // (this is 'N') + public static int gIndex = BaseUtils.simpleBaseToBaseIndex('G'); + public static int cIndex = BaseUtils.simpleBaseToBaseIndex('C'); + public static int aIndex = BaseUtils.simpleBaseToBaseIndex('A'); + public static int tIndex = BaseUtils.simpleBaseToBaseIndex('T'); + + /// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or // a pyrimidine to another pyrimidine nucleotide (C <-> T). // Approximately two out of every three single nucleotide polymorphisms (SNPs) are transitions. diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index d45e5e693..6369f5d81 100755 --- a/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -44,6 +44,18 @@ public class GenomeLocSortedSet extends AbstractSet { public GenomeLocSortedSet() { } + public GenomeLocSortedSet(GenomeLoc e) { + this(); + add(e); + } + + public GenomeLocSortedSet(Collection l) { + this(); + + for ( GenomeLoc e : l ) + add(e); + } + /** * get an iterator over this collection * @@ -62,6 +74,17 @@ public class GenomeLocSortedSet extends AbstractSet { return mArray.size(); } + /** + * Return the size, in bp, of the genomic regions by all of the regions in this set + * @return size in bp of the covered regions + */ + public long coveredSize() { + long s = 0; + for ( GenomeLoc e : this ) + s += e.size(); + return s; + } + /** * determine if the collection is empty * @@ -180,7 +203,6 @@ public class GenomeLocSortedSet extends AbstractSet { return GenomeLocSortedSet.createSetFromList(good); } - private static final List EMPTY_LIST = new ArrayList(); private List subtractRegion(GenomeLoc g, GenomeLoc e) { if (g.equals(e)) { @@ -243,39 +265,6 @@ public class GenomeLocSortedSet extends AbstractSet { } -// public boolean removeRegions(GenomeLocSortedSet toRemove) { -// int i = 0, j = 0; -// -// while ( true ) { -// if ( i >= mArray.size() || j >= toRemove.mArray.size() ) -// return true; -// -// GenomeLoc g = mArray.get(i); -// GenomeLoc e = toRemove.mArray.get(j); -// -// if (g.overlapsP(e)) { -// boolean finishEarly = removeOverlappingRegion(g, e); -// if ( finishEarly ) { -// i++; -// } -// } else if ( g.compareContigs(e) < 0 ) { -// i++; // g is on an earlier contig, move g forward -// } else if ( g.compareContigs(e) > 0 ) { -// j++; // g is on a later contig, move e forward -// } else if ( g.getStop() < e.getStart() ) { -// i++; // g stops before e starts, move g forward -// } else if ( e.getStop() < g.getStart() ) { -// j++; // g starts after e stops, move e forward -// } else { -// throw new StingException("BUG: unexpected condition: g=" + g + ", e=" + e); -// } -// -// if ( (i + j) % 10000 == 0 ) -// logger.debug("removeRegions operation: i + j = " + ( i + j )); -// } -// } -// - /** * a simple removal of an interval contained in this list. The interval must be identical to one in the list (no partial locations or overlapping) * @param location the GenomeLoc to remove @@ -373,4 +362,15 @@ public class GenomeLocSortedSet extends AbstractSet { return this.mArray; } + public String toString() { + StringBuilder s = new StringBuilder(); + s.append("["); + for ( GenomeLoc e : this ) { + s.append(" "); + s.append(e.toString()); + } + s.append("]"); + + return s.toString(); + } } diff --git a/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetTest.java b/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetTest.java index 7b61c419a..f7805be19 100755 --- a/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetTest.java +++ b/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetTest.java @@ -9,6 +9,7 @@ import org.junit.Before; import org.junit.Test; import java.util.Iterator; +import java.util.Arrays; /** * @@ -120,10 +121,87 @@ public class GenomeLocSortedSetTest extends BaseTest { assertTrue(loc.getContigIndex() == 1); } + @Test + public void deleteAllByRegion() { + GenomeLoc e = GenomeLocParser.createGenomeLoc(1, 1, 100); + mSortedSet.add(e); + for (int x = 1; x < 101; x++) { + GenomeLoc del = GenomeLocParser.createGenomeLoc(1,x,x); + mSortedSet = mSortedSet.substractRegions(new GenomeLocSortedSet(del)); + } + assertTrue(mSortedSet.isEmpty()); + } + @Test + public void deleteSomeByRegion() { + GenomeLoc e = GenomeLocParser.createGenomeLoc(1, 1, 100); + mSortedSet.add(e); + for (int x = 1; x < 50; x++) { + GenomeLoc del = GenomeLocParser.createGenomeLoc(1,x,x); + mSortedSet = mSortedSet.substractRegions(new GenomeLocSortedSet(del)); + } + assertTrue(!mSortedSet.isEmpty()); + assertTrue(mSortedSet.size() == 1); + GenomeLoc loc = mSortedSet.iterator().next(); + assertTrue(loc.getStop() == 100); + assertTrue(loc.getStart() == 50); + } - + @Test + public void deleteSuperRegion() { + GenomeLoc e = GenomeLocParser.createGenomeLoc(1, 10, 20); + GenomeLoc g = GenomeLocParser.createGenomeLoc(1, 70, 100); + mSortedSet.add(g); + mSortedSet.addRegion(e); + assertTrue(mSortedSet.size() == 2); + // now delete a region + GenomeLoc d = GenomeLocParser.createGenomeLoc(1, 15, 75); + mSortedSet = mSortedSet.substractRegions(new GenomeLocSortedSet(d)); + Iterator iter = mSortedSet.iterator(); + GenomeLoc loc = iter.next(); + assertTrue(loc.getStart() == 10); + assertTrue(loc.getStop() == 14); + assertTrue(loc.getContigIndex() == 1); + + loc = iter.next(); + assertTrue(loc.getStart() == 76); + assertTrue(loc.getStop() == 100); + assertTrue(loc.getContigIndex() == 1); + } + + @Test + public void substractComplexExample() { + GenomeLoc e = GenomeLocParser.createGenomeLoc(1, 1, 20); + mSortedSet.add(e); + + GenomeLoc r1 = GenomeLocParser.createGenomeLoc(1, 3, 5); + GenomeLoc r2 = GenomeLocParser.createGenomeLoc(1, 10, 12); + GenomeLoc r3 = GenomeLocParser.createGenomeLoc(1, 16, 18); + GenomeLocSortedSet toExclude = new GenomeLocSortedSet(Arrays.asList(r1, r2, r3)); + + GenomeLocSortedSet remaining = mSortedSet.substractRegions(toExclude); +// logger.debug("Initial " + mSortedSet); +// logger.debug("Exclude " + toExclude); +// logger.debug("Remaining " + remaining); + + assertEquals(20, mSortedSet.coveredSize()); + assertEquals(9, toExclude.coveredSize()); + assertEquals(11, remaining.coveredSize()); + + Iterator it = remaining.iterator(); + GenomeLoc p1 = it.next(); + GenomeLoc p2 = it.next(); + GenomeLoc p3 = it.next(); + GenomeLoc p4 = it.next(); + + assertEquals(GenomeLocParser.createGenomeLoc(1, 1, 2), p1); + assertEquals(GenomeLocParser.createGenomeLoc(1, 6, 9), p2); + assertEquals(GenomeLocParser.createGenomeLoc(1, 13, 15), p3); + assertEquals(GenomeLocParser.createGenomeLoc(1, 19, 20), p4); + } + + @Test public void fromSequenceDictionary() {