Unit tests for -XL arguments; expt. annotation calculating the GC content within 100 bp of the current SNP

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2997 348d0f76-0448-11de-a6fe-93d51630548a
2010-03-14 21:08:14 +00:00 · 2010-03-14 21:08:14 +00:00 · 4dd7c5972c
parent e367a50e9b
commit 4dd7c5972c
6 changed files with 189 additions and 56 deletions
--- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@ -158,7 +158,22 @@ public class GenomeAnalysisEngine {
        // create the output streams
        initializeOutputStreams(my_walker, microScheduler.getOutputTracker());

-        // todo -- call createSetFromList for -XL argument, and unify this with intervals, if provided
+        initializeIntervals();
+
+        ShardStrategy shardStrategy = getShardStrategy(my_walker,
+                                                       microScheduler.getReference(),
+                                                       intervals,
+                                                       argCollection.maximumEngineIterations,
+                                                       readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
+
+        // execute the microscheduler, storing the results
+        return microScheduler.execute(my_walker, shardStrategy, argCollection.maximumEngineIterations);
+    }
+
+    /**
+     * Setup the intervals to be processed
+     */
+    private void initializeIntervals() {
        GenomeLocSortedSet excludeIntervals = null;
        if (argCollection.excludeIntervals != null && argCollection.intervalMerging.check()) {
            List<GenomeLoc> rawExcludeIntervals = parseIntervalRegion(argCollection.excludeIntervals, IntervalMergingRule.ALL);
@ -171,17 +186,18 @@ public class GenomeAnalysisEngine {

        if ( excludeIntervals != null ) {
            GenomeLocSortedSet toPrune = intervals == null ? GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) : intervals;
-            intervals = pruneIntervals( toPrune, excludeIntervals );
+            long toPruneSize = toPrune.coveredSize();
+            long toExcludeSize = excludeIntervals.coveredSize();
+            logger.info(String.format("Initial include intervals cover %d bases", toPruneSize));
+            logger.info(String.format("Initial exclude intervals cover %d bases", toExcludeSize));
+            intervals = toPrune.substractRegions( excludeIntervals );
+            long intervalSize = intervals.coveredSize();
+            logger.info(String.format("Excluding %d bases from original intervals (%.2f%% reduction)",
+                    toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
        }

-        ShardStrategy shardStrategy = getShardStrategy(my_walker,
-                                                       microScheduler.getReference(),
-                                                       intervals,
-                                                       argCollection.maximumEngineIterations,
-                                                       readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
-
-        // execute the microscheduler, storing the results
-        return microScheduler.execute(my_walker, shardStrategy, argCollection.maximumEngineIterations);
+        if ( intervals != null )
+            logger.info(String.format("Processing %d bases in intervals", intervals.coveredSize()));
    }

    /**
@ -306,16 +322,6 @@ public class GenomeAnalysisEngine {
        return microScheduler;
    }

-    private GenomeLocSortedSet pruneIntervals( GenomeLocSortedSet toPrune, GenomeLocSortedSet toExclude) {
-        logger.info(String.format("pruning intervals from %d against %d", toPrune.size(), toExclude.size()));
-        //for ( GenomeLoc exclude : toExclude )
-        //    toPrune.removeRegion(exclude);
-        GenomeLocSortedSet x = toPrune.substractRegions(toExclude);
-        logger.info(String.format("done pruning intervals == now have %d", x.size()));
-
-        return x;
-    }
-
    /**
     * setup the interval regions, from either the interval file of the genome region string
     *
--- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java
@ -0,0 +1,43 @@
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
+
+import java.util.Map;
+
+
+public class GCContent implements VariantAnnotation {
+
+    public String annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
+        double content = computeGCContent(ref);
+        return String.format("%.2f", content);
+    }
+
+    public String getKeyName() { return "GC"; }
+
+    public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine("GC", 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "GC content within 20 bp +/- the variant"); }
+
+    public boolean useZeroQualityReads() { return false; }
+
+    private static double computeGCContent(ReferenceContext ref) {
+        int gc = 0, at = 0;
+
+        for ( char base : ref.getBases() ) {
+            int baseIndex = BaseUtils.simpleBaseToBaseIndex(base);
+            if ( baseIndex == BaseUtils.gIndex || baseIndex == BaseUtils.cIndex )
+                gc++;
+            else if ( baseIndex == BaseUtils.aIndex || baseIndex == BaseUtils.tIndex )
+                at++;
+            else
+                ; // ignore
+        }
+
+        int sum = gc + at;
+        return (100.0*gc) / (sum == 0 ? 1 : sum);
+     }
+}
--- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
@ -20,7 +20,7 @@ import java.io.*;
 */
 //@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=VariationRod.class))
@Allows(value={DataSource.READS, DataSource.REFERENCE})
-@Reference(window=@Window(start=-20,stop=20))
+@Reference(window=@Window(start=-50,stop=50))
@By(DataSource.REFERENCE)
 public class VariantAnnotator extends LocusWalker<Integer, Integer> {
    @Argument(fullName="vcfOutput", shortName="vcf", doc="VCF file to which all variants should be written with annotations", required=true)
--- a/java/src/org/broadinstitute/sting/utils/BaseUtils.java
+++ b/java/src/org/broadinstitute/sting/utils/BaseUtils.java
@ -14,6 +14,12 @@ public class BaseUtils {
    public static final byte DELETION_INDEX = 4;
    public static final byte NO_CALL_INDEX = 5; // (this is 'N')

+    public static int gIndex = BaseUtils.simpleBaseToBaseIndex('G');
+    public static int cIndex = BaseUtils.simpleBaseToBaseIndex('C');
+    public static int aIndex = BaseUtils.simpleBaseToBaseIndex('A');
+    public static int tIndex = BaseUtils.simpleBaseToBaseIndex('T');
+
+
    /// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or
    // a pyrimidine to another pyrimidine nucleotide (C <-> T).
    // Approximately two out of every three single nucleotide polymorphisms (SNPs) are transitions.
--- a/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java
+++ b/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java
@ -44,6 +44,18 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
    public GenomeLocSortedSet() {
    }

+    public GenomeLocSortedSet(GenomeLoc e) {
+        this();
+        add(e);
+    }
+
+    public GenomeLocSortedSet(Collection<GenomeLoc> l) {
+        this();
+
+        for ( GenomeLoc e : l )
+            add(e);
+    }
+
    /**
     * get an iterator over this collection
     *
@ -62,6 +74,17 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
        return mArray.size();
    }

+    /**
+     * Return the size, in bp, of the genomic regions by all of the regions in this set
+     * @return size in bp of the covered regions
+     */
+    public long coveredSize() {
+        long s = 0;
+        for ( GenomeLoc e : this )
+            s += e.size();
+        return s;
+    }
+
    /**
     * determine if the collection is empty
     *
@ -180,7 +203,6 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
        return GenomeLocSortedSet.createSetFromList(good);
    }

-
    private static final List<GenomeLoc> EMPTY_LIST = new ArrayList<GenomeLoc>();
    private List<GenomeLoc> subtractRegion(GenomeLoc g, GenomeLoc e) {
        if (g.equals(e)) {
@ -243,39 +265,6 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
    }


-//    public boolean removeRegions(GenomeLocSortedSet toRemove) {
-//        int i = 0, j = 0;
-//
-//        while ( true ) {
-//            if ( i >= mArray.size() || j >= toRemove.mArray.size() )
-//                return true;
-//
-//            GenomeLoc g = mArray.get(i);
-//            GenomeLoc e = toRemove.mArray.get(j);
-//
-//            if (g.overlapsP(e)) {
-//                boolean finishEarly = removeOverlappingRegion(g, e);
-//                if ( finishEarly ) {
-//                    i++;
-//                }
-//            } else if ( g.compareContigs(e) < 0 ) {
-//                i++;        // g is on an earlier contig, move g forward
-//            } else if ( g.compareContigs(e) > 0 ) {
-//                j++;        // g is on a later contig, move e forward
-//            } else if ( g.getStop() < e.getStart() ) {
-//                i++;        // g stops before e starts, move g forward
-//            } else if ( e.getStop() < g.getStart() ) {
-//                j++;        // g starts after e stops, move e forward
-//            } else {
-//                throw new StingException("BUG: unexpected condition: g=" + g + ", e=" + e);
-//            }
-//
-//            if ( (i + j) % 10000 == 0 )
-//                logger.debug("removeRegions operation: i + j = " + ( i + j ));
-//        }
-//    }
-//
-
    /**
     * a simple removal of an interval contained in this list.  The interval must be identical to one in the list (no partial locations or overlapping)
     * @param location the GenomeLoc to remove
@ -373,4 +362,15 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
        return this.mArray;
    }

+    public String toString() {
+        StringBuilder s = new StringBuilder();
+        s.append("[");
+        for ( GenomeLoc e : this ) {
+            s.append(" ");
+            s.append(e.toString());
+        }
+        s.append("]");
+
+        return s.toString();
+    }
 }
--- a/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetTest.java
+++ b/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetTest.java
@ -9,6 +9,7 @@ import org.junit.Before;
 import org.junit.Test;

 import java.util.Iterator;
+import java.util.Arrays;

 /**
 *
@ -120,10 +121,87 @@ public class GenomeLocSortedSetTest extends BaseTest {
        assertTrue(loc.getContigIndex() == 1);
    }

+    @Test
+    public void deleteAllByRegion() {
+        GenomeLoc e = GenomeLocParser.createGenomeLoc(1, 1, 100);
+        mSortedSet.add(e);
+        for (int x = 1; x < 101; x++) {
+            GenomeLoc del = GenomeLocParser.createGenomeLoc(1,x,x);
+            mSortedSet = mSortedSet.substractRegions(new GenomeLocSortedSet(del));
+        }
+        assertTrue(mSortedSet.isEmpty());
+    }

+    @Test
+    public void deleteSomeByRegion() {
+        GenomeLoc e = GenomeLocParser.createGenomeLoc(1, 1, 100);
+        mSortedSet.add(e);
+        for (int x = 1; x < 50; x++) {
+            GenomeLoc del = GenomeLocParser.createGenomeLoc(1,x,x);
+            mSortedSet = mSortedSet.substractRegions(new GenomeLocSortedSet(del));
+        }
+        assertTrue(!mSortedSet.isEmpty());
+        assertTrue(mSortedSet.size() == 1);
+        GenomeLoc loc = mSortedSet.iterator().next();        
+        assertTrue(loc.getStop() == 100);
+        assertTrue(loc.getStart() == 50);

+    }

-   
+    @Test
+    public void deleteSuperRegion() {
+        GenomeLoc e = GenomeLocParser.createGenomeLoc(1, 10, 20);
+        GenomeLoc g = GenomeLocParser.createGenomeLoc(1, 70, 100);
+        mSortedSet.add(g);
+        mSortedSet.addRegion(e);
+        assertTrue(mSortedSet.size() == 2);
+        // now delete a region
+        GenomeLoc d = GenomeLocParser.createGenomeLoc(1, 15, 75);
+        mSortedSet = mSortedSet.substractRegions(new GenomeLocSortedSet(d));
+        Iterator<GenomeLoc> iter = mSortedSet.iterator();
+        GenomeLoc loc = iter.next();
+        assertTrue(loc.getStart() == 10);
+        assertTrue(loc.getStop() == 14);
+        assertTrue(loc.getContigIndex() == 1);
+
+        loc = iter.next();
+        assertTrue(loc.getStart() == 76);
+        assertTrue(loc.getStop() == 100);
+        assertTrue(loc.getContigIndex() == 1);
+    }
+
+    @Test
+    public void substractComplexExample() {
+        GenomeLoc e = GenomeLocParser.createGenomeLoc(1, 1, 20);
+        mSortedSet.add(e);
+
+        GenomeLoc r1 = GenomeLocParser.createGenomeLoc(1, 3, 5);
+        GenomeLoc r2 = GenomeLocParser.createGenomeLoc(1, 10, 12);
+        GenomeLoc r3 = GenomeLocParser.createGenomeLoc(1, 16, 18);
+        GenomeLocSortedSet toExclude = new GenomeLocSortedSet(Arrays.asList(r1, r2, r3));
+
+        GenomeLocSortedSet remaining = mSortedSet.substractRegions(toExclude);
+//        logger.debug("Initial   " + mSortedSet);
+//        logger.debug("Exclude   " + toExclude);
+//        logger.debug("Remaining " + remaining);
+
+        assertEquals(20, mSortedSet.coveredSize());
+        assertEquals(9, toExclude.coveredSize());
+        assertEquals(11, remaining.coveredSize());
+
+        Iterator<GenomeLoc> it = remaining.iterator();
+        GenomeLoc p1 = it.next();
+        GenomeLoc p2 = it.next();
+        GenomeLoc p3 = it.next();
+        GenomeLoc p4 = it.next();
+
+        assertEquals(GenomeLocParser.createGenomeLoc(1, 1, 2), p1);
+        assertEquals(GenomeLocParser.createGenomeLoc(1, 6, 9), p2);
+        assertEquals(GenomeLocParser.createGenomeLoc(1, 13, 15), p3);
+        assertEquals(GenomeLocParser.createGenomeLoc(1, 19, 20), p4);
+    }
+
+    

    @Test
    public void fromSequenceDictionary() {