From 4dd7c5972c402ca77b802fe6a42f1399c9e87e5a Mon Sep 17 00:00:00 2001
From: depristo <depristo@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Sun, 14 Mar 2010 21:08:14 +0000
Subject: [PATCH] Unit tests for -XL arguments; expt. annotation calculating
 the GC content within 100 bp of the current SNP

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2997 348d0f76-0448-11de-a6fe-93d51630548a
---
 .../sting/gatk/GenomeAnalysisEngine.java      | 46 ++++++-----
 .../gatk/walkers/annotator/GCContent.java     | 43 ++++++++++
 .../walkers/annotator/VariantAnnotator.java   |  2 +-
 .../broadinstitute/sting/utils/BaseUtils.java |  6 ++
 .../sting/utils/GenomeLocSortedSet.java       | 68 ++++++++--------
 .../sting/utils/GenomeLocSortedSetTest.java   | 80 ++++++++++++++++++-
 6 files changed, 189 insertions(+), 56 deletions(-)
 create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java
diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
index 0874e9f5e..b4c0febec 100755
--- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@@ -158,7 +158,22 @@ public class GenomeAnalysisEngine {
         // create the output streams
         initializeOutputStreams(my_walker, microScheduler.getOutputTracker());
 
-        // todo -- call createSetFromList for -XL argument, and unify this with intervals, if provided
+        initializeIntervals();
+
+        ShardStrategy shardStrategy = getShardStrategy(my_walker,
+                                                       microScheduler.getReference(),
+                                                       intervals,
+                                                       argCollection.maximumEngineIterations,
+                                                       readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
+
+        // execute the microscheduler, storing the results
+        return microScheduler.execute(my_walker, shardStrategy, argCollection.maximumEngineIterations);
+    }
+
+    /**
+     * Setup the intervals to be processed
+     */
+    private void initializeIntervals() {
         GenomeLocSortedSet excludeIntervals = null;
         if (argCollection.excludeIntervals != null && argCollection.intervalMerging.check()) {
             List<GenomeLoc> rawExcludeIntervals = parseIntervalRegion(argCollection.excludeIntervals, IntervalMergingRule.ALL);
@@ -171,17 +186,18 @@ public class GenomeAnalysisEngine {
 
         if ( excludeIntervals != null ) {
             GenomeLocSortedSet toPrune = intervals == null ? GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) : intervals;
-            intervals = pruneIntervals( toPrune, excludeIntervals );
+            long toPruneSize = toPrune.coveredSize();
+            long toExcludeSize = excludeIntervals.coveredSize();
+            logger.info(String.format("Initial include intervals cover %d bases", toPruneSize));
+            logger.info(String.format("Initial exclude intervals cover %d bases", toExcludeSize));
+            intervals = toPrune.substractRegions( excludeIntervals );
+            long intervalSize = intervals.coveredSize();
+            logger.info(String.format("Excluding %d bases from original intervals (%.2f%% reduction)",
+                    toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
         }
 
-        ShardStrategy shardStrategy = getShardStrategy(my_walker,
-                                                       microScheduler.getReference(),
-                                                       intervals,
-                                                       argCollection.maximumEngineIterations,
-                                                       readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
-
-        // execute the microscheduler, storing the results
-        return microScheduler.execute(my_walker, shardStrategy, argCollection.maximumEngineIterations);
+        if ( intervals != null )
+            logger.info(String.format("Processing %d bases in intervals", intervals.coveredSize()));
     }
 
     /**
@@ -306,16 +322,6 @@ public class GenomeAnalysisEngine {
         return microScheduler;
     }
 
-    private GenomeLocSortedSet pruneIntervals( GenomeLocSortedSet toPrune, GenomeLocSortedSet toExclude) {
-        logger.info(String.format("pruning intervals from %d against %d", toPrune.size(), toExclude.size()));
-        //for ( GenomeLoc exclude : toExclude )
-        //    toPrune.removeRegion(exclude);
-        GenomeLocSortedSet x = toPrune.substractRegions(toExclude);
-        logger.info(String.format("done pruning intervals == now have %d", x.size()));
-
-        return x;
-    }
-
     /**
      * setup the interval regions, from either the interval file of the genome region string
      *
diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java
new file mode 100755
index 000000000..fd0d70e59
--- /dev/null
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java
@@ -0,0 +1,43 @@
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
+
+import java.util.Map;
+
+
+public class GCContent implements VariantAnnotation {
+
+    public String annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
+        double content = computeGCContent(ref);
+        return String.format("%.2f", content);
+    }
+
+    public String getKeyName() { return "GC"; }
+
+    public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine("GC", 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "GC content within 20 bp +/- the variant"); }
+
+    public boolean useZeroQualityReads() { return false; }
+
+    private static double computeGCContent(ReferenceContext ref) {
+        int gc = 0, at = 0;
+
+        for ( char base : ref.getBases() ) {
+            int baseIndex = BaseUtils.simpleBaseToBaseIndex(base);
+            if ( baseIndex == BaseUtils.gIndex || baseIndex == BaseUtils.cIndex )
+                gc++;
+            else if ( baseIndex == BaseUtils.aIndex || baseIndex == BaseUtils.tIndex )
+                at++;
+            else
+                ; // ignore
+        }
+
+        int sum = gc + at;
+        return (100.0*gc) / (sum == 0 ? 1 : sum);
+     }
+}
\ No newline at end of file
diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
index af94b0aab..56a055938 100755
--- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
@@ -20,7 +20,7 @@ import java.io.*;
  */
 //@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=VariationRod.class))
 @Allows(value={DataSource.READS, DataSource.REFERENCE})
-@Reference(window=@Window(start=-20,stop=20))
+@Reference(window=@Window(start=-50,stop=50))
 @By(DataSource.REFERENCE)
 public class VariantAnnotator extends LocusWalker<Integer, Integer> {
     @Argument(fullName="vcfOutput", shortName="vcf", doc="VCF file to which all variants should be written with annotations", required=true)
diff --git a/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/java/src/org/broadinstitute/sting/utils/BaseUtils.java
index 4d7f1ac41..6b9ff4284 100644
--- a/java/src/org/broadinstitute/sting/utils/BaseUtils.java
+++ b/java/src/org/broadinstitute/sting/utils/BaseUtils.java
@@ -14,6 +14,12 @@ public class BaseUtils {
     public static final byte DELETION_INDEX = 4;
     public static final byte NO_CALL_INDEX = 5; // (this is 'N')
 
+    public static int gIndex = BaseUtils.simpleBaseToBaseIndex('G');
+    public static int cIndex = BaseUtils.simpleBaseToBaseIndex('C');
+    public static int aIndex = BaseUtils.simpleBaseToBaseIndex('A');
+    public static int tIndex = BaseUtils.simpleBaseToBaseIndex('T');
+
+
     /// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or
     // a pyrimidine to another pyrimidine nucleotide (C <-> T).
     // Approximately two out of every three single nucleotide polymorphisms (SNPs) are transitions.
diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java
index d45e5e693..6369f5d81 100755
--- a/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java
+++ b/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java
@@ -44,6 +44,18 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
     public GenomeLocSortedSet() {
     }
 
+    public GenomeLocSortedSet(GenomeLoc e) {
+        this();
+        add(e);
+    }
+
+    public GenomeLocSortedSet(Collection<GenomeLoc> l) {
+        this();
+
+        for ( GenomeLoc e : l )
+            add(e);
+    }
+
     /**
      * get an iterator over this collection
      *
@@ -62,6 +74,17 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
         return mArray.size();
     }
 
+    /**
+     * Return the size, in bp, of the genomic regions by all of the regions in this set
+     * @return size in bp of the covered regions
+     */
+    public long coveredSize() {
+        long s = 0;
+        for ( GenomeLoc e : this )
+            s += e.size();
+        return s;
+    }
+
     /**
      * determine if the collection is empty
      *
@@ -180,7 +203,6 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
         return GenomeLocSortedSet.createSetFromList(good);
     }
 
-
     private static final List<GenomeLoc> EMPTY_LIST = new ArrayList<GenomeLoc>();
     private List<GenomeLoc> subtractRegion(GenomeLoc g, GenomeLoc e) {
         if (g.equals(e)) {
@@ -243,39 +265,6 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
     }
 
 
-//    public boolean removeRegions(GenomeLocSortedSet toRemove) {
-//        int i = 0, j = 0;
-//
-//        while ( true ) {
-//            if ( i >= mArray.size() || j >= toRemove.mArray.size() )
-//                return true;
-//
-//            GenomeLoc g = mArray.get(i);
-//            GenomeLoc e = toRemove.mArray.get(j);
-//
-//            if (g.overlapsP(e)) {
-//                boolean finishEarly = removeOverlappingRegion(g, e);
-//                if ( finishEarly ) {
-//                    i++;
-//                }
-//            } else if ( g.compareContigs(e) < 0 ) {
-//                i++;        // g is on an earlier contig, move g forward
-//            } else if ( g.compareContigs(e) > 0 ) {
-//                j++;        // g is on a later contig, move e forward
-//            } else if ( g.getStop() < e.getStart() ) {
-//                i++;        // g stops before e starts, move g forward
-//            } else if ( e.getStop() < g.getStart() ) {
-//                j++;        // g starts after e stops, move e forward
-//            } else {
-//                throw new StingException("BUG: unexpected condition: g=" + g + ", e=" + e);
-//            }
-//
-//            if ( (i + j) % 10000 == 0 )
-//                logger.debug("removeRegions operation: i + j = " + ( i + j ));
-//        }
-//    }
-//
-
     /**
      * a simple removal of an interval contained in this list.  The interval must be identical to one in the list (no partial locations or overlapping)
      * @param location the GenomeLoc to remove
@@ -373,4 +362,15 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
         return this.mArray;
     }
 
+    public String toString() {
+        StringBuilder s = new StringBuilder();
+        s.append("[");
+        for ( GenomeLoc e : this ) {
+            s.append(" ");
+            s.append(e.toString());
+        }
+        s.append("]");
+
+        return s.toString();
+    }
 }
diff --git a/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetTest.java b/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetTest.java
index 7b61c419a..f7805be19 100755
--- a/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetTest.java
+++ b/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetTest.java
@@ -9,6 +9,7 @@ import org.junit.Before;
 import org.junit.Test;
 
 import java.util.Iterator;
+import java.util.Arrays;
 
 /**
  *
@@ -120,10 +121,87 @@ public class GenomeLocSortedSetTest extends BaseTest {
         assertTrue(loc.getContigIndex() == 1);
     }
 
+    @Test
+    public void deleteAllByRegion() {
+        GenomeLoc e = GenomeLocParser.createGenomeLoc(1, 1, 100);
+        mSortedSet.add(e);
+        for (int x = 1; x < 101; x++) {
+            GenomeLoc del = GenomeLocParser.createGenomeLoc(1,x,x);
+            mSortedSet = mSortedSet.substractRegions(new GenomeLocSortedSet(del));
+        }
+        assertTrue(mSortedSet.isEmpty());
+    }
 
+    @Test
+    public void deleteSomeByRegion() {
+        GenomeLoc e = GenomeLocParser.createGenomeLoc(1, 1, 100);
+        mSortedSet.add(e);
+        for (int x = 1; x < 50; x++) {
+            GenomeLoc del = GenomeLocParser.createGenomeLoc(1,x,x);
+            mSortedSet = mSortedSet.substractRegions(new GenomeLocSortedSet(del));
+        }
+        assertTrue(!mSortedSet.isEmpty());
+        assertTrue(mSortedSet.size() == 1);
+        GenomeLoc loc = mSortedSet.iterator().next();        
+        assertTrue(loc.getStop() == 100);
+        assertTrue(loc.getStart() == 50);
 
+    }
 
-   
+    @Test
+    public void deleteSuperRegion() {
+        GenomeLoc e = GenomeLocParser.createGenomeLoc(1, 10, 20);
+        GenomeLoc g = GenomeLocParser.createGenomeLoc(1, 70, 100);
+        mSortedSet.add(g);
+        mSortedSet.addRegion(e);
+        assertTrue(mSortedSet.size() == 2);
+        // now delete a region
+        GenomeLoc d = GenomeLocParser.createGenomeLoc(1, 15, 75);
+        mSortedSet = mSortedSet.substractRegions(new GenomeLocSortedSet(d));
+        Iterator<GenomeLoc> iter = mSortedSet.iterator();
+        GenomeLoc loc = iter.next();
+        assertTrue(loc.getStart() == 10);
+        assertTrue(loc.getStop() == 14);
+        assertTrue(loc.getContigIndex() == 1);
+
+        loc = iter.next();
+        assertTrue(loc.getStart() == 76);
+        assertTrue(loc.getStop() == 100);
+        assertTrue(loc.getContigIndex() == 1);
+    }
+
+    @Test
+    public void substractComplexExample() {
+        GenomeLoc e = GenomeLocParser.createGenomeLoc(1, 1, 20);
+        mSortedSet.add(e);
+
+        GenomeLoc r1 = GenomeLocParser.createGenomeLoc(1, 3, 5);
+        GenomeLoc r2 = GenomeLocParser.createGenomeLoc(1, 10, 12);
+        GenomeLoc r3 = GenomeLocParser.createGenomeLoc(1, 16, 18);
+        GenomeLocSortedSet toExclude = new GenomeLocSortedSet(Arrays.asList(r1, r2, r3));
+
+        GenomeLocSortedSet remaining = mSortedSet.substractRegions(toExclude);
+//        logger.debug("Initial   " + mSortedSet);
+//        logger.debug("Exclude   " + toExclude);
+//        logger.debug("Remaining " + remaining);
+
+        assertEquals(20, mSortedSet.coveredSize());
+        assertEquals(9, toExclude.coveredSize());
+        assertEquals(11, remaining.coveredSize());
+
+        Iterator<GenomeLoc> it = remaining.iterator();
+        GenomeLoc p1 = it.next();
+        GenomeLoc p2 = it.next();
+        GenomeLoc p3 = it.next();
+        GenomeLoc p4 = it.next();
+
+        assertEquals(GenomeLocParser.createGenomeLoc(1, 1, 2), p1);
+        assertEquals(GenomeLocParser.createGenomeLoc(1, 6, 9), p2);
+        assertEquals(GenomeLocParser.createGenomeLoc(1, 13, 15), p3);
+        assertEquals(GenomeLocParser.createGenomeLoc(1, 19, 20), p4);
+    }
+
+    
 
     @Test
     public void fromSequenceDictionary() {