Merge branch 'master' of ssh://gsa1.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

2011-09-19 12:24:12 -04:00 · 2011-09-19 12:24:12 -04:00 · 095f75ff7d
parent 85626e7a5d 034b868588
commit 095f75ff7d
12 changed files with 245 additions and 169 deletions
--- a/public/R/queueJobReport.R
+++ b/public/R/queueJobReport.R
@ -12,14 +12,14 @@ if ( onCMDLine ) {
  inputFileName = args[1]
  outputPDF = args[2]
 } else {
-  #inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt"
-  inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
+  inputFileName = "~/Desktop/Q-30033@gsa1.jobreport.txt"
+  #inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
  #inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt"
  outputPDF = NA
 }

-RUNTIME_UNITS = "(sec)"
-ORIGINAL_UNITS_TO_SECONDS = 1/1000
+RUNTIME_UNITS = "(hours)"
+ORIGINAL_UNITS_TO_SECONDS = 1/1000/60/60

 # 
 # Helper function to aggregate all of the jobs in the report across all tables
@ -33,7 +33,7 @@ allJobsFromReport <- function(report) {
 #
 # Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job
 #
-plotJobsGantt <- function(gatkReport, sortOverall) {
+plotJobsGantt <- function(gatkReport, sortOverall, includeText) {
  allJobs = allJobsFromReport(gatkReport)
  if ( sortOverall ) {
    title = "All jobs, by analysis, by start time"
@ -44,16 +44,18 @@ plotJobsGantt <- function(gatkReport, sortOverall) {
  }
  allJobs$index = 1:nrow(allJobs)
  minTime = min(allJobs$startTime)
-  allJobs$relStartTime = allJobs$startTime - minTime
-  allJobs$relDoneTime = allJobs$doneTime - minTime
+  allJobs$relStartTime = (allJobs$startTime - minTime) * ORIGINAL_UNITS_TO_SECONDS
+  allJobs$relDoneTime = (allJobs$doneTime - minTime) * ORIGINAL_UNITS_TO_SECONDS
  allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts)
  maxRelTime = max(allJobs$relDoneTime)
  p <- ggplot(data=allJobs, aes(x=relStartTime, y=index, color=analysisName))
-  p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=2, arrow=arrow(length = unit(0.1, "cm")))
-  p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
+  p <- p + theme_bw()
+  p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=1, arrow=arrow(length = unit(0.1, "cm")))
+  if ( includeText )
+    p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
  p <- p + xlim(0, maxRelTime * 1.1)
  p <- p + xlab(paste("Start time (relative to first job)", RUNTIME_UNITS))
-  p <- p + ylab("Job")
+  p <- p + ylab("Job number")
  p <- p + opts(title=title)
  print(p)
 }
@ -157,8 +159,8 @@ if ( ! is.na(outputPDF) ) {
  pdf(outputPDF, height=8.5, width=11)
 } 

-plotJobsGantt(gatkReportData, T)
-plotJobsGantt(gatkReportData, F)
+plotJobsGantt(gatkReportData, T, F)
+plotJobsGantt(gatkReportData, F, F)
 plotProgressByTime(gatkReportData)
 for ( group in gatkReportData ) {
 plotGroup(group)
--- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
@ -293,15 +293,16 @@ public class GATKRunReport {
     * That is, postReport() is guarenteed not to fail for any reason.
     */
    private File postReportToLocalDisk(File rootDir) {
+        String filename = getID() + ".report.xml.gz";
+        File file = new File(rootDir, filename);
        try {
-            String filename = getID() + ".report.xml.gz";
-            File file = new File(rootDir, filename);
            postReportToFile(file);
            logger.debug("Wrote report to " + file);
            return file;
        } catch ( Exception e ) {
            // we catch everything, and no matter what eat the error
            exceptDuringRunReport("Couldn't read report file", e);
+            file.delete();
            return null;
        }
    }
@ -312,6 +313,7 @@ public class GATKRunReport {
        File localFile = postReportToLocalDisk(new File("./"));
        logger.debug("Generating GATK report to AWS S3 based on local file " + localFile);
        if ( localFile != null ) { // we succeeded in creating the local file
+            localFile.deleteOnExit();
            try {
                // stop us from printing the annoying, and meaningless, mime types warning
                Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class);
@ -336,14 +338,13 @@ public class GATKRunReport {
                //logger.info("Uploading " + localFile + " to AWS bucket");
                S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject);
                logger.debug("Uploaded to AWS: " + s3Object);
+                logger.info("Uploaded run statistics report to AWS S3");
            } catch ( S3ServiceException e ) {
                exceptDuringRunReport("S3 exception occurred", e);
            } catch ( NoSuchAlgorithmException e ) {
                exceptDuringRunReport("Couldn't calculate MD5", e);
            } catch ( IOException e ) {
                exceptDuringRunReport("Couldn't read report file", e);
-            } finally {
-                localFile.delete();
            }
        }
    }
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java
@ -63,12 +63,9 @@ import java.util.*;
 * <h2>Input</h2>
 * <p>
 * One or more bam files (with proper headers) to be analyzed for coverage statistics
+ * (Optional) A REFSEQ Rod to aggregate coverage to the gene level
 * </p>
- * <p>
- *(Optional) A REFSEQ Rod to aggregate coverage to the gene level
- * <p>
- * (for information about creating the REFSEQ Rod, please consult the RefSeqCodec documentation)
- *</p></p>
+ *
 * <h2>Output</h2>
 * <p>
 * Tables pertaining to different coverage summaries. Suffix on the table files declares the contents:
@ -96,7 +93,7 @@ import java.util.*;
 * <pre>
 * java -Xmx2g -jar GenomeAnalysisTK.jar \
 *   -R ref.fasta \
- *   -T DepthOfCoverage \
+ *   -T VariantEval \
 *   -o file_name_base \
 *   -I input_bams.list
 *   [-geneList refSeq.sorted.txt] \
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
@ -309,6 +309,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
        getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } });
        getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } });
        getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } });
+        getters.put("TYPE", new Getter() { public String get(VariantContext vc) { return vc.getType().toString(); } });
        getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } });
        getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } });
        getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } });
--- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java
+++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java
@ -306,7 +306,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
    
    @Override
    public int hashCode() {
-        return (int)( start << 16 + stop << 4 + contigIndex );
+        return start << 16 | stop << 4 | contigIndex;
    }


--- a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java
@ -190,11 +190,21 @@ public class SampleUtils {

    }

-    public static List<String> getSamplesFromCommandLineInput(Collection<String> sampleArgs) {
+    /**
+     * Returns a new set of samples, containing a final list of samples expanded from sampleArgs
+     *
+     * Each element E of sampleArgs can either be a literal sample name or a file.  For each E,
+     * we try to read a file named E from disk, and if possible all lines from that file are expanded
+     * into unique sample names.
+     *
+     * @param sampleArgs
+     * @return
+     */
+    public static Set<String> getSamplesFromCommandLineInput(Collection<String> sampleArgs) {
        if (sampleArgs != null) {
            // Let's first go through the list and see if we were given any files.  We'll add every entry in the file to our
            // sample list set, and treat the entries as if they had been specified on the command line.
-            List<String> samplesFromFiles = new ArrayList<String>();
+            Set<String> samplesFromFiles = new HashSet<String>();
            for (String SAMPLE_EXPRESSION : sampleArgs) {
                File sampleFile = new File(SAMPLE_EXPRESSION);

@ -203,7 +213,7 @@ public class SampleUtils {

                    List<String> lines = reader.readLines();
                    for (String line : lines) {
-                        samplesFromFiles.add(line);
+                        samplesFromFiles.add(line.trim());
                    }
                } catch (FileNotFoundException e) {
                    samplesFromFiles.add(SAMPLE_EXPRESSION); // not a file, so must be a sample
@ -212,7 +222,8 @@ public class SampleUtils {

            return samplesFromFiles;
        }
-        return new ArrayList<String>();
+
+        return new HashSet<String>();
    }

    public static Set<String> getSamplesFromCommandLineInput(Collection<String> vcfSamples, Collection<String> sampleExpressions) {
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
@ -12,35 +12,19 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
 import java.util.ArrayList;

 /**
- * Allows for reading in RefSeq information
+ * TODO FOR CHRIS HARTL
 *
 * <p>
- * Parses a sorted UCSC RefSeq file (see below) into relevant features: the gene name, the unique gene name (if multiple transcrips get separate entries), exons, gene start/stop, coding start/stop,
- * strandedness of transcription. 
+ * Codec Description
 * </p>
 *
 * <p>
- * Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the Wiki here
- * <a href="http://www.broadinstitute.org/gsa/wiki/index.php/RefSeq">http://www.broadinstitute.org/gsa/wiki/index.php/RefSeq</a>
+ * See also: link to file specification
 * </p>
- * <h2> Usage </h2>
- * The RefSeq Rod can be bound as any other rod, and is specified by REFSEQ, for example
- * <pre>
- * -refSeqBinding:REFSEQ /path/to/refSeq.txt
- * </pre>
- *
- * You will need to consult individual walkers for the binding name ("refSeqBinding", above)
 *
 * <h2>File format example</h2>
- * If you want to define your own file for use, the format is (tab delimited):
- * bin, name, chrom, strand, transcription start, transcription end, coding start, coding end, num exons, exon starts, exon ends, id, alt. name, coding start status (complete/incomplete), coding end status (complete,incomplete)
- * and exon frames, for example:
- * <pre>
- * 76 NM_001011874 1 - 3204562 3661579 3206102 3661429 3 3204562,3411782,3660632, 3207049,3411982,3661579, 0 Xkr4 cmpl cmpl 1,2,0,
- * </pre>
- * for more information see <a href="http://skip.ucsc.edu/cgi-bin/hgTables?hgsid=5651&hgta_doSchemaDb=mm8&hgta_doSchemaTable=refGene">here</a>
 * <p>
- *     
+ *     A BAM file containing <b>exactly one sample</b>.
 * </p>
 *
 * @author Mark DePristo
--- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java
@ -334,24 +334,44 @@ public class IntervalUtils {
    }

    /**
-     * Splits an interval list into multiple files.
-     * @param fileHeader The sam file header.
+     * Splits an interval list into multiple sublists.
     * @param locs The genome locs to split.
     * @param splits The stop points for the genome locs returned by splitFixedIntervals.
-     * @param scatterParts The output interval lists to write to.
+     * @return A list of lists of genome locs, split according to splits
     */
-    public static void scatterFixedIntervals(SAMFileHeader fileHeader, List<GenomeLoc> locs, List<Integer> splits, List<File> scatterParts) {
-        if (splits.size() != scatterParts.size())
-            throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size()));
-        int fileIndex = 0;
+    public static List<List<GenomeLoc>> splitIntervalsToSubLists(List<GenomeLoc> locs, List<Integer> splits) {
        int locIndex = 1;
        int start = 0;
+        List<List<GenomeLoc>> sublists = new ArrayList<List<GenomeLoc>>(splits.size());
        for (Integer stop: splits) {
-            IntervalList intervalList = new IntervalList(fileHeader);
+            List<GenomeLoc> curList = new ArrayList<GenomeLoc>();
            for (int i = start; i < stop; i++)
-                intervalList.add(toInterval(locs.get(i), locIndex++));
-            intervalList.write(scatterParts.get(fileIndex++));
+                curList.add(locs.get(i));
            start = stop;
+            sublists.add(curList);
+        }
+
+        return sublists;
+    }
+
+
+    /**
+     * Splits an interval list into multiple files.
+     * @param fileHeader The sam file header.
+     * @param splits Pre-divided genome locs returned by splitFixedIntervals.
+     * @param scatterParts The output interval lists to write to.
+     */
+    public static void scatterFixedIntervals(SAMFileHeader fileHeader, List<List<GenomeLoc>> splits, List<File> scatterParts) {
+        if (splits.size() != scatterParts.size())
+            throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size()));
+
+        int fileIndex = 0;
+        int locIndex = 1;
+        for (final List<GenomeLoc> split : splits) {
+            IntervalList intervalList = new IntervalList(fileHeader);
+            for (final GenomeLoc loc : split)
+                intervalList.add(toInterval(loc, locIndex++));
+            intervalList.write(scatterParts.get(fileIndex++));
        }
    }

@ -361,17 +381,15 @@ public class IntervalUtils {
     * @param numParts Number of parts to split the locs into.
     * @return The stop points to split the genome locs.
     */
-    public static List<Integer> splitFixedIntervals(List<GenomeLoc> locs, int numParts) {
+    public static List<List<GenomeLoc>> splitFixedIntervals(List<GenomeLoc> locs, int numParts) {
        if (locs.size() < numParts)
            throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts));
-        long locsSize = 0;
-        for (GenomeLoc loc: locs)
-            locsSize += loc.size();
-        List<Integer> splitPoints = new ArrayList<Integer>();
+        final long locsSize = intervalSize(locs);
+        final List<Integer> splitPoints = new ArrayList<Integer>();
        addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts);
        Collections.sort(splitPoints);
        splitPoints.add(locs.size());
-        return splitPoints;
+        return splitIntervalsToSubLists(locs, splitPoints);
    }

    private static void addFixedSplit(List<Integer> splitPoints, List<GenomeLoc> locs, long locsSize, int startIndex, int stopIndex, int numParts) {
@ -441,4 +459,11 @@ public class IntervalUtils {
            return merged;
        }
    }
+
+    public static final long intervalSize(final List<GenomeLoc> locs) {
+        long size = 0;
+        for ( final GenomeLoc loc : locs )
+            size += loc.size();
+        return size;
+    }
 }
--- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java
@ -30,6 +30,20 @@ public class IntervalUtilsUnitTest extends BaseTest {
    private SAMFileHeader hg19Header;
    private GenomeLocParser hg19GenomeLocParser;
    private List<GenomeLoc> hg19ReferenceLocs;
+    private List<GenomeLoc> hg19exomeIntervals;
+
+    private List<GenomeLoc> getLocs(String... intervals) {
+        return getLocs(Arrays.asList(intervals));
+    }
+
+    private List<GenomeLoc> getLocs(List<String> intervals) {
+        if (intervals.size() == 0)
+            return hg18ReferenceLocs;
+        List<GenomeLoc> locs = new ArrayList<GenomeLoc>();
+        for (String interval: intervals)
+            locs.add(hg18GenomeLocParser.parseGenomeLoc(interval));
+        return locs;
+    }

    @BeforeClass
    public void init() {
@ -54,12 +68,69 @@ public class IntervalUtilsUnitTest extends BaseTest {
            ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(hg19Ref);
            hg19GenomeLocParser = new GenomeLocParser(seq);
            hg19ReferenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()).toList()) ;
+
+            hg19exomeIntervals = Collections.unmodifiableList(IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(hg19Intervals), false));
        }
        catch(FileNotFoundException ex) {
            throw new UserException.CouldNotReadInputFile(hg19Ref,ex);
        }
    }

+    // -------------------------------------------------------------------------------------
+    //
+    // tests to ensure the quality of the interval cuts of the interval cutting functions
+    //
+    // -------------------------------------------------------------------------------------
+
+    private class IntervalSlicingTest extends TestDataProvider {
+        public int parts;
+        public double maxAllowableVariance;
+
+        private IntervalSlicingTest(final int parts, final double maxAllowableVariance) {
+            super(IntervalSlicingTest.class);
+            this.parts = parts;
+            this.maxAllowableVariance = maxAllowableVariance;
+        }
+
+        public String toString() {
+            return String.format("IntervalSlicingTest parts=%d maxVar=%.2f", parts, maxAllowableVariance);
+        }
+    }
+
+    @DataProvider(name = "intervalslicingdata")
+    public Object[][] createTrees() {
+        new IntervalSlicingTest(1, 0);
+        new IntervalSlicingTest(2, 1);
+        new IntervalSlicingTest(5, 1);
+        new IntervalSlicingTest(10, 1);
+        new IntervalSlicingTest(67, 1);
+        new IntervalSlicingTest(100, 1);
+        new IntervalSlicingTest(500, 1);
+        new IntervalSlicingTest(1000, 1);
+        return IntervalSlicingTest.getTests(IntervalSlicingTest.class);
+    }
+
+    @Test(enabled = true, dataProvider = "intervalslicingdata")
+    public void testFixedScatterIntervalsAlgorithm(IntervalSlicingTest test) {
+        List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(hg19exomeIntervals, test.parts);
+
+        long totalSize = IntervalUtils.intervalSize(hg19exomeIntervals);
+        long idealSplitSize = totalSize / test.parts;
+
+        long sumOfSplitSizes = 0;
+        int counter = 0;
+        for ( final List<GenomeLoc> split : splits ) {
+            long splitSize = IntervalUtils.intervalSize(split);
+            double sigma = (splitSize - idealSplitSize) / (1.0 * idealSplitSize);
+            //logger.warn(String.format("Split %d size %d ideal %d sigma %.2f", counter, splitSize, idealSplitSize, sigma));
+            counter++;
+            sumOfSplitSizes += splitSize;
+            Assert.assertTrue(Math.abs(sigma) <= test.maxAllowableVariance, String.format("Interval %d (size %d ideal %d) has a variance %.2f outside of the tolerated range %.2f", counter, splitSize, idealSplitSize, sigma, test.maxAllowableVariance));
+        }
+
+        Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals");
+    }
+
    @Test(expectedExceptions=UserException.class)
    public void testMergeListsBySetOperatorNoOverlap() {
        // a couple of lists we'll use for the testing
@ -129,19 +200,6 @@ public class IntervalUtilsUnitTest extends BaseTest {
        Assert.assertEquals((long)lengths.get("chrX"), 154913754);
    }

-    private List<GenomeLoc> getLocs(String... intervals) {
-        return getLocs(Arrays.asList(intervals));
-    }
-
-    private List<GenomeLoc> getLocs(List<String> intervals) {
-        if (intervals.size() == 0)
-            return hg18ReferenceLocs;
-        List<GenomeLoc> locs = new ArrayList<GenomeLoc>();
-        for (String interval: intervals)
-            locs.add(hg18GenomeLocParser.parseGenomeLoc(interval));
-        return locs;
-    }
-
    @Test
    public void testParseIntervalArguments() {
        Assert.assertEquals(getLocs().size(), 45);
@ -174,8 +232,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
        List<File> files = testFiles("basic.", 3, ".intervals");

        List<GenomeLoc> locs = getLocs("chr1", "chr2", "chr3");
-        List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
-        IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
+        List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
+        IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);

        List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
        List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
@ -200,8 +258,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
        List<File> files = testFiles("less.", 3, ".intervals");

        List<GenomeLoc> locs = getLocs("chr1", "chr2", "chr3", "chr4");
-        List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
-        IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
+        List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
+        IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);

        List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
        List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
@ -228,8 +286,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
    public void testScatterFixedIntervalsMoreFiles() {
        List<File> files = testFiles("more.", 3, ".intervals");
        List<GenomeLoc> locs = getLocs("chr1", "chr2");
-        List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size()
-        IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
+        List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size()
+        IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
    }
    @Test
    public void testScatterFixedIntervalsStart() {
@ -242,8 +300,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
        List<File> files = testFiles("split.", 3, ".intervals");

        List<GenomeLoc> locs = getLocs(intervals);
-        List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
-        IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
+        List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
+        IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);

        List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
        List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
@ -270,8 +328,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
        List<File> files = testFiles("split.", 3, ".intervals");

        List<GenomeLoc> locs = getLocs(intervals);
-        List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
-        IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
+        List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
+        IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);

        List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
        List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
@ -298,8 +356,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
        List<File> files = testFiles("split.", 3, ".intervals");

        List<GenomeLoc> locs = getLocs(intervals);
-        List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
-        IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
+        List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
+        IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);

        List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
        List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
@ -319,7 +377,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
    public void testScatterFixedIntervalsFile() {
        List<File> files = testFiles("sg.", 20, ".intervals");
        List<GenomeLoc> locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false);
-        List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
+        List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());

        int[] counts = {
                125, 138, 287, 291, 312, 105, 155, 324,
@ -332,16 +390,13 @@ public class IntervalUtilsUnitTest extends BaseTest {
        };

        //String splitCounts = "";
-        for (int lastIndex = 0, i = 0; i < splits.size(); i++) {
-            int splitIndex = splits.get(i);
-            int splitCount = (splitIndex - lastIndex);
-            //splitCounts += ", " + splitCount;
-            lastIndex = splitIndex;
+        for (int i = 0; i < splits.size(); i++) {
+            int splitCount = splits.get(i).size();
            Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i);
        }
        //System.out.println(splitCounts.substring(2));

-        IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
+        IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);

        int locIndex = 0;
        for (int i = 0; i < files.size(); i++) {
@ -357,8 +412,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
    @Test
    public void testScatterFixedIntervalsMax() {
        List<File> files = testFiles("sg.", 85, ".intervals");
-        List<Integer> splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size());
-        IntervalUtils.scatterFixedIntervals(hg19Header, hg19ReferenceLocs, splits, files);
+        List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size());
+        IntervalUtils.scatterFixedIntervals(hg19Header, splits, files);

        for (int i = 0; i < files.size(); i++) {
            String file = files.get(i).toString();
--- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala
+++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala
@ -1,65 +1,65 @@
-/*
- * Copyright (c) 2011, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.queue.extensions.gatk
-
-import java.io.File
-import collection.JavaConversions._
-import org.broadinstitute.sting.utils.interval.IntervalUtils
-import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource
-import net.sf.samtools.SAMFileHeader
-import java.util.Collections
-import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser}
-
-case class GATKIntervals(reference: File, intervals: List[String]) {
-  private lazy val referenceDataSource = new ReferenceDataSource(reference)
-  private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]]
-
-  lazy val samFileHeader = {
-    val header = new SAMFileHeader
-    header.setSequenceDictionary(referenceDataSource.getReference.getSequenceDictionary)
-    header
-  }
-
-  lazy val locs: java.util.List[GenomeLoc] = {
-    val parser = new GenomeLocParser(referenceDataSource.getReference)
-    val parsedLocs =
-      if (intervals.isEmpty)
-        GenomeLocSortedSet.createSetFromSequenceDictionary(samFileHeader.getSequenceDictionary).toList
-      else
-        IntervalUtils.parseIntervalArguments(parser, intervals, false)
-    Collections.sort(parsedLocs)
-    Collections.unmodifiableList(parsedLocs)
-  }
-
-  lazy val contigs = locs.map(_.getContig).distinct.toList
-
-  def getSplits(size: Int) = {
-    splitsBySize.getOrElse(size, {
-      val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size)
-      splitsBySize += size -> splits
-      splits
-    })
-  }
-}
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.queue.extensions.gatk
+
+import java.io.File
+import collection.JavaConversions._
+import org.broadinstitute.sting.utils.interval.IntervalUtils
+import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource
+import net.sf.samtools.SAMFileHeader
+import java.util.Collections
+import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser}
+
+case class GATKIntervals(reference: File, intervals: List[String]) {
+  private lazy val referenceDataSource = new ReferenceDataSource(reference)
+//  private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]]
+
+  lazy val samFileHeader = {
+    val header = new SAMFileHeader
+    header.setSequenceDictionary(referenceDataSource.getReference.getSequenceDictionary)
+    header
+  }
+
+  lazy val locs: java.util.List[GenomeLoc] = {
+    val parser = new GenomeLocParser(referenceDataSource.getReference)
+    val parsedLocs =
+      if (intervals.isEmpty)
+        GenomeLocSortedSet.createSetFromSequenceDictionary(samFileHeader.getSequenceDictionary).toList
+      else
+        IntervalUtils.parseIntervalArguments(parser, intervals, false)
+    Collections.sort(parsedLocs)
+    Collections.unmodifiableList(parsedLocs)
+  }
+
+  lazy val contigs = locs.map(_.getContig).distinct.toList
+
+//  def getSplits(size: Int) = {
+//    splitsBySize.getOrElse(size, {
+//      val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size)
+//      splitsBySize += size -> splits
+//      splits
+//    })
+//  }
+}
--- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala
+++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/IntervalScatterFunction.scala
@ -37,7 +37,7 @@ class IntervalScatterFunction extends GATKScatterFunction with InProcessFunction

  def run() {
    val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals)
-    IntervalUtils.scatterFixedIntervals(gi.samFileHeader, gi.locs,
-      gi.getSplits(this.scatterOutputFiles.size), this.scatterOutputFiles)
+    val splits = IntervalUtils.splitFixedIntervals(gi.locs, this.scatterOutputFiles.size)
+    IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles)
  }
 }
--- a/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala
+++ b/public/scala/test/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervalsUnitTest.scala
@ -53,8 +53,8 @@ class GATKIntervalsUnitTest {
    val gi = new GATKIntervals(hg18Reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5"))
    Assert.assertEquals(gi.locs.toList, List(chr1, chr2, chr3))
    Assert.assertEquals(gi.contigs, List("chr1", "chr2", "chr3"))
-    Assert.assertEquals(gi.getSplits(2).toList, List(2, 3))
-    Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3))
+//    Assert.assertEquals(gi.getSplits(2).toList, List(2, 3))
+//    Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3))
  }

  @Test(timeOut = 30000)
@ -65,7 +65,7 @@ class GATKIntervalsUnitTest {
    //   for(Item item: javaConvertedScalaList)
    // This for loop is actually an O(N^2) operation as the iterator calls the
    // O(N) javaConvertedScalaList.size() for each iteration of the loop.
-    Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894)
+    //Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894)
    Assert.assertEquals(gi.contigs.size, 24)
  }

@ -74,8 +74,8 @@ class GATKIntervalsUnitTest {
    val gi = new GATKIntervals(hg18Reference, Nil)
    Assert.assertEquals(gi.locs, hg18ReferenceLocs)
    Assert.assertEquals(gi.contigs.size, hg18ReferenceLocs.size)
-    Assert.assertEquals(gi.getSplits(2).toList, List(10, 45))
-    Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45))
+//    Assert.assertEquals(gi.getSplits(2).toList, List(10, 45))
+//    Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45))
  }

  @Test