Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

2011-07-18 17:59:26 -04:00 · 2011-07-18 17:59:26 -04:00 · 1837da37f6
parent 916c0c9489 4e78f0b064
commit 1837da37f6
10 changed files with 124 additions and 58 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java
@ -100,7 +100,11 @@ public class GATKReport {
     * @param tableDescription  the description of the table
     */
    public void addTable(String tableName, String tableDescription) {
-        GATKReportTable table = new GATKReportTable(tableName, tableDescription);
+        addTable(tableName, tableDescription, true);
+    }
+
+    public void addTable(String tableName, String tableDescription, boolean sortByPrimaryKey) {
+        GATKReportTable table = new GATKReportTable(tableName, tableDescription, sortByPrimaryKey);
        tables.put(tableName, table);
    }

--- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java
@ -3,9 +3,7 @@ package org.broadinstitute.sting.gatk.report;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;

 import java.io.PrintStream;
-import java.util.HashMap;
-import java.util.LinkedHashMap;
-import java.util.TreeSet;
+import java.util.*;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

@ -96,8 +94,9 @@ public class GATKReportTable {
    private String tableDescription;

    private String primaryKeyName;
-    private TreeSet<Object> primaryKeyColumn;
+    private Collection<Object> primaryKeyColumn;
    private boolean primaryKeyDisplay;
+    boolean sortByPrimaryKey = true;

    private LinkedHashMap<String, GATKReportColumn> columns;

@ -121,12 +120,17 @@ public class GATKReportTable {
     * @param tableDescription  the description of the table
     */
    public GATKReportTable(String tableName, String tableDescription) {
-        if (!isValidName(tableName)) {
+        this(tableName, tableDescription, true);
+    }
+
+    public GATKReportTable(String tableName, String tableDescription, boolean sortByPrimaryKey) {
+         if (!isValidName(tableName)) {
            throw new ReviewedStingException("Attempted to set a GATKReportTable name of '" + tableName + "'.  GATKReportTable names must be purely alphanumeric - no spaces or special characters are allowed.");
        }

        this.tableName = tableName;
        this.tableDescription = tableDescription;
+        this.sortByPrimaryKey = sortByPrimaryKey;

        columns = new LinkedHashMap<String, GATKReportColumn>();
    }
@ -137,20 +141,14 @@ public class GATKReportTable {
     * @param primaryKeyName  the name of the primary key column
     */
    public void addPrimaryKey(String primaryKeyName) {
-        if (!isValidName(primaryKeyName)) {
-            throw new ReviewedStingException("Attempted to set a GATKReportTable primary key name of '" + primaryKeyName + "'.  GATKReportTable primary key names must be purely alphanumeric - no spaces or special characters are allowed.");
-        }
-
-        this.primaryKeyName = primaryKeyName;
-
-        primaryKeyColumn = new TreeSet<Object>();
-        primaryKeyDisplay = true;
+        addPrimaryKey(primaryKeyName, true);
    }

    /**
     * Add an optionally visible primary key column.  This becomes the unique identifier for every column in the table, and will always be printed as the first column.
     *
     * @param primaryKeyName  the name of the primary key column
+     * @param display should this primary key be displayed?
     */
    public void addPrimaryKey(String primaryKeyName, boolean display) {
        if (!isValidName(primaryKeyName)) {
@ -159,7 +157,7 @@ public class GATKReportTable {

        this.primaryKeyName = primaryKeyName;

-        primaryKeyColumn = new TreeSet<Object>();
+        primaryKeyColumn = sortByPrimaryKey ? new TreeSet<Object>() : new LinkedList<Object>();
        primaryKeyDisplay = display;
    }

--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java
@ -29,9 +29,7 @@ import net.sf.samtools.SAMRecord;
 import net.sf.samtools.SAMRecordIterator;
 import net.sf.samtools.util.BlockCompressedInputStream;

-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
+import java.io.*;
 import java.util.Arrays;


@ -102,8 +100,10 @@ public class BAMDiffableReader implements DiffableReader {
        final byte[] BAM_MAGIC = "BAM\1".getBytes();
        final byte[] buffer = new byte[BAM_MAGIC.length];
        try {
-            FileInputStream fstream = new FileInputStream(file);
-            new BlockCompressedInputStream(fstream).read(buffer,0,BAM_MAGIC.length);
+            InputStream fstream = new BufferedInputStream(new FileInputStream(file));
+            if ( !BlockCompressedInputStream.isValidFile(fstream) )
+                return false;
+            new BlockCompressedInputStream(fstream).read(buffer, 0, BAM_MAGIC.length);
            return Arrays.equals(buffer, BAM_MAGIC);
        } catch ( IOException e ) {
            return false;
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java
@ -143,7 +143,7 @@ public class DiffEngine {
     * Not that only pairs of the same length are considered as potentially equivalent
     *
     * @param params determines how we display the items
-     * @param diffs
+     * @param diffs the list of differences to summarize
     */
    public void reportSummarizedDifferences(List<Difference> diffs, SummaryReportParams params ) {
        printSummaryReport(summarizeDifferences(diffs), params );
@ -207,14 +207,7 @@ public class DiffEngine {
    }

    protected void printSummaryReport(List<Difference> sortedSummaries, SummaryReportParams params ) {
-        GATKReport report = new GATKReport();
-        final String tableName = "diffences";
-        report.addTable(tableName, "Summarized differences between the master and test files.\nSee http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine for more information");
-        GATKReportTable table = report.getTable(tableName);
-        table.addPrimaryKey("Difference", true);
-        table.addColumn("NumberOfOccurrences", 0);
-        table.addColumn("SpecificDifference", 0);
-
+        List<Difference> toShow = new ArrayList<Difference>();
        int count = 0, count1 = 0;
        for ( Difference diff : sortedSummaries ) {
            if ( diff.getCount() < params.minSumDiffToShow )
@ -230,10 +223,26 @@ public class DiffEngine {
                    break;
            }

-            table.set(diff.getPath(), "NumberOfOccurrences", diff.getCount());
-            table.set(diff.getPath(), "SpecificDifference", diff.valueDiffString());
+            toShow.add(diff);
        }

+        // if we want it in descending order, reverse the list
+        if ( ! params.descending ) {
+            Collections.reverse(toShow);
+        }
+
+        // now that we have a specific list of values we want to show, display them
+        GATKReport report = new GATKReport();
+        final String tableName = "diffences";
+        report.addTable(tableName, "Summarized differences between the master and test files.\nSee http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine for more information", false);
+        GATKReportTable table = report.getTable(tableName);
+        table.addPrimaryKey("Difference", true);
+        table.addColumn("NumberOfOccurrences", 0);
+        table.addColumn("ExampleDifference", 0);
+        for ( Difference diff : toShow ) {
+            table.set(diff.getPath(), "NumberOfOccurrences", diff.getCount());
+            table.set(diff.getPath(), "ExampleDifference", diff.valueDiffString());
+        }
        table.write(params.out);
    }

@ -252,7 +261,7 @@ public class DiffEngine {
     * commonPostfixLength: how many parts are shared at the end, suppose its 2
     * We want to create a string *.*.C.D
     *
-     * @param parts
+     * @param parts the separated path values [above without .]
     * @param commonPostfixLength
     * @return
     */
@ -351,6 +360,7 @@ public class DiffEngine {
        int maxItemsToDisplay = 0;
        int maxCountOneItems = 0;
        int minSumDiffToShow = 0;
+        boolean descending = true;

        public SummaryReportParams(PrintStream out, int maxItemsToDisplay, int maxCountOneItems, int minSumDiffToShow) {
            this.out = out;
@ -358,5 +368,9 @@ public class DiffEngine {
            this.maxCountOneItems = maxCountOneItems;
            this.minSumDiffToShow = minSumDiffToShow;
        }
+
+        public void setDescending(boolean descending) {
+            this.descending = descending;
+        }
    }
 }
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java
@ -112,6 +112,7 @@ public class DiffObjectsWalker extends RodWalker<Integer, Integer> {
        }

        DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff);
+        params.setDescending(false);
        diffEngine.reportSummarizedDifferences(diffs, params);
    }
 }
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java
@ -72,13 +72,19 @@ public class VCFDiffableReader implements DiffableReader {
            }

            String line = lineReader.readLine();
-            int count = 0;
+            int count = 0, nRecordsAtPos = 1;
+            String prevName = "";
            while ( line != null ) {
                if ( count++ > maxElementsToRead && maxElementsToRead != -1)
                    break;

                VariantContext vc = (VariantContext)vcfCodec.decode(line);
                String name = vc.getChr() + ":" + vc.getStart();
+                if ( name.equals(prevName) ) {
+                    name += "_" + ++nRecordsAtPos;
+                } else {
+                    prevName = name;
+                }
                DiffNode vcRoot = DiffNode.empty(name, root);

                // add fields
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
@ -25,6 +25,7 @@

 package org.broadinstitute.sting.gatk.walkers.variantutils;

+import org.apache.poi.hpsf.Variant;
 import org.broadinstitute.sting.commandline.Argument;
 import org.broadinstitute.sting.commandline.Hidden;
 import org.broadinstitute.sting.commandline.Output;
@ -149,7 +150,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {

        // get all of the vcf rods at this locus
        // Need to provide reference bases to simpleMerge starting at current locus
-        Collection<VariantContext> vcs = tracker.getAllVariantContexts(ref, null,context.getLocation(), true, false);
+        Collection<VariantContext> vcs = tracker.getAllVariantContexts(ref, null, context.getLocation(), true, false);

        if ( sitesOnlyVCF ) {
            vcs = VariantContextUtils.sitesOnlyVariantContexts(vcs);
@ -172,17 +173,25 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
        if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN))
            return 0;
        
-        VariantContext mergedVC;
+        List<VariantContext> mergedVCs = new ArrayList<VariantContext>();
        if ( master ) {
-             mergedVC = VariantContextUtils.masterMerge(vcs, "master");
+            mergedVCs.add(VariantContextUtils.masterMerge(vcs, "master"));
        } else {
-            mergedVC = VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(),vcs, priority, filteredRecordsMergeType,
-                    genotypeMergeOption, true, printComplexMerges, ref.getBase(), SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC);
+            Map<VariantContext.Type, List<VariantContext>> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs);
+            // iterate over the types so that it's deterministic
+            for ( VariantContext.Type type : VariantContext.Type.values() ) {
+                if ( VCsByType.containsKey(type) )
+                    mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type),
+                            priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
+                            ref.getBase(), SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
+            }
        }

-        //out.printf("   merged => %s%nannotated => %s%n", mergedVC, annotatedMergedVC);
+        for ( VariantContext mergedVC : mergedVCs ) {
+            // only operate at the start of events
+            if ( mergedVC == null )
+                continue;

-        if ( mergedVC != null ) { // only operate at the start of events
            HashMap<String, Object> attributes = new HashMap<String, Object>(mergedVC.getAttributes());
            // re-compute chromosome counts
            VariantContextUtils.calculateChromosomeCounts(mergedVC, attributes, false);
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
@ -289,8 +289,8 @@ public class VariantContextUtils {

    /**
     * Returns a newly allocated VC that is the same as VC, but without genotypes
-     * @param vc
-     * @return
+     * @param vc  variant context
+     * @return  new VC without genotypes
     */
    @Requires("vc != null")
    @Ensures("result != null")
@ -303,8 +303,8 @@ public class VariantContextUtils {

    /**
     * Returns a newly allocated list of VC, where each VC is the same as the input VCs, but without genotypes
-     * @param vcs
-     * @return
+     * @param vcs  collection of VCs
+     * @return new VCs without genotypes
     */
    @Requires("vcs != null")
    @Ensures("result != null")
@ -362,9 +362,9 @@ public class VariantContextUtils {
     * information per genotype.  The master merge will add the PQ information from each genotype record, where
     * appropriate, to the master VC.
     *
-     * @param unsortedVCs
-     * @param masterName
-     * @return
+     * @param unsortedVCs   collection of VCs
+     * @param masterName    name of master VC
+     * @return  master-merged VC
     */
    public static VariantContext masterMerge(Collection<VariantContext> unsortedVCs, String masterName) {
        VariantContext master = findMaster(unsortedVCs, masterName);
@ -435,11 +435,15 @@ public class VariantContextUtils {
     * If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
     * the sample name
     *
-     * @param unsortedVCs
-     * @param priorityListOfVCs
-     * @param filteredRecordMergeType
-     * @param genotypeMergeOptions
-     * @return
+     * @param genomeLocParser           loc parser
+     * @param unsortedVCs               collection of unsorted VCs
+     * @param priorityListOfVCs         priority list detailing the order in which we should grab the VCs
+     * @param filteredRecordMergeType   merge type for filtered records
+     * @param genotypeMergeOptions      merge option for genotypes
+     * @param annotateOrigin            should we annotate the set it came from?
+     * @param printMessages             should we print messages?
+     * @param inputRefBase              the ref base
+     * @return new VariantContext
     */
    public static VariantContext simpleMerge(GenomeLocParser genomeLocParser, Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs,
                                             FilteredRecordMergeType filteredRecordMergeType, GenotypeMergeType genotypeMergeOptions,
@ -448,6 +452,24 @@ public class VariantContextUtils {
        return simpleMerge(genomeLocParser, unsortedVCs, priorityListOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, inputRefBase, "set", false, false);
    }

+    /**
+     * Merges VariantContexts into a single hybrid.  Takes genotypes for common samples in priority order, if provided.
+     * If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
+     * the sample name
+     *
+     * @param genomeLocParser           loc parser
+     * @param unsortedVCs               collection of unsorted VCs
+     * @param priorityListOfVCs         priority list detailing the order in which we should grab the VCs
+     * @param filteredRecordMergeType   merge type for filtered records
+     * @param genotypeMergeOptions      merge option for genotypes
+     * @param annotateOrigin            should we annotate the set it came from?
+     * @param printMessages             should we print messages?
+     * @param inputRefBase              the ref base
+     * @param setKey                    the key name of the set
+     * @param filteredAreUncalled       are filtered records uncalled?
+     * @param mergeInfoWithMaxAC        should we merge in info from the VC with maximum allele count?
+     * @return new VariantContext
+     */
    public static VariantContext simpleMerge(GenomeLocParser genomeLocParser, Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs,
                                             FilteredRecordMergeType filteredRecordMergeType, GenotypeMergeType genotypeMergeOptions,
                                             boolean annotateOrigin, boolean printMessages, byte inputRefBase, String setKey,
@ -470,7 +492,7 @@ public class VariantContextUtils {
            if ( ! filteredAreUncalled || vc.isNotFiltered() )
                VCs.add(VariantContext.createVariantContextWithPaddedAlleles(vc,inputRefBase,false));
        }
-        if ( VCs.size() == 0 ) // everything is filtered out and we're filteredareUncalled
+        if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled
            return null;

        // establish the baseline info from the first VC
@ -615,6 +637,17 @@ public class VariantContextUtils {
        return merged;
    }

+    public static Map<VariantContext.Type, List<VariantContext>> separateVariantContextsByType(Collection<VariantContext> VCs) {
+        HashMap<VariantContext.Type, List<VariantContext>> mappedVCs = new HashMap<VariantContext.Type, List<VariantContext>>();
+        for ( VariantContext vc : VCs ) {
+            if ( !mappedVCs.containsKey(vc.getType()) )
+                mappedVCs.put(vc.getType(), new ArrayList<VariantContext>());
+            mappedVCs.get(vc.getType()).add(vc);
+        }
+
+        return mappedVCs;
+    }
+
    private static class AlleleMapper {
        private VariantContext vc = null;
        private Map<Allele, Allele> map = null;
@ -834,6 +867,7 @@ public class VariantContextUtils {

    /**
     * create a genome location, given a variant context
+     * @param genomeLocParser parser
     * @param vc the variant context
     * @return the genomeLoc
     */
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java
@ -52,8 +52,8 @@ public class DiffObjectsIntegrationTest extends WalkerTest {

    @DataProvider(name = "data")
    public Object[][] createData() {
-        new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "fb7f4e011487ca56bce865ae5468cdc5");
-        new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "423cec3befbf0a72d8bc3757ee628fc4");
+        new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "4d9f4636de05b93c354d05011264546e");
+        new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "37e6efd833b5cd6d860a9df3df9713fc");
        return TestParams.getTests(TestParams.class);
    }

--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java
@ -80,9 +80,9 @@ public class CombineVariantsIntegrationTest extends WalkerTest {

    @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "1d5a021387a8a86554db45a29f66140f", false); } // official project VCF files in tabix format
    @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "20163d60f18a46496f6da744ab5cc0f9", false); } // official project VCF files in tabix format
-    @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "5b82f37df1f5ba40f0474d71c94142ec", false); }
+    @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "f1cf095c2fe9641b7ca1f8ee2c46fd4a", false); }

-    @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "c58dca482bf97069eac6d9f1a07a2cba", false); }
+    @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "e144b6283765494bfe8189ac59965083", false); }

    @Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "89f55abea8f59e39d1effb908440548c", true); }

@ -100,7 +100,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
                        " -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" +
                        " -genotypeMergeOptions UNIQUIFY -L 1"),
                1,
-                Arrays.asList("8b78339ccf7a5a5a837f79e88a3a38e5"));
+                Arrays.asList("1de95f91ca15d2a8856de35dee0ce33e"));
        executeTest("threeWayWithRefs", spec);
    }