diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
new file mode 100644
index 000000000..85f587aaf
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.diagnostics;
+
+import net.sf.samtools.SAMReadGroupRecord;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
+import org.broadinstitute.sting.gatk.report.GATKReport;
+import org.broadinstitute.sting.gatk.report.GATKReportTable;
+import org.broadinstitute.sting.gatk.walkers.ReadWalker;
+import org.broadinstitute.sting.utils.Median;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+
+import java.io.PrintStream;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Emits a GATKReport containing read group, sample, library, platform, center, median insert size and
+ * median read length for each read group in every BAM file.
+ *
+ * Note that this walker stops when all read groups have been observed at least a few thousand times so that
+ * the median statistics are well determined. It is safe to run it WG and it'll finish in an appropriate
+ * timeframe.
+ *
+ *
Input
+ *
+ * Any number of BAM files
+ *
+ *
+ * Output
+ *
+ * GATKReport containing read group, sample, library, platform, center, median insert size and median read length.
+ *
+ * For example, running this tool on the NA12878 data sets:
+ *
+ *
+ * ##:GATKReport.v0.2 ReadGroupProperties : Table of read group properties
+ * readgroup sample library platform center median.read.length median.insert.size
+ * 20FUK.1 NA12878 Solexa-18483 illumina BI 101 387
+ * 20FUK.2 NA12878 Solexa-18484 illumina BI 101 415
+ * 20FUK.3 NA12878 Solexa-18483 illumina BI 101 388
+ * 20FUK.4 NA12878 Solexa-18484 illumina BI 101 415
+ * 20FUK.5 NA12878 Solexa-18483 illumina BI 101 387
+ * 20FUK.6 NA12878 Solexa-18484 illumina BI 101 415
+ * 20FUK.7 NA12878 Solexa-18483 illumina BI 101 388
+ * 20FUK.8 NA12878 Solexa-18484 illumina BI 101 415
+ * 20GAV.1 NA12878 Solexa-18483 illumina BI 101 388
+ * 20GAV.2 NA12878 Solexa-18484 illumina BI 101 415
+ * 20GAV.3 NA12878 Solexa-18483 illumina BI 101 388
+ * 20GAV.4 NA12878 Solexa-18484 illumina BI 101 416
+ * 20GAV.5 NA12878 Solexa-18483 illumina BI 101 388
+ * 20GAV.6 NA12878 Solexa-18484 illumina BI 101 415
+ * 20GAV.7 NA12878 Solexa-18483 illumina BI 101 387
+ * 20GAV.8 NA12878 Solexa-18484 illumina BI 101 414
+ *
+ *
+ *
+ * Examples
+ *
+ * java
+ * -jar GenomeAnalysisTK.jar
+ * -T ReadGroupProperties
+ * -I example1.bam -I example2.bam etc
+ * -R reference.fasta
+ * -o example.gatkreport.txt
+ *
+ *
+ * @author Mark DePristo
+ */
+
+
+
+public class ReadGroupProperties extends ReadWalker {
+ @Output
+ public PrintStream out;
+
+ @Argument(shortName="maxElementsForMedian", doc="Calculate median from the first maxElementsForMedian values observed", required=false)
+ public int MAX_VALUES_FOR_MEDIAN = 10000;
+
+ private final static String TABLE_NAME = "ReadGroupProperties";
+ private final Map> readLengths = new HashMap>();
+ private final Map> insertSizes = new HashMap>();
+
+ @Override
+ public void initialize() {
+ for ( final SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) {
+ readLengths.put(rg.getId(), new Median(MAX_VALUES_FOR_MEDIAN));
+ insertSizes.put(rg.getId(), new Median(MAX_VALUES_FOR_MEDIAN));
+ }
+ }
+
+ @Override
+ public boolean filter(ReferenceContext ref, GATKSAMRecord read) {
+ return ! (read.getReadFailsVendorQualityCheckFlag() || read.getReadUnmappedFlag());
+ }
+
+ @Override
+ public boolean isDone() {
+ // TODO -- this is far too slow!
+ return ! (anyMedianNeedsData(readLengths) || anyMedianNeedsData(insertSizes));
+ }
+
+ private final boolean anyMedianNeedsData(Map> medianMap) {
+ for ( Median median : medianMap.values() ) {
+ if ( ! median.isFull() )
+ return true;
+ }
+
+ return false;
+ }
+
+ private final void updateMedian(final Median median, final int value) {
+ median.add(value);
+ }
+
+ @Override
+ public Integer map(ReferenceContext referenceContext, GATKSAMRecord read, ReadMetaDataTracker readMetaDataTracker) {
+ final String rg = read.getReadGroup().getId();
+
+ updateMedian(readLengths.get(rg), read.getReadLength());
+ if ( read.getReadPairedFlag() && read.getInferredInsertSize() != 0) {
+ //logger.info(rg + " => " + Math.abs(read.getInferredInsertSize()));
+ updateMedian(insertSizes.get(rg), Math.abs(read.getInferredInsertSize()));
+ }
+
+ return null;
+ }
+
+ @Override
+ public Integer reduceInit() {
+ return null;
+ }
+
+ @Override
+ public Integer reduce(Integer integer, Integer integer1) {
+ return null;
+ }
+
+ @Override
+ public void onTraversalDone(Integer sum) {
+ final GATKReport report = new GATKReport();
+ report.addTable(TABLE_NAME, "Table of read group properties");
+ GATKReportTable table = report.getTable(TABLE_NAME);
+
+ table.addPrimaryKey("readgroup");
+ //* Emits a GATKReport containing read group, sample, library, platform, center, median insert size and
+ //* median read length for each read group in every BAM file.
+ table.addColumn("sample", "NA");
+ table.addColumn("library", "NA");
+ table.addColumn("platform", "NA");
+ table.addColumn("center", "NA");
+ table.addColumn("median.read.length", Integer.valueOf(0));
+ table.addColumn("median.insert.size", Integer.valueOf(0));
+
+ for ( final SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) {
+ final String rgID = rg.getId();
+ table.set(rgID, "sample", rg.getSample());
+ table.set(rgID, "library", rg.getLibrary());
+ table.set(rgID, "platform", rg.getPlatform());
+ table.set(rgID, "center", rg.getSequencingCenter());
+ table.set(rgID, "median.read.length", readLengths.get(rgID).getMedian(0));
+ table.set(rgID, "median.insert.size", insertSizes.get(rgID).getMedian(0));
+ }
+
+ report.print(out);
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/utils/Median.java b/public/java/src/org/broadinstitute/sting/utils/Median.java
new file mode 100644
index 000000000..7ebe8d2d7
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/utils/Median.java
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils;
+
+import java.util.*;
+
+/**
+ * Utility class for calculating median from a data set, potentially limiting the size of data to a
+ * fixed amount
+ *
+ * @author Your Name
+ * @since Date created
+ */
+public class Median {
+ final List values;
+ final int maxValuesToKeep;
+ boolean sorted = false;
+
+ public Median() {
+ this(Integer.MAX_VALUE);
+ }
+
+ public Median(final int maxValuesToKeep) {
+ this.maxValuesToKeep = maxValuesToKeep;
+ this.values = new ArrayList();
+ }
+
+ public boolean isFull() {
+ return values.size() >= maxValuesToKeep;
+ }
+
+ public int size() {
+ return values.size();
+ }
+
+ public boolean isEmpty() {
+ return values.isEmpty();
+ }
+
+ public T getMedian() {
+ if ( isEmpty() )
+ throw new IllegalStateException("Cannot get median value from empty array");
+ return getMedian(null); // note that value null will never be used
+ }
+
+ /**
+ * Returns the floor(n + 1 / 2) item from the list of values if the list
+ * has values, or defaultValue is the list is empty.
+ */
+ public T getMedian(final T defaultValue) {
+ if ( isEmpty() )
+ return defaultValue;
+
+ if ( ! sorted ) {
+ sorted = true;
+ Collections.sort(values);
+ }
+
+ final int offset = (int)Math.floor((values.size() + 1) * 0.5) - 1;
+ return values.get(offset);
+ }
+
+ public boolean add(final T value) {
+ if ( ! isFull() ) {
+ sorted = false;
+ return values.add(value);
+ }
+ else
+ return false;
+ }
+}
diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java
index 626b91cbf..ac3a970f9 100755
--- a/public/java/test/org/broadinstitute/sting/BaseTest.java
+++ b/public/java/test/org/broadinstitute/sting/BaseTest.java
@@ -141,7 +141,7 @@ public abstract class BaseTest {
*/
public static class TestDataProvider {
private static final Map> tests = new HashMap>();
- private String name;
+ protected String name;
/**
* Create a new TestDataProvider instance bound to the class variable C
diff --git a/public/java/test/org/broadinstitute/sting/MedianUnitTest.java b/public/java/test/org/broadinstitute/sting/MedianUnitTest.java
new file mode 100644
index 000000000..c12db9b77
--- /dev/null
+++ b/public/java/test/org/broadinstitute/sting/MedianUnitTest.java
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+// our package
+package org.broadinstitute.sting;
+
+
+// the imports for unit testing.
+
+
+import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LikelihoodCalculationEngine;
+import org.broadinstitute.sting.utils.Median;
+import org.testng.Assert;
+import org.testng.annotations.BeforeSuite;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+
+public class MedianUnitTest extends BaseTest {
+ LikelihoodCalculationEngine engine;
+
+ @BeforeSuite
+ public void before() {
+ engine = new LikelihoodCalculationEngine(0, 0, false);
+ }
+
+ // --------------------------------------------------------------------------------
+ //
+ // Provider
+ //
+ // --------------------------------------------------------------------------------
+
+ private class MedianTestProvider extends TestDataProvider {
+ final List values = new ArrayList();
+ final int cap;
+ final Integer expected;
+
+ public MedianTestProvider(int expected, int cap, Integer ... values) {
+ super(MedianTestProvider.class);
+ this.expected = expected;
+ this.cap = cap;
+ this.values.addAll(Arrays.asList(values));
+ this.name = String.format("values=%s expected=%d cap=%d", this.values, expected, cap);
+ }
+ }
+
+ @DataProvider(name = "MedianTestProvider")
+ public Object[][] makeMedianTestProvider() {
+ new MedianTestProvider(1, 1000, 0, 1, 2);
+ new MedianTestProvider(1, 1000, 1, 0, 1, 2);
+ new MedianTestProvider(1, 1000, 0, 1, 2, 3);
+ new MedianTestProvider(2, 1000, 0, 1, 2, 3, 4);
+ new MedianTestProvider(2, 1000, 4, 1, 2, 3, 0);
+ new MedianTestProvider(1, 1000, 1);
+ new MedianTestProvider(2, 1000, 2);
+ new MedianTestProvider(1, 1000, 1, 2);
+
+ new MedianTestProvider(1, 3, 1);
+ new MedianTestProvider(1, 3, 1, 2);
+ new MedianTestProvider(2, 3, 1, 2, 3);
+ new MedianTestProvider(2, 3, 1, 2, 3, 4);
+ new MedianTestProvider(2, 3, 1, 2, 3, 4, 5);
+
+ new MedianTestProvider(1, 3, 1);
+ new MedianTestProvider(1, 3, 1, 2);
+ new MedianTestProvider(2, 3, 3, 2, 1);
+ new MedianTestProvider(3, 3, 4, 3, 2, 1);
+ new MedianTestProvider(4, 3, 5, 4, 3, 2, 1);
+
+ return MedianTestProvider.getTests(MedianTestProvider.class);
+ }
+
+ @Test(dataProvider = "MedianTestProvider")
+ public void testBasicLikelihoods(MedianTestProvider cfg) {
+ final Median median = new Median(cfg.cap);
+
+ int nAdded = 0;
+ for ( final int value : cfg.values )
+ if ( median.add(value) )
+ nAdded++;
+
+ Assert.assertEquals(nAdded, median.size());
+
+ Assert.assertEquals(cfg.values.isEmpty(), median.isEmpty());
+ Assert.assertEquals(cfg.values.size() >= cfg.cap, median.isFull());
+ Assert.assertEquals(median.getMedian(), cfg.expected, cfg.toString());
+ }
+
+ @Test(expectedExceptions = IllegalStateException.class)
+ public void testEmptyMedian() {
+ final Median median = new Median();
+ Assert.assertTrue(median.isEmpty());
+ final Integer d = 100;
+ Assert.assertEquals(median.getMedian(d), d);
+ median.getMedian();
+ }
+
+}
\ No newline at end of file
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java
new file mode 100644
index 000000000..84f7fa363
--- /dev/null
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.diagnostics;
+
+import org.broadinstitute.sting.WalkerTest;
+import org.testng.annotations.Test;
+
+import java.util.Arrays;
+
+/**
+ * Tests ReadGroupProperties
+ */
+public class ReadGroupPropertiesIntegrationTest extends WalkerTest {
+ @Test
+ public void basicTest() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ "-T ReadGroupProperties -R " + b37KGReference + " -I " + b37GoodBAM + " -L 20:10,000,000-11,000,000 -o %s",
+ 1,
+ Arrays.asList("1795e3157ab23e7e597acec334e29525"));
+ executeTest("ReadGroupProperties:", spec);
+ }
+}
\ No newline at end of file