From aff508e09137318579887ca757c638dc78211b17 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 1 Mar 2012 15:01:11 -0500 Subject: [PATCH] ReadGroupProperties walker and associated infrastructure -- ReadGroupProperties: Emits a GATKReport containing read group, sample, library, platform, center, median insert size and median read length for each read group in every BAM file. -- Median tool that collects up to a given maximum number of elements and returns the median of the elements. -- Unit and integration tests for everything. -- Making name of TestProvider protected so subclasses and override name more easily --- .../diagnostics/ReadGroupProperties.java | 192 ++++++++++++++++++ .../broadinstitute/sting/utils/Median.java | 93 +++++++++ .../org/broadinstitute/sting/BaseTest.java | 2 +- .../broadinstitute/sting/MedianUnitTest.java | 123 +++++++++++ .../ReadGroupPropertiesIntegrationTest.java | 44 ++++ 5 files changed, 453 insertions(+), 1 deletion(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/Median.java create mode 100644 public/java/test/org/broadinstitute/sting/MedianUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java new file mode 100644 index 000000000..85f587aaf --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diagnostics; + +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.Median; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.io.PrintStream; +import java.util.HashMap; +import java.util.Map; + +/** + * Emits a GATKReport containing read group, sample, library, platform, center, median insert size and + * median read length for each read group in every BAM file. + * + * Note that this walker stops when all read groups have been observed at least a few thousand times so that + * the median statistics are well determined. It is safe to run it WG and it'll finish in an appropriate + * timeframe. + * + *

Input

+ *

+ * Any number of BAM files + *

+ * + *

Output

+ *

+ * GATKReport containing read group, sample, library, platform, center, median insert size and median read length. + * + * For example, running this tool on the NA12878 data sets: + * + *

+ *      ##:GATKReport.v0.2 ReadGroupProperties : Table of read group properties
+ *      readgroup  sample   library       platform  center  median.read.length  median.insert.size
+ *      20FUK.1    NA12878  Solexa-18483  illumina  BI                     101                 387
+ *      20FUK.2    NA12878  Solexa-18484  illumina  BI                     101                 415
+ *      20FUK.3    NA12878  Solexa-18483  illumina  BI                     101                 388
+ *      20FUK.4    NA12878  Solexa-18484  illumina  BI                     101                 415
+ *      20FUK.5    NA12878  Solexa-18483  illumina  BI                     101                 387
+ *      20FUK.6    NA12878  Solexa-18484  illumina  BI                     101                 415
+ *      20FUK.7    NA12878  Solexa-18483  illumina  BI                     101                 388
+ *      20FUK.8    NA12878  Solexa-18484  illumina  BI                     101                 415
+ *      20GAV.1    NA12878  Solexa-18483  illumina  BI                     101                 388
+ *      20GAV.2    NA12878  Solexa-18484  illumina  BI                     101                 415
+ *      20GAV.3    NA12878  Solexa-18483  illumina  BI                     101                 388
+ *      20GAV.4    NA12878  Solexa-18484  illumina  BI                     101                 416
+ *      20GAV.5    NA12878  Solexa-18483  illumina  BI                     101                 388
+ *      20GAV.6    NA12878  Solexa-18484  illumina  BI                     101                 415
+ *      20GAV.7    NA12878  Solexa-18483  illumina  BI                     101                 387
+ *      20GAV.8    NA12878  Solexa-18484  illumina  BI                     101                 414
+ *      
+ *

+ * + *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T ReadGroupProperties
+ *      -I example1.bam -I example2.bam etc
+ *      -R reference.fasta
+ *      -o example.gatkreport.txt
+ *  
+ * + * @author Mark DePristo + */ + + + +public class ReadGroupProperties extends ReadWalker { + @Output + public PrintStream out; + + @Argument(shortName="maxElementsForMedian", doc="Calculate median from the first maxElementsForMedian values observed", required=false) + public int MAX_VALUES_FOR_MEDIAN = 10000; + + private final static String TABLE_NAME = "ReadGroupProperties"; + private final Map> readLengths = new HashMap>(); + private final Map> insertSizes = new HashMap>(); + + @Override + public void initialize() { + for ( final SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + readLengths.put(rg.getId(), new Median(MAX_VALUES_FOR_MEDIAN)); + insertSizes.put(rg.getId(), new Median(MAX_VALUES_FOR_MEDIAN)); + } + } + + @Override + public boolean filter(ReferenceContext ref, GATKSAMRecord read) { + return ! (read.getReadFailsVendorQualityCheckFlag() || read.getReadUnmappedFlag()); + } + + @Override + public boolean isDone() { + // TODO -- this is far too slow! + return ! (anyMedianNeedsData(readLengths) || anyMedianNeedsData(insertSizes)); + } + + private final boolean anyMedianNeedsData(Map> medianMap) { + for ( Median median : medianMap.values() ) { + if ( ! median.isFull() ) + return true; + } + + return false; + } + + private final void updateMedian(final Median median, final int value) { + median.add(value); + } + + @Override + public Integer map(ReferenceContext referenceContext, GATKSAMRecord read, ReadMetaDataTracker readMetaDataTracker) { + final String rg = read.getReadGroup().getId(); + + updateMedian(readLengths.get(rg), read.getReadLength()); + if ( read.getReadPairedFlag() && read.getInferredInsertSize() != 0) { + //logger.info(rg + " => " + Math.abs(read.getInferredInsertSize())); + updateMedian(insertSizes.get(rg), Math.abs(read.getInferredInsertSize())); + } + + return null; + } + + @Override + public Integer reduceInit() { + return null; + } + + @Override + public Integer reduce(Integer integer, Integer integer1) { + return null; + } + + @Override + public void onTraversalDone(Integer sum) { + final GATKReport report = new GATKReport(); + report.addTable(TABLE_NAME, "Table of read group properties"); + GATKReportTable table = report.getTable(TABLE_NAME); + + table.addPrimaryKey("readgroup"); + //* Emits a GATKReport containing read group, sample, library, platform, center, median insert size and + //* median read length for each read group in every BAM file. + table.addColumn("sample", "NA"); + table.addColumn("library", "NA"); + table.addColumn("platform", "NA"); + table.addColumn("center", "NA"); + table.addColumn("median.read.length", Integer.valueOf(0)); + table.addColumn("median.insert.size", Integer.valueOf(0)); + + for ( final SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + final String rgID = rg.getId(); + table.set(rgID, "sample", rg.getSample()); + table.set(rgID, "library", rg.getLibrary()); + table.set(rgID, "platform", rg.getPlatform()); + table.set(rgID, "center", rg.getSequencingCenter()); + table.set(rgID, "median.read.length", readLengths.get(rgID).getMedian(0)); + table.set(rgID, "median.insert.size", insertSizes.get(rgID).getMedian(0)); + } + + report.print(out); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/Median.java b/public/java/src/org/broadinstitute/sting/utils/Median.java new file mode 100644 index 000000000..7ebe8d2d7 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/Median.java @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils; + +import java.util.*; + +/** + * Utility class for calculating median from a data set, potentially limiting the size of data to a + * fixed amount + * + * @author Your Name + * @since Date created + */ +public class Median { + final List values; + final int maxValuesToKeep; + boolean sorted = false; + + public Median() { + this(Integer.MAX_VALUE); + } + + public Median(final int maxValuesToKeep) { + this.maxValuesToKeep = maxValuesToKeep; + this.values = new ArrayList(); + } + + public boolean isFull() { + return values.size() >= maxValuesToKeep; + } + + public int size() { + return values.size(); + } + + public boolean isEmpty() { + return values.isEmpty(); + } + + public T getMedian() { + if ( isEmpty() ) + throw new IllegalStateException("Cannot get median value from empty array"); + return getMedian(null); // note that value null will never be used + } + + /** + * Returns the floor(n + 1 / 2) item from the list of values if the list + * has values, or defaultValue is the list is empty. + */ + public T getMedian(final T defaultValue) { + if ( isEmpty() ) + return defaultValue; + + if ( ! sorted ) { + sorted = true; + Collections.sort(values); + } + + final int offset = (int)Math.floor((values.size() + 1) * 0.5) - 1; + return values.get(offset); + } + + public boolean add(final T value) { + if ( ! isFull() ) { + sorted = false; + return values.add(value); + } + else + return false; + } +} diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 626b91cbf..ac3a970f9 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -141,7 +141,7 @@ public abstract class BaseTest { */ public static class TestDataProvider { private static final Map> tests = new HashMap>(); - private String name; + protected String name; /** * Create a new TestDataProvider instance bound to the class variable C diff --git a/public/java/test/org/broadinstitute/sting/MedianUnitTest.java b/public/java/test/org/broadinstitute/sting/MedianUnitTest.java new file mode 100644 index 000000000..c12db9b77 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/MedianUnitTest.java @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting; + + +// the imports for unit testing. + + +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LikelihoodCalculationEngine; +import org.broadinstitute.sting.utils.Median; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + + +public class MedianUnitTest extends BaseTest { + LikelihoodCalculationEngine engine; + + @BeforeSuite + public void before() { + engine = new LikelihoodCalculationEngine(0, 0, false); + } + + // -------------------------------------------------------------------------------- + // + // Provider + // + // -------------------------------------------------------------------------------- + + private class MedianTestProvider extends TestDataProvider { + final List values = new ArrayList(); + final int cap; + final Integer expected; + + public MedianTestProvider(int expected, int cap, Integer ... values) { + super(MedianTestProvider.class); + this.expected = expected; + this.cap = cap; + this.values.addAll(Arrays.asList(values)); + this.name = String.format("values=%s expected=%d cap=%d", this.values, expected, cap); + } + } + + @DataProvider(name = "MedianTestProvider") + public Object[][] makeMedianTestProvider() { + new MedianTestProvider(1, 1000, 0, 1, 2); + new MedianTestProvider(1, 1000, 1, 0, 1, 2); + new MedianTestProvider(1, 1000, 0, 1, 2, 3); + new MedianTestProvider(2, 1000, 0, 1, 2, 3, 4); + new MedianTestProvider(2, 1000, 4, 1, 2, 3, 0); + new MedianTestProvider(1, 1000, 1); + new MedianTestProvider(2, 1000, 2); + new MedianTestProvider(1, 1000, 1, 2); + + new MedianTestProvider(1, 3, 1); + new MedianTestProvider(1, 3, 1, 2); + new MedianTestProvider(2, 3, 1, 2, 3); + new MedianTestProvider(2, 3, 1, 2, 3, 4); + new MedianTestProvider(2, 3, 1, 2, 3, 4, 5); + + new MedianTestProvider(1, 3, 1); + new MedianTestProvider(1, 3, 1, 2); + new MedianTestProvider(2, 3, 3, 2, 1); + new MedianTestProvider(3, 3, 4, 3, 2, 1); + new MedianTestProvider(4, 3, 5, 4, 3, 2, 1); + + return MedianTestProvider.getTests(MedianTestProvider.class); + } + + @Test(dataProvider = "MedianTestProvider") + public void testBasicLikelihoods(MedianTestProvider cfg) { + final Median median = new Median(cfg.cap); + + int nAdded = 0; + for ( final int value : cfg.values ) + if ( median.add(value) ) + nAdded++; + + Assert.assertEquals(nAdded, median.size()); + + Assert.assertEquals(cfg.values.isEmpty(), median.isEmpty()); + Assert.assertEquals(cfg.values.size() >= cfg.cap, median.isFull()); + Assert.assertEquals(median.getMedian(), cfg.expected, cfg.toString()); + } + + @Test(expectedExceptions = IllegalStateException.class) + public void testEmptyMedian() { + final Median median = new Median(); + Assert.assertTrue(median.isEmpty()); + final Integer d = 100; + Assert.assertEquals(median.getMedian(d), d); + median.getMedian(); + } + +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java new file mode 100644 index 000000000..84f7fa363 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diagnostics; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +/** + * Tests ReadGroupProperties + */ +public class ReadGroupPropertiesIntegrationTest extends WalkerTest { + @Test + public void basicTest() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T ReadGroupProperties -R " + b37KGReference + " -I " + b37GoodBAM + " -L 20:10,000,000-11,000,000 -o %s", + 1, + Arrays.asList("1795e3157ab23e7e597acec334e29525")); + executeTest("ReadGroupProperties:", spec); + } +} \ No newline at end of file