ReadGroupProperties walker and associated infrastructure

-- ReadGroupProperties: Emits a GATKReport containing read group, sample, library, platform, center, median insert size and median read length for each read group in every BAM file.
-- Median tool that collects up to a given maximum number of elements and returns the median of the elements.
-- Unit and integration tests for everything.
-- Making name of TestProvider protected so subclasses and override name more easily
This commit is contained in:
Mark DePristo 2012-03-01 15:01:11 -05:00
parent 88e060e1e9
commit aff508e091
5 changed files with 453 additions and 1 deletions

View File

@ -0,0 +1,192 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.Median;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.PrintStream;
import java.util.HashMap;
import java.util.Map;
/**
* Emits a GATKReport containing read group, sample, library, platform, center, median insert size and
* median read length for each read group in every BAM file.
*
* Note that this walker stops when all read groups have been observed at least a few thousand times so that
* the median statistics are well determined. It is safe to run it WG and it'll finish in an appropriate
* timeframe.
*
* <h2>Input</h2>
* <p>
* Any number of BAM files
* </p>
*
* <h2>Output</h2>
* <p>
* GATKReport containing read group, sample, library, platform, center, median insert size and median read length.
*
* For example, running this tool on the NA12878 data sets:
*
* <pre>
* ##:GATKReport.v0.2 ReadGroupProperties : Table of read group properties
* readgroup sample library platform center median.read.length median.insert.size
* 20FUK.1 NA12878 Solexa-18483 illumina BI 101 387
* 20FUK.2 NA12878 Solexa-18484 illumina BI 101 415
* 20FUK.3 NA12878 Solexa-18483 illumina BI 101 388
* 20FUK.4 NA12878 Solexa-18484 illumina BI 101 415
* 20FUK.5 NA12878 Solexa-18483 illumina BI 101 387
* 20FUK.6 NA12878 Solexa-18484 illumina BI 101 415
* 20FUK.7 NA12878 Solexa-18483 illumina BI 101 388
* 20FUK.8 NA12878 Solexa-18484 illumina BI 101 415
* 20GAV.1 NA12878 Solexa-18483 illumina BI 101 388
* 20GAV.2 NA12878 Solexa-18484 illumina BI 101 415
* 20GAV.3 NA12878 Solexa-18483 illumina BI 101 388
* 20GAV.4 NA12878 Solexa-18484 illumina BI 101 416
* 20GAV.5 NA12878 Solexa-18483 illumina BI 101 388
* 20GAV.6 NA12878 Solexa-18484 illumina BI 101 415
* 20GAV.7 NA12878 Solexa-18483 illumina BI 101 387
* 20GAV.8 NA12878 Solexa-18484 illumina BI 101 414
* </pre>
* </p>
*
* <h2>Examples</h2>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T ReadGroupProperties
* -I example1.bam -I example2.bam etc
* -R reference.fasta
* -o example.gatkreport.txt
* </pre>
*
* @author Mark DePristo
*/
public class ReadGroupProperties extends ReadWalker<Integer, Integer> {
@Output
public PrintStream out;
@Argument(shortName="maxElementsForMedian", doc="Calculate median from the first maxElementsForMedian values observed", required=false)
public int MAX_VALUES_FOR_MEDIAN = 10000;
private final static String TABLE_NAME = "ReadGroupProperties";
private final Map<String, Median<Integer>> readLengths = new HashMap<String, Median<Integer>>();
private final Map<String, Median<Integer>> insertSizes = new HashMap<String, Median<Integer>>();
@Override
public void initialize() {
for ( final SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) {
readLengths.put(rg.getId(), new Median<Integer>(MAX_VALUES_FOR_MEDIAN));
insertSizes.put(rg.getId(), new Median<Integer>(MAX_VALUES_FOR_MEDIAN));
}
}
@Override
public boolean filter(ReferenceContext ref, GATKSAMRecord read) {
return ! (read.getReadFailsVendorQualityCheckFlag() || read.getReadUnmappedFlag());
}
@Override
public boolean isDone() {
// TODO -- this is far too slow!
return ! (anyMedianNeedsData(readLengths) || anyMedianNeedsData(insertSizes));
}
private final boolean anyMedianNeedsData(Map<String, Median<Integer>> medianMap) {
for ( Median<Integer> median : medianMap.values() ) {
if ( ! median.isFull() )
return true;
}
return false;
}
private final void updateMedian(final Median<Integer> median, final int value) {
median.add(value);
}
@Override
public Integer map(ReferenceContext referenceContext, GATKSAMRecord read, ReadMetaDataTracker readMetaDataTracker) {
final String rg = read.getReadGroup().getId();
updateMedian(readLengths.get(rg), read.getReadLength());
if ( read.getReadPairedFlag() && read.getInferredInsertSize() != 0) {
//logger.info(rg + " => " + Math.abs(read.getInferredInsertSize()));
updateMedian(insertSizes.get(rg), Math.abs(read.getInferredInsertSize()));
}
return null;
}
@Override
public Integer reduceInit() {
return null;
}
@Override
public Integer reduce(Integer integer, Integer integer1) {
return null;
}
@Override
public void onTraversalDone(Integer sum) {
final GATKReport report = new GATKReport();
report.addTable(TABLE_NAME, "Table of read group properties");
GATKReportTable table = report.getTable(TABLE_NAME);
table.addPrimaryKey("readgroup");
//* Emits a GATKReport containing read group, sample, library, platform, center, median insert size and
//* median read length for each read group in every BAM file.
table.addColumn("sample", "NA");
table.addColumn("library", "NA");
table.addColumn("platform", "NA");
table.addColumn("center", "NA");
table.addColumn("median.read.length", Integer.valueOf(0));
table.addColumn("median.insert.size", Integer.valueOf(0));
for ( final SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) {
final String rgID = rg.getId();
table.set(rgID, "sample", rg.getSample());
table.set(rgID, "library", rg.getLibrary());
table.set(rgID, "platform", rg.getPlatform());
table.set(rgID, "center", rg.getSequencingCenter());
table.set(rgID, "median.read.length", readLengths.get(rgID).getMedian(0));
table.set(rgID, "median.insert.size", insertSizes.get(rgID).getMedian(0));
}
report.print(out);
}
}

View File

@ -0,0 +1,93 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils;
import java.util.*;
/**
* Utility class for calculating median from a data set, potentially limiting the size of data to a
* fixed amount
*
* @author Your Name
* @since Date created
*/
public class Median<T extends Comparable> {
final List<T> values;
final int maxValuesToKeep;
boolean sorted = false;
public Median() {
this(Integer.MAX_VALUE);
}
public Median(final int maxValuesToKeep) {
this.maxValuesToKeep = maxValuesToKeep;
this.values = new ArrayList<T>();
}
public boolean isFull() {
return values.size() >= maxValuesToKeep;
}
public int size() {
return values.size();
}
public boolean isEmpty() {
return values.isEmpty();
}
public T getMedian() {
if ( isEmpty() )
throw new IllegalStateException("Cannot get median value from empty array");
return getMedian(null); // note that value null will never be used
}
/**
* Returns the floor(n + 1 / 2) item from the list of values if the list
* has values, or defaultValue is the list is empty.
*/
public T getMedian(final T defaultValue) {
if ( isEmpty() )
return defaultValue;
if ( ! sorted ) {
sorted = true;
Collections.sort(values);
}
final int offset = (int)Math.floor((values.size() + 1) * 0.5) - 1;
return values.get(offset);
}
public boolean add(final T value) {
if ( ! isFull() ) {
sorted = false;
return values.add(value);
}
else
return false;
}
}

View File

@ -141,7 +141,7 @@ public abstract class BaseTest {
*/
public static class TestDataProvider {
private static final Map<Class, List<Object>> tests = new HashMap<Class, List<Object>>();
private String name;
protected String name;
/**
* Create a new TestDataProvider instance bound to the class variable C

View File

@ -0,0 +1,123 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
// our package
package org.broadinstitute.sting;
// the imports for unit testing.
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LikelihoodCalculationEngine;
import org.broadinstitute.sting.utils.Median;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class MedianUnitTest extends BaseTest {
LikelihoodCalculationEngine engine;
@BeforeSuite
public void before() {
engine = new LikelihoodCalculationEngine(0, 0, false);
}
// --------------------------------------------------------------------------------
//
// Provider
//
// --------------------------------------------------------------------------------
private class MedianTestProvider extends TestDataProvider {
final List<Integer> values = new ArrayList<Integer>();
final int cap;
final Integer expected;
public MedianTestProvider(int expected, int cap, Integer ... values) {
super(MedianTestProvider.class);
this.expected = expected;
this.cap = cap;
this.values.addAll(Arrays.asList(values));
this.name = String.format("values=%s expected=%d cap=%d", this.values, expected, cap);
}
}
@DataProvider(name = "MedianTestProvider")
public Object[][] makeMedianTestProvider() {
new MedianTestProvider(1, 1000, 0, 1, 2);
new MedianTestProvider(1, 1000, 1, 0, 1, 2);
new MedianTestProvider(1, 1000, 0, 1, 2, 3);
new MedianTestProvider(2, 1000, 0, 1, 2, 3, 4);
new MedianTestProvider(2, 1000, 4, 1, 2, 3, 0);
new MedianTestProvider(1, 1000, 1);
new MedianTestProvider(2, 1000, 2);
new MedianTestProvider(1, 1000, 1, 2);
new MedianTestProvider(1, 3, 1);
new MedianTestProvider(1, 3, 1, 2);
new MedianTestProvider(2, 3, 1, 2, 3);
new MedianTestProvider(2, 3, 1, 2, 3, 4);
new MedianTestProvider(2, 3, 1, 2, 3, 4, 5);
new MedianTestProvider(1, 3, 1);
new MedianTestProvider(1, 3, 1, 2);
new MedianTestProvider(2, 3, 3, 2, 1);
new MedianTestProvider(3, 3, 4, 3, 2, 1);
new MedianTestProvider(4, 3, 5, 4, 3, 2, 1);
return MedianTestProvider.getTests(MedianTestProvider.class);
}
@Test(dataProvider = "MedianTestProvider")
public void testBasicLikelihoods(MedianTestProvider cfg) {
final Median<Integer> median = new Median<Integer>(cfg.cap);
int nAdded = 0;
for ( final int value : cfg.values )
if ( median.add(value) )
nAdded++;
Assert.assertEquals(nAdded, median.size());
Assert.assertEquals(cfg.values.isEmpty(), median.isEmpty());
Assert.assertEquals(cfg.values.size() >= cfg.cap, median.isFull());
Assert.assertEquals(median.getMedian(), cfg.expected, cfg.toString());
}
@Test(expectedExceptions = IllegalStateException.class)
public void testEmptyMedian() {
final Median<Integer> median = new Median<Integer>();
Assert.assertTrue(median.isEmpty());
final Integer d = 100;
Assert.assertEquals(median.getMedian(d), d);
median.getMedian();
}
}

View File

@ -0,0 +1,44 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import java.util.Arrays;
/**
* Tests ReadGroupProperties
*/
public class ReadGroupPropertiesIntegrationTest extends WalkerTest {
@Test
public void basicTest() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T ReadGroupProperties -R " + b37KGReference + " -I " + b37GoodBAM + " -L 20:10,000,000-11,000,000 -o %s",
1,
Arrays.asList("1795e3157ab23e7e597acec334e29525"));
executeTest("ReadGroupProperties:", spec);
}
}