ReadGroupProperties walker and associated infrastructure
-- ReadGroupProperties: Emits a GATKReport containing read group, sample, library, platform, center, median insert size and median read length for each read group in every BAM file. -- Median tool that collects up to a given maximum number of elements and returns the median of the elements. -- Unit and integration tests for everything. -- Making name of TestProvider protected so subclasses and override name more easily
This commit is contained in:
parent
88e060e1e9
commit
aff508e091
|
|
@ -0,0 +1,192 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics;
|
||||
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.Median;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Emits a GATKReport containing read group, sample, library, platform, center, median insert size and
|
||||
* median read length for each read group in every BAM file.
|
||||
*
|
||||
* Note that this walker stops when all read groups have been observed at least a few thousand times so that
|
||||
* the median statistics are well determined. It is safe to run it WG and it'll finish in an appropriate
|
||||
* timeframe.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* Any number of BAM files
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* GATKReport containing read group, sample, library, platform, center, median insert size and median read length.
|
||||
*
|
||||
* For example, running this tool on the NA12878 data sets:
|
||||
*
|
||||
* <pre>
|
||||
* ##:GATKReport.v0.2 ReadGroupProperties : Table of read group properties
|
||||
* readgroup sample library platform center median.read.length median.insert.size
|
||||
* 20FUK.1 NA12878 Solexa-18483 illumina BI 101 387
|
||||
* 20FUK.2 NA12878 Solexa-18484 illumina BI 101 415
|
||||
* 20FUK.3 NA12878 Solexa-18483 illumina BI 101 388
|
||||
* 20FUK.4 NA12878 Solexa-18484 illumina BI 101 415
|
||||
* 20FUK.5 NA12878 Solexa-18483 illumina BI 101 387
|
||||
* 20FUK.6 NA12878 Solexa-18484 illumina BI 101 415
|
||||
* 20FUK.7 NA12878 Solexa-18483 illumina BI 101 388
|
||||
* 20FUK.8 NA12878 Solexa-18484 illumina BI 101 415
|
||||
* 20GAV.1 NA12878 Solexa-18483 illumina BI 101 388
|
||||
* 20GAV.2 NA12878 Solexa-18484 illumina BI 101 415
|
||||
* 20GAV.3 NA12878 Solexa-18483 illumina BI 101 388
|
||||
* 20GAV.4 NA12878 Solexa-18484 illumina BI 101 416
|
||||
* 20GAV.5 NA12878 Solexa-18483 illumina BI 101 388
|
||||
* 20GAV.6 NA12878 Solexa-18484 illumina BI 101 415
|
||||
* 20GAV.7 NA12878 Solexa-18483 illumina BI 101 387
|
||||
* 20GAV.8 NA12878 Solexa-18484 illumina BI 101 414
|
||||
* </pre>
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T ReadGroupProperties
|
||||
* -I example1.bam -I example2.bam etc
|
||||
* -R reference.fasta
|
||||
* -o example.gatkreport.txt
|
||||
* </pre>
|
||||
*
|
||||
* @author Mark DePristo
|
||||
*/
|
||||
|
||||
|
||||
|
||||
public class ReadGroupProperties extends ReadWalker<Integer, Integer> {
|
||||
@Output
|
||||
public PrintStream out;
|
||||
|
||||
@Argument(shortName="maxElementsForMedian", doc="Calculate median from the first maxElementsForMedian values observed", required=false)
|
||||
public int MAX_VALUES_FOR_MEDIAN = 10000;
|
||||
|
||||
private final static String TABLE_NAME = "ReadGroupProperties";
|
||||
private final Map<String, Median<Integer>> readLengths = new HashMap<String, Median<Integer>>();
|
||||
private final Map<String, Median<Integer>> insertSizes = new HashMap<String, Median<Integer>>();
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
for ( final SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) {
|
||||
readLengths.put(rg.getId(), new Median<Integer>(MAX_VALUES_FOR_MEDIAN));
|
||||
insertSizes.put(rg.getId(), new Median<Integer>(MAX_VALUES_FOR_MEDIAN));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean filter(ReferenceContext ref, GATKSAMRecord read) {
|
||||
return ! (read.getReadFailsVendorQualityCheckFlag() || read.getReadUnmappedFlag());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isDone() {
|
||||
// TODO -- this is far too slow!
|
||||
return ! (anyMedianNeedsData(readLengths) || anyMedianNeedsData(insertSizes));
|
||||
}
|
||||
|
||||
private final boolean anyMedianNeedsData(Map<String, Median<Integer>> medianMap) {
|
||||
for ( Median<Integer> median : medianMap.values() ) {
|
||||
if ( ! median.isFull() )
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private final void updateMedian(final Median<Integer> median, final int value) {
|
||||
median.add(value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer map(ReferenceContext referenceContext, GATKSAMRecord read, ReadMetaDataTracker readMetaDataTracker) {
|
||||
final String rg = read.getReadGroup().getId();
|
||||
|
||||
updateMedian(readLengths.get(rg), read.getReadLength());
|
||||
if ( read.getReadPairedFlag() && read.getInferredInsertSize() != 0) {
|
||||
//logger.info(rg + " => " + Math.abs(read.getInferredInsertSize()));
|
||||
updateMedian(insertSizes.get(rg), Math.abs(read.getInferredInsertSize()));
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer reduceInit() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer reduce(Integer integer, Integer integer1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTraversalDone(Integer sum) {
|
||||
final GATKReport report = new GATKReport();
|
||||
report.addTable(TABLE_NAME, "Table of read group properties");
|
||||
GATKReportTable table = report.getTable(TABLE_NAME);
|
||||
|
||||
table.addPrimaryKey("readgroup");
|
||||
//* Emits a GATKReport containing read group, sample, library, platform, center, median insert size and
|
||||
//* median read length for each read group in every BAM file.
|
||||
table.addColumn("sample", "NA");
|
||||
table.addColumn("library", "NA");
|
||||
table.addColumn("platform", "NA");
|
||||
table.addColumn("center", "NA");
|
||||
table.addColumn("median.read.length", Integer.valueOf(0));
|
||||
table.addColumn("median.insert.size", Integer.valueOf(0));
|
||||
|
||||
for ( final SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) {
|
||||
final String rgID = rg.getId();
|
||||
table.set(rgID, "sample", rg.getSample());
|
||||
table.set(rgID, "library", rg.getLibrary());
|
||||
table.set(rgID, "platform", rg.getPlatform());
|
||||
table.set(rgID, "center", rg.getSequencingCenter());
|
||||
table.set(rgID, "median.read.length", readLengths.get(rgID).getMedian(0));
|
||||
table.set(rgID, "median.insert.size", insertSizes.get(rgID).getMedian(0));
|
||||
}
|
||||
|
||||
report.print(out);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,93 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Utility class for calculating median from a data set, potentially limiting the size of data to a
|
||||
* fixed amount
|
||||
*
|
||||
* @author Your Name
|
||||
* @since Date created
|
||||
*/
|
||||
public class Median<T extends Comparable> {
|
||||
final List<T> values;
|
||||
final int maxValuesToKeep;
|
||||
boolean sorted = false;
|
||||
|
||||
public Median() {
|
||||
this(Integer.MAX_VALUE);
|
||||
}
|
||||
|
||||
public Median(final int maxValuesToKeep) {
|
||||
this.maxValuesToKeep = maxValuesToKeep;
|
||||
this.values = new ArrayList<T>();
|
||||
}
|
||||
|
||||
public boolean isFull() {
|
||||
return values.size() >= maxValuesToKeep;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return values.size();
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return values.isEmpty();
|
||||
}
|
||||
|
||||
public T getMedian() {
|
||||
if ( isEmpty() )
|
||||
throw new IllegalStateException("Cannot get median value from empty array");
|
||||
return getMedian(null); // note that value null will never be used
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the floor(n + 1 / 2) item from the list of values if the list
|
||||
* has values, or defaultValue is the list is empty.
|
||||
*/
|
||||
public T getMedian(final T defaultValue) {
|
||||
if ( isEmpty() )
|
||||
return defaultValue;
|
||||
|
||||
if ( ! sorted ) {
|
||||
sorted = true;
|
||||
Collections.sort(values);
|
||||
}
|
||||
|
||||
final int offset = (int)Math.floor((values.size() + 1) * 0.5) - 1;
|
||||
return values.get(offset);
|
||||
}
|
||||
|
||||
public boolean add(final T value) {
|
||||
if ( ! isFull() ) {
|
||||
sorted = false;
|
||||
return values.add(value);
|
||||
}
|
||||
else
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
@ -141,7 +141,7 @@ public abstract class BaseTest {
|
|||
*/
|
||||
public static class TestDataProvider {
|
||||
private static final Map<Class, List<Object>> tests = new HashMap<Class, List<Object>>();
|
||||
private String name;
|
||||
protected String name;
|
||||
|
||||
/**
|
||||
* Create a new TestDataProvider instance bound to the class variable C
|
||||
|
|
|
|||
|
|
@ -0,0 +1,123 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
// our package
|
||||
package org.broadinstitute.sting;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LikelihoodCalculationEngine;
|
||||
import org.broadinstitute.sting.utils.Median;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class MedianUnitTest extends BaseTest {
|
||||
LikelihoodCalculationEngine engine;
|
||||
|
||||
@BeforeSuite
|
||||
public void before() {
|
||||
engine = new LikelihoodCalculationEngine(0, 0, false);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Provider
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private class MedianTestProvider extends TestDataProvider {
|
||||
final List<Integer> values = new ArrayList<Integer>();
|
||||
final int cap;
|
||||
final Integer expected;
|
||||
|
||||
public MedianTestProvider(int expected, int cap, Integer ... values) {
|
||||
super(MedianTestProvider.class);
|
||||
this.expected = expected;
|
||||
this.cap = cap;
|
||||
this.values.addAll(Arrays.asList(values));
|
||||
this.name = String.format("values=%s expected=%d cap=%d", this.values, expected, cap);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "MedianTestProvider")
|
||||
public Object[][] makeMedianTestProvider() {
|
||||
new MedianTestProvider(1, 1000, 0, 1, 2);
|
||||
new MedianTestProvider(1, 1000, 1, 0, 1, 2);
|
||||
new MedianTestProvider(1, 1000, 0, 1, 2, 3);
|
||||
new MedianTestProvider(2, 1000, 0, 1, 2, 3, 4);
|
||||
new MedianTestProvider(2, 1000, 4, 1, 2, 3, 0);
|
||||
new MedianTestProvider(1, 1000, 1);
|
||||
new MedianTestProvider(2, 1000, 2);
|
||||
new MedianTestProvider(1, 1000, 1, 2);
|
||||
|
||||
new MedianTestProvider(1, 3, 1);
|
||||
new MedianTestProvider(1, 3, 1, 2);
|
||||
new MedianTestProvider(2, 3, 1, 2, 3);
|
||||
new MedianTestProvider(2, 3, 1, 2, 3, 4);
|
||||
new MedianTestProvider(2, 3, 1, 2, 3, 4, 5);
|
||||
|
||||
new MedianTestProvider(1, 3, 1);
|
||||
new MedianTestProvider(1, 3, 1, 2);
|
||||
new MedianTestProvider(2, 3, 3, 2, 1);
|
||||
new MedianTestProvider(3, 3, 4, 3, 2, 1);
|
||||
new MedianTestProvider(4, 3, 5, 4, 3, 2, 1);
|
||||
|
||||
return MedianTestProvider.getTests(MedianTestProvider.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "MedianTestProvider")
|
||||
public void testBasicLikelihoods(MedianTestProvider cfg) {
|
||||
final Median<Integer> median = new Median<Integer>(cfg.cap);
|
||||
|
||||
int nAdded = 0;
|
||||
for ( final int value : cfg.values )
|
||||
if ( median.add(value) )
|
||||
nAdded++;
|
||||
|
||||
Assert.assertEquals(nAdded, median.size());
|
||||
|
||||
Assert.assertEquals(cfg.values.isEmpty(), median.isEmpty());
|
||||
Assert.assertEquals(cfg.values.size() >= cfg.cap, median.isFull());
|
||||
Assert.assertEquals(median.getMedian(), cfg.expected, cfg.toString());
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = IllegalStateException.class)
|
||||
public void testEmptyMedian() {
|
||||
final Median<Integer> median = new Median<Integer>();
|
||||
Assert.assertTrue(median.isEmpty());
|
||||
final Integer d = 100;
|
||||
Assert.assertEquals(median.getMedian(d), d);
|
||||
median.getMedian();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Tests ReadGroupProperties
|
||||
*/
|
||||
public class ReadGroupPropertiesIntegrationTest extends WalkerTest {
|
||||
@Test
|
||||
public void basicTest() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T ReadGroupProperties -R " + b37KGReference + " -I " + b37GoodBAM + " -L 20:10,000,000-11,000,000 -o %s",
|
||||
1,
|
||||
Arrays.asList("1795e3157ab23e7e597acec334e29525"));
|
||||
executeTest("ReadGroupProperties:", spec);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue