From a8fb26bf0167147bae2c3896e41be5049dd0bb48 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 6 Mar 2013 21:39:18 -0500 Subject: [PATCH] A generic downsampler that reduces coverage for a bunch of reads -- Exposed the underlying minElementsPerStack parameter for LevelingDownsampler --- .../gatk/downsampling/DownsamplingUtils.java | 107 ++++++++++++++++++ .../downsampling/LevelingDownsampler.java | 26 ++++- .../walkers/readutils/DownsampleReadsQC.java | 105 +++++++++++++++++ 3 files changed, 235 insertions(+), 3 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/DownsampleReadsQC.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java new file mode 100644 index 000000000..877083829 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingUtils.java @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.util.*; + +/** + * Utilities for using the downsamplers for common tasks + * + * User: depristo + * Date: 3/6/13 + * Time: 4:26 PM + */ +public class DownsamplingUtils { + private DownsamplingUtils() { } + + /** + * Level the coverage of the reads in each sample to no more than downsampleTo reads, no reducing + * coverage at any read start to less than minReadsPerAlignmentStart + * + * This algorithm can be used to handle the situation where you have lots of coverage in some interval, and + * want to reduce the coverage of the big peak down without removing the many reads at the edge of this + * interval that are in fact good + * + * This algorithm separately operates on the reads for each sample independently. + * + * @param reads a sorted list of reads + * @param downsampleTo the targeted number of reads we want from reads per sample + * @param minReadsPerAlignmentStart don't reduce the number of reads starting at a specific alignment start + * to below this. That is, if this value is 2, we'll never reduce the number + * of reads starting at a specific start site to less than 2 + * @return a sorted list of reads + */ + public static List levelCoverageByPosition(final List reads, final int downsampleTo, final int minReadsPerAlignmentStart) { + if ( reads == null ) throw new IllegalArgumentException("reads must not be null"); + + final List downsampled = new ArrayList(reads.size()); + + final Map>> readsBySampleByStart = partitionReadsBySampleAndStart(reads); + for ( final Map> readsByPosMap : readsBySampleByStart.values() ) { + final LevelingDownsampler, GATKSAMRecord> downsampler = new LevelingDownsampler, GATKSAMRecord>(downsampleTo, minReadsPerAlignmentStart); + downsampler.submit(readsByPosMap.values()); + downsampler.signalEndOfInput(); + for ( final List downsampledReads : downsampler.consumeFinalizedItems()) + downsampled.addAll(downsampledReads); + } + + return ReadUtils.sortReadsByCoordinate(downsampled); + } + + /** + * Build the data structure mapping for each sample -> (position -> reads at position) + * + * Note that the map position -> reads isn't ordered in any meaningful way + * + * @param reads a list of sorted reads + * @return a map containing the list of reads at each start location, for each sample independently + */ + private static Map>> partitionReadsBySampleAndStart(final List reads) { + final Map>> readsBySampleByStart = new LinkedHashMap>>(); + + for ( final GATKSAMRecord read : reads ) { + Map> readsByStart = readsBySampleByStart.get(read.getReadGroup().getSample()); + + if ( readsByStart == null ) { + readsByStart = new LinkedHashMap>(); + readsBySampleByStart.put(read.getReadGroup().getSample(), readsByStart); + } + + List readsAtStart = readsByStart.get(read.getAlignmentStart()); + if ( readsAtStart == null ) { + readsAtStart = new LinkedList(); + readsByStart.put(read.getAlignmentStart(), readsAtStart); + } + + readsAtStart.add(read); + } + + return readsBySampleByStart; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java index 9b4b2adcb..a8a808333 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java @@ -47,8 +47,8 @@ import java.util.*; * @author David Roazen */ public class LevelingDownsampler, E> implements Downsampler { - - private int targetSize; + private final int minElementsPerStack; + private final int targetSize; private List groups; @@ -59,12 +59,32 @@ public class LevelingDownsampler, E> implements Downsampler /** * Construct a LevelingDownsampler * + * Uses the default minElementsPerStack of 1 + * * @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed * this value -- if it does, items are removed from Lists evenly until the total size * is <= this value */ public LevelingDownsampler( int targetSize ) { + this(targetSize, 1); + } + + /** + * Construct a LevelingDownsampler + * + * @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed + * this value -- if it does, items are removed from Lists evenly until the total size + * is <= this value + * @param minElementsPerStack no stack will be reduced below this size during downsampling. That is, + * if a stack has only 3 elements and minElementsPerStack is 3, no matter what + * we'll not reduce this stack below 3. + */ + public LevelingDownsampler(final int targetSize, final int minElementsPerStack) { + if ( targetSize < 0 ) throw new IllegalArgumentException("targetSize must be >= 0 but got " + targetSize); + if ( minElementsPerStack < 0 ) throw new IllegalArgumentException("minElementsPerStack must be >= 0 but got " + minElementsPerStack); + this.targetSize = targetSize; + this.minElementsPerStack = minElementsPerStack; clear(); reset(); } @@ -148,7 +168,7 @@ public class LevelingDownsampler, E> implements Downsampler // remove any more items without violating the constraint that all groups must // be left with at least one item while ( numItemsToRemove > 0 && numConsecutiveUmodifiableGroups < groupSizes.length ) { - if ( groupSizes[currentGroupIndex] > 1 ) { + if ( groupSizes[currentGroupIndex] > minElementsPerStack ) { groupSizes[currentGroupIndex]--; numItemsToRemove--; numConsecutiveUmodifiableGroups = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/DownsampleReadsQC.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/DownsampleReadsQC.java new file mode 100644 index 000000000..1141a9164 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/DownsampleReadsQC.java @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.readutils; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedList; + +/** + */ +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class DownsampleReadsQC extends ReadWalker> implements NanoSchedulable { + @Output(doc="Write output to this BAM filename instead of STDOUT", required = true) + StingSAMFileWriter out; + + @Argument(fullName = "minReadsPerAlignmentStart", shortName = "minReadsPerAlignmentStart", doc ="", required = false) + private int minReadsPerAlignmentStart = 5; + + @Argument(fullName = "downsampleTo", shortName = "downsampleTo", doc ="", required = false) + private int downsampleTo = 1000; + + /** + * The initialize function. + */ + public void initialize() { +// final boolean preSorted = true; +// if (getToolkit() != null && getToolkit().getArguments().BQSR_RECAL_FILE != null && !NO_PG_TAG ) { +// Utils.setupWriter(out, getToolkit(), getToolkit().getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME); +// } + } + + /** + * The reads map function. + * + * @param ref the reference bases that correspond to our read, if a reference was provided + * @param readIn the read itself, as a GATKSAMRecord + * @return the read itself + */ + public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord readIn, RefMetaDataTracker metaDataTracker ) { + return readIn; + } + + /** + * reduceInit is called once before any calls to the map function. We use it here to setup the output + * bam file, if it was specified on the command line + * + * @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise + */ + public Collection reduceInit() { + return new LinkedList(); + } + + /** + * given a read and a output location, reduce by emitting the read + * + * @param read the read itself + * @param output the output source + * @return the SAMFileWriter, so that the next reduce can emit to the same source + */ + public Collection reduce( GATKSAMRecord read, Collection output ) { + output.add(read); + return output; + } + + @Override + public void onTraversalDone(Collection result) { + for ( final GATKSAMRecord read : DownsamplingUtils.levelCoverageByPosition(new ArrayList(result), downsampleTo, minReadsPerAlignmentStart) ) + out.addAlignment(read); + } +}