A generic downsampler that reduces coverage for a bunch of reads

-- Exposed the underlying minElementsPerStack parameter for LevelingDownsampler
This commit is contained in:
Mark DePristo 2013-03-06 21:39:18 -05:00
parent 752440707d
commit a8fb26bf01
3 changed files with 235 additions and 3 deletions

View File

@ -0,0 +1,107 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.*;
/**
* Utilities for using the downsamplers for common tasks
*
* User: depristo
* Date: 3/6/13
* Time: 4:26 PM
*/
public class DownsamplingUtils {
private DownsamplingUtils() { }
/**
* Level the coverage of the reads in each sample to no more than downsampleTo reads, no reducing
* coverage at any read start to less than minReadsPerAlignmentStart
*
* This algorithm can be used to handle the situation where you have lots of coverage in some interval, and
* want to reduce the coverage of the big peak down without removing the many reads at the edge of this
* interval that are in fact good
*
* This algorithm separately operates on the reads for each sample independently.
*
* @param reads a sorted list of reads
* @param downsampleTo the targeted number of reads we want from reads per sample
* @param minReadsPerAlignmentStart don't reduce the number of reads starting at a specific alignment start
* to below this. That is, if this value is 2, we'll never reduce the number
* of reads starting at a specific start site to less than 2
* @return a sorted list of reads
*/
public static List<GATKSAMRecord> levelCoverageByPosition(final List<GATKSAMRecord> reads, final int downsampleTo, final int minReadsPerAlignmentStart) {
if ( reads == null ) throw new IllegalArgumentException("reads must not be null");
final List<GATKSAMRecord> downsampled = new ArrayList<GATKSAMRecord>(reads.size());
final Map<String, Map<Integer, List<GATKSAMRecord>>> readsBySampleByStart = partitionReadsBySampleAndStart(reads);
for ( final Map<Integer, List<GATKSAMRecord>> readsByPosMap : readsBySampleByStart.values() ) {
final LevelingDownsampler<List<GATKSAMRecord>, GATKSAMRecord> downsampler = new LevelingDownsampler<List<GATKSAMRecord>, GATKSAMRecord>(downsampleTo, minReadsPerAlignmentStart);
downsampler.submit(readsByPosMap.values());
downsampler.signalEndOfInput();
for ( final List<GATKSAMRecord> downsampledReads : downsampler.consumeFinalizedItems())
downsampled.addAll(downsampledReads);
}
return ReadUtils.sortReadsByCoordinate(downsampled);
}
/**
* Build the data structure mapping for each sample -> (position -> reads at position)
*
* Note that the map position -> reads isn't ordered in any meaningful way
*
* @param reads a list of sorted reads
* @return a map containing the list of reads at each start location, for each sample independently
*/
private static Map<String, Map<Integer, List<GATKSAMRecord>>> partitionReadsBySampleAndStart(final List<GATKSAMRecord> reads) {
final Map<String, Map<Integer, List<GATKSAMRecord>>> readsBySampleByStart = new LinkedHashMap<String, Map<Integer, List<GATKSAMRecord>>>();
for ( final GATKSAMRecord read : reads ) {
Map<Integer, List<GATKSAMRecord>> readsByStart = readsBySampleByStart.get(read.getReadGroup().getSample());
if ( readsByStart == null ) {
readsByStart = new LinkedHashMap<Integer, List<GATKSAMRecord>>();
readsBySampleByStart.put(read.getReadGroup().getSample(), readsByStart);
}
List<GATKSAMRecord> readsAtStart = readsByStart.get(read.getAlignmentStart());
if ( readsAtStart == null ) {
readsAtStart = new LinkedList<GATKSAMRecord>();
readsByStart.put(read.getAlignmentStart(), readsAtStart);
}
readsAtStart.add(read);
}
return readsBySampleByStart;
}
}

View File

@ -47,8 +47,8 @@ import java.util.*;
* @author David Roazen
*/
public class LevelingDownsampler<T extends List<E>, E> implements Downsampler<T> {
private int targetSize;
private final int minElementsPerStack;
private final int targetSize;
private List<T> groups;
@ -59,12 +59,32 @@ public class LevelingDownsampler<T extends List<E>, E> implements Downsampler<T>
/**
* Construct a LevelingDownsampler
*
* Uses the default minElementsPerStack of 1
*
* @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed
* this value -- if it does, items are removed from Lists evenly until the total size
* is <= this value
*/
public LevelingDownsampler( int targetSize ) {
this(targetSize, 1);
}
/**
* Construct a LevelingDownsampler
*
* @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed
* this value -- if it does, items are removed from Lists evenly until the total size
* is <= this value
* @param minElementsPerStack no stack will be reduced below this size during downsampling. That is,
* if a stack has only 3 elements and minElementsPerStack is 3, no matter what
* we'll not reduce this stack below 3.
*/
public LevelingDownsampler(final int targetSize, final int minElementsPerStack) {
if ( targetSize < 0 ) throw new IllegalArgumentException("targetSize must be >= 0 but got " + targetSize);
if ( minElementsPerStack < 0 ) throw new IllegalArgumentException("minElementsPerStack must be >= 0 but got " + minElementsPerStack);
this.targetSize = targetSize;
this.minElementsPerStack = minElementsPerStack;
clear();
reset();
}
@ -148,7 +168,7 @@ public class LevelingDownsampler<T extends List<E>, E> implements Downsampler<T>
// remove any more items without violating the constraint that all groups must
// be left with at least one item
while ( numItemsToRemove > 0 && numConsecutiveUmodifiableGroups < groupSizes.length ) {
if ( groupSizes[currentGroupIndex] > 1 ) {
if ( groupSizes[currentGroupIndex] > minElementsPerStack ) {
groupSizes[currentGroupIndex]--;
numItemsToRemove--;
numConsecutiveUmodifiableGroups = 0;

View File

@ -0,0 +1,105 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.readutils;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.NanoSchedulable;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedList;
/**
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class DownsampleReadsQC extends ReadWalker<GATKSAMRecord, Collection<GATKSAMRecord>> implements NanoSchedulable {
@Output(doc="Write output to this BAM filename instead of STDOUT", required = true)
StingSAMFileWriter out;
@Argument(fullName = "minReadsPerAlignmentStart", shortName = "minReadsPerAlignmentStart", doc ="", required = false)
private int minReadsPerAlignmentStart = 5;
@Argument(fullName = "downsampleTo", shortName = "downsampleTo", doc ="", required = false)
private int downsampleTo = 1000;
/**
* The initialize function.
*/
public void initialize() {
// final boolean preSorted = true;
// if (getToolkit() != null && getToolkit().getArguments().BQSR_RECAL_FILE != null && !NO_PG_TAG ) {
// Utils.setupWriter(out, getToolkit(), getToolkit().getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME);
// }
}
/**
* The reads map function.
*
* @param ref the reference bases that correspond to our read, if a reference was provided
* @param readIn the read itself, as a GATKSAMRecord
* @return the read itself
*/
public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord readIn, RefMetaDataTracker metaDataTracker ) {
return readIn;
}
/**
* reduceInit is called once before any calls to the map function. We use it here to setup the output
* bam file, if it was specified on the command line
*
* @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise
*/
public Collection<GATKSAMRecord> reduceInit() {
return new LinkedList<GATKSAMRecord>();
}
/**
* given a read and a output location, reduce by emitting the read
*
* @param read the read itself
* @param output the output source
* @return the SAMFileWriter, so that the next reduce can emit to the same source
*/
public Collection<GATKSAMRecord> reduce( GATKSAMRecord read, Collection<GATKSAMRecord> output ) {
output.add(read);
return output;
}
@Override
public void onTraversalDone(Collection<GATKSAMRecord> result) {
for ( final GATKSAMRecord read : DownsamplingUtils.levelCoverageByPosition(new ArrayList<GATKSAMRecord>(result), downsampleTo, minReadsPerAlignmentStart) )
out.addAlignment(read);
}
}