A generic downsampler that reduces coverage for a bunch of reads
-- Exposed the underlying minElementsPerStack parameter for LevelingDownsampler
This commit is contained in:
parent
752440707d
commit
a8fb26bf01
|
|
@ -0,0 +1,107 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Utilities for using the downsamplers for common tasks
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 3/6/13
|
||||
* Time: 4:26 PM
|
||||
*/
|
||||
public class DownsamplingUtils {
|
||||
private DownsamplingUtils() { }
|
||||
|
||||
/**
|
||||
* Level the coverage of the reads in each sample to no more than downsampleTo reads, no reducing
|
||||
* coverage at any read start to less than minReadsPerAlignmentStart
|
||||
*
|
||||
* This algorithm can be used to handle the situation where you have lots of coverage in some interval, and
|
||||
* want to reduce the coverage of the big peak down without removing the many reads at the edge of this
|
||||
* interval that are in fact good
|
||||
*
|
||||
* This algorithm separately operates on the reads for each sample independently.
|
||||
*
|
||||
* @param reads a sorted list of reads
|
||||
* @param downsampleTo the targeted number of reads we want from reads per sample
|
||||
* @param minReadsPerAlignmentStart don't reduce the number of reads starting at a specific alignment start
|
||||
* to below this. That is, if this value is 2, we'll never reduce the number
|
||||
* of reads starting at a specific start site to less than 2
|
||||
* @return a sorted list of reads
|
||||
*/
|
||||
public static List<GATKSAMRecord> levelCoverageByPosition(final List<GATKSAMRecord> reads, final int downsampleTo, final int minReadsPerAlignmentStart) {
|
||||
if ( reads == null ) throw new IllegalArgumentException("reads must not be null");
|
||||
|
||||
final List<GATKSAMRecord> downsampled = new ArrayList<GATKSAMRecord>(reads.size());
|
||||
|
||||
final Map<String, Map<Integer, List<GATKSAMRecord>>> readsBySampleByStart = partitionReadsBySampleAndStart(reads);
|
||||
for ( final Map<Integer, List<GATKSAMRecord>> readsByPosMap : readsBySampleByStart.values() ) {
|
||||
final LevelingDownsampler<List<GATKSAMRecord>, GATKSAMRecord> downsampler = new LevelingDownsampler<List<GATKSAMRecord>, GATKSAMRecord>(downsampleTo, minReadsPerAlignmentStart);
|
||||
downsampler.submit(readsByPosMap.values());
|
||||
downsampler.signalEndOfInput();
|
||||
for ( final List<GATKSAMRecord> downsampledReads : downsampler.consumeFinalizedItems())
|
||||
downsampled.addAll(downsampledReads);
|
||||
}
|
||||
|
||||
return ReadUtils.sortReadsByCoordinate(downsampled);
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the data structure mapping for each sample -> (position -> reads at position)
|
||||
*
|
||||
* Note that the map position -> reads isn't ordered in any meaningful way
|
||||
*
|
||||
* @param reads a list of sorted reads
|
||||
* @return a map containing the list of reads at each start location, for each sample independently
|
||||
*/
|
||||
private static Map<String, Map<Integer, List<GATKSAMRecord>>> partitionReadsBySampleAndStart(final List<GATKSAMRecord> reads) {
|
||||
final Map<String, Map<Integer, List<GATKSAMRecord>>> readsBySampleByStart = new LinkedHashMap<String, Map<Integer, List<GATKSAMRecord>>>();
|
||||
|
||||
for ( final GATKSAMRecord read : reads ) {
|
||||
Map<Integer, List<GATKSAMRecord>> readsByStart = readsBySampleByStart.get(read.getReadGroup().getSample());
|
||||
|
||||
if ( readsByStart == null ) {
|
||||
readsByStart = new LinkedHashMap<Integer, List<GATKSAMRecord>>();
|
||||
readsBySampleByStart.put(read.getReadGroup().getSample(), readsByStart);
|
||||
}
|
||||
|
||||
List<GATKSAMRecord> readsAtStart = readsByStart.get(read.getAlignmentStart());
|
||||
if ( readsAtStart == null ) {
|
||||
readsAtStart = new LinkedList<GATKSAMRecord>();
|
||||
readsByStart.put(read.getAlignmentStart(), readsAtStart);
|
||||
}
|
||||
|
||||
readsAtStart.add(read);
|
||||
}
|
||||
|
||||
return readsBySampleByStart;
|
||||
}
|
||||
}
|
||||
|
|
@ -47,8 +47,8 @@ import java.util.*;
|
|||
* @author David Roazen
|
||||
*/
|
||||
public class LevelingDownsampler<T extends List<E>, E> implements Downsampler<T> {
|
||||
|
||||
private int targetSize;
|
||||
private final int minElementsPerStack;
|
||||
private final int targetSize;
|
||||
|
||||
private List<T> groups;
|
||||
|
||||
|
|
@ -59,12 +59,32 @@ public class LevelingDownsampler<T extends List<E>, E> implements Downsampler<T>
|
|||
/**
|
||||
* Construct a LevelingDownsampler
|
||||
*
|
||||
* Uses the default minElementsPerStack of 1
|
||||
*
|
||||
* @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed
|
||||
* this value -- if it does, items are removed from Lists evenly until the total size
|
||||
* is <= this value
|
||||
*/
|
||||
public LevelingDownsampler( int targetSize ) {
|
||||
this(targetSize, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a LevelingDownsampler
|
||||
*
|
||||
* @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed
|
||||
* this value -- if it does, items are removed from Lists evenly until the total size
|
||||
* is <= this value
|
||||
* @param minElementsPerStack no stack will be reduced below this size during downsampling. That is,
|
||||
* if a stack has only 3 elements and minElementsPerStack is 3, no matter what
|
||||
* we'll not reduce this stack below 3.
|
||||
*/
|
||||
public LevelingDownsampler(final int targetSize, final int minElementsPerStack) {
|
||||
if ( targetSize < 0 ) throw new IllegalArgumentException("targetSize must be >= 0 but got " + targetSize);
|
||||
if ( minElementsPerStack < 0 ) throw new IllegalArgumentException("minElementsPerStack must be >= 0 but got " + minElementsPerStack);
|
||||
|
||||
this.targetSize = targetSize;
|
||||
this.minElementsPerStack = minElementsPerStack;
|
||||
clear();
|
||||
reset();
|
||||
}
|
||||
|
|
@ -148,7 +168,7 @@ public class LevelingDownsampler<T extends List<E>, E> implements Downsampler<T>
|
|||
// remove any more items without violating the constraint that all groups must
|
||||
// be left with at least one item
|
||||
while ( numItemsToRemove > 0 && numConsecutiveUmodifiableGroups < groupSizes.length ) {
|
||||
if ( groupSizes[currentGroupIndex] > 1 ) {
|
||||
if ( groupSizes[currentGroupIndex] > minElementsPerStack ) {
|
||||
groupSizes[currentGroupIndex]--;
|
||||
numItemsToRemove--;
|
||||
numConsecutiveUmodifiableGroups = 0;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,105 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.readutils;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.NanoSchedulable;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
|
||||
/**
|
||||
*/
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||
public class DownsampleReadsQC extends ReadWalker<GATKSAMRecord, Collection<GATKSAMRecord>> implements NanoSchedulable {
|
||||
@Output(doc="Write output to this BAM filename instead of STDOUT", required = true)
|
||||
StingSAMFileWriter out;
|
||||
|
||||
@Argument(fullName = "minReadsPerAlignmentStart", shortName = "minReadsPerAlignmentStart", doc ="", required = false)
|
||||
private int minReadsPerAlignmentStart = 5;
|
||||
|
||||
@Argument(fullName = "downsampleTo", shortName = "downsampleTo", doc ="", required = false)
|
||||
private int downsampleTo = 1000;
|
||||
|
||||
/**
|
||||
* The initialize function.
|
||||
*/
|
||||
public void initialize() {
|
||||
// final boolean preSorted = true;
|
||||
// if (getToolkit() != null && getToolkit().getArguments().BQSR_RECAL_FILE != null && !NO_PG_TAG ) {
|
||||
// Utils.setupWriter(out, getToolkit(), getToolkit().getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME);
|
||||
// }
|
||||
}
|
||||
|
||||
/**
|
||||
* The reads map function.
|
||||
*
|
||||
* @param ref the reference bases that correspond to our read, if a reference was provided
|
||||
* @param readIn the read itself, as a GATKSAMRecord
|
||||
* @return the read itself
|
||||
*/
|
||||
public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord readIn, RefMetaDataTracker metaDataTracker ) {
|
||||
return readIn;
|
||||
}
|
||||
|
||||
/**
|
||||
* reduceInit is called once before any calls to the map function. We use it here to setup the output
|
||||
* bam file, if it was specified on the command line
|
||||
*
|
||||
* @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise
|
||||
*/
|
||||
public Collection<GATKSAMRecord> reduceInit() {
|
||||
return new LinkedList<GATKSAMRecord>();
|
||||
}
|
||||
|
||||
/**
|
||||
* given a read and a output location, reduce by emitting the read
|
||||
*
|
||||
* @param read the read itself
|
||||
* @param output the output source
|
||||
* @return the SAMFileWriter, so that the next reduce can emit to the same source
|
||||
*/
|
||||
public Collection<GATKSAMRecord> reduce( GATKSAMRecord read, Collection<GATKSAMRecord> output ) {
|
||||
output.add(read);
|
||||
return output;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTraversalDone(Collection<GATKSAMRecord> result) {
|
||||
for ( final GATKSAMRecord read : DownsamplingUtils.levelCoverageByPosition(new ArrayList<GATKSAMRecord>(result), downsampleTo, minReadsPerAlignmentStart) )
|
||||
out.addAlignment(read);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue