Merge branch 'master' of ssh://gsa4.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Guillermo del Angel 2012-06-13 20:59:30 -04:00
commit cd2074b1dc
9 changed files with 1168 additions and 0 deletions

View File

@ -0,0 +1,76 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import java.util.Collection;
import java.util.List;
/**
* The basic downsampler API, with no reads-specific operations
*
* @author David Roazen
*/
public interface Downsampler<T> {
/*
* Submit one item to the downsampler for consideration . Some downsamplers will be able to determine
* immediately whether the item survives the downsampling process, while others will need to see
* more items before making that determination.
*/
public void submit( T item );
/*
* Submit a collection of items to the downsampler for consideration.
*/
public void submit( Collection<T> items );
/*
* Are there items that have survived the downsampling process waiting to be retrieved?
*/
public boolean hasDownsampledItems();
/*
* Return (and remove) all items that have survived downsampling and are waiting to be retrieved.
*/
public List<T> consumeDownsampledItems();
/*
* Are there items stored in this downsampler that it doesn't yet know whether they will
* ultimately survive the downsampling process?
*/
public boolean hasPendingItems();
/*
* Used to tell the downsampler that no more items will be submitted to it, and that it should
* finalize any pending items.
*/
public void signalEndOfInput();
/*
* Reset the downsampler to a clean state, devoid of any pending/downsampled items or tracked state
* information.
*/
public void clear();
}

View File

@ -0,0 +1,98 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import java.util.Collection;
import java.util.Iterator;
import java.util.NoSuchElementException;
/**
* StingSAMIterator wrapper around our generic reads downsampler interface
*
* @author David Roazen
*/
public class DownsamplingReadsIterator implements StingSAMIterator {
private StingSAMIterator nestedSAMIterator;
private ReadsDownsampler<SAMRecord> downsampler;
private Collection<SAMRecord> downsampledReadsCache;
private Iterator<SAMRecord> downsampledReadsCacheIterator;
public DownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsampler<SAMRecord> downsampler ) {
nestedSAMIterator = iter;
this.downsampler = downsampler;
fillDownsampledReadsCache();
}
public boolean hasNext() {
if ( downsampledReadsCacheIterator.hasNext() ) {
return true;
}
else if ( ! nestedSAMIterator.hasNext() || ! fillDownsampledReadsCache() ) {
return false;
}
return true;
}
public SAMRecord next() {
if ( ! downsampledReadsCacheIterator.hasNext() && ! fillDownsampledReadsCache() ) {
throw new NoSuchElementException("next() called when there are no more items");
}
return downsampledReadsCacheIterator.next();
}
private boolean fillDownsampledReadsCache() {
while ( nestedSAMIterator.hasNext() && ! downsampler.hasDownsampledItems() ) {
downsampler.submit(nestedSAMIterator.next());
}
if ( ! nestedSAMIterator.hasNext() ) {
downsampler.signalEndOfInput();
}
downsampledReadsCache = downsampler.consumeDownsampledItems();
downsampledReadsCacheIterator = downsampledReadsCache.iterator();
return downsampledReadsCacheIterator.hasNext();
}
public void remove() {
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
}
public void close() {
nestedSAMIterator.close();
}
public Iterator<SAMRecord> iterator() {
return this;
}
}

View File

@ -0,0 +1,94 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
/**
* Fractional Downsampler: selects a specified fraction of the reads for inclusion
*
* @author David Roazen
*/
public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
private ArrayList<T> selectedReads;
private int cutoffForInclusion;
private static final int RANDOM_POOL_SIZE = 10000;
public FractionalDownsampler( double fraction ) {
if ( fraction < 0.0 || fraction > 1.0 ) {
throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive");
}
cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE);
clear();
}
public void submit( T newRead ) {
if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) {
selectedReads.add(newRead);
}
}
public void submit( Collection<T> newReads ) {
for ( T read : newReads ) {
submit(read);
}
}
public boolean hasDownsampledItems() {
return selectedReads.size() > 0;
}
public List<T> consumeDownsampledItems() {
List<T> downsampledItems = selectedReads;
clear();
return downsampledItems;
}
public boolean hasPendingItems() {
return false;
}
public void signalEndOfInput() {
// NO-OP
}
public void clear() {
selectedReads = new ArrayList<T>();
}
public boolean requiresCoordinateSortOrder() {
return false;
}
}

View File

@ -0,0 +1,259 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;
/**
* Positional Downsampler: When eliminating reads, try to do so evenly based on the alignment start positions
*
* @author David Roazen
*/
public class PositionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
private int targetCoverage;
private ReservoirDownsampler<T> reservoir;
private int currentContigIndex;
private int currentAlignmentStart;
private LinkedList<PositionalReadGrouping> pendingReads;
private ArrayList<T> finalizedReads;
public PositionalDownsampler ( int targetCoverage ) {
this.targetCoverage = targetCoverage;
clear();
}
public void submit ( T newRead ) {
if ( readIsPastCurrentPosition(newRead) ) {
updateAndDownsamplePendingReads();
}
reservoir.submit(newRead);
updateCurrentPosition(newRead);
}
public void submit ( Collection<T> newReads ) {
for ( T read : newReads ) {
submit(read);
}
}
public boolean hasDownsampledItems() {
return finalizedReads.size() > 0;
}
public List<T> consumeDownsampledItems() {
List<T> toReturn = finalizedReads;
finalizedReads = new ArrayList<T>();
return toReturn;
}
public boolean hasPendingItems() {
return pendingReads.size() > 0;
}
public void signalEndOfInput() {
updateAndDownsamplePendingReads();
for ( PositionalReadGrouping group : pendingReads ) {
group.finalizeAllActiveReads();
finalizedReads.addAll(group.getFinalizedReads());
}
pendingReads.clear();
}
public void clear() {
reservoir = new ReservoirDownsampler<T>(targetCoverage);
pendingReads = new LinkedList<PositionalReadGrouping>();
finalizedReads = new ArrayList<T>();
}
public boolean requiresCoordinateSortOrder() {
return true;
}
private void updateCurrentPosition ( T read ) {
currentContigIndex = read.getReferenceIndex();
currentAlignmentStart = read.getAlignmentStart();
}
private boolean readIsPastCurrentPosition ( T read ) {
return read.getReferenceIndex() != currentContigIndex || read.getAlignmentStart() > currentAlignmentStart;
}
private void updateAndDownsamplePendingReads() {
finalizeOutOfScopeReads();
List<T> oldLocusReads = reservoir.consumeDownsampledItems();
pendingReads.add(new PositionalReadGrouping(oldLocusReads, currentContigIndex, currentAlignmentStart));
downsampleOverlappingGroups();
}
private void finalizeOutOfScopeReads() {
Iterator<PositionalReadGrouping> iter = pendingReads.iterator();
boolean noPrecedingUnfinalizedGroups = true;
while ( iter.hasNext() ) {
PositionalReadGrouping currentGroup = iter.next();
currentGroup.finalizeActiveReadsBeforePosition(currentContigIndex, currentAlignmentStart);
if ( currentGroup.isFinalized() && noPrecedingUnfinalizedGroups ) {
iter.remove();
finalizedReads.addAll(currentGroup.getFinalizedReads());
}
else {
noPrecedingUnfinalizedGroups = false;
}
}
}
private void downsampleOverlappingGroups() {
int[] groupReadCounts = new int[pendingReads.size()];
int totalCoverage = 0;
int numActiveGroups = 0;
int currentGroup = 0;
for ( PositionalReadGrouping group : pendingReads ) {
groupReadCounts[currentGroup] = group.numActiveReads();
totalCoverage += groupReadCounts[currentGroup];
if ( groupReadCounts[currentGroup] > 0 ) {
numActiveGroups++;
}
currentGroup++;
}
if ( totalCoverage <= targetCoverage ) {
return;
}
int numReadsToRemove = Math.min(totalCoverage - targetCoverage, totalCoverage - numActiveGroups);
currentGroup = 0;
while ( numReadsToRemove > 0 ) {
if ( groupReadCounts[currentGroup] > 1 ) {
groupReadCounts[currentGroup]--;
numReadsToRemove--;
}
currentGroup = (currentGroup + 1) % groupReadCounts.length;
}
currentGroup = 0;
for ( PositionalReadGrouping group : pendingReads ) {
if ( ! group.isFinalized() ) {
group.downsampleActiveReads(groupReadCounts[currentGroup]);
}
currentGroup++;
}
}
private class PositionalReadGrouping {
private List<T> activeReads;
private List<T> finalizedReads;
private int contig;
private int alignmentStart;
public PositionalReadGrouping( Collection<T> reads, int contig, int alignmentStart ) {
activeReads = new LinkedList<T>(reads);
finalizedReads = new ArrayList<T>();
this.contig = contig;
this.alignmentStart = alignmentStart;
}
public int numActiveReads() {
return activeReads.size();
}
public boolean isFinalized() {
return activeReads.size() == 0;
}
public List<T> getFinalizedReads() {
return finalizedReads;
}
public void finalizeActiveReadsBeforePosition( int contig, int position ) {
if ( this.contig != contig ) {
finalizeAllActiveReads();
return;
}
Iterator<T> iter = activeReads.iterator();
while ( iter.hasNext() ) {
T read = iter.next();
if ( read.getAlignmentEnd() < position ) {
iter.remove();
finalizedReads.add(read);
}
}
}
public void finalizeAllActiveReads() {
finalizedReads.addAll(activeReads);
activeReads.clear();
}
public void downsampleActiveReads( int numReadsToKeep ) {
if ( numReadsToKeep > activeReads.size() || numReadsToKeep < 0 ) {
throw new ReviewedStingException(String.format("Cannot retain %d reads out of %d total reads",
numReadsToKeep, activeReads.size()));
}
BitSet itemsToKeep = new BitSet(activeReads.size());
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(activeReads.size(), numReadsToKeep) ) {
itemsToKeep.set(selectedIndex);
}
int currentIndex = 0;
Iterator<T> iter = activeReads.iterator();
while ( iter.hasNext() ) {
T read = iter.next();
if ( ! itemsToKeep.get(currentIndex) ) {
iter.remove();
}
currentIndex++;
}
}
}
}

View File

@ -0,0 +1,40 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
/**
* An extension of the basic downsampler API with reads-specific operations
*
* @author David Roazen
*/
public interface ReadsDownsampler<T extends SAMRecord> extends Downsampler<T> {
/*
* Does this downsampler require that reads be fed to it in coordinate order?
*/
public boolean requiresCoordinateSortOrder();
}

View File

@ -0,0 +1,106 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
/**
* Reservoir Downsampler: Selects n reads out of a stream whose size is not known in advance, with
* every read in the stream having an equal chance of being selected for inclusion.
*
* An implementation of "Algorithm R" from the paper "Random Sampling with a Reservoir" (Jeffrey Scott Vitter, 1985)
*
* @author David Roazen
*/
public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
private ArrayList<T> reservoir;
private int targetSampleSize;
private int totalReadsSeen;
public ReservoirDownsampler ( int targetSampleSize ) {
if ( targetSampleSize <= 0 ) {
throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0");
}
this.targetSampleSize = targetSampleSize;
clear();
}
public void submit ( T newRead ) {
totalReadsSeen++;
if ( totalReadsSeen <= targetSampleSize ) {
reservoir.add(newRead);
}
else {
int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen);
if ( randomSlot < targetSampleSize ) {
reservoir.set(randomSlot, newRead);
}
}
}
public void submit ( Collection<T> newReads ) {
for ( T read : newReads ) {
submit(read);
}
}
public boolean hasDownsampledItems() {
return reservoir.size() > 0;
}
public List<T> consumeDownsampledItems() {
List<T> downsampledItems = reservoir;
clear();
return downsampledItems;
}
public boolean hasPendingItems() {
return false;
}
public void signalEndOfInput() {
// NO-OP
}
public void clear() {
reservoir = new ArrayList<T>(targetSampleSize);
totalReadsSeen = 0;
}
public boolean requiresCoordinateSortOrder() {
return false;
}
}

View File

@ -0,0 +1,73 @@
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Collection;
public class DownsamplingReadsIteratorUnitTest {
@Test
public void testDownsamplingIteratorWithPositionalDownsampling() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
Collection<SAMRecord> reads = new ArrayList<SAMRecord>();
reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 1, 100));
reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 50, 100));
StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler<SAMRecord>(1000));
Assert.assertTrue(iter.hasNext());
SAMRecord previous = iter.next();
int count = 1;
while ( iter.hasNext() ) {
SAMRecord current = iter.next();
Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex()));
count++;
previous = current;
}
Assert.assertEquals(count, 1000);
}
@Test
public void testDownsamplingIteratorNoEffectiveDownsampling() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
Collection<SAMRecord> reads = new ArrayList<SAMRecord>();
reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100));
reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100));
StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler<SAMRecord>(1000));
Assert.assertTrue(iter.hasNext());
SAMRecord previous = iter.next();
int count = 1;
while ( iter.hasNext() ) {
SAMRecord current = iter.next();
Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex()));
count++;
previous = current;
}
Assert.assertEquals(count, 600);
}
private ArrayList<SAMRecord> createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
ArrayList<SAMRecord> stack = new ArrayList<SAMRecord>(stackSize);
for ( int i = 1; i <= stackSize; i++ ) {
stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length));
}
return stack;
}
}

View File

@ -0,0 +1,65 @@
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.annotations.Test;
import org.testng.Assert;
import java.util.ArrayList;
import java.util.List;
public class FractionalDownsamplerUnitTest {
@Test
public void test100PercentInclusion() {
FractionalDownsampler<SAMRecord> downsampler = new FractionalDownsampler<SAMRecord>(1.0);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500));
downsampler.signalEndOfInput();
List<SAMRecord> downsampledReads = downsampler.consumeDownsampledItems();
Assert.assertTrue(downsampledReads.size() == 1000);
}
@Test
public void test0PercentInclusion() {
FractionalDownsampler<SAMRecord> downsampler = new FractionalDownsampler<SAMRecord>(0.0);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500));
downsampler.signalEndOfInput();
List<SAMRecord> downsampledReads = downsampler.consumeDownsampledItems();
Assert.assertTrue(downsampledReads.isEmpty());
}
@Test
public void test50PercentInclusion() {
FractionalDownsampler<SAMRecord> downsampler = new FractionalDownsampler<SAMRecord>(0.5);
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
downsampler.submit(createRandomReads(5000, header, "foo", 0, 100000, 500));
downsampler.signalEndOfInput();
List<SAMRecord> downsampledReads = downsampler.consumeDownsampledItems();
Assert.assertTrue(downsampledReads.size() >= 2000 && downsampledReads.size() <= 3000);
}
private List<SAMRecord> createRandomReads( int numReads, SAMFileHeader header, String name, int contigIndex, int maxAlignmentStart, int maxLength ) {
List<SAMRecord> reads = new ArrayList<SAMRecord>(numReads);
for ( int i = 1; i <= numReads; i++ ) {
reads.add(ArtificialSAMUtils.createArtificialRead(header, name, contigIndex,
GenomeAnalysisEngine.getRandomGenerator().nextInt(maxAlignmentStart) + 1,
GenomeAnalysisEngine.getRandomGenerator().nextInt(maxLength) + 1));
}
return reads;
}
}

View File

@ -0,0 +1,357 @@
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.annotations.Test;
import org.testng.Assert;
import java.util.*;
// TODO: generalize these tests so that all possible arrangements of 1-4 stacks can be tested
public class PositionalDownsamplerUnitTest extends BaseTest {
/**
* -------
* -------
* -------
* -------
* -------
* -------
*/
@Test
public void testThreeOverlappingIdenticalStacks() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testThreeOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000);
}
/**
* -------
* -------
* -------
* -------
* -------
* -------
*/
@Test
public void testThreeNonOverlappingIdenticalStacks() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 201, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 301, 100));
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testThreeNonOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) == 1000);
Assert.assertTrue(downsampledStackSizes.get(1) == 1000);
Assert.assertTrue(downsampledStackSizes.get(2) == 1000);
}
/**
* ---
* ---
* -------
* -------
* -------
* -------
*/
@Test
public void testThreeStacksWithShortStackAtBeginning() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 25));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 20, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testThreeStacksWithShortStackAtBeginning: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000);
}
/**
* -------
* -------
* ---
* ---
* -------
* -------
*/
@Test
public void testThreeStacksWithShortStackInMiddle() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 25));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 75, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testThreeStacksWithShortStackInMiddle: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(2) <= 1000);
}
/**
* ------
* ------
* -------
* -------
* ---
* ---
*/
@Test
public void testThreeStacksWithShortStackAtEnd() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 135, 25));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testThreeStacksWithShortStackAtEnd: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000);
}
/**
* -------
* ----
* -------
* ----
* -------
* -------
*/
@Test
public void testThreePartiallyOverlappingStacks() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 1, 100, 50));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 75, 100, 50));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(2000, header, "foo", 0, 150, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testThreePartiallyOverlappingStacks: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(1) <= 1000);
Assert.assertTrue(downsampledStackSizes.get(2) <= 1000);
// TODO: need to examine per-base coverage here
}
@Test
public void testNoDownsamplingRequired() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<SAMRecord> downsampler = new PositionalDownsampler<SAMRecord>(1000);
downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 25, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100));
Assert.assertFalse(downsampler.hasDownsampledItems());
Assert.assertTrue(downsampler.hasPendingItems());
downsampler.signalEndOfInput();
Assert.assertTrue(downsampler.hasDownsampledItems());
Assert.assertFalse(downsampler.hasPendingItems());
List<Integer> downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems());
System.out.println("testNoDownsamplingRequired: Downsampled Stack sizes: " + downsampledStackSizes);
Assert.assertEquals(downsampledStackSizes.size(), 3);
Assert.assertTrue(downsampledStackSizes.get(0) == 300);
Assert.assertTrue(downsampledStackSizes.get(1) == 300);
Assert.assertTrue(downsampledStackSizes.get(2) == 300);
}
@Test
public void testGATKSAMRecordSupport() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
PositionalDownsampler<GATKSAMRecord> downsampler = new PositionalDownsampler<GATKSAMRecord>(1000);
List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
for ( int i = 0; i < 10; i++ ) {
reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10));
}
downsampler.submit(reads);
downsampler.signalEndOfInput();
List<GATKSAMRecord> downsampledReads = downsampler.consumeDownsampledItems();
Assert.assertTrue(downsampledReads.size() == 10);
}
private ArrayList<SAMRecord> createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
ArrayList<SAMRecord> stack = new ArrayList<SAMRecord>(stackSize);
for ( int i = 1; i <= stackSize; i++ ) {
stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length));
}
return stack;
}
private ArrayList<SAMRecord> createStackOfVaryingReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int firstLength, int secondLength ) {
ArrayList<SAMRecord> stack = createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, firstLength);
stack.addAll(createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, secondLength));
return stack;
}
private List<Integer> getDownsampledStackSizesAndVerifySortedness( List<SAMRecord> downsampledReads ) {
List<Integer> stackSizes = new ArrayList<Integer>();
Iterator<SAMRecord> iter = downsampledReads.iterator();
Assert.assertTrue(iter.hasNext());
SAMRecord previousRead = iter.next();
int currentStackSize = 1;
while ( iter.hasNext() ) {
SAMRecord currentRead = iter.next();
if ( ! currentRead.getReferenceIndex().equals(previousRead.getReferenceIndex()) || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) {
stackSizes.add(currentStackSize);
currentStackSize = 1;
}
else if ( currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) {
Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead));
}
else {
currentStackSize++;
}
previousRead = currentRead;
}
stackSizes.add(currentStackSize);
return stackSizes;
}
}