ReadBackedPileup cleanup

-- Only ReadBackedPileupImpl (concrete class) and ReadBackedPileup (interface) live, moved all functionality of AbstractReadBackedPileup into the impl
-- ReadBackedPileupImpl was literally a shell class after we removed extended events.  A few bits of code cleanup and we reduced a bunch of class complexity in the gatk
-- ReadBackedPileups no longer accept pre-cached values (size, nMapQ reads, etc) but now lazy load these values as needed
-- Created optimized calculation routines to iterator over all of the reads in the pileup in whatever order is most efficient as well.
-- New LIBS no longer calculates size, n mapq, and n deletion reads while making pileups.
-- Added commons-collections for IteratorChain
This commit is contained in:
Mark DePristo 2013-01-10 16:26:51 -05:00
parent e3e3ae29b2
commit 9e23c592e6
6 changed files with 1143 additions and 1106 deletions

View File

@ -61,6 +61,7 @@
<dependency org="commons-lang" name="commons-lang" rev="2.5"/>
<dependency org="commons-logging" name="commons-logging" rev="1.1.1"/>
<dependency org="commons-io" name="commons-io" rev="2.1"/>
<dependency org="commons-collections" name="commons-collections" rev="3.2.1"/>
<dependency org="org.apache.commons" name="commons-math" rev="2.2"/>
<!-- Lucene core utilities -->

View File

@ -242,39 +242,30 @@ public class LocusIteratorByState extends LocusIterator {
final Iterator<AlignmentStateMachine> iterator = readStates.iterator(sample);
final List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
int size = 0; // number of elements in this sample's pileup
int nDeletions = 0; // number of deletions in this sample's pileup
int nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
while (iterator.hasNext()) {
final AlignmentStateMachine state = iterator.next(); // state object with the read/offset information
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
final CigarOperator op = state.getCigarOperator(); // current cigar operator
// state object with the read/offset information
final AlignmentStateMachine state = iterator.next();
final GATKSAMRecord read = (GATKSAMRecord) state.getRead();
final CigarOperator op = state.getCigarOperator();
if (op == CigarOperator.N) // N's are never added to any pileup
if (op == CigarOperator.N) // N's are never added to any pileup
continue;
if (!dontIncludeReadInPileup(read, location.getStart())) {
if ( op == CigarOperator.D ) {
if ( ! includeReadsWithDeletionAtLoci )
continue;
nDeletions++;
if ( ! includeReadsWithDeletionAtLoci && op == CigarOperator.D ) {
continue;
}
pile.add(state.makePileupElement());
size++;
if ( read.getMappingQuality() == 0 )
nMQ0Reads++;
}
}
if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup
fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads));
if (! pile.isEmpty() ) // if this pileup added at least one base, add it to the full pileup
fullPileup.put(sample, new ReadBackedPileupImpl(location, pile));
}
updateReadStates(); // critical - must be called after we get the current state offsets and location
if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
updateReadStates(); // critical - must be called after we get the current state offsets and location
if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled);
}
}

View File

@ -25,6 +25,8 @@
package org.broadinstitute.sting.utils.pileup;
import org.apache.commons.collections.iterators.IteratorChain;
import java.util.*;
/**
@ -35,6 +37,20 @@ import java.util.*;
*/
abstract class PileupElementTracker<PE extends PileupElement> implements Iterable<PE> {
public abstract int size();
/**
* Iterate through the PEs here, but in any order, which may improve performance
* if you don't care about the underlying order the reads are coming to you in.
* @return an iteratable over all pileup elements in this tracker
*/
public abstract Iterable<PE> unorderedIterable();
/**
* Same as @see #unorderedIterable but the actual iterator itself
* @return
*/
public Iterator<PE> unorderedIterator() { return unorderedIterable().iterator(); }
public abstract PileupElementTracker<PE> copy();
}
@ -65,6 +81,7 @@ class UnifiedPileupElementTracker<PE extends PileupElement> extends PileupElemen
}
public Iterator<PE> iterator() { return pileup.iterator(); }
public Iterable<PE> unorderedIterable() { return this; }
}
class PerSamplePileupElementTracker<PE extends PileupElement> extends PileupElementTracker<PE> {
@ -113,4 +130,25 @@ class PerSamplePileupElementTracker<PE extends PileupElement> extends PileupElem
public int size() {
return size;
}
public Iterable<PE> unorderedIterable() {
return new Iterable<PE>() {
@Override
public Iterator<PE> iterator() {
return new Iterator<PE>() {
final private IteratorChain chain = new IteratorChain();
{ // initialize the chain with the unordered iterators of the per sample pileups
for ( PileupElementTracker<PE> pet : pileup.values() ) {
chain.addIterator(pet.unorderedIterator());
}
}
@Override public boolean hasNext() { return chain.hasNext(); }
@Override public PE next() { return (PE)chain.next(); }
@Override public void remove() { throw new UnsupportedOperationException("Cannot remove"); }
};
}
};
}
}

View File

@ -25,12 +25,18 @@
package org.broadinstitute.sting.utils.pileup;
import net.sf.samtools.CigarElement;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
@ -39,6 +45,17 @@ import java.util.*;
* Test routines for read-backed pileup.
*/
public class ReadBackedPileupUnitTest {
protected static SAMFileHeader header;
protected GenomeLocParser genomeLocParser;
private GenomeLoc loc;
@BeforeClass
public void beforeClass() {
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
loc = genomeLocParser.createGenomeLoc("chr1", 1);
}
/**
* Ensure that basic read group splitting works.
*/
@ -195,4 +212,98 @@ public class ReadBackedPileupUnitTest {
missingSamplePileup = pileup.getPileupForSample("not here");
Assert.assertNull(missingSamplePileup,"Pileup for sample 'not here' should be null but isn't");
}
}
private static int sampleI = 0;
private class RBPCountTest {
final String sample;
final int nReads, nMapq0, nDeletions;
private RBPCountTest(int nReads, int nMapq0, int nDeletions) {
this.sample = "sample" + sampleI++;
this.nReads = nReads;
this.nMapq0 = nMapq0;
this.nDeletions = nDeletions;
}
private List<PileupElement> makeReads( final int n, final int mapq, final String op ) {
final int readLength = 3;
final List<PileupElement> elts = new LinkedList<PileupElement>();
for ( int i = 0; i < n; i++ ) {
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, readLength);
read.setReadBases(Utils.dupBytes((byte) 'A', readLength));
read.setBaseQualities(Utils.dupBytes((byte) 30, readLength));
read.setCigarString("1M1" + op + "1M");
read.setMappingQuality(mapq);
final int baseOffset = op.equals("M") ? 1 : 0;
final CigarElement cigarElement = read.getCigar().getCigarElement(1);
elts.add(new PileupElement(read, baseOffset, cigarElement, 1, 0));
}
return elts;
}
private ReadBackedPileupImpl makePileup() {
final List<PileupElement> elts = new LinkedList<PileupElement>();
elts.addAll(makeReads(nMapq0, 0, "M"));
elts.addAll(makeReads(nDeletions, 30, "D"));
elts.addAll(makeReads(nReads - nMapq0 - nDeletions, 30, "M"));
return new ReadBackedPileupImpl(loc, elts);
}
@Override
public String toString() {
return "RBPCountTest{" +
"sample='" + sample + '\'' +
", nReads=" + nReads +
", nMapq0=" + nMapq0 +
", nDeletions=" + nDeletions +
'}';
}
}
@DataProvider(name = "RBPCountingTest")
public Object[][] makeRBPCountingTest() {
final List<Object[]> tests = new LinkedList<Object[]>();
for ( final int nMapq : Arrays.asList(0, 10, 20) ) {
for ( final int nDeletions : Arrays.asList(0, 10, 20) ) {
for ( final int nReg : Arrays.asList(0, 10, 20) ) {
final int total = nMapq + nDeletions + nReg;
if ( total > 0 )
tests.add(new Object[]{new RBPCountTest(total, nMapq, nDeletions)});
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "RBPCountingTest")
public void testRBPCountingTestSinglePileup(RBPCountTest params) {
testRBPCounts(params.makePileup(), params);
}
@Test(dataProvider = "RBPCountingTest")
public void testRBPCountingTestMultiSample(RBPCountTest params) {
final RBPCountTest newSample = new RBPCountTest(2, 1, 1);
final Map<String, ReadBackedPileupImpl> pileupsBySample = new HashMap<String, ReadBackedPileupImpl>();
pileupsBySample.put(newSample.sample, newSample.makePileup());
pileupsBySample.put(params.sample, params.makePileup());
final ReadBackedPileup pileup = new ReadBackedPileupImpl(loc, pileupsBySample);
testRBPCounts(pileup, new RBPCountTest(params.nReads + 2, params.nMapq0 + 1, params.nDeletions + 1));
}
private void testRBPCounts(final ReadBackedPileup rbp, RBPCountTest expected) {
for ( int cycles = 0; cycles < 3; cycles++ ) {
// multiple cycles to make sure caching is working
Assert.assertEquals(rbp.getNumberOfElements(), expected.nReads);
Assert.assertEquals(rbp.depthOfCoverage(), expected.nReads);
Assert.assertEquals(rbp.getNumberOfDeletions(), expected.nDeletions);
Assert.assertEquals(rbp.getNumberOfMappingQualityZeroReads(), expected.nMapq0);
}
}
}