Initial version of "fast" FragmentPileup code

-- Uses mayOverlapRoutine in ReadUtils
-- Attempts to be smart when doing overlap calculation, to avoid unnecessary allocations
-- PileupElement now comparable (sorts on offset than on start)
-- Caliper microbenchmark to assess performance
This commit is contained in:
Mark DePristo 2011-10-22 21:36:37 -04:00
parent b863390cb1
commit 42bf9adede
6 changed files with 323 additions and 45 deletions

View File

@ -1,9 +1,10 @@
package org.broadinstitute.sting.utils.pileup;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.*;
/**
* An easy to access fragment-based pileup, which contains two separate pileups. The first
@ -16,28 +17,46 @@ import java.util.Map;
* TODO -- technically we could generalize this code to support a pseudo-duplicate marking
* TODO -- algorithm that could collect all duplicates into a single super pileup element
*
* Oct 21: note that the order of the oneReadPileup and twoReadPileups are not
* defined. The algorithms that produce these lists are in fact producing
* lists of Pileup elements *NOT* sorted by alignment start position of the underlying
* reads.
*
* User: depristo
* Date: 3/26/11
* Time: 10:09 PM
*/
public class FragmentPileup {
final Collection<PileupElement> oneReadPile;
final Collection<TwoReadPileupElement> twoReadPile = new ArrayList<TwoReadPileupElement>();
Collection<PileupElement> oneReadPile = null;
Collection<TwoReadPileupElement> twoReadPile = null;
/**
* Create a new Fragment-based pileup from the standard read-based pileup
* @param pileup
*/
public FragmentPileup(ReadBackedPileup pileup) {
Map<String, PileupElement> nameMap = new HashMap<String, PileupElement>();
//oldSlowCalculation(pileup);
fastNewCalculation(pileup);
}
protected FragmentPileup(ReadBackedPileup pileup, boolean useOldAlgorithm) {
if ( useOldAlgorithm )
oldSlowCalculation(pileup);
else
fastNewCalculation(pileup);
}
private final void oldSlowCalculation(final ReadBackedPileup pileup) {
final Map<String, PileupElement> nameMap = new HashMap<String, PileupElement>();
// build an initial map, grabbing all of the multi-read fragments
for ( PileupElement p : pileup ) {
String readName = p.getRead().getReadName();
for ( final PileupElement p : pileup ) {
final String readName = p.getRead().getReadName();
PileupElement pe1 = nameMap.get(readName);
final PileupElement pe1 = nameMap.get(readName);
if ( pe1 != null ) {
// assumes we have at most 2 reads per fragment
if ( twoReadPile == null ) twoReadPile = new ArrayList<TwoReadPileupElement>();
twoReadPile.add(new TwoReadPileupElement(pe1, p));
nameMap.remove(readName);
} else {
@ -45,17 +64,96 @@ public class FragmentPileup {
}
}
// now set the one Read pile to the values in the nameMap with only a single read
oneReadPile = nameMap.values();
}
/**
* @param pileup
*/
private final void fastNewCalculation(final ReadBackedPileup pileup) {
Map<String, PileupElement> nameMap = null; // lazy initialization
for ( final PileupElement p : pileup ) {
final SAMRecord read = p.getRead();
switch (ReadUtils.readMightOverlapMate(read) ) {
// we know for certain this read doesn't have an overlapping mate
case NO: {
addToOnePile(p);
break;
}
// we know that we overlap our mate, so put the read in the nameMap in
// case our mate shows up
case LEFT_YES: {
nameMap = addToNameMap(nameMap, p);
break;
}
// read starts at the same position, so we are looking at either the first or
// the second read. In the first, add it to the map, in the second grab it
// from the map and create a fragment
case SAME_START: {
final PileupElement pe1 = getFromNameMap(nameMap, p);
if ( pe1 != null ) {
addToTwoPile(pe1, p);
nameMap.remove(p.getRead().getReadName());
} else {
nameMap = addToNameMap(nameMap, p);
}
break;
}
// in this case we need to see if our mate is already present, and if so
// grab the read from the list
case RIGHT_MAYBE: {
final PileupElement pe1 = getFromNameMap(nameMap, p);
if ( pe1 != null ) {
addToTwoPile(pe1, p);
nameMap.remove(p.getRead().getReadName());
} else {
addToOnePile(p);
}
break;
}
}
}
if ( nameMap != null && ! nameMap.isEmpty() ) // could be slightly more optimally
for ( final PileupElement p : nameMap.values() )
addToOnePile(p);
}
private final Map<String, PileupElement> addToNameMap(Map<String, PileupElement> map, final PileupElement p) {
if ( map == null ) map = new HashMap<String, PileupElement>();
map.put(p.getRead().getReadName(), p);
return map;
}
private final PileupElement getFromNameMap(Map<String, PileupElement> map, final PileupElement p) {
return map == null ? null : map.get(p.getRead().getReadName());
}
private final void addToOnePile(final PileupElement p) {
if ( oneReadPile == null ) oneReadPile = new ArrayList<PileupElement>();
oneReadPile.add(p);
}
private final void addToTwoPile(final PileupElement p1, final PileupElement p2) {
// assumes we have at most 2 reads per fragment
if ( twoReadPile == null ) twoReadPile = new ArrayList<TwoReadPileupElement>();
twoReadPile.add(new TwoReadPileupElement(p1, p2));
}
/**
* Gets the pileup elements containing two reads, in no particular order
*
* @return
*/
public Collection<TwoReadPileupElement> getTwoReadPileup() {
return twoReadPile;
return twoReadPile == null ? Collections.<TwoReadPileupElement>emptyList() : twoReadPile;
}
/**
@ -64,7 +162,7 @@ public class FragmentPileup {
* @return
*/
public Collection<PileupElement> getOneReadPileup() {
return oneReadPile;
return oneReadPile == null ? Collections.<PileupElement>emptyList() : oneReadPile;
}
/**

View File

@ -13,7 +13,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils;
* Date: Apr 14, 2009
* Time: 8:54:05 AM
*/
public class PileupElement {
public class PileupElement implements Comparable<PileupElement> {
public static final byte DELETION_BASE = BaseUtils.D;
public static final byte DELETION_QUAL = (byte) 16;
public static final byte A_FOLLOWED_BY_INSERTION_BASE = (byte) 87;
@ -76,6 +76,20 @@ public class PileupElement {
return isDeletion() ? DELETION_QUAL : read.getBaseQualities()[offset];
}
@Override
public int compareTo(final PileupElement pileupElement) {
if ( offset < pileupElement.offset )
return -1;
else if ( offset > pileupElement.offset )
return 1;
else if ( read.getAlignmentStart() < pileupElement.read.getAlignmentStart() )
return -1;
else if ( read.getAlignmentStart() > pileupElement.read.getAlignmentStart() )
return 1;
else
return 0;
}
// --------------------------------------------------------------------------
//
// Reduced read accessors

View File

@ -214,6 +214,54 @@ public class ReadUtils {
return state;
}
/**
* s1 e1
* |-----------------------> [record in hand]
* s2
* <-----------------------|
*
* s1, e1, and s2 are all in the record. Assuming that s1 < s2 (we are the left most read),
* we can compute whether we overlap with our mate by seeing if s2 <= e1 or no. If e1 <
* s2 then we known that we cannot over.
*
* If we are looking at the right read
*
* s1
* |----------------------->
* s2 e2
* <-----------------------| [record in hand]
*
* we know the position of s1 and s2, but we don't know e1, so we cannot tell if we
* overlap with our mate or not, so in this case we return MAYBE.
*
* Note that if rec has an unmapped mate or is unpaired we certainly know the answer
*
* @param rec
* @return
*/
public static ReadOverlapsMateType readMightOverlapMate(final SAMRecord rec) {
if ( ! rec.getReadPairedFlag() || rec.getMateUnmappedFlag() ) {
return ReadOverlapsMateType.NO;
} else { // read is actually paired
final int recStart = rec.getAlignmentStart();
final int recEnd = rec.getAlignmentEnd();
final int mateStart = rec.getMateAlignmentStart();
if ( recStart < mateStart ) {
// we are the left most read
return mateStart <= recEnd ? ReadOverlapsMateType.LEFT_YES: ReadOverlapsMateType.NO;
} else if ( recStart == mateStart ) {
// we are the left most read
return ReadOverlapsMateType.SAME_START;
} else {
// we are the right most read, so we cannot tell
return ReadOverlapsMateType.RIGHT_MAYBE;
}
}
}
public enum ReadOverlapsMateType { LEFT_YES, NO, SAME_START, RIGHT_MAYBE }
private static Pair<Integer, Integer> getAdaptorBoundaries(SAMRecord rec, int adaptorLength) {
int isize = rec.getInferredInsertSize();
if ( isize == 0 )

View File

@ -224,7 +224,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("0bece77ce6bc447438ef9b2921b2dc41"));
Arrays.asList("eeba568272f9b42d5450da75c7cc6d2d"));
executeTest(String.format("test indel caller in SLX"), spec);
}
@ -252,7 +252,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("790b1a1d6ab79eee8c24812bb8ca6fae"));
Arrays.asList("19ff9bd3139480bdf79dcbf117cf2b24"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@ -262,7 +262,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("408d3aba4d094c067fc00a43992c2292"));
Arrays.asList("118918f2e9e56a3cfc5ccb2856d529c8"));
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1);
}
@ -272,7 +272,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("5e4e09354410b76fc0d822050d84132a"));
Arrays.asList("a20799237accd52c1b8c2ac096309c8f"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2);
}
@ -282,7 +282,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1,
Arrays.asList("c599eedbeb422713b8a28529e805e4ae"));
Arrays.asList("18ef8181157b4ac3eb8492f538467f92"));
executeTest("test MultiSample Pilot2 indels with complicated records", spec3);
}
@ -291,7 +291,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec(
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation +
"phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1,
Arrays.asList("37d908a682ac269f8f19dec939ff5b01"));
Arrays.asList("ad884e511a751b05e64db5314314365a"));
executeTest("test MultiSample 1000G Phase1 indels with complicated records emitting all sites", spec4);
}

View File

@ -0,0 +1,108 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.pileup;
import com.google.caliper.Param;
import com.google.caliper.SimpleBenchmark;
import com.google.caliper.runner.CaliperMain;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import java.util.*;
/**
* Caliper microbenchmark of fragment pileup
*/
public class FragmentPileupBenchmark extends SimpleBenchmark {
final int N_PILEUPS_TO_GENERATE = 100;
List<ReadBackedPileup> pileups = new ArrayList<ReadBackedPileup>(N_PILEUPS_TO_GENERATE);
@Param({"10", "100", "1000"}) // , "10000"})
int pileupSize; // set automatically by framework
@Param({"150", "400"})
int insertSize; // set automatically by framework
@Override protected void setUp() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
GenomeLocParser genomeLocParser;
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
final int pos = 50;
GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", pos);
final Random ran = new Random();
final int readLen = 100;
final boolean leftIsFirst = true;
final boolean leftIsNegative = false;
final int insertSizeVariation = insertSize / 10;
for ( int pileupN = 0; pileupN < N_PILEUPS_TO_GENERATE; pileupN++ ) {
List<PileupElement> pileupElements = new ArrayList<PileupElement>();
for ( int i = 0; i < pileupSize / 2; i++ ) {
final String readName = "read" + i;
final int leftStart = new Random().nextInt(49) + 1;
final int fragmentSize = (int)(ran.nextGaussian() * insertSizeVariation + insertSize);
final int rightStart = leftStart + fragmentSize - readLen;
if ( rightStart <= 0 ) continue;
List<SAMRecord> pair = FragmentPileupUnitTest.createPair(header, readName, readLen, leftStart, rightStart, leftIsFirst, leftIsNegative);
SAMRecord left = pair.get(0);
SAMRecord right = pair.get(1);
pileupElements.add(new PileupElement(left, pos - leftStart));
if ( pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd() ) {
pileupElements.add(new PileupElement(right, pos - rightStart));
}
}
Collections.sort(pileupElements);
pileups.add(new ReadBackedPileupImpl(loc, pileupElements));
}
}
public void timeNaiveNameMatch(int rep) {
int nFrags = 0;
for ( int i = 0; i < rep; i++ ) {
for ( ReadBackedPileup rbp : pileups )
nFrags += new FragmentPileup(rbp, true).getTwoReadPileup().size();
}
}
public void timeFastNameMatch(int rep) {
int nFrags = 0;
for ( int i = 0; i < rep; i++ )
for ( ReadBackedPileup rbp : pileups )
nFrags += new FragmentPileup(rbp, false).getTwoReadPileup().size();
}
public static void main(String[] args) {
CaliperMain.main(FragmentPileupBenchmark.class, args);
}
}

View File

@ -43,37 +43,48 @@ import java.util.*;
public class FragmentPileupUnitTest extends BaseTest {
private static SAMFileHeader header;
public final static List<SAMRecord> createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) {
SAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen);
SAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen);
left.setReadPairedFlag(true);
right.setReadPairedFlag(true);
left.setProperPairFlag(true);
right.setProperPairFlag(true);
left.setFirstOfPairFlag(leftIsFirst);
right.setFirstOfPairFlag(! leftIsFirst);
left.setReadNegativeStrandFlag(leftIsNegative);
left.setMateNegativeStrandFlag(!leftIsNegative);
right.setReadNegativeStrandFlag(!leftIsNegative);
right.setMateNegativeStrandFlag(leftIsNegative);
left.setMateAlignmentStart(right.getAlignmentStart());
right.setMateAlignmentStart(left.getAlignmentStart());
left.setMateReferenceIndex(0);
right.setMateReferenceIndex(0);
int isize = rightStart + readLen - leftStart;
left.setInferredInsertSize(isize);
right.setInferredInsertSize(-isize);
return Arrays.asList(left, right);
}
private class FragmentPileupTest extends TestDataProvider {
List<TestState> states = new ArrayList<TestState>();
private FragmentPileupTest(String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) {
super(FragmentPileupTest.class, String.format("%s-leftIsFirst:%b-leftIsNegative:%b", name, leftIsFirst, leftIsNegative));
List<SAMRecord> pair = createPair(header, "readpair", readLen, leftStart, rightStart, leftIsFirst, leftIsNegative);
SAMRecord left = pair.get(0);
SAMRecord right = pair.get(1);
for ( int pos = leftStart; pos < rightStart + readLen; pos++) {
SAMRecord left = ArtificialSAMUtils.createArtificialRead(header, "readpair", 0, leftStart, readLen);
SAMRecord right = ArtificialSAMUtils.createArtificialRead(header, "readpair", 0, rightStart, readLen);
left.setProperPairFlag(true);
right.setProperPairFlag(true);
left.setFirstOfPairFlag(leftIsFirst);
right.setFirstOfPairFlag(! leftIsFirst);
left.setReadNegativeStrandFlag(leftIsNegative);
left.setMateNegativeStrandFlag(!leftIsNegative);
right.setReadNegativeStrandFlag(!leftIsNegative);
right.setMateNegativeStrandFlag(leftIsNegative);
left.setMateAlignmentStart(right.getAlignmentStart());
right.setMateAlignmentStart(left.getAlignmentStart());
left.setMateReferenceIndex(0);
right.setMateReferenceIndex(0);
int isize = rightStart + readLen - leftStart;
left.setInferredInsertSize(isize);
right.setInferredInsertSize(-isize);
boolean posCoveredByLeft = pos >= left.getAlignmentStart() && pos <= left.getAlignmentEnd();
boolean posCoveredByRight = pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd();
@ -139,7 +150,6 @@ public class FragmentPileupUnitTest extends BaseTest {
}
}
@BeforeTest
public void setup() {
header = ArtificialSAMUtils.createArtificialSamHeader(1,1,1000);