From 42bf9adede112282e8e8e45bf987ab01305b0a7e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 22 Oct 2011 21:36:37 -0400 Subject: [PATCH] Initial version of "fast" FragmentPileup code -- Uses mayOverlapRoutine in ReadUtils -- Attempts to be smart when doing overlap calculation, to avoid unnecessary allocations -- PileupElement now comparable (sorts on offset than on start) -- Caliper microbenchmark to assess performance --- .../sting/utils/pileup/FragmentPileup.java | 124 ++++++++++++++++-- .../sting/utils/pileup/PileupElement.java | 16 ++- .../sting/utils/sam/ReadUtils.java | 48 +++++++ .../UnifiedGenotyperIntegrationTest.java | 12 +- .../utils/pileup/FragmentPileupBenchmark.java | 108 +++++++++++++++ .../utils/pileup/FragmentPileupUnitTest.java | 60 +++++---- 6 files changed, 323 insertions(+), 45 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupBenchmark.java diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java index f7d237401..f9a94989b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java @@ -1,9 +1,10 @@ package org.broadinstitute.sting.utils.pileup; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.Map; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.util.*; /** * An easy to access fragment-based pileup, which contains two separate pileups. The first @@ -16,28 +17,46 @@ import java.util.Map; * TODO -- technically we could generalize this code to support a pseudo-duplicate marking * TODO -- algorithm that could collect all duplicates into a single super pileup element * + * Oct 21: note that the order of the oneReadPileup and twoReadPileups are not + * defined. The algorithms that produce these lists are in fact producing + * lists of Pileup elements *NOT* sorted by alignment start position of the underlying + * reads. + * * User: depristo * Date: 3/26/11 * Time: 10:09 PM */ public class FragmentPileup { - final Collection oneReadPile; - final Collection twoReadPile = new ArrayList(); + Collection oneReadPile = null; + Collection twoReadPile = null; /** * Create a new Fragment-based pileup from the standard read-based pileup * @param pileup */ public FragmentPileup(ReadBackedPileup pileup) { - Map nameMap = new HashMap(); + //oldSlowCalculation(pileup); + fastNewCalculation(pileup); + } + + protected FragmentPileup(ReadBackedPileup pileup, boolean useOldAlgorithm) { + if ( useOldAlgorithm ) + oldSlowCalculation(pileup); + else + fastNewCalculation(pileup); + } + + private final void oldSlowCalculation(final ReadBackedPileup pileup) { + final Map nameMap = new HashMap(); // build an initial map, grabbing all of the multi-read fragments - for ( PileupElement p : pileup ) { - String readName = p.getRead().getReadName(); + for ( final PileupElement p : pileup ) { + final String readName = p.getRead().getReadName(); - PileupElement pe1 = nameMap.get(readName); + final PileupElement pe1 = nameMap.get(readName); if ( pe1 != null ) { // assumes we have at most 2 reads per fragment + if ( twoReadPile == null ) twoReadPile = new ArrayList(); twoReadPile.add(new TwoReadPileupElement(pe1, p)); nameMap.remove(readName); } else { @@ -45,17 +64,96 @@ public class FragmentPileup { } } - // now set the one Read pile to the values in the nameMap with only a single read oneReadPile = nameMap.values(); } + /** + * @param pileup + */ + private final void fastNewCalculation(final ReadBackedPileup pileup) { + Map nameMap = null; // lazy initialization + + for ( final PileupElement p : pileup ) { + final SAMRecord read = p.getRead(); + + switch (ReadUtils.readMightOverlapMate(read) ) { + // we know for certain this read doesn't have an overlapping mate + case NO: { + addToOnePile(p); + break; + } + + // we know that we overlap our mate, so put the read in the nameMap in + // case our mate shows up + case LEFT_YES: { + nameMap = addToNameMap(nameMap, p); + break; + } + + // read starts at the same position, so we are looking at either the first or + // the second read. In the first, add it to the map, in the second grab it + // from the map and create a fragment + case SAME_START: { + final PileupElement pe1 = getFromNameMap(nameMap, p); + if ( pe1 != null ) { + addToTwoPile(pe1, p); + nameMap.remove(p.getRead().getReadName()); + } else { + nameMap = addToNameMap(nameMap, p); + } + break; + } + + + // in this case we need to see if our mate is already present, and if so + // grab the read from the list + case RIGHT_MAYBE: { + final PileupElement pe1 = getFromNameMap(nameMap, p); + if ( pe1 != null ) { + addToTwoPile(pe1, p); + nameMap.remove(p.getRead().getReadName()); + } else { + addToOnePile(p); + } + break; + } + } + } + + if ( nameMap != null && ! nameMap.isEmpty() ) // could be slightly more optimally + for ( final PileupElement p : nameMap.values() ) + addToOnePile(p); + } + + private final Map addToNameMap(Map map, final PileupElement p) { + if ( map == null ) map = new HashMap(); + map.put(p.getRead().getReadName(), p); + return map; + } + + private final PileupElement getFromNameMap(Map map, final PileupElement p) { + return map == null ? null : map.get(p.getRead().getReadName()); + } + + + private final void addToOnePile(final PileupElement p) { + if ( oneReadPile == null ) oneReadPile = new ArrayList(); + oneReadPile.add(p); + } + + private final void addToTwoPile(final PileupElement p1, final PileupElement p2) { + // assumes we have at most 2 reads per fragment + if ( twoReadPile == null ) twoReadPile = new ArrayList(); + twoReadPile.add(new TwoReadPileupElement(p1, p2)); + } + /** * Gets the pileup elements containing two reads, in no particular order * * @return */ public Collection getTwoReadPileup() { - return twoReadPile; + return twoReadPile == null ? Collections.emptyList() : twoReadPile; } /** @@ -64,7 +162,7 @@ public class FragmentPileup { * @return */ public Collection getOneReadPileup() { - return oneReadPile; + return oneReadPile == null ? Collections.emptyList() : oneReadPile; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index e47b6ada7..3d6b6f4b9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -13,7 +13,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; * Date: Apr 14, 2009 * Time: 8:54:05 AM */ -public class PileupElement { +public class PileupElement implements Comparable { public static final byte DELETION_BASE = BaseUtils.D; public static final byte DELETION_QUAL = (byte) 16; public static final byte A_FOLLOWED_BY_INSERTION_BASE = (byte) 87; @@ -76,6 +76,20 @@ public class PileupElement { return isDeletion() ? DELETION_QUAL : read.getBaseQualities()[offset]; } + @Override + public int compareTo(final PileupElement pileupElement) { + if ( offset < pileupElement.offset ) + return -1; + else if ( offset > pileupElement.offset ) + return 1; + else if ( read.getAlignmentStart() < pileupElement.read.getAlignmentStart() ) + return -1; + else if ( read.getAlignmentStart() > pileupElement.read.getAlignmentStart() ) + return 1; + else + return 0; + } + // -------------------------------------------------------------------------- // // Reduced read accessors diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index f8e4927ed..f093e6539 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -214,6 +214,54 @@ public class ReadUtils { return state; } + /** + * s1 e1 + * |-----------------------> [record in hand] + * s2 + * <-----------------------| + * + * s1, e1, and s2 are all in the record. Assuming that s1 < s2 (we are the left most read), + * we can compute whether we overlap with our mate by seeing if s2 <= e1 or no. If e1 < + * s2 then we known that we cannot over. + * + * If we are looking at the right read + * + * s1 + * |-----------------------> + * s2 e2 + * <-----------------------| [record in hand] + * + * we know the position of s1 and s2, but we don't know e1, so we cannot tell if we + * overlap with our mate or not, so in this case we return MAYBE. + * + * Note that if rec has an unmapped mate or is unpaired we certainly know the answer + * + * @param rec + * @return + */ + public static ReadOverlapsMateType readMightOverlapMate(final SAMRecord rec) { + if ( ! rec.getReadPairedFlag() || rec.getMateUnmappedFlag() ) { + return ReadOverlapsMateType.NO; + } else { // read is actually paired + final int recStart = rec.getAlignmentStart(); + final int recEnd = rec.getAlignmentEnd(); + final int mateStart = rec.getMateAlignmentStart(); + + if ( recStart < mateStart ) { + // we are the left most read + return mateStart <= recEnd ? ReadOverlapsMateType.LEFT_YES: ReadOverlapsMateType.NO; + } else if ( recStart == mateStart ) { + // we are the left most read + return ReadOverlapsMateType.SAME_START; + } else { + // we are the right most read, so we cannot tell + return ReadOverlapsMateType.RIGHT_MAYBE; + } + } + } + + public enum ReadOverlapsMateType { LEFT_YES, NO, SAME_START, RIGHT_MAYBE } + private static Pair getAdaptorBoundaries(SAMRecord rec, int adaptorLength) { int isize = rec.getInferredInsertSize(); if ( isize == 0 ) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 6b6346447..7d6cfc7ad 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -224,7 +224,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("0bece77ce6bc447438ef9b2921b2dc41")); + Arrays.asList("eeba568272f9b42d5450da75c7cc6d2d")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -252,7 +252,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("790b1a1d6ab79eee8c24812bb8ca6fae")); + Arrays.asList("19ff9bd3139480bdf79dcbf117cf2b24")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -262,7 +262,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("408d3aba4d094c067fc00a43992c2292")); + Arrays.asList("118918f2e9e56a3cfc5ccb2856d529c8")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1); } @@ -272,7 +272,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("5e4e09354410b76fc0d822050d84132a")); + Arrays.asList("a20799237accd52c1b8c2ac096309c8f")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); } @@ -282,7 +282,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, - Arrays.asList("c599eedbeb422713b8a28529e805e4ae")); + Arrays.asList("18ef8181157b4ac3eb8492f538467f92")); executeTest("test MultiSample Pilot2 indels with complicated records", spec3); } @@ -291,7 +291,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("37d908a682ac269f8f19dec939ff5b01")); + Arrays.asList("ad884e511a751b05e64db5314314365a")); executeTest("test MultiSample 1000G Phase1 indels with complicated records emitting all sites", spec4); } diff --git a/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupBenchmark.java b/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupBenchmark.java new file mode 100644 index 000000000..fe1ca305f --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupBenchmark.java @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pileup; + +import com.google.caliper.Param; +import com.google.caliper.SimpleBenchmark; +import com.google.caliper.runner.CaliperMain; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; + +import java.util.*; + +/** + * Caliper microbenchmark of fragment pileup + */ +public class FragmentPileupBenchmark extends SimpleBenchmark { + final int N_PILEUPS_TO_GENERATE = 100; + List pileups = new ArrayList(N_PILEUPS_TO_GENERATE); + + @Param({"10", "100", "1000"}) // , "10000"}) + int pileupSize; // set automatically by framework + + @Param({"150", "400"}) + int insertSize; // set automatically by framework + + @Override protected void setUp() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + GenomeLocParser genomeLocParser; + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + final int pos = 50; + GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", pos); + + final Random ran = new Random(); + final int readLen = 100; + final boolean leftIsFirst = true; + final boolean leftIsNegative = false; + final int insertSizeVariation = insertSize / 10; + + for ( int pileupN = 0; pileupN < N_PILEUPS_TO_GENERATE; pileupN++ ) { + List pileupElements = new ArrayList(); + for ( int i = 0; i < pileupSize / 2; i++ ) { + final String readName = "read" + i; + final int leftStart = new Random().nextInt(49) + 1; + final int fragmentSize = (int)(ran.nextGaussian() * insertSizeVariation + insertSize); + final int rightStart = leftStart + fragmentSize - readLen; + + if ( rightStart <= 0 ) continue; + + List pair = FragmentPileupUnitTest.createPair(header, readName, readLen, leftStart, rightStart, leftIsFirst, leftIsNegative); + SAMRecord left = pair.get(0); + SAMRecord right = pair.get(1); + + pileupElements.add(new PileupElement(left, pos - leftStart)); + + if ( pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd() ) { + pileupElements.add(new PileupElement(right, pos - rightStart)); + } + } + + Collections.sort(pileupElements); + pileups.add(new ReadBackedPileupImpl(loc, pileupElements)); + } + } + + public void timeNaiveNameMatch(int rep) { + int nFrags = 0; + for ( int i = 0; i < rep; i++ ) { + for ( ReadBackedPileup rbp : pileups ) + nFrags += new FragmentPileup(rbp, true).getTwoReadPileup().size(); + } + } + + public void timeFastNameMatch(int rep) { + int nFrags = 0; + for ( int i = 0; i < rep; i++ ) + for ( ReadBackedPileup rbp : pileups ) + nFrags += new FragmentPileup(rbp, false).getTwoReadPileup().size(); + } + + public static void main(String[] args) { + CaliperMain.main(FragmentPileupBenchmark.class, args); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupUnitTest.java index 778259528..be4ecc0c6 100644 --- a/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupUnitTest.java @@ -43,37 +43,48 @@ import java.util.*; public class FragmentPileupUnitTest extends BaseTest { private static SAMFileHeader header; + public final static List createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { + SAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen); + SAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen); + + left.setReadPairedFlag(true); + right.setReadPairedFlag(true); + + left.setProperPairFlag(true); + right.setProperPairFlag(true); + + left.setFirstOfPairFlag(leftIsFirst); + right.setFirstOfPairFlag(! leftIsFirst); + + left.setReadNegativeStrandFlag(leftIsNegative); + left.setMateNegativeStrandFlag(!leftIsNegative); + right.setReadNegativeStrandFlag(!leftIsNegative); + right.setMateNegativeStrandFlag(leftIsNegative); + + left.setMateAlignmentStart(right.getAlignmentStart()); + right.setMateAlignmentStart(left.getAlignmentStart()); + + left.setMateReferenceIndex(0); + right.setMateReferenceIndex(0); + + int isize = rightStart + readLen - leftStart; + left.setInferredInsertSize(isize); + right.setInferredInsertSize(-isize); + + return Arrays.asList(left, right); + } + private class FragmentPileupTest extends TestDataProvider { List states = new ArrayList(); private FragmentPileupTest(String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { super(FragmentPileupTest.class, String.format("%s-leftIsFirst:%b-leftIsNegative:%b", name, leftIsFirst, leftIsNegative)); + List pair = createPair(header, "readpair", readLen, leftStart, rightStart, leftIsFirst, leftIsNegative); + SAMRecord left = pair.get(0); + SAMRecord right = pair.get(1); + for ( int pos = leftStart; pos < rightStart + readLen; pos++) { - SAMRecord left = ArtificialSAMUtils.createArtificialRead(header, "readpair", 0, leftStart, readLen); - SAMRecord right = ArtificialSAMUtils.createArtificialRead(header, "readpair", 0, rightStart, readLen); - - left.setProperPairFlag(true); - right.setProperPairFlag(true); - - left.setFirstOfPairFlag(leftIsFirst); - right.setFirstOfPairFlag(! leftIsFirst); - - left.setReadNegativeStrandFlag(leftIsNegative); - left.setMateNegativeStrandFlag(!leftIsNegative); - right.setReadNegativeStrandFlag(!leftIsNegative); - right.setMateNegativeStrandFlag(leftIsNegative); - - left.setMateAlignmentStart(right.getAlignmentStart()); - right.setMateAlignmentStart(left.getAlignmentStart()); - - left.setMateReferenceIndex(0); - right.setMateReferenceIndex(0); - - int isize = rightStart + readLen - leftStart; - left.setInferredInsertSize(isize); - right.setInferredInsertSize(-isize); - boolean posCoveredByLeft = pos >= left.getAlignmentStart() && pos <= left.getAlignmentEnd(); boolean posCoveredByRight = pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd(); @@ -139,7 +150,6 @@ public class FragmentPileupUnitTest extends BaseTest { } } - @BeforeTest public void setup() { header = ArtificialSAMUtils.createArtificialSamHeader(1,1,1000);