From d9cdc5d006e470fb5429c55efa1e74b0be68a4bc Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 21 May 2013 15:32:24 -0400 Subject: [PATCH 1/3] Optimization: track alleles in the PerReadAlleleLikelihoodMap with a HashSet -- The previous version of PerReadAlleleLikelihoodMap only stored the alleles in an ArrayList, and used ArrayList.contains() to determine if an allele was already present in the map. This is very slow with many alleles. Now keeps both the ArrayList (for get() performance) and a Set of alleles for contains(). --- .../genotyper/PerReadAlleleLikelihoodMap.java | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index 150e24c51..c8bb7ff79 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -42,13 +42,13 @@ import java.util.*; * For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood. */ public class PerReadAlleleLikelihoodMap { - protected final List alleles; - protected final Map> likelihoodReadMap; + /** A set of all of the allele, so we can efficiently determine if an allele is already present */ + private final Set allelesSet = new HashSet<>(); + /** A list of the unique allele, as an ArrayList so we can call get(i) efficiently */ + protected final List alleles = new ArrayList<>(); + protected final Map> likelihoodReadMap = new LinkedHashMap<>(); - public PerReadAlleleLikelihoodMap() { - likelihoodReadMap = new LinkedHashMap>(); - alleles = new ArrayList(); - } + public PerReadAlleleLikelihoodMap() { } /** * Add a new entry into the Read -> ( Allele -> Likelihood ) map of maps. @@ -61,18 +61,20 @@ public class PerReadAlleleLikelihoodMap { if ( a == null ) throw new IllegalArgumentException("Cannot add a null allele to the allele likelihood map"); if ( likelihood == null ) throw new IllegalArgumentException("Likelihood cannot be null"); if ( likelihood > 0.0 ) throw new IllegalArgumentException("Likelihood must be negative (L = log(p))"); + Map likelihoodMap = likelihoodReadMap.get(read); if (likelihoodMap == null){ // LinkedHashMap will ensure iterating through alleles will be in consistent order - likelihoodMap = new LinkedHashMap(); + likelihoodMap = new LinkedHashMap<>(); } likelihoodReadMap.put(read,likelihoodMap); likelihoodMap.put(a,likelihood); - if (!alleles.contains(a)) + if (!allelesSet.contains(a)) { + allelesSet.add(a); alleles.add(a); - + } } public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) { @@ -165,6 +167,7 @@ public class PerReadAlleleLikelihoodMap { } public void clear() { + allelesSet.clear(); alleles.clear(); likelihoodReadMap.clear(); } From a1093ad230f1c24e4b9c2bf0e622d43ff2138e05 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 21 May 2013 15:35:43 -0400 Subject: [PATCH 2/3] Optimization for ActiveRegion.removeAll -- Previous version took a Collection to remove, and called ArrayList.removeAll() on this collection to remove reads from the ActiveRegion. This can be very slow when there are lots of reads, as ArrayList.removeAll ultimately calls indexOf() that searches through the list calling equals() on each element. New version takes a set, and uses an iterator on the list to remove() from the iterator any read that is in the set. Given that we were already iterating over the list of reads to update the read span, this algorithm is actually simpler and faster than the previous one. -- Update HaplotypeCaller filterReadsInRegion to use a Set not a List. -- Expanded the unit tests a bit for ActiveRegion.removeAll --- .../haplotypecaller/HaplotypeCaller.java | 11 ++++------- .../sting/utils/activeregion/ActiveRegion.java | 14 +++++++++----- .../activeregion/ActiveRegionUnitTest.java | 18 +++++++++++++++--- 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index fd8a1968b..24499def8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -678,7 +678,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In if (dontGenotype) return NO_CALLS; // user requested we not proceed // filter out reads from genotyping which fail mapping quality based criteria - final List filteredReads = filterNonPassingReads( assemblyResult.regionForGenotyping ); + final Collection filteredReads = filterNonPassingReads( assemblyResult.regionForGenotyping ); final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads ); if( assemblyResult.regionForGenotyping.size() == 0 ) { return NO_CALLS; } // no reads remain after filtering so nothing else to do! @@ -918,17 +918,14 @@ public class HaplotypeCaller extends ActiveRegionWalker, In activeRegion.addAll(DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart)); } - private List filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { - final List readsToRemove = new ArrayList<>(); -// logger.info("Filtering non-passing regions: n incoming " + activeRegion.getReads().size()); + private Set filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { + final Set readsToRemove = new LinkedHashSet<>(); for( final GATKSAMRecord rec : activeRegion.getReads() ) { if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { readsToRemove.add(rec); -// logger.info("\tremoving read " + rec + " len " + rec.getReadLength()); } } activeRegion.removeAll( readsToRemove ); -// logger.info("Filtered non-passing regions: n remaining " + activeRegion.getReads().size()); return readsToRemove; } @@ -938,7 +935,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getExtendedLoc().getContig(), padLeft, padRight); } - private Map> splitReadsBySample( final List reads ) { + private Map> splitReadsBySample( final Collection reads ) { final Map> returnMap = new HashMap>(); for( final String sample : samplesList) { List readList = returnMap.get( sample ); diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index 2f4c1b55d..7f2fe6833 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -336,13 +336,17 @@ public class ActiveRegion implements HasGenomeLocation { /** * Remove all of the reads in readsToRemove from this active region - * @param readsToRemove the collection of reads we want to remove + * @param readsToRemove the set of reads we want to remove */ - public void removeAll( final Collection readsToRemove ) { - reads.removeAll(readsToRemove); + public void removeAll( final Set readsToRemove ) { + final Iterator it = reads.iterator(); spanIncludingReads = extendedLoc; - for ( final GATKSAMRecord read : reads ) { - spanIncludingReads = spanIncludingReads.union( genomeLocParser.createGenomeLoc(read) ); + while ( it.hasNext() ) { + final GATKSAMRecord read = it.next(); + if ( readsToRemove.contains(read) ) + it.remove(); + else + spanIncludingReads = spanIncludingReads.union( genomeLocParser.createGenomeLoc(read) ); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java index ad5fd3642..0f9b8531a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java @@ -144,7 +144,7 @@ public class ActiveRegionUnitTest extends BaseTest { } @Test(enabled = !DEBUG, dataProvider = "ActiveRegionReads") - public void testActiveRegionReads(final GenomeLoc loc, final GATKSAMRecord read) { + public void testActiveRegionReads(final GenomeLoc loc, final GATKSAMRecord read) throws Exception { final GenomeLoc expectedSpan = loc.union(genomeLocParser.createGenomeLoc(read)); final ActiveRegion region = new ActiveRegion(loc, null, true, genomeLocParser, 0); @@ -176,19 +176,31 @@ public class ActiveRegionUnitTest extends BaseTest { Assert.assertEquals(region.getReadSpanLoc(), expectedSpan); Assert.assertTrue(region.equalExceptReads(region2)); - region.removeAll(Collections.emptyList()); + region.removeAll(Collections.emptySet()); Assert.assertEquals(region.getReads(), Collections.singletonList(read)); Assert.assertEquals(region.size(), 1); Assert.assertEquals(region.getExtendedLoc(), loc); Assert.assertEquals(region.getReadSpanLoc(), expectedSpan); Assert.assertTrue(region.equalExceptReads(region2)); - region.removeAll(Collections.singletonList(read)); + region.removeAll(Collections.singleton(read)); Assert.assertEquals(region.getReads(), Collections.emptyList()); Assert.assertEquals(region.size(), 0); Assert.assertEquals(region.getExtendedLoc(), loc); Assert.assertEquals(region.getReadSpanLoc(), loc); Assert.assertTrue(region.equalExceptReads(region2)); + + final GATKSAMRecord read2 = (GATKSAMRecord)read.clone(); + read2.setReadName(read.getReadName() + ".clone"); + + for ( final GATKSAMRecord readToKeep : Arrays.asList(read, read2)) { + region.addAll(Arrays.asList(read, read2)); + final GATKSAMRecord readToDiscard = readToKeep == read ? read2 : read; + region.removeAll(Collections.singleton(readToDiscard)); + Assert.assertEquals(region.getReads(), Arrays.asList(readToKeep)); + Assert.assertEquals(region.size(), 1); + Assert.assertEquals(region.getExtendedLoc(), loc); + } } // ----------------------------------------------------------------------------------------------- From 010034a65024a3c6c2b89f8a68c34efae881c59a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 21 May 2013 16:01:57 -0400 Subject: [PATCH 3/3] Optimization/bugfix for PerReadAlleleLikelihoodMap -- Add() call had a misplaced map.put call, so that we were always putting the result of get() back into the map, when what we really intended was to only put the value back in if the original get() resulted in a null and so initialized the result --- .../sting/utils/genotyper/PerReadAlleleLikelihoodMap.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index c8bb7ff79..f253fc9c9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -66,8 +66,8 @@ public class PerReadAlleleLikelihoodMap { if (likelihoodMap == null){ // LinkedHashMap will ensure iterating through alleles will be in consistent order likelihoodMap = new LinkedHashMap<>(); + likelihoodReadMap.put(read,likelihoodMap); } - likelihoodReadMap.put(read,likelihoodMap); likelihoodMap.put(a,likelihood);