Merge pull request #236 from broadinstitute/md_simple_hc_performance_improvements
3 simple performance improvements for HaplotypeCaller
This commit is contained in:
commit
925232b0fc
|
|
@ -678,7 +678,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
if (dontGenotype) return NO_CALLS; // user requested we not proceed
|
||||
|
||||
// filter out reads from genotyping which fail mapping quality based criteria
|
||||
final List<GATKSAMRecord> filteredReads = filterNonPassingReads( assemblyResult.regionForGenotyping );
|
||||
final Collection<GATKSAMRecord> filteredReads = filterNonPassingReads( assemblyResult.regionForGenotyping );
|
||||
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
|
||||
|
||||
if( assemblyResult.regionForGenotyping.size() == 0 ) { return NO_CALLS; } // no reads remain after filtering so nothing else to do!
|
||||
|
|
@ -918,17 +918,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
activeRegion.addAll(DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart));
|
||||
}
|
||||
|
||||
private List<GATKSAMRecord> filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
|
||||
final List<GATKSAMRecord> readsToRemove = new ArrayList<>();
|
||||
// logger.info("Filtering non-passing regions: n incoming " + activeRegion.getReads().size());
|
||||
private Set<GATKSAMRecord> filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
|
||||
final Set<GATKSAMRecord> readsToRemove = new LinkedHashSet<>();
|
||||
for( final GATKSAMRecord rec : activeRegion.getReads() ) {
|
||||
if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
|
||||
readsToRemove.add(rec);
|
||||
// logger.info("\tremoving read " + rec + " len " + rec.getReadLength());
|
||||
}
|
||||
}
|
||||
activeRegion.removeAll( readsToRemove );
|
||||
// logger.info("Filtered non-passing regions: n remaining " + activeRegion.getReads().size());
|
||||
return readsToRemove;
|
||||
}
|
||||
|
||||
|
|
@ -938,7 +935,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getExtendedLoc().getContig(), padLeft, padRight);
|
||||
}
|
||||
|
||||
private Map<String, List<GATKSAMRecord>> splitReadsBySample( final List<GATKSAMRecord> reads ) {
|
||||
private Map<String, List<GATKSAMRecord>> splitReadsBySample( final Collection<GATKSAMRecord> reads ) {
|
||||
final Map<String, List<GATKSAMRecord>> returnMap = new HashMap<String, List<GATKSAMRecord>>();
|
||||
for( final String sample : samplesList) {
|
||||
List<GATKSAMRecord> readList = returnMap.get( sample );
|
||||
|
|
|
|||
|
|
@ -336,13 +336,17 @@ public class ActiveRegion implements HasGenomeLocation {
|
|||
|
||||
/**
|
||||
* Remove all of the reads in readsToRemove from this active region
|
||||
* @param readsToRemove the collection of reads we want to remove
|
||||
* @param readsToRemove the set of reads we want to remove
|
||||
*/
|
||||
public void removeAll( final Collection<GATKSAMRecord> readsToRemove ) {
|
||||
reads.removeAll(readsToRemove);
|
||||
public void removeAll( final Set<GATKSAMRecord> readsToRemove ) {
|
||||
final Iterator<GATKSAMRecord> it = reads.iterator();
|
||||
spanIncludingReads = extendedLoc;
|
||||
for ( final GATKSAMRecord read : reads ) {
|
||||
spanIncludingReads = spanIncludingReads.union( genomeLocParser.createGenomeLoc(read) );
|
||||
while ( it.hasNext() ) {
|
||||
final GATKSAMRecord read = it.next();
|
||||
if ( readsToRemove.contains(read) )
|
||||
it.remove();
|
||||
else
|
||||
spanIncludingReads = spanIncludingReads.union( genomeLocParser.createGenomeLoc(read) );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -42,13 +42,13 @@ import java.util.*;
|
|||
* For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood.
|
||||
*/
|
||||
public class PerReadAlleleLikelihoodMap {
|
||||
protected final List<Allele> alleles;
|
||||
protected final Map<GATKSAMRecord, Map<Allele, Double>> likelihoodReadMap;
|
||||
/** A set of all of the allele, so we can efficiently determine if an allele is already present */
|
||||
private final Set<Allele> allelesSet = new HashSet<>();
|
||||
/** A list of the unique allele, as an ArrayList so we can call get(i) efficiently */
|
||||
protected final List<Allele> alleles = new ArrayList<>();
|
||||
protected final Map<GATKSAMRecord, Map<Allele, Double>> likelihoodReadMap = new LinkedHashMap<>();
|
||||
|
||||
public PerReadAlleleLikelihoodMap() {
|
||||
likelihoodReadMap = new LinkedHashMap<GATKSAMRecord,Map<Allele,Double>>();
|
||||
alleles = new ArrayList<Allele>();
|
||||
}
|
||||
public PerReadAlleleLikelihoodMap() { }
|
||||
|
||||
/**
|
||||
* Add a new entry into the Read -> ( Allele -> Likelihood ) map of maps.
|
||||
|
|
@ -61,18 +61,20 @@ public class PerReadAlleleLikelihoodMap {
|
|||
if ( a == null ) throw new IllegalArgumentException("Cannot add a null allele to the allele likelihood map");
|
||||
if ( likelihood == null ) throw new IllegalArgumentException("Likelihood cannot be null");
|
||||
if ( likelihood > 0.0 ) throw new IllegalArgumentException("Likelihood must be negative (L = log(p))");
|
||||
|
||||
Map<Allele,Double> likelihoodMap = likelihoodReadMap.get(read);
|
||||
if (likelihoodMap == null){
|
||||
// LinkedHashMap will ensure iterating through alleles will be in consistent order
|
||||
likelihoodMap = new LinkedHashMap<Allele, Double>();
|
||||
likelihoodMap = new LinkedHashMap<>();
|
||||
likelihoodReadMap.put(read,likelihoodMap);
|
||||
}
|
||||
likelihoodReadMap.put(read,likelihoodMap);
|
||||
|
||||
likelihoodMap.put(a,likelihood);
|
||||
|
||||
if (!alleles.contains(a))
|
||||
if (!allelesSet.contains(a)) {
|
||||
allelesSet.add(a);
|
||||
alleles.add(a);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) {
|
||||
|
|
@ -165,6 +167,7 @@ public class PerReadAlleleLikelihoodMap {
|
|||
}
|
||||
|
||||
public void clear() {
|
||||
allelesSet.clear();
|
||||
alleles.clear();
|
||||
likelihoodReadMap.clear();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -144,7 +144,7 @@ public class ActiveRegionUnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
@Test(enabled = !DEBUG, dataProvider = "ActiveRegionReads")
|
||||
public void testActiveRegionReads(final GenomeLoc loc, final GATKSAMRecord read) {
|
||||
public void testActiveRegionReads(final GenomeLoc loc, final GATKSAMRecord read) throws Exception {
|
||||
final GenomeLoc expectedSpan = loc.union(genomeLocParser.createGenomeLoc(read));
|
||||
|
||||
final ActiveRegion region = new ActiveRegion(loc, null, true, genomeLocParser, 0);
|
||||
|
|
@ -176,19 +176,31 @@ public class ActiveRegionUnitTest extends BaseTest {
|
|||
Assert.assertEquals(region.getReadSpanLoc(), expectedSpan);
|
||||
Assert.assertTrue(region.equalExceptReads(region2));
|
||||
|
||||
region.removeAll(Collections.<GATKSAMRecord>emptyList());
|
||||
region.removeAll(Collections.<GATKSAMRecord>emptySet());
|
||||
Assert.assertEquals(region.getReads(), Collections.singletonList(read));
|
||||
Assert.assertEquals(region.size(), 1);
|
||||
Assert.assertEquals(region.getExtendedLoc(), loc);
|
||||
Assert.assertEquals(region.getReadSpanLoc(), expectedSpan);
|
||||
Assert.assertTrue(region.equalExceptReads(region2));
|
||||
|
||||
region.removeAll(Collections.singletonList(read));
|
||||
region.removeAll(Collections.singleton(read));
|
||||
Assert.assertEquals(region.getReads(), Collections.emptyList());
|
||||
Assert.assertEquals(region.size(), 0);
|
||||
Assert.assertEquals(region.getExtendedLoc(), loc);
|
||||
Assert.assertEquals(region.getReadSpanLoc(), loc);
|
||||
Assert.assertTrue(region.equalExceptReads(region2));
|
||||
|
||||
final GATKSAMRecord read2 = (GATKSAMRecord)read.clone();
|
||||
read2.setReadName(read.getReadName() + ".clone");
|
||||
|
||||
for ( final GATKSAMRecord readToKeep : Arrays.asList(read, read2)) {
|
||||
region.addAll(Arrays.asList(read, read2));
|
||||
final GATKSAMRecord readToDiscard = readToKeep == read ? read2 : read;
|
||||
region.removeAll(Collections.singleton(readToDiscard));
|
||||
Assert.assertEquals(region.getReads(), Arrays.asList(readToKeep));
|
||||
Assert.assertEquals(region.size(), 1);
|
||||
Assert.assertEquals(region.getExtendedLoc(), loc);
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------
|
||||
|
|
|
|||
Loading…
Reference in New Issue