From 43fdd31e20d7cdc9319b77ab52c22d8de32b6c07 Mon Sep 17 00:00:00 2001 From: depristo Date: Thu, 16 Jun 2011 12:54:59 +0000 Subject: [PATCH] Significant performance optimization for reduced reads due to better algorithm for including reads in the variable regions. Fixed a critical bug that actually produced multiple copies of the same read in the variable regions with this optimization as well. Scala exploration script updated as well. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6005 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/reducereads/ConsensusSite.java | 15 ++++++----- .../SingleSampleConsensusReadCompressor.java | 26 +++++++++++++------ .../depristo/ReducedBAMEvaluation.scala | 2 +- 3 files changed, 27 insertions(+), 16 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ConsensusSite.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ConsensusSite.java index 6891d3b4f..9a952e8ea 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ConsensusSite.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/ConsensusSite.java @@ -8,10 +8,7 @@ import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashSet; -import java.util.Set; +import java.util.*; /* * Copyright (c) 2009 The Broad Institute @@ -48,7 +45,7 @@ import java.util.Set; * on the reference genome, or is a dummy site that is only used to calculate insertion statistics */ final class ConsensusSite { - final Set overlappingReads = new HashSet(); + final Collection overlappingReads = new LinkedList(); final int offset, position; final BaseCounts counts = new BaseCounts(); @@ -57,17 +54,21 @@ final class ConsensusSite { public ConsensusSite(int position, int offset) { this.position = position; this.offset = offset; - } public int getPosition() { return position; } - public Set getOverlappingReads() { + public Collection getOverlappingReads() { return overlappingReads; } + /** + * Adds a pileup element (read / offset pair) to this consensus site. Assumes + * that the same element isn't added to site more than once. + * @param elt + */ public void addOverlappingRead(PileupElement elt) { overlappingReads.add(elt); counts.incr(elt.getBase()); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/SingleSampleConsensusReadCompressor.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/SingleSampleConsensusReadCompressor.java index c40430594..9c5f28f6d 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/SingleSampleConsensusReadCompressor.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/reducereads/SingleSampleConsensusReadCompressor.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.reducereads; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import net.sf.samtools.*; import org.apache.commons.math.stat.descriptive.summary.Sum; import org.apache.log4j.Logger; @@ -323,7 +325,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres List reads = new ArrayList(); for ( ConsensusSpan span : spans ) { - logger.info("Span is " + span); + //logger.info("Span is " + span); if ( span.isConserved() ) reads.addAll(conservedSpanReads(sites, span)); else @@ -416,17 +418,25 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres return Collections.singletonList(consensus); } + @Requires({"sites != null", "span.isVariable()"}) + @Ensures("result != null") private Collection variableSpanReads(List sites, ConsensusSpan span) { - Set reads = new HashSet(); + Collection reads = new LinkedList(); + Set readNames = new HashSet(); - // todo -- this code is grossly inefficient, as it checks each variable read at each site in the span for ( int i = 0; i < span.size(); i++ ) { int refI = i + span.getOffsetFromStartOfSites(); - ConsensusSite site = sites.get(refI); - for ( PileupElement p : site.getOverlappingReads() ) { - SAMRecord read = clipReadToSpan(p.getRead(), span); - if ( keepClippedReadInVariableSpan(p.getRead(), read) ) - reads.add(read); + + for ( PileupElement p : sites.get(refI).getOverlappingReads() ) { + if ( readNames.contains(p.getRead().getReadName()) ) { + ; + //logger.info("Rejecting already seen read: " + p.getRead().getReadName()); + } else { + readNames.add(p.getRead().getReadName()); + SAMRecord read = clipReadToSpan(p.getRead(), span); + if ( keepClippedReadInVariableSpan(p.getRead(), read) ) + reads.add(read); + } } } diff --git a/scala/qscript/oneoffs/depristo/ReducedBAMEvaluation.scala b/scala/qscript/oneoffs/depristo/ReducedBAMEvaluation.scala index 41e5f8334..bf0231f4d 100755 --- a/scala/qscript/oneoffs/depristo/ReducedBAMEvaluation.scala +++ b/scala/qscript/oneoffs/depristo/ReducedBAMEvaluation.scala @@ -37,7 +37,7 @@ class ReducedBAMEvaluation extends QScript { } trait CoFoJa extends JavaCommandLineFunction { - override def javaOpts = super.javaOpts + " -javaagent:lib/cofoja.jar" + override def javaOpts = super.javaOpts // + " -javaagent:lib/cofoja.jar" } def script = {