Significant performance optimization for reduced reads due to better algorithm for including reads in the variable regions. Fixed a critical bug that actually produced multiple copies of the same read in the variable regions with this optimization as well. Scala exploration script updated as well.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6005 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-06-16 12:54:59 +00:00
parent 38d7733989
commit 43fdd31e20
3 changed files with 27 additions and 16 deletions

View File

@ -8,10 +8,7 @@ import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import java.util.ArrayList; import java.util.*;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
/* /*
* Copyright (c) 2009 The Broad Institute * Copyright (c) 2009 The Broad Institute
@ -48,7 +45,7 @@ import java.util.Set;
* on the reference genome, or is a dummy site that is only used to calculate insertion statistics * on the reference genome, or is a dummy site that is only used to calculate insertion statistics
*/ */
final class ConsensusSite { final class ConsensusSite {
final Set<PileupElement> overlappingReads = new HashSet<PileupElement>(); final Collection<PileupElement> overlappingReads = new LinkedList<PileupElement>();
final int offset, position; final int offset, position;
final BaseCounts counts = new BaseCounts(); final BaseCounts counts = new BaseCounts();
@ -57,17 +54,21 @@ final class ConsensusSite {
public ConsensusSite(int position, int offset) { public ConsensusSite(int position, int offset) {
this.position = position; this.position = position;
this.offset = offset; this.offset = offset;
} }
public int getPosition() { public int getPosition() {
return position; return position;
} }
public Set<PileupElement> getOverlappingReads() { public Collection<PileupElement> getOverlappingReads() {
return overlappingReads; return overlappingReads;
} }
/**
* Adds a pileup element (read / offset pair) to this consensus site. Assumes
* that the same element isn't added to site more than once.
* @param elt
*/
public void addOverlappingRead(PileupElement elt) { public void addOverlappingRead(PileupElement elt) {
overlappingReads.add(elt); overlappingReads.add(elt);
counts.incr(elt.getBase()); counts.incr(elt.getBase());

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.playground.gatk.walkers.reducereads; package org.broadinstitute.sting.playground.gatk.walkers.reducereads;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.*; import net.sf.samtools.*;
import org.apache.commons.math.stat.descriptive.summary.Sum; import org.apache.commons.math.stat.descriptive.summary.Sum;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
@ -323,7 +325,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
List<SAMRecord> reads = new ArrayList<SAMRecord>(); List<SAMRecord> reads = new ArrayList<SAMRecord>();
for ( ConsensusSpan span : spans ) { for ( ConsensusSpan span : spans ) {
logger.info("Span is " + span); //logger.info("Span is " + span);
if ( span.isConserved() ) if ( span.isConserved() )
reads.addAll(conservedSpanReads(sites, span)); reads.addAll(conservedSpanReads(sites, span));
else else
@ -416,19 +418,27 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
return Collections.singletonList(consensus); return Collections.singletonList(consensus);
} }
@Requires({"sites != null", "span.isVariable()"})
@Ensures("result != null")
private Collection<SAMRecord> variableSpanReads(List<ConsensusSite> sites, ConsensusSpan span) { private Collection<SAMRecord> variableSpanReads(List<ConsensusSite> sites, ConsensusSpan span) {
Set<SAMRecord> reads = new HashSet<SAMRecord>(); Collection<SAMRecord> reads = new LinkedList<SAMRecord>();
Set<String> readNames = new HashSet<String>();
// todo -- this code is grossly inefficient, as it checks each variable read at each site in the span
for ( int i = 0; i < span.size(); i++ ) { for ( int i = 0; i < span.size(); i++ ) {
int refI = i + span.getOffsetFromStartOfSites(); int refI = i + span.getOffsetFromStartOfSites();
ConsensusSite site = sites.get(refI);
for ( PileupElement p : site.getOverlappingReads() ) { for ( PileupElement p : sites.get(refI).getOverlappingReads() ) {
if ( readNames.contains(p.getRead().getReadName()) ) {
;
//logger.info("Rejecting already seen read: " + p.getRead().getReadName());
} else {
readNames.add(p.getRead().getReadName());
SAMRecord read = clipReadToSpan(p.getRead(), span); SAMRecord read = clipReadToSpan(p.getRead(), span);
if ( keepClippedReadInVariableSpan(p.getRead(), read) ) if ( keepClippedReadInVariableSpan(p.getRead(), read) )
reads.add(read); reads.add(read);
} }
} }
}
return reads; return reads;
} }

View File

@ -37,7 +37,7 @@ class ReducedBAMEvaluation extends QScript {
} }
trait CoFoJa extends JavaCommandLineFunction { trait CoFoJa extends JavaCommandLineFunction {
override def javaOpts = super.javaOpts + " -javaagent:lib/cofoja.jar" override def javaOpts = super.javaOpts // + " -javaagent:lib/cofoja.jar"
} }
def script = { def script = {