Intermediate commit (DVCS, where are you?) of a fully operational ReducedRead walker. Now results in minor differences in the raw calls (filtering is a different matter) in an exome but 20x less disk space than the full exome data. Changes to the UG necessary to process reduced reads are not yet committed, as they are being tested. This code is being moved to playground now.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5924 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-06-02 21:13:31 +00:00
parent dd6d61c031
commit 429833c05a
9 changed files with 140 additions and 68 deletions

View File

@ -1,16 +1,16 @@
package org.broadinstitute.sting.oneoffprojects.walkers.reducereads;
import org.broadinstitute.sting.utils.BaseUtils;
import com.google.java.contract.*;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: 4/8/11
* Time: 2:55 PM
* To change this template use File | Settings | File Templates.
*/
final class BaseCounts {
int counts[] = new int[4]; // fixme -- include - and I events
private final int counts[] = new int[4]; // fixme -- include - and I events
public void incr(byte base) {
int baseI = BaseUtils.simpleBaseToBaseIndex(base);
@ -18,14 +18,17 @@ final class BaseCounts {
counts[baseI]++;
}
@Ensures("BaseUtils.isRegularBase(result)")
public byte baseWithMostCounts() {
return BaseUtils.baseIndexToSimpleBase(maxBaseIndex());
}
@Ensures("result >= 0")
public int countOfMostCommonBase() {
return counts[maxBaseIndex()];
}
@Ensures("result >= 0")
public int totalCounts() {
int sum = 0;
@ -36,6 +39,7 @@ final class BaseCounts {
return sum;
}
@Ensures("result >= 0 && result < counts.length")
private int maxBaseIndex() {
int maxI = 0;
for ( int i = 0; i < counts.length; i++) {
@ -46,6 +50,7 @@ final class BaseCounts {
return maxI;
}
@Ensures("result != null")
public String toString() {
StringBuilder b = new StringBuilder();
for ( int i = 0; i < counts.length; i++ ) {

View File

@ -1,22 +1,36 @@
package org.broadinstitute.sting.oneoffprojects.walkers.reducereads;
import com.google.java.contract.*;
import net.sf.samtools.SAMRecord;
import java.util.Collection;
import java.util.Iterator;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: 4/10/11
* Time: 8:49 AM
* To change this template use File | Settings | File Templates.
*
* A general interface for ReadCompressors. Read compressors have the following semantics:
*
* The accept a stream of reads, in order, and after each added read returns a compressed stream
* of reads for emission. This stream of reads is a "reduced" representation of the total stream
* of reads. The actual compression approach is left up to the implementing class.
*/
public interface ConsensusReadCompressor { // extends Iterable<SAMRecord> {
public interface ConsensusReadCompressor {
/**
* Adds the read to the compressor. The returned iteratable collection of
* reads represents the incremental compressed output.
* @param read the next uncompressed read in the input stream to the compressor
* @return an iterator over the incrementally available compressed reads
*/
@Requires("read != null")
@Ensures("result != null")
Iterable<SAMRecord> addAlignment(SAMRecord read);
Iterable<SAMRecord> close();
//Collection<SAMRecord> consensusReads();
// @Override
// Iterator<SAMRecord> iterator();
/**
* Must be called after the last read has been added to finalize the compressor state
* and return the last compressed reads from the compressor.
* @return an iterator over the final compressed reads of this compressor
*/
@Ensures("result != null")
Iterable<SAMRecord> close();
}

View File

@ -18,7 +18,7 @@ import java.util.Set;
* User: depristo
* Date: 4/8/11
* Time: 3:01 PM
* To change this template use File | Settings | File Templates.
*
*/
final class ConsensusSite {
final GenomeLoc loc;
@ -26,7 +26,7 @@ final class ConsensusSite {
final int offset;
final BaseCounts counts = new BaseCounts();
ConsensusType markedType = null;
ConsensusSpan.Type markedType = null;
public ConsensusSite(GenomeLoc loc, int offset) {
this.loc = loc;
@ -88,14 +88,12 @@ final class ConsensusSite {
return counts.toString();
}
public void setMarkedType(ConsensusType markedType) {
public void setMarkedType(ConsensusSpan.Type markedType) {
this.markedType = markedType;
}
public ConsensusType getMarkedType() {
public ConsensusSpan.Type getMarkedType() {
if ( markedType == null ) throw new ReviewedStingException("markedType not yet set!");
return markedType;
}
}

View File

@ -10,11 +10,30 @@ import org.broadinstitute.sting.utils.GenomeLoc;
* To change this template use File | Settings | File Templates.
*/
final class ConsensusSpan {
/**
* The type of an span is either conserved (little variability within the span) or
* variable (too many differences among the reads in the span to determine the exact
* haplotype sequence).
*/
public enum Type {
CONSERVED, VARIABLE;
public static Type otherType(Type t) {
switch ( t ) {
case CONSERVED: return VARIABLE;
case VARIABLE: return CONSERVED;
}
return CONSERVED;
}
}
final int refStart; // the start position on the reference for relative calculations
final GenomeLoc loc;
final ConsensusType consensusType;
final Type consensusType;
public ConsensusSpan(final int refStart, GenomeLoc loc, ConsensusType consensusType) {
public ConsensusSpan(final int refStart, GenomeLoc loc, ConsensusSpan.Type consensusType) {
this.refStart = refStart;
this.loc = loc;
this.consensusType = consensusType;
@ -32,7 +51,7 @@ final class ConsensusSpan {
return loc.getStop();
}
public ConsensusType getConsensusType() {
public ConsensusSpan.Type getConsensusType() {
return consensusType;
}
@ -40,8 +59,8 @@ final class ConsensusSpan {
return getGenomeStop() - getGenomeStart() + 1;
}
public boolean isConserved() { return getConsensusType() == ConsensusType.CONSERVED; }
public boolean isVariable() { return getConsensusType() == ConsensusType.VARIABLE; }
public boolean isConserved() { return getConsensusType() == Type.CONSERVED; }
public boolean isVariable() { return getConsensusType() == Type.VARIABLE; }
public String toString() {
return String.format("%s %s", consensusType, loc);

View File

@ -1,20 +0,0 @@
package org.broadinstitute.sting.oneoffprojects.walkers.reducereads;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: 4/9/11
* Time: 7:52 PM
* To change this template use File | Settings | File Templates.
*/
public enum ConsensusType {
CONSERVED, VARIABLE;
public static ConsensusType otherType(ConsensusType t) {
switch ( t ) {
case CONSERVED: return VARIABLE;
case VARIABLE: return CONSERVED;
}
return CONSERVED;
}
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.oneoffprojects.walkers.reducereads;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -49,17 +50,26 @@ public class MultiSampleConsensusReadCompressor implements ConsensusReadCompress
public MultiSampleConsensusReadCompressor(SAMFileHeader header,
final int readContextSize,
final GenomeLocParser glParser,
final String contig,
final int minBpForRunningConsensus,
final int maxReadsAtVariableSites) {
for ( String name : SampleUtils.getSAMFileSamples(header) ) {
compressorsPerSample.put(name,
new SingleSampleConsensusReadCompressor(readContextSize,
glParser, contig, minBpForRunningConsensus, maxReadsAtVariableSites));
new SingleSampleConsensusReadCompressor(name, readContextSize,
glParser, minBpForRunningConsensus, maxReadsAtVariableSites));
// todo -- argument for minConsensusSize
}
}
public Collection<SAMReadGroupRecord> getReducedReadGroups() {
List<SAMReadGroupRecord> rgs = new ArrayList<SAMReadGroupRecord>();
for ( SingleSampleConsensusReadCompressor comp : compressorsPerSample.values() ) {
rgs.add(comp.getReducedReadGroup());
}
return rgs;
}
@Override
public Iterable<SAMRecord> addAlignment(SAMRecord read) {
String sample = read.getReadGroup().getSample();

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.oneoffprojects.walkers.reducereads;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
@ -69,10 +70,20 @@ public class ReduceReadsWalker extends ReadWalker<SAMRecord, ConsensusReadCompre
protected int totalReads = 0;
int nCompressedReads = 0;
MultiSampleConsensusReadCompressor compressor;
@Override
public void initialize() {
super.initialize();
compressor = new MultiSampleConsensusReadCompressor(getToolkit().getSAMFileHeader(),
contextSize, getToolkit().getGenomeLocParser(),
minBpForRunningConsensus, maxReadsAtVariableSites);
out.setPresorted(false);
for ( SAMReadGroupRecord rg : compressor.getReducedReadGroups())
out.getFileHeader().addReadGroup(rg);
}
@Override
@ -89,9 +100,7 @@ public class ReduceReadsWalker extends ReadWalker<SAMRecord, ConsensusReadCompre
*/
@Override
public ConsensusReadCompressor reduceInit() {
return new MultiSampleConsensusReadCompressor(getToolkit().getSAMFileHeader(),
contextSize, getToolkit().getGenomeLocParser(), "20", // todo fixme
minBpForRunningConsensus, maxReadsAtVariableSites);
return compressor;
}
/**

View File

@ -58,6 +58,9 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
private static final double MAX_FRACTION_DISAGREEING_BASES = 0.1;
private static final ClippingRepresentation VARIABLE_READ_REPRESENTATION = ClippingRepresentation.SOFTCLIP_BASES;
private static final double MIN_FRACT_BASES_FOR_VARIABLE_READ = 0.33; // todo -- should be variable
private static final int MIN_BASES_IN_VARIABLE_SPAN_TO_INCLUDE_READ = 10;
protected final static String RG_POSTFIX = ".ReducedReads";
public final static int REDUCED_READ_BASE_QUALITY = 30;
// todo -- should merge close together spans
@ -68,22 +71,39 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
final int maxReadsAtVariableSites;
final int minBpForRunningConsensus;
int retryTimer = 0;
int consensusCounter = 0;
final String contig;
final SAMReadGroupRecord reducedReadGroup;
String contig = null;
final GenomeLocParser glParser;
SAMFileHeader header;
GenomeLoc lastProcessedRegion = null;
public SingleSampleConsensusReadCompressor(final int readContextSize,
public SingleSampleConsensusReadCompressor(final String sampleName,
final int readContextSize,
final GenomeLocParser glParser,
final String contig,
final int minBpForRunningConsensus,
final int maxReadsAtVariableSites) {
this.readContextSize = readContextSize;
this.glParser = glParser;
this.contig = contig;
this.minBpForRunningConsensus = minBpForRunningConsensus;
this.maxReadsAtVariableSites = maxReadsAtVariableSites;
this.reducedReadGroup = createReducedReadGroup(sampleName);
}
/**
* Helper function to create a read group for these reduced reads
* @param sampleName
* @return
*/
private static final SAMReadGroupRecord createReducedReadGroup(final String sampleName) {
SAMReadGroupRecord rg = new SAMReadGroupRecord(sampleName + RG_POSTFIX);
rg.setSample(sampleName);
return rg;
}
public SAMReadGroupRecord getReducedReadGroup() {
return reducedReadGroup;
}
// ------------------------------------------------------------------------------------------
@ -97,6 +117,11 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
*/
@Override
public Iterable<SAMRecord> addAlignment( SAMRecord read ) {
if ( contig == null )
contig = read.getReferenceName();
if ( ! read.getReferenceName().equals(contig) )
throw new ReviewedStingException("ConsensusRead system doesn't support multiple contig processing right now");
if ( header == null )
header = read.getHeader();
@ -200,7 +225,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
private List<ConsensusSite> expandVariableSites(List<ConsensusSite> sites) {
for ( ConsensusSite site : sites )
site.setMarkedType(ConsensusType.CONSERVED);
site.setMarkedType(ConsensusSpan.Type.CONSERVED);
for ( int i = 0; i < sites.size(); i++ ) {
ConsensusSite site = sites.get(i);
@ -209,7 +234,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
int stop = Math.min(sites.size(), i + readContextSize + 1);
for ( int j = start; j < stop; j++ ) {
// aggressive tagging -- you are only conserved if you are never variable
sites.get(j).setMarkedType(ConsensusType.VARIABLE);
sites.get(j).setMarkedType(ConsensusSpan.Type.VARIABLE);
}
}
}
@ -223,7 +248,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
int start = 0;
// our first span type is the type of the first site
ConsensusType consensusType = sites.get(0).getMarkedType();
ConsensusSpan.Type consensusType = sites.get(0).getMarkedType();
while ( start < sites.size() ) {
ConsensusSpan span = findSpan(sites, start, consensusType);
@ -234,20 +259,20 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
start += span.size();
}
consensusType = ConsensusType.otherType(consensusType);
consensusType = ConsensusSpan.Type.otherType(consensusType);
}
return spans;
}
private ConsensusSpan findSpan(List<ConsensusSite> sites, int start, ConsensusType consensusType) {
private ConsensusSpan findSpan(List<ConsensusSite> sites, int start, ConsensusSpan.Type consensusType) {
int refStart = sites.get(0).getLoc().getStart();
for ( int end = start + 1; end < sites.size(); end++ ) {
ConsensusSite site = sites.get(end);
boolean conserved = site.getMarkedType() == ConsensusType.CONSERVED;
if ( (consensusType == ConsensusType.CONSERVED && ! conserved) ||
(consensusType == ConsensusType.VARIABLE && conserved) ||
boolean conserved = site.getMarkedType() == ConsensusSpan.Type.CONSERVED;
if ( (consensusType == ConsensusSpan.Type.CONSERVED && ! conserved) ||
(consensusType == ConsensusSpan.Type.VARIABLE && conserved) ||
end + 1 == sites.size() ) { // we are done with the complete interval
GenomeLoc loc = glParser.createGenomeLoc(contig, start+refStart, end+refStart-1);
return new ConsensusSpan(refStart, loc, consensusType);
@ -319,7 +344,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
for ( int i = 0; i < span.size(); i++ ) {
int refI = i + span.getOffsetFromStartOfSites();
ConsensusSite site = sites.get(refI);
if ( site.getMarkedType() == ConsensusType.VARIABLE )
if ( site.getMarkedType() == ConsensusSpan.Type.VARIABLE )
throw new ReviewedStingException("Variable site included in consensus: " + site);
final int count = site.counts.countOfMostCommonBase();
final byte base = count == 0 ? (byte)'N' : site.counts.baseWithMostCounts();
@ -328,10 +353,13 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
}
SAMRecord consensus = new SAMRecord(header);
consensus.setAttribute("RG", reducedReadGroup.getId());
consensus.setAttribute(ReadUtils.REDUCED_READ_QUALITY_TAG, Integer.valueOf(REDUCED_READ_BASE_QUALITY));
consensus.setReferenceName(contig);
consensus.setReadName("Mark");
consensus.setCigarString(String.format("%dM", span.size()));
consensus.setReadName(String.format("%s.read.%d", reducedReadGroup.getId(), consensusCounter++));
consensus.setReadPairedFlag(false);
consensus.setReadUnmappedFlag(false);
consensus.setCigarString(String.format("%dM", span.size()));
consensus.setAlignmentStart(span.getGenomeStart());
consensus.setReadBases(bases);
consensus.setBaseQualities(quals);
@ -347,15 +375,13 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
private Collection<SAMRecord> variableSpanReads(List<ConsensusSite> sites, ConsensusSpan span) {
Set<SAMRecord> reads = new HashSet<SAMRecord>();
int minNumBasesInRead = (int)Math.floor(MIN_FRACT_BASES_FOR_VARIABLE_READ * span.size());
// todo -- this code is grossly inefficient, as it checks each variable read at each site in the span
for ( int i = 0; i < span.size(); i++ ) {
int refI = i + span.getOffsetFromStartOfSites();
ConsensusSite site = sites.get(refI);
for ( PileupElement p : site.getOverlappingReads() ) {
// if ( reads.contains(p.getRead()))
// logger.info("Skipping already added read: " + p.getRead().getReadName());
SAMRecord read = clipReadToSpan(p.getRead(), span);
if ( read.getReadLength() >= minNumBasesInRead )
if ( keepClippedReadInVariableSpan(p.getRead(), read) )
reads.add(read);
}
}
@ -363,6 +389,15 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
return reads;
}
private final static boolean keepClippedReadInVariableSpan(SAMRecord originalRead, SAMRecord variableRead) {
int originalReadLength = originalRead.getReadLength();
int variableReadLength = variableRead.getReadLength();
return variableReadLength >= MIN_BASES_IN_VARIABLE_SPAN_TO_INCLUDE_READ;
// &&
// ((1.0 * variableReadLength) / originalReadLength) >= MIN_FRACT_BASES_FOR_VARIABLE_READ;
}
private SAMRecord clipReadToSpan(SAMRecord read, ConsensusSpan span) {
ReadClipper clipper = new ReadClipper(read);
int spanStart = span.getGenomeStart();
@ -371,7 +406,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres
for ( RefPileupElement p : RefPileupElement.walkRead(read) ) {
if ( p.getRefOffset() == spanStart && p.getOffset() != 0 ) {
clipper.addOp(new ClippingOp(0, p.getOffset()));
clipper.addOp(new ClippingOp(0, p.getOffset() - 1));
}
if ( p.getRefOffset() == spanEnd && p.getOffset() != readLen - 1 ) {

View File

@ -41,6 +41,8 @@ import java.io.File;
* @version 0.1
*/
public class ReadUtils {
public static final String REDUCED_READ_QUALITY_TAG = "RQ";
private ReadUtils() { }
public static SAMFileHeader copySAMFileHeader(SAMFileHeader toCopy) {