diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/BaseCounts.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/BaseCounts.java index 3919d76dc..687dc2c0a 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/BaseCounts.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/BaseCounts.java @@ -1,16 +1,16 @@ package org.broadinstitute.sting.oneoffprojects.walkers.reducereads; import org.broadinstitute.sting.utils.BaseUtils; +import com.google.java.contract.*; /** * Created by IntelliJ IDEA. * User: depristo * Date: 4/8/11 * Time: 2:55 PM -* To change this template use File | Settings | File Templates. */ final class BaseCounts { - int counts[] = new int[4]; // fixme -- include - and I events + private final int counts[] = new int[4]; // fixme -- include - and I events public void incr(byte base) { int baseI = BaseUtils.simpleBaseToBaseIndex(base); @@ -18,14 +18,17 @@ final class BaseCounts { counts[baseI]++; } + @Ensures("BaseUtils.isRegularBase(result)") public byte baseWithMostCounts() { return BaseUtils.baseIndexToSimpleBase(maxBaseIndex()); } + @Ensures("result >= 0") public int countOfMostCommonBase() { return counts[maxBaseIndex()]; } + @Ensures("result >= 0") public int totalCounts() { int sum = 0; @@ -36,6 +39,7 @@ final class BaseCounts { return sum; } + @Ensures("result >= 0 && result < counts.length") private int maxBaseIndex() { int maxI = 0; for ( int i = 0; i < counts.length; i++) { @@ -46,6 +50,7 @@ final class BaseCounts { return maxI; } + @Ensures("result != null") public String toString() { StringBuilder b = new StringBuilder(); for ( int i = 0; i < counts.length; i++ ) { diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusReadCompressor.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusReadCompressor.java index 2ef27226a..74081f01d 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusReadCompressor.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusReadCompressor.java @@ -1,22 +1,36 @@ package org.broadinstitute.sting.oneoffprojects.walkers.reducereads; +import com.google.java.contract.*; import net.sf.samtools.SAMRecord; -import java.util.Collection; -import java.util.Iterator; - /** * Created by IntelliJ IDEA. * User: depristo * Date: 4/10/11 * Time: 8:49 AM - * To change this template use File | Settings | File Templates. + * + * A general interface for ReadCompressors. Read compressors have the following semantics: + * + * The accept a stream of reads, in order, and after each added read returns a compressed stream + * of reads for emission. This stream of reads is a "reduced" representation of the total stream + * of reads. The actual compression approach is left up to the implementing class. */ -public interface ConsensusReadCompressor { // extends Iterable { +public interface ConsensusReadCompressor { + /** + * Adds the read to the compressor. The returned iteratable collection of + * reads represents the incremental compressed output. + * @param read the next uncompressed read in the input stream to the compressor + * @return an iterator over the incrementally available compressed reads + */ + @Requires("read != null") + @Ensures("result != null") Iterable addAlignment(SAMRecord read); - Iterable close(); -//Collection consensusReads(); -// @Override -// Iterator iterator(); + /** + * Must be called after the last read has been added to finalize the compressor state + * and return the last compressed reads from the compressor. + * @return an iterator over the final compressed reads of this compressor + */ + @Ensures("result != null") + Iterable close(); } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSite.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSite.java index 97e1fa70a..041826f56 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSite.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSite.java @@ -18,7 +18,7 @@ import java.util.Set; * User: depristo * Date: 4/8/11 * Time: 3:01 PM -* To change this template use File | Settings | File Templates. +* */ final class ConsensusSite { final GenomeLoc loc; @@ -26,7 +26,7 @@ final class ConsensusSite { final int offset; final BaseCounts counts = new BaseCounts(); - ConsensusType markedType = null; + ConsensusSpan.Type markedType = null; public ConsensusSite(GenomeLoc loc, int offset) { this.loc = loc; @@ -88,14 +88,12 @@ final class ConsensusSite { return counts.toString(); } - public void setMarkedType(ConsensusType markedType) { + public void setMarkedType(ConsensusSpan.Type markedType) { this.markedType = markedType; } - public ConsensusType getMarkedType() { + public ConsensusSpan.Type getMarkedType() { if ( markedType == null ) throw new ReviewedStingException("markedType not yet set!"); return markedType; } - - } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSpan.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSpan.java index 33b196767..6f3c0edc1 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSpan.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSpan.java @@ -10,11 +10,30 @@ import org.broadinstitute.sting.utils.GenomeLoc; * To change this template use File | Settings | File Templates. */ final class ConsensusSpan { + + /** + * The type of an span is either conserved (little variability within the span) or + * variable (too many differences among the reads in the span to determine the exact + * haplotype sequence). + */ + public enum Type { + CONSERVED, VARIABLE; + + public static Type otherType(Type t) { + switch ( t ) { + case CONSERVED: return VARIABLE; + case VARIABLE: return CONSERVED; + } + return CONSERVED; + } + } + + final int refStart; // the start position on the reference for relative calculations final GenomeLoc loc; - final ConsensusType consensusType; + final Type consensusType; - public ConsensusSpan(final int refStart, GenomeLoc loc, ConsensusType consensusType) { + public ConsensusSpan(final int refStart, GenomeLoc loc, ConsensusSpan.Type consensusType) { this.refStart = refStart; this.loc = loc; this.consensusType = consensusType; @@ -32,7 +51,7 @@ final class ConsensusSpan { return loc.getStop(); } - public ConsensusType getConsensusType() { + public ConsensusSpan.Type getConsensusType() { return consensusType; } @@ -40,8 +59,8 @@ final class ConsensusSpan { return getGenomeStop() - getGenomeStart() + 1; } - public boolean isConserved() { return getConsensusType() == ConsensusType.CONSERVED; } - public boolean isVariable() { return getConsensusType() == ConsensusType.VARIABLE; } + public boolean isConserved() { return getConsensusType() == Type.CONSERVED; } + public boolean isVariable() { return getConsensusType() == Type.VARIABLE; } public String toString() { return String.format("%s %s", consensusType, loc); diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusType.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusType.java deleted file mode 100644 index 0e69e7c5d..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusType.java +++ /dev/null @@ -1,20 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.reducereads; - -/** -* Created by IntelliJ IDEA. -* User: depristo -* Date: 4/9/11 -* Time: 7:52 PM -* To change this template use File | Settings | File Templates. -*/ -public enum ConsensusType { - CONSERVED, VARIABLE; - - public static ConsensusType otherType(ConsensusType t) { - switch ( t ) { - case CONSERVED: return VARIABLE; - case VARIABLE: return CONSERVED; - } - return CONSERVED; - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/MultiSampleConsensusReadCompressor.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/MultiSampleConsensusReadCompressor.java index bb3540906..ab87290e2 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/MultiSampleConsensusReadCompressor.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/MultiSampleConsensusReadCompressor.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.oneoffprojects.walkers.reducereads; import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -49,17 +50,26 @@ public class MultiSampleConsensusReadCompressor implements ConsensusReadCompress public MultiSampleConsensusReadCompressor(SAMFileHeader header, final int readContextSize, final GenomeLocParser glParser, - final String contig, final int minBpForRunningConsensus, final int maxReadsAtVariableSites) { for ( String name : SampleUtils.getSAMFileSamples(header) ) { compressorsPerSample.put(name, - new SingleSampleConsensusReadCompressor(readContextSize, - glParser, contig, minBpForRunningConsensus, maxReadsAtVariableSites)); + new SingleSampleConsensusReadCompressor(name, readContextSize, + glParser, minBpForRunningConsensus, maxReadsAtVariableSites)); // todo -- argument for minConsensusSize } } + public Collection getReducedReadGroups() { + List rgs = new ArrayList(); + + for ( SingleSampleConsensusReadCompressor comp : compressorsPerSample.values() ) { + rgs.add(comp.getReducedReadGroup()); + } + + return rgs; + } + @Override public Iterable addAlignment(SAMRecord read) { String sample = read.getReadGroup().getSample(); diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ReduceReadsWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ReduceReadsWalker.java index e0984988a..a1610cc52 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ReduceReadsWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ReduceReadsWalker.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.oneoffprojects.walkers.reducereads; +import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; @@ -69,10 +70,20 @@ public class ReduceReadsWalker extends ReadWalker addAlignment( SAMRecord read ) { + if ( contig == null ) + contig = read.getReferenceName(); + if ( ! read.getReferenceName().equals(contig) ) + throw new ReviewedStingException("ConsensusRead system doesn't support multiple contig processing right now"); + if ( header == null ) header = read.getHeader(); @@ -200,7 +225,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres private List expandVariableSites(List sites) { for ( ConsensusSite site : sites ) - site.setMarkedType(ConsensusType.CONSERVED); + site.setMarkedType(ConsensusSpan.Type.CONSERVED); for ( int i = 0; i < sites.size(); i++ ) { ConsensusSite site = sites.get(i); @@ -209,7 +234,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres int stop = Math.min(sites.size(), i + readContextSize + 1); for ( int j = start; j < stop; j++ ) { // aggressive tagging -- you are only conserved if you are never variable - sites.get(j).setMarkedType(ConsensusType.VARIABLE); + sites.get(j).setMarkedType(ConsensusSpan.Type.VARIABLE); } } } @@ -223,7 +248,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres int start = 0; // our first span type is the type of the first site - ConsensusType consensusType = sites.get(0).getMarkedType(); + ConsensusSpan.Type consensusType = sites.get(0).getMarkedType(); while ( start < sites.size() ) { ConsensusSpan span = findSpan(sites, start, consensusType); @@ -234,20 +259,20 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres start += span.size(); } - consensusType = ConsensusType.otherType(consensusType); + consensusType = ConsensusSpan.Type.otherType(consensusType); } return spans; } - private ConsensusSpan findSpan(List sites, int start, ConsensusType consensusType) { + private ConsensusSpan findSpan(List sites, int start, ConsensusSpan.Type consensusType) { int refStart = sites.get(0).getLoc().getStart(); for ( int end = start + 1; end < sites.size(); end++ ) { ConsensusSite site = sites.get(end); - boolean conserved = site.getMarkedType() == ConsensusType.CONSERVED; - if ( (consensusType == ConsensusType.CONSERVED && ! conserved) || - (consensusType == ConsensusType.VARIABLE && conserved) || + boolean conserved = site.getMarkedType() == ConsensusSpan.Type.CONSERVED; + if ( (consensusType == ConsensusSpan.Type.CONSERVED && ! conserved) || + (consensusType == ConsensusSpan.Type.VARIABLE && conserved) || end + 1 == sites.size() ) { // we are done with the complete interval GenomeLoc loc = glParser.createGenomeLoc(contig, start+refStart, end+refStart-1); return new ConsensusSpan(refStart, loc, consensusType); @@ -319,7 +344,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres for ( int i = 0; i < span.size(); i++ ) { int refI = i + span.getOffsetFromStartOfSites(); ConsensusSite site = sites.get(refI); - if ( site.getMarkedType() == ConsensusType.VARIABLE ) + if ( site.getMarkedType() == ConsensusSpan.Type.VARIABLE ) throw new ReviewedStingException("Variable site included in consensus: " + site); final int count = site.counts.countOfMostCommonBase(); final byte base = count == 0 ? (byte)'N' : site.counts.baseWithMostCounts(); @@ -328,10 +353,13 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres } SAMRecord consensus = new SAMRecord(header); + consensus.setAttribute("RG", reducedReadGroup.getId()); + consensus.setAttribute(ReadUtils.REDUCED_READ_QUALITY_TAG, Integer.valueOf(REDUCED_READ_BASE_QUALITY)); consensus.setReferenceName(contig); - consensus.setReadName("Mark"); - consensus.setCigarString(String.format("%dM", span.size())); + consensus.setReadName(String.format("%s.read.%d", reducedReadGroup.getId(), consensusCounter++)); consensus.setReadPairedFlag(false); + consensus.setReadUnmappedFlag(false); + consensus.setCigarString(String.format("%dM", span.size())); consensus.setAlignmentStart(span.getGenomeStart()); consensus.setReadBases(bases); consensus.setBaseQualities(quals); @@ -347,15 +375,13 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres private Collection variableSpanReads(List sites, ConsensusSpan span) { Set reads = new HashSet(); - int minNumBasesInRead = (int)Math.floor(MIN_FRACT_BASES_FOR_VARIABLE_READ * span.size()); + // todo -- this code is grossly inefficient, as it checks each variable read at each site in the span for ( int i = 0; i < span.size(); i++ ) { int refI = i + span.getOffsetFromStartOfSites(); ConsensusSite site = sites.get(refI); for ( PileupElement p : site.getOverlappingReads() ) { -// if ( reads.contains(p.getRead())) -// logger.info("Skipping already added read: " + p.getRead().getReadName()); SAMRecord read = clipReadToSpan(p.getRead(), span); - if ( read.getReadLength() >= minNumBasesInRead ) + if ( keepClippedReadInVariableSpan(p.getRead(), read) ) reads.add(read); } } @@ -363,6 +389,15 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres return reads; } + private final static boolean keepClippedReadInVariableSpan(SAMRecord originalRead, SAMRecord variableRead) { + int originalReadLength = originalRead.getReadLength(); + int variableReadLength = variableRead.getReadLength(); + + return variableReadLength >= MIN_BASES_IN_VARIABLE_SPAN_TO_INCLUDE_READ; +// && +// ((1.0 * variableReadLength) / originalReadLength) >= MIN_FRACT_BASES_FOR_VARIABLE_READ; + } + private SAMRecord clipReadToSpan(SAMRecord read, ConsensusSpan span) { ReadClipper clipper = new ReadClipper(read); int spanStart = span.getGenomeStart(); @@ -371,7 +406,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres for ( RefPileupElement p : RefPileupElement.walkRead(read) ) { if ( p.getRefOffset() == spanStart && p.getOffset() != 0 ) { - clipper.addOp(new ClippingOp(0, p.getOffset())); + clipper.addOp(new ClippingOp(0, p.getOffset() - 1)); } if ( p.getRefOffset() == spanEnd && p.getOffset() != readLen - 1 ) { diff --git a/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 2394e34dc..080762039 100644 --- a/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -41,6 +41,8 @@ import java.io.File; * @version 0.1 */ public class ReadUtils { + public static final String REDUCED_READ_QUALITY_TAG = "RQ"; + private ReadUtils() { } public static SAMFileHeader copySAMFileHeader(SAMFileHeader toCopy) {