From 429833c05ae94dc09c8423cc7181ed6363cd1c5b Mon Sep 17 00:00:00 2001 From: depristo Date: Thu, 2 Jun 2011 21:13:31 +0000 Subject: [PATCH] Intermediate commit (DVCS, where are you?) of a fully operational ReducedRead walker. Now results in minor differences in the raw calls (filtering is a different matter) in an exome but 20x less disk space than the full exome data. Changes to the UG necessary to process reduced reads are not yet committed, as they are being tested. This code is being moved to playground now. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5924 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/reducereads/BaseCounts.java | 9 ++- .../reducereads/ConsensusReadCompressor.java | 32 +++++--- .../walkers/reducereads/ConsensusSite.java | 10 +-- .../walkers/reducereads/ConsensusSpan.java | 29 +++++-- .../walkers/reducereads/ConsensusType.java | 20 ----- .../MultiSampleConsensusReadCompressor.java | 16 +++- .../reducereads/ReduceReadsWalker.java | 15 +++- .../SingleSampleConsensusReadCompressor.java | 75 ++++++++++++++----- .../sting/utils/sam/ReadUtils.java | 2 + 9 files changed, 140 insertions(+), 68 deletions(-) delete mode 100644 java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusType.java diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/BaseCounts.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/BaseCounts.java index 3919d76dc..687dc2c0a 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/BaseCounts.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/BaseCounts.java @@ -1,16 +1,16 @@ package org.broadinstitute.sting.oneoffprojects.walkers.reducereads; import org.broadinstitute.sting.utils.BaseUtils; +import com.google.java.contract.*; /** * Created by IntelliJ IDEA. * User: depristo * Date: 4/8/11 * Time: 2:55 PM -* To change this template use File | Settings | File Templates. */ final class BaseCounts { - int counts[] = new int[4]; // fixme -- include - and I events + private final int counts[] = new int[4]; // fixme -- include - and I events public void incr(byte base) { int baseI = BaseUtils.simpleBaseToBaseIndex(base); @@ -18,14 +18,17 @@ final class BaseCounts { counts[baseI]++; } + @Ensures("BaseUtils.isRegularBase(result)") public byte baseWithMostCounts() { return BaseUtils.baseIndexToSimpleBase(maxBaseIndex()); } + @Ensures("result >= 0") public int countOfMostCommonBase() { return counts[maxBaseIndex()]; } + @Ensures("result >= 0") public int totalCounts() { int sum = 0; @@ -36,6 +39,7 @@ final class BaseCounts { return sum; } + @Ensures("result >= 0 && result < counts.length") private int maxBaseIndex() { int maxI = 0; for ( int i = 0; i < counts.length; i++) { @@ -46,6 +50,7 @@ final class BaseCounts { return maxI; } + @Ensures("result != null") public String toString() { StringBuilder b = new StringBuilder(); for ( int i = 0; i < counts.length; i++ ) { diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusReadCompressor.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusReadCompressor.java index 2ef27226a..74081f01d 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusReadCompressor.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusReadCompressor.java @@ -1,22 +1,36 @@ package org.broadinstitute.sting.oneoffprojects.walkers.reducereads; +import com.google.java.contract.*; import net.sf.samtools.SAMRecord; -import java.util.Collection; -import java.util.Iterator; - /** * Created by IntelliJ IDEA. * User: depristo * Date: 4/10/11 * Time: 8:49 AM - * To change this template use File | Settings | File Templates. + * + * A general interface for ReadCompressors. Read compressors have the following semantics: + * + * The accept a stream of reads, in order, and after each added read returns a compressed stream + * of reads for emission. This stream of reads is a "reduced" representation of the total stream + * of reads. The actual compression approach is left up to the implementing class. */ -public interface ConsensusReadCompressor { // extends Iterable { +public interface ConsensusReadCompressor { + /** + * Adds the read to the compressor. The returned iteratable collection of + * reads represents the incremental compressed output. + * @param read the next uncompressed read in the input stream to the compressor + * @return an iterator over the incrementally available compressed reads + */ + @Requires("read != null") + @Ensures("result != null") Iterable addAlignment(SAMRecord read); - Iterable close(); -//Collection consensusReads(); -// @Override -// Iterator iterator(); + /** + * Must be called after the last read has been added to finalize the compressor state + * and return the last compressed reads from the compressor. + * @return an iterator over the final compressed reads of this compressor + */ + @Ensures("result != null") + Iterable close(); } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSite.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSite.java index 97e1fa70a..041826f56 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSite.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSite.java @@ -18,7 +18,7 @@ import java.util.Set; * User: depristo * Date: 4/8/11 * Time: 3:01 PM -* To change this template use File | Settings | File Templates. +* */ final class ConsensusSite { final GenomeLoc loc; @@ -26,7 +26,7 @@ final class ConsensusSite { final int offset; final BaseCounts counts = new BaseCounts(); - ConsensusType markedType = null; + ConsensusSpan.Type markedType = null; public ConsensusSite(GenomeLoc loc, int offset) { this.loc = loc; @@ -88,14 +88,12 @@ final class ConsensusSite { return counts.toString(); } - public void setMarkedType(ConsensusType markedType) { + public void setMarkedType(ConsensusSpan.Type markedType) { this.markedType = markedType; } - public ConsensusType getMarkedType() { + public ConsensusSpan.Type getMarkedType() { if ( markedType == null ) throw new ReviewedStingException("markedType not yet set!"); return markedType; } - - } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSpan.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSpan.java index 33b196767..6f3c0edc1 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSpan.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSpan.java @@ -10,11 +10,30 @@ import org.broadinstitute.sting.utils.GenomeLoc; * To change this template use File | Settings | File Templates. */ final class ConsensusSpan { + + /** + * The type of an span is either conserved (little variability within the span) or + * variable (too many differences among the reads in the span to determine the exact + * haplotype sequence). + */ + public enum Type { + CONSERVED, VARIABLE; + + public static Type otherType(Type t) { + switch ( t ) { + case CONSERVED: return VARIABLE; + case VARIABLE: return CONSERVED; + } + return CONSERVED; + } + } + + final int refStart; // the start position on the reference for relative calculations final GenomeLoc loc; - final ConsensusType consensusType; + final Type consensusType; - public ConsensusSpan(final int refStart, GenomeLoc loc, ConsensusType consensusType) { + public ConsensusSpan(final int refStart, GenomeLoc loc, ConsensusSpan.Type consensusType) { this.refStart = refStart; this.loc = loc; this.consensusType = consensusType; @@ -32,7 +51,7 @@ final class ConsensusSpan { return loc.getStop(); } - public ConsensusType getConsensusType() { + public ConsensusSpan.Type getConsensusType() { return consensusType; } @@ -40,8 +59,8 @@ final class ConsensusSpan { return getGenomeStop() - getGenomeStart() + 1; } - public boolean isConserved() { return getConsensusType() == ConsensusType.CONSERVED; } - public boolean isVariable() { return getConsensusType() == ConsensusType.VARIABLE; } + public boolean isConserved() { return getConsensusType() == Type.CONSERVED; } + public boolean isVariable() { return getConsensusType() == Type.VARIABLE; } public String toString() { return String.format("%s %s", consensusType, loc); diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusType.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusType.java deleted file mode 100644 index 0e69e7c5d..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusType.java +++ /dev/null @@ -1,20 +0,0 @@ -package org.broadinstitute.sting.oneoffprojects.walkers.reducereads; - -/** -* Created by IntelliJ IDEA. -* User: depristo -* Date: 4/9/11 -* Time: 7:52 PM -* To change this template use File | Settings | File Templates. -*/ -public enum ConsensusType { - CONSERVED, VARIABLE; - - public static ConsensusType otherType(ConsensusType t) { - switch ( t ) { - case CONSERVED: return VARIABLE; - case VARIABLE: return CONSERVED; - } - return CONSERVED; - } -} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/MultiSampleConsensusReadCompressor.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/MultiSampleConsensusReadCompressor.java index bb3540906..ab87290e2 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/MultiSampleConsensusReadCompressor.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/MultiSampleConsensusReadCompressor.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.oneoffprojects.walkers.reducereads; import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -49,17 +50,26 @@ public class MultiSampleConsensusReadCompressor implements ConsensusReadCompress public MultiSampleConsensusReadCompressor(SAMFileHeader header, final int readContextSize, final GenomeLocParser glParser, - final String contig, final int minBpForRunningConsensus, final int maxReadsAtVariableSites) { for ( String name : SampleUtils.getSAMFileSamples(header) ) { compressorsPerSample.put(name, - new SingleSampleConsensusReadCompressor(readContextSize, - glParser, contig, minBpForRunningConsensus, maxReadsAtVariableSites)); + new SingleSampleConsensusReadCompressor(name, readContextSize, + glParser, minBpForRunningConsensus, maxReadsAtVariableSites)); // todo -- argument for minConsensusSize } } + public Collection getReducedReadGroups() { + List rgs = new ArrayList(); + + for ( SingleSampleConsensusReadCompressor comp : compressorsPerSample.values() ) { + rgs.add(comp.getReducedReadGroup()); + } + + return rgs; + } + @Override public Iterable addAlignment(SAMRecord read) { String sample = read.getReadGroup().getSample(); diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ReduceReadsWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ReduceReadsWalker.java index e0984988a..a1610cc52 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ReduceReadsWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ReduceReadsWalker.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.oneoffprojects.walkers.reducereads; +import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; @@ -69,10 +70,20 @@ public class ReduceReadsWalker extends ReadWalker addAlignment( SAMRecord read ) { + if ( contig == null ) + contig = read.getReferenceName(); + if ( ! read.getReferenceName().equals(contig) ) + throw new ReviewedStingException("ConsensusRead system doesn't support multiple contig processing right now"); + if ( header == null ) header = read.getHeader(); @@ -200,7 +225,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres private List expandVariableSites(List sites) { for ( ConsensusSite site : sites ) - site.setMarkedType(ConsensusType.CONSERVED); + site.setMarkedType(ConsensusSpan.Type.CONSERVED); for ( int i = 0; i < sites.size(); i++ ) { ConsensusSite site = sites.get(i); @@ -209,7 +234,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres int stop = Math.min(sites.size(), i + readContextSize + 1); for ( int j = start; j < stop; j++ ) { // aggressive tagging -- you are only conserved if you are never variable - sites.get(j).setMarkedType(ConsensusType.VARIABLE); + sites.get(j).setMarkedType(ConsensusSpan.Type.VARIABLE); } } } @@ -223,7 +248,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres int start = 0; // our first span type is the type of the first site - ConsensusType consensusType = sites.get(0).getMarkedType(); + ConsensusSpan.Type consensusType = sites.get(0).getMarkedType(); while ( start < sites.size() ) { ConsensusSpan span = findSpan(sites, start, consensusType); @@ -234,20 +259,20 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres start += span.size(); } - consensusType = ConsensusType.otherType(consensusType); + consensusType = ConsensusSpan.Type.otherType(consensusType); } return spans; } - private ConsensusSpan findSpan(List sites, int start, ConsensusType consensusType) { + private ConsensusSpan findSpan(List sites, int start, ConsensusSpan.Type consensusType) { int refStart = sites.get(0).getLoc().getStart(); for ( int end = start + 1; end < sites.size(); end++ ) { ConsensusSite site = sites.get(end); - boolean conserved = site.getMarkedType() == ConsensusType.CONSERVED; - if ( (consensusType == ConsensusType.CONSERVED && ! conserved) || - (consensusType == ConsensusType.VARIABLE && conserved) || + boolean conserved = site.getMarkedType() == ConsensusSpan.Type.CONSERVED; + if ( (consensusType == ConsensusSpan.Type.CONSERVED && ! conserved) || + (consensusType == ConsensusSpan.Type.VARIABLE && conserved) || end + 1 == sites.size() ) { // we are done with the complete interval GenomeLoc loc = glParser.createGenomeLoc(contig, start+refStart, end+refStart-1); return new ConsensusSpan(refStart, loc, consensusType); @@ -319,7 +344,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres for ( int i = 0; i < span.size(); i++ ) { int refI = i + span.getOffsetFromStartOfSites(); ConsensusSite site = sites.get(refI); - if ( site.getMarkedType() == ConsensusType.VARIABLE ) + if ( site.getMarkedType() == ConsensusSpan.Type.VARIABLE ) throw new ReviewedStingException("Variable site included in consensus: " + site); final int count = site.counts.countOfMostCommonBase(); final byte base = count == 0 ? (byte)'N' : site.counts.baseWithMostCounts(); @@ -328,10 +353,13 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres } SAMRecord consensus = new SAMRecord(header); + consensus.setAttribute("RG", reducedReadGroup.getId()); + consensus.setAttribute(ReadUtils.REDUCED_READ_QUALITY_TAG, Integer.valueOf(REDUCED_READ_BASE_QUALITY)); consensus.setReferenceName(contig); - consensus.setReadName("Mark"); - consensus.setCigarString(String.format("%dM", span.size())); + consensus.setReadName(String.format("%s.read.%d", reducedReadGroup.getId(), consensusCounter++)); consensus.setReadPairedFlag(false); + consensus.setReadUnmappedFlag(false); + consensus.setCigarString(String.format("%dM", span.size())); consensus.setAlignmentStart(span.getGenomeStart()); consensus.setReadBases(bases); consensus.setBaseQualities(quals); @@ -347,15 +375,13 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres private Collection variableSpanReads(List sites, ConsensusSpan span) { Set reads = new HashSet(); - int minNumBasesInRead = (int)Math.floor(MIN_FRACT_BASES_FOR_VARIABLE_READ * span.size()); + // todo -- this code is grossly inefficient, as it checks each variable read at each site in the span for ( int i = 0; i < span.size(); i++ ) { int refI = i + span.getOffsetFromStartOfSites(); ConsensusSite site = sites.get(refI); for ( PileupElement p : site.getOverlappingReads() ) { -// if ( reads.contains(p.getRead())) -// logger.info("Skipping already added read: " + p.getRead().getReadName()); SAMRecord read = clipReadToSpan(p.getRead(), span); - if ( read.getReadLength() >= minNumBasesInRead ) + if ( keepClippedReadInVariableSpan(p.getRead(), read) ) reads.add(read); } } @@ -363,6 +389,15 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres return reads; } + private final static boolean keepClippedReadInVariableSpan(SAMRecord originalRead, SAMRecord variableRead) { + int originalReadLength = originalRead.getReadLength(); + int variableReadLength = variableRead.getReadLength(); + + return variableReadLength >= MIN_BASES_IN_VARIABLE_SPAN_TO_INCLUDE_READ; +// && +// ((1.0 * variableReadLength) / originalReadLength) >= MIN_FRACT_BASES_FOR_VARIABLE_READ; + } + private SAMRecord clipReadToSpan(SAMRecord read, ConsensusSpan span) { ReadClipper clipper = new ReadClipper(read); int spanStart = span.getGenomeStart(); @@ -371,7 +406,7 @@ public class SingleSampleConsensusReadCompressor implements ConsensusReadCompres for ( RefPileupElement p : RefPileupElement.walkRead(read) ) { if ( p.getRefOffset() == spanStart && p.getOffset() != 0 ) { - clipper.addOp(new ClippingOp(0, p.getOffset())); + clipper.addOp(new ClippingOp(0, p.getOffset() - 1)); } if ( p.getRefOffset() == spanEnd && p.getOffset() != readLen - 1 ) { diff --git a/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 2394e34dc..080762039 100644 --- a/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -41,6 +41,8 @@ import java.io.File; * @version 0.1 */ public class ReadUtils { + public static final String REDUCED_READ_QUALITY_TAG = "RQ"; + private ReadUtils() { } public static SAMFileHeader copySAMFileHeader(SAMFileHeader toCopy) {