Refactored read clipping framework into a generic utilities class, independent of ClipReadsWalker, which now uses this framework. Some more cleanup is really needed, as some of the arguments to the classes are really only useful for ClipReads
ReduceReadsWalker -- does consensus-based read compression, v2. Does all of the consensus calculations within the ConsensusReadCompressor per sample, and multi-sample case is handled by MultiSampleConsensusReadCompressor. For deeply covered data sets, this projects a significant reduction in the number of mapped reads. Impact on analysis call quality tbd. Expected to be relatively minor, as the system automatically detects regions without a strong consensus, and expands a window around these so that +/- 10bp of all reads are shown around the unclear sites. Not usable yet -- as it does not yet support streaming output, and actually holds all reads in memory at once. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5610 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
13c5f3322d
commit
9c36b0a39b
|
|
@ -33,11 +33,13 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
|||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.clipreads.ClippingOp;
|
||||
import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation;
|
||||
import org.broadinstitute.sting.utils.clipreads.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
|
||||
import java.nio.channels.IllegalSelectorException;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
|
|
@ -51,7 +53,7 @@ import net.sf.samtools.util.StringUtil;
|
|||
* with poor quality scores, that match particular sequences, or that were generated by particular machine cycles.
|
||||
*/
|
||||
@Requires({DataSource.READS})
|
||||
public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, ClipReadsWalker.ClippingData> {
|
||||
public class ClipReadsWalker extends ReadWalker<ReadClipper, ClipReadsWalker.ClippingData> {
|
||||
@Output
|
||||
PrintStream out;
|
||||
|
||||
|
|
@ -215,7 +217,7 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
if (found) {
|
||||
int start = match.start();
|
||||
int stop = match.end() - 1;
|
||||
ClippingOp op = new ClippingOp(ClippingType.MATCHES_CLIP_SEQ, start, stop, stc.seq);
|
||||
ClippingOp op = new ClippingOp(ClippingOp.ClippingType.MATCHES_CLIP_SEQ, start, stop, stc.seq);
|
||||
clipper.addOp(op);
|
||||
}
|
||||
}
|
||||
|
|
@ -262,7 +264,7 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
int start = startStop.first;
|
||||
int stop = startStop.second;
|
||||
|
||||
ClippingOp op = new ClippingOp(ClippingType.WITHIN_CLIP_RANGE, start, stop, null);
|
||||
ClippingOp op = new ClippingOp(ClippingOp.ClippingType.WITHIN_CLIP_RANGE, start, stop, null);
|
||||
clipper.addOp(op);
|
||||
}
|
||||
}
|
||||
|
|
@ -303,7 +305,7 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
if (clipPoint != -1) {
|
||||
int start = read.getReadNegativeStrandFlag() ? 0 : clipPoint;
|
||||
int stop = read.getReadNegativeStrandFlag() ? clipPoint : readLen - 1;
|
||||
clipper.addOp(new ClippingOp(ClippingType.LOW_Q_SCORES, start, stop, null));
|
||||
clipper.addOp(new ClippingOp(ClippingOp.ClippingType.LOW_Q_SCORES, start, stop, null));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -375,192 +377,6 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* What is the type of a ClippingOp?
|
||||
*/
|
||||
private enum ClippingType {
|
||||
LOW_Q_SCORES,
|
||||
WITHIN_CLIP_RANGE,
|
||||
MATCHES_CLIP_SEQ
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents a clip on a read. It has a type (see the enum) along with a start and stop in the bases
|
||||
* of the read, plus an option extraInfo (useful for carrying info where needed).
|
||||
* <p/>
|
||||
* Also holds the critical apply function that actually execute the clipping operation on a provided read,
|
||||
* according to the wishes of the supplid ClippingAlgorithm enum.
|
||||
*/
|
||||
private class ClippingOp {
|
||||
public ClippingType type;
|
||||
public int start, stop; // inclusive
|
||||
public Object extraInfo = null;
|
||||
|
||||
public ClippingOp(ClippingType type, int start, int stop, Object extraInfo) {
|
||||
this.type = type;
|
||||
this.start = start;
|
||||
this.stop = stop;
|
||||
this.extraInfo = extraInfo;
|
||||
}
|
||||
|
||||
public int getLength() {
|
||||
return stop - start + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clips the bases in clippedRead according to this operation's start and stop. Uses the clipping
|
||||
* representation used is the one provided by algorithm argument.
|
||||
*
|
||||
* @param algorithm
|
||||
* @param clippedRead
|
||||
*/
|
||||
public void apply(ClippingRepresentation algorithm, SAMRecord clippedRead) {
|
||||
//clippedRead.setReferenceIndex(1);
|
||||
byte[] quals = clippedRead.getBaseQualities();
|
||||
byte[] bases = clippedRead.getReadBases();
|
||||
|
||||
switch (algorithm) {
|
||||
// important note:
|
||||
// it's not safe to call read.getReadBases()[i] = 'N' or read.getBaseQualities()[i] = 0
|
||||
// because you're not guaranteed to get a pointer to the actual array of bytes in the SAMRecord
|
||||
case WRITE_NS:
|
||||
for (int i = start; i <= stop; i++)
|
||||
bases[i] = 'N';
|
||||
clippedRead.setReadBases(bases);
|
||||
break;
|
||||
case WRITE_Q0S:
|
||||
for (int i = start; i <= stop; i++)
|
||||
quals[i] = 0;
|
||||
clippedRead.setBaseQualities(quals);
|
||||
break;
|
||||
case WRITE_NS_Q0S:
|
||||
for (int i = start; i <= stop; i++) {
|
||||
bases[i] = 'N';
|
||||
quals[i] = 0;
|
||||
}
|
||||
clippedRead.setReadBases(bases);
|
||||
clippedRead.setBaseQualities(quals);
|
||||
break;
|
||||
case SOFTCLIP_BASES:
|
||||
if ( ! clippedRead.getReadUnmappedFlag() ) {
|
||||
// we can't process unmapped reads
|
||||
|
||||
//System.out.printf("%d %d %d%n", stop, start, clippedRead.getReadLength());
|
||||
if ( (stop + 1 - start) == clippedRead.getReadLength() ) {
|
||||
// BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it alone
|
||||
logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", clippedRead.getReadName()));
|
||||
break;
|
||||
}
|
||||
|
||||
if ( start > 0 && stop != clippedRead.getReadLength() - 1 )
|
||||
throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d",
|
||||
clippedRead.getReadName(), start, stop));
|
||||
|
||||
Cigar oldCigar = clippedRead.getCigar();
|
||||
|
||||
int scLeft = 0, scRight = clippedRead.getReadLength();
|
||||
if ( clippedRead.getReadNegativeStrandFlag() ) {
|
||||
if ( start == 0 )
|
||||
scLeft = stop + 1;
|
||||
else
|
||||
scRight = start + 1;
|
||||
} else {
|
||||
if ( start == 0 )
|
||||
scLeft = stop;
|
||||
else
|
||||
scRight = start;
|
||||
}
|
||||
|
||||
Cigar newCigar = _softClip(oldCigar, scLeft, scRight);
|
||||
clippedRead.setCigar(newCigar);
|
||||
|
||||
int newClippedStart = _getNewAlignmentStartOffset(newCigar, oldCigar);
|
||||
int newStart = clippedRead.getAlignmentStart() + newClippedStart;
|
||||
clippedRead.setAlignmentStart(newStart);
|
||||
|
||||
//System.out.printf("%s clipping at %d %d / %d %d => %s and %d%n", oldCigar.toString(), start, stop, scLeft, scRight, newCigar.toString(), newStart);
|
||||
}
|
||||
|
||||
break;
|
||||
//throw new RuntimeException("Softclipping of bases not yet implemented.");
|
||||
default:
|
||||
throw new IllegalStateException("Unexpected Clipping operator type " + algorithm);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* How should we represent a clipped bases in a read?
|
||||
*/
|
||||
public enum ClippingRepresentation {
|
||||
WRITE_NS, // change the bases to Ns
|
||||
WRITE_Q0S, // change the quality scores to Q0
|
||||
WRITE_NS_Q0S, // change the quality scores to Q0 and write Ns
|
||||
SOFTCLIP_BASES // change cigar string to S, but keep bases
|
||||
}
|
||||
|
||||
/**
|
||||
* A simple collection of the clipping operations to apply to a read along with its read
|
||||
*/
|
||||
static public class ReadClipper {
|
||||
SAMRecord read;
|
||||
List<ClippingOp> ops = null;
|
||||
|
||||
/**
|
||||
* We didn't do any clipping work on this read, just leave everything as a default
|
||||
*
|
||||
* @param read
|
||||
*/
|
||||
public ReadClipper(final SAMRecord read) {
|
||||
this.read = read;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add another clipping operation to apply to this read
|
||||
*
|
||||
* @param op
|
||||
*/
|
||||
public void addOp(ClippingOp op) {
|
||||
if (ops == null) ops = new ArrayList<ClippingOp>();
|
||||
ops.add(op);
|
||||
}
|
||||
|
||||
public List<ClippingOp> getOps() {
|
||||
return ops;
|
||||
}
|
||||
|
||||
public boolean wasClipped() {
|
||||
return ops != null;
|
||||
}
|
||||
|
||||
public SAMRecord getRead() {
|
||||
return read;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return a new read corresponding to this.read that's been clipped according to ops, if any are present.
|
||||
*
|
||||
* @param algorithm
|
||||
* @return
|
||||
*/
|
||||
public SAMRecord clipRead(ClippingRepresentation algorithm) {
|
||||
if (ops == null)
|
||||
return getRead();
|
||||
else {
|
||||
try {
|
||||
SAMRecord clippedRead = (SAMRecord) read.clone();
|
||||
for (ClippingOp op : getOps()) {
|
||||
op.apply(algorithm, clippedRead);
|
||||
}
|
||||
return clippedRead;
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new RuntimeException(e); // this should never happen
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static class ClippingData {
|
||||
public long nTotalReads = 0;
|
||||
public long nTotalBases = 0;
|
||||
|
|
@ -616,140 +432,4 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
return s.toString();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a cigar string, get the number of bases hard or soft clipped at the start
|
||||
*/
|
||||
private int _getNewAlignmentStartOffset(final Cigar __cigar, final Cigar __oldCigar) {
|
||||
int num = 0;
|
||||
for (CigarElement e : __cigar.getCigarElements()) {
|
||||
if (!e.getOperator().consumesReferenceBases()) {
|
||||
if (e.getOperator().consumesReadBases()) {
|
||||
num += e.getLength();
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int oldNum = 0;
|
||||
int curReadCounter = 0;
|
||||
|
||||
for (CigarElement e : __oldCigar.getCigarElements()) {
|
||||
int curRefLength = e.getLength();
|
||||
int curReadLength = e.getLength();
|
||||
if (!e.getOperator().consumesReadBases()) {
|
||||
curReadLength = 0;
|
||||
}
|
||||
|
||||
boolean truncated = false;
|
||||
if (curReadCounter + curReadLength > num) {
|
||||
curReadLength = num - curReadCounter;
|
||||
curRefLength = num - curReadCounter;
|
||||
truncated = true;
|
||||
}
|
||||
|
||||
if (!e.getOperator().consumesReferenceBases()) {
|
||||
curRefLength = 0;
|
||||
}
|
||||
|
||||
curReadCounter += curReadLength;
|
||||
oldNum += curRefLength;
|
||||
|
||||
if (curReadCounter > num || truncated) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return oldNum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a cigar string, soft clip up to startClipEnd and soft clip starting at endClipBegin
|
||||
*/
|
||||
private Cigar _softClip(final Cigar __cigar, final int __startClipEnd, final int __endClipBegin) {
|
||||
if (__endClipBegin <= __startClipEnd) {
|
||||
//whole thing should be soft clipped
|
||||
int cigarLength = 0;
|
||||
for (CigarElement e : __cigar.getCigarElements()) {
|
||||
cigarLength += e.getLength();
|
||||
}
|
||||
|
||||
Cigar newCigar = new Cigar();
|
||||
newCigar.add(new CigarElement(cigarLength, CigarOperator.SOFT_CLIP));
|
||||
assert newCigar.isValid(null, -1) == null;
|
||||
return newCigar;
|
||||
}
|
||||
|
||||
int curLength = 0;
|
||||
Vector<CigarElement> newElements = new Vector<CigarElement>();
|
||||
for (CigarElement curElem : __cigar.getCigarElements()) {
|
||||
if (!curElem.getOperator().consumesReadBases()) {
|
||||
if (curLength > __startClipEnd && curLength < __endClipBegin) {
|
||||
newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator()));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
int s = curLength;
|
||||
int e = curLength + curElem.getLength();
|
||||
if (e <= __startClipEnd || s >= __endClipBegin) {
|
||||
//must turn this entire thing into a clip
|
||||
newElements.add(new CigarElement(curElem.getLength(), CigarOperator.SOFT_CLIP));
|
||||
} else if (s >= __startClipEnd && e <= __endClipBegin) {
|
||||
//same thing
|
||||
newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator()));
|
||||
} else {
|
||||
//we are clipping in the middle of this guy
|
||||
CigarElement newStart = null;
|
||||
CigarElement newMid = null;
|
||||
CigarElement newEnd = null;
|
||||
|
||||
int midLength = curElem.getLength();
|
||||
if (s < __startClipEnd) {
|
||||
newStart = new CigarElement(__startClipEnd - s, CigarOperator.SOFT_CLIP);
|
||||
midLength -= newStart.getLength();
|
||||
}
|
||||
|
||||
if (e > __endClipBegin) {
|
||||
newEnd = new CigarElement(e - __endClipBegin, CigarOperator.SOFT_CLIP);
|
||||
midLength -= newEnd.getLength();
|
||||
}
|
||||
assert midLength >= 0;
|
||||
if (midLength > 0) {
|
||||
newMid = new CigarElement(midLength, curElem.getOperator());
|
||||
}
|
||||
if (newStart != null) {
|
||||
newElements.add(newStart);
|
||||
}
|
||||
if (newMid != null) {
|
||||
newElements.add(newMid);
|
||||
}
|
||||
if (newEnd != null) {
|
||||
newElements.add(newEnd);
|
||||
}
|
||||
}
|
||||
curLength += curElem.getLength();
|
||||
}
|
||||
|
||||
Vector<CigarElement> finalNewElements = new Vector<CigarElement>();
|
||||
CigarElement lastElement = null;
|
||||
for (CigarElement elem : newElements) {
|
||||
if (lastElement == null || lastElement.getOperator() != elem.getOperator()) {
|
||||
if (lastElement != null) {
|
||||
finalNewElements.add(lastElement);
|
||||
}
|
||||
lastElement = elem;
|
||||
} else {
|
||||
lastElement = new CigarElement(lastElement.getLength() + elem.getLength(), lastElement.getOperator());
|
||||
}
|
||||
}
|
||||
if (lastElement != null) {
|
||||
finalNewElements.add(lastElement);
|
||||
}
|
||||
|
||||
Cigar newCigar = new Cigar(finalNewElements);
|
||||
assert newCigar.isValid(null, -1) == null;
|
||||
return newCigar;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.reducereads;
|
||||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: 4/8/11
|
||||
* Time: 2:55 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
final class BaseCounts {
|
||||
int counts[] = new int[4]; // fixme -- include - and I events
|
||||
|
||||
public void incr(byte base) {
|
||||
int baseI = BaseUtils.simpleBaseToBaseIndex(base);
|
||||
if ( baseI >= 0 ) // no Ns
|
||||
counts[baseI]++;
|
||||
}
|
||||
|
||||
public byte baseWithMostCounts() {
|
||||
return BaseUtils.baseIndexToSimpleBase(maxBaseIndex());
|
||||
}
|
||||
|
||||
public int countOfMostCommonBase() {
|
||||
return counts[maxBaseIndex()];
|
||||
}
|
||||
|
||||
public int totalCounts() {
|
||||
int sum = 0;
|
||||
|
||||
for ( int c : counts ) {
|
||||
sum += c;
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
private int maxBaseIndex() {
|
||||
int maxI = 0;
|
||||
for ( int i = 0; i < counts.length; i++) {
|
||||
if ( counts[i] > counts[maxI] ) {
|
||||
maxI = i;
|
||||
}
|
||||
}
|
||||
return maxI;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder b = new StringBuilder();
|
||||
for ( int i = 0; i < counts.length; i++ ) {
|
||||
b.append((char)BaseUtils.baseIndexToSimpleBase(i)).append("=").append(counts[i]).append(",");
|
||||
}
|
||||
return b.toString();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.reducereads;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: 4/10/11
|
||||
* Time: 8:49 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public interface ConsensusReadCompressor extends Iterable<SAMRecord> {
|
||||
void addAlignment(SAMRecord read);
|
||||
|
||||
Collection<SAMRecord> consensusReads();
|
||||
|
||||
@Override
|
||||
Iterator<SAMRecord> iterator();
|
||||
}
|
||||
|
|
@ -0,0 +1,101 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.reducereads;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: 4/8/11
|
||||
* Time: 3:01 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
final class ConsensusSite {
|
||||
final GenomeLoc loc;
|
||||
final Set<PileupElement> overlappingReads = new HashSet<PileupElement>();
|
||||
final int offset;
|
||||
final BaseCounts counts = new BaseCounts();
|
||||
|
||||
ConsensusType markedType = null;
|
||||
|
||||
public ConsensusSite(GenomeLoc loc, int offset) {
|
||||
this.loc = loc;
|
||||
|
||||
this.offset = offset;
|
||||
|
||||
}
|
||||
|
||||
public GenomeLoc getLoc() {
|
||||
return loc;
|
||||
}
|
||||
|
||||
public Set<PileupElement> getOverlappingReads() {
|
||||
return overlappingReads;
|
||||
}
|
||||
|
||||
public void addOverlappingRead(PileupElement elt) {
|
||||
overlappingReads.add(elt);
|
||||
counts.incr(elt.getBase());
|
||||
}
|
||||
|
||||
public boolean isStrongConsensus(final double maxFractionDisagreeingBases) {
|
||||
int mostCommon = counts.countOfMostCommonBase();
|
||||
int total = counts.totalCounts();
|
||||
double fractionCommonBase = (1.0 * mostCommon) / total;
|
||||
return (1 - fractionCommonBase) < maxFractionDisagreeingBases;
|
||||
}
|
||||
|
||||
public final static class ConsensusBase {
|
||||
byte base, qual;
|
||||
|
||||
public byte getBase() {
|
||||
return base;
|
||||
}
|
||||
|
||||
public byte getQual() {
|
||||
return qual;
|
||||
}
|
||||
|
||||
public ConsensusBase(byte base, byte qual) {
|
||||
this.base = base;
|
||||
this.qual = qual;
|
||||
}
|
||||
}
|
||||
|
||||
public ConsensusBase getConsensus() {
|
||||
byte base = counts.baseWithMostCounts();
|
||||
int qual = 0;
|
||||
|
||||
for ( PileupElement p : overlappingReads ) {
|
||||
if ( p.getBase() == base )
|
||||
qual++;
|
||||
}
|
||||
|
||||
return new ConsensusBase(base, QualityUtils.boundQual(qual, (byte)64));
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return counts.toString();
|
||||
}
|
||||
|
||||
public void setMarkedType(ConsensusType markedType) {
|
||||
this.markedType = markedType;
|
||||
}
|
||||
|
||||
public ConsensusType getMarkedType() {
|
||||
if ( markedType == null ) throw new ReviewedStingException("markedType not yet set!");
|
||||
return markedType;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.reducereads;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: 4/8/11
|
||||
* Time: 3:01 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
final class ConsensusSpan {
|
||||
final int refStart; // the start position on the reference for relative calculations
|
||||
final GenomeLoc loc;
|
||||
final ConsensusType consensusType;
|
||||
|
||||
public ConsensusSpan(final int refStart, GenomeLoc loc, ConsensusType consensusType) {
|
||||
this.refStart = refStart;
|
||||
this.loc = loc;
|
||||
this.consensusType = consensusType;
|
||||
}
|
||||
|
||||
public int getOffsetFromStartOfSites() {
|
||||
return loc.getStart() - refStart;
|
||||
}
|
||||
|
||||
public int getGenomeStart() {
|
||||
return loc.getStart();
|
||||
}
|
||||
|
||||
public int getGenomeStop() {
|
||||
return loc.getStop();
|
||||
}
|
||||
|
||||
public ConsensusType getConsensusType() {
|
||||
return consensusType;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return getGenomeStop() - getGenomeStart() + 1;
|
||||
}
|
||||
|
||||
public boolean isConserved() { return getConsensusType() == ConsensusType.CONSERVED; }
|
||||
public boolean isVariable() { return getConsensusType() == ConsensusType.VARIABLE; }
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s %s", consensusType, loc);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.reducereads;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: 4/9/11
|
||||
* Time: 7:52 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public enum ConsensusType {
|
||||
CONSERVED, VARIABLE;
|
||||
|
||||
public static ConsensusType otherType(ConsensusType t) {
|
||||
switch ( t ) {
|
||||
case CONSERVED: return VARIABLE;
|
||||
case VARIABLE: return CONSERVED;
|
||||
}
|
||||
return CONSERVED;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,80 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.reducereads;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
//import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
*
|
||||
* @author depristo
|
||||
* @version 0.1
|
||||
*/
|
||||
public class MultiSampleConsensusReadCompressor implements ConsensusReadCompressor {
|
||||
protected static final Logger logger = Logger.getLogger(MultiSampleConsensusReadCompressor.class);
|
||||
|
||||
Map<String, SingleSampleConsensusReadCompressor> compressorsPerSample = new HashMap<String, SingleSampleConsensusReadCompressor>();
|
||||
|
||||
public MultiSampleConsensusReadCompressor(SAMFileHeader header,
|
||||
final int readContextSize,
|
||||
final GenomeLocParser glParser,
|
||||
final String contig) {
|
||||
for ( String name : SampleUtils.getSAMFileSamples(header) ) {
|
||||
compressorsPerSample.put(name,
|
||||
new SingleSampleConsensusReadCompressor(readContextSize, glParser, contig));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addAlignment(SAMRecord read) {
|
||||
String sample = read.getReadGroup().getSample();
|
||||
SingleSampleConsensusReadCompressor compressor = compressorsPerSample.get(sample);
|
||||
if ( compressor == null )
|
||||
throw new ReviewedStingException("No compressor for sample " + sample);
|
||||
compressor.addAlignment(read);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<SAMRecord> consensusReads() {
|
||||
List<SAMRecord> reads = new LinkedList<SAMRecord>();
|
||||
for ( SingleSampleConsensusReadCompressor comp : compressorsPerSample.values() )
|
||||
reads.addAll(comp.consensusReads());
|
||||
return reads;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<SAMRecord> iterator() {
|
||||
return consensusReads().iterator();
|
||||
}
|
||||
}
|
||||
|
|
@ -25,53 +25,50 @@
|
|||
|
||||
package org.broadinstitute.sting.oneoffprojects.walkers.reducereads;
|
||||
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: April 7, 2011
|
||||
*/
|
||||
public class ReduceReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||
public class ReduceReadsWalker extends ReadWalker<SAMRecord, Collection<SAMRecord>> {
|
||||
@Output
|
||||
protected StingSAMFileWriter out;
|
||||
|
||||
@Argument(fullName = "SNPContextSize", shortName = "SCS", doc = "", required = true)
|
||||
protected int SNPContextSize;
|
||||
@Output(fullName="bedOut", shortName = "bedOut", doc="BED output", required = false)
|
||||
protected PrintStream bedOut = null;
|
||||
|
||||
@Argument(fullName = "IndelContextSize", shortName = "ICS", doc = "", required = true)
|
||||
protected int IndelContextSize;
|
||||
@Argument(fullName = "contextSize", shortName = "CS", doc = "", required = false)
|
||||
protected int contextSize = 10;
|
||||
|
||||
@Argument(fullName = "INCLUDE_RAW_READS", shortName = "IRR", doc = "", required = false)
|
||||
protected boolean INCLUDE_RAW_READS = false;
|
||||
|
||||
@Argument(fullName = "useRead", shortName = "UR", doc = "", required = false)
|
||||
protected Set<String> readNamesToUse;
|
||||
|
||||
protected ReducingSAMFileWriter reducingOut;
|
||||
protected int totalReads = 0;
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
reducingOut = new ReducingSAMFileWriter(out, SNPContextSize, IndelContextSize);
|
||||
super.initialize();
|
||||
out.setPresorted(false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SAMRecord map( ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker ) {
|
||||
for ( GATKFeature feature : metaDataTracker.getAllCoveringRods() ) {
|
||||
if ( feature.getUnderlyingObject() instanceof VariantContext ) {
|
||||
VariantContext vc = (VariantContext)feature.getUnderlyingObject();
|
||||
reducingOut.addVariant(vc);
|
||||
}
|
||||
}
|
||||
|
||||
totalReads++;
|
||||
return read; // all the work is done in the reduce step for this walker
|
||||
}
|
||||
|
|
@ -83,26 +80,45 @@ public class ReduceReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
|||
* @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise
|
||||
*/
|
||||
@Override
|
||||
public SAMFileWriter reduceInit() {
|
||||
return reducingOut;
|
||||
public Collection<SAMRecord> reduceInit() {
|
||||
return new ArrayList<SAMRecord>();
|
||||
}
|
||||
|
||||
/**
|
||||
* given a read and a output location, reduce by emitting the read
|
||||
* @param read the read itself
|
||||
* @param output the output source
|
||||
* @return the SAMFileWriter, so that the next reduce can emit to the same source
|
||||
*/
|
||||
public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) {
|
||||
output.addAlignment(read);
|
||||
return output;
|
||||
public Collection<SAMRecord> reduce( SAMRecord read, Collection<SAMRecord> reads ) {
|
||||
if ( readNamesToUse == null || readNamesToUse.contains(read.getReadName()) )
|
||||
reads.add(read);
|
||||
return reads;
|
||||
}
|
||||
|
||||
public void onTraversalDone( SAMFileWriter reduceResult ) {
|
||||
logger.info("Compressed reads: " + reducingOut.getNCompressedReads());
|
||||
logger.info("Total reads : " + totalReads);
|
||||
// todo -- fixme
|
||||
//reducingOut.close();
|
||||
@Override
|
||||
public void onTraversalDone( Collection<SAMRecord> reads ) {
|
||||
String contig = reads.iterator().next().getReferenceName();
|
||||
ConsensusReadCompressor compressor =
|
||||
new MultiSampleConsensusReadCompressor(getToolkit().getSAMFileHeader(),
|
||||
contextSize, getToolkit().getGenomeLocParser(), contig);
|
||||
|
||||
// add all of the reads to the compressor
|
||||
for ( SAMRecord read : reads ) {
|
||||
if ( INCLUDE_RAW_READS )
|
||||
out.addAlignment(read);
|
||||
compressor.addAlignment(read);
|
||||
}
|
||||
|
||||
// compress the reads
|
||||
//compressor.writeConsensusBed(bedOut);
|
||||
int nCompressedReads = 0;
|
||||
for ( SAMRecord read : compressor ) {
|
||||
out.addAlignment(read);
|
||||
nCompressedReads++;
|
||||
}
|
||||
|
||||
logger.info("Compressed reads : " + nCompressedReads);
|
||||
logger.info("Total reads : " + totalReads);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,74 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.reducereads;
|
||||
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: Apr 14, 2009
|
||||
* Time: 8:54:05 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class RefPileupElement extends PileupElement {
|
||||
final int refOffset;
|
||||
|
||||
public RefPileupElement(SAMRecord read, int offset, int refOffset) {
|
||||
super(read, offset);
|
||||
this.refOffset = refOffset;
|
||||
}
|
||||
|
||||
public int getRefOffset() {
|
||||
return refOffset;
|
||||
}
|
||||
|
||||
public static Iterable<RefPileupElement> walkRead(SAMRecord read) {
|
||||
return walkRead(read, 0);
|
||||
}
|
||||
|
||||
public static Iterable<RefPileupElement> walkRead(final SAMRecord read, final int refIStart) {
|
||||
return new Iterable<RefPileupElement>() {
|
||||
public Iterator<RefPileupElement> iterator() {
|
||||
List<RefPileupElement> elts = new ArrayList<RefPileupElement>();
|
||||
|
||||
int readI = 0, refI = read.getAlignmentStart() - refIStart;
|
||||
for ( CigarElement elt : read.getCigar().getCigarElements() ) {
|
||||
int l = elt.getLength();
|
||||
switch (elt.getOperator()) {
|
||||
case N: // cannot handle these
|
||||
break;
|
||||
case H : case P : // ignore pads and hard clips
|
||||
break;
|
||||
case S :
|
||||
//refI += l; // move the reference too, in addition to I
|
||||
readI += l;
|
||||
break;
|
||||
case I :
|
||||
for ( int i = 0; i < l; i++)
|
||||
elts.add(new RefPileupElement(read, readI++, refI));
|
||||
break;
|
||||
case D :
|
||||
for ( int i = 0; i < l; i++)
|
||||
elts.add(new RefPileupElement(read, -1, refI++));
|
||||
break;
|
||||
case M :
|
||||
for ( int i = 0; i < l; i++)
|
||||
elts.add(new RefPileupElement(read, readI++, refI++));
|
||||
break;
|
||||
default:
|
||||
throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName());
|
||||
}
|
||||
}
|
||||
|
||||
return elts.iterator();
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,294 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.reducereads;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.clipreads.ClippingOp;
|
||||
import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation;
|
||||
import org.broadinstitute.sting.utils.clipreads.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
//import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
*
|
||||
* @author depristo
|
||||
* @version 0.1
|
||||
*/
|
||||
public class SingleSampleConsensusReadCompressor implements ConsensusReadCompressor {
|
||||
protected static final Logger logger = Logger.getLogger(SingleSampleConsensusReadCompressor.class);
|
||||
private static final boolean DEBUG = false;
|
||||
private static final boolean INVERT = false;
|
||||
private static final boolean PRINT_CONSENSUS_READS = false;
|
||||
private static final double MAX_FRACTION_DISAGREEING_BASES = 0.1;
|
||||
private static final ClippingRepresentation VARIABLE_READ_REPRESENTATION = ClippingRepresentation.SOFTCLIP_BASES;
|
||||
|
||||
/** The place where we ultimately write out our records */
|
||||
Queue<SAMRecord> waitingReads = new LinkedList<SAMRecord>();
|
||||
|
||||
final int readContextSize;
|
||||
final int maxReadsAtVariableSites = 50;
|
||||
long nCompressedReads = 0;
|
||||
|
||||
final String contig;
|
||||
final GenomeLocParser glParser;
|
||||
SAMFileHeader header;
|
||||
|
||||
// todo -- require minimum read size of 2 bp in variable region
|
||||
|
||||
public SingleSampleConsensusReadCompressor(final int readContextSize,
|
||||
final GenomeLocParser glParser,
|
||||
final String contig) {
|
||||
this.readContextSize = readContextSize;
|
||||
this.glParser = glParser;
|
||||
this.contig = contig;
|
||||
}
|
||||
|
||||
/**
|
||||
* @{inheritDoc}
|
||||
*/
|
||||
public void addAlignment( SAMRecord read ) {
|
||||
if ( header == null )
|
||||
header = read.getHeader();
|
||||
|
||||
if ( ! read.getDuplicateReadFlag() && ! read.getNotPrimaryAlignmentFlag() && ! read.getReadUnmappedFlag() )
|
||||
waitingReads.add(read);
|
||||
}
|
||||
|
||||
public void writeConsensusBed(PrintStream bedOut) {
|
||||
for ( ConsensusSite site : calculateConsensusSites(waitingReads) ) {
|
||||
GenomeLoc loc = site.getLoc();
|
||||
bedOut.printf("%s\t%d\t%d\t%s%n", loc.getContig(), loc.getStart()-1, loc.getStop(), site.counts);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<SAMRecord> iterator() {
|
||||
return consensusReads().iterator();
|
||||
}
|
||||
|
||||
public Collection<SAMRecord> consensusReads() {
|
||||
if ( ! waitingReads.isEmpty() ) {
|
||||
List<ConsensusSite> sites = calculateConsensusSites(waitingReads);
|
||||
List<ConsensusSpan> spans = calculateSpans(sites);
|
||||
return consensusReadsFromSitesAndSpans(sites, spans);
|
||||
} else {
|
||||
return Collections.EMPTY_LIST;
|
||||
}
|
||||
}
|
||||
|
||||
private List<ConsensusSite> expandVariableSites(List<ConsensusSite> sites) {
|
||||
for ( ConsensusSite site : sites )
|
||||
site.setMarkedType(ConsensusType.CONSERVED);
|
||||
|
||||
for ( int i = 0; i < sites.size(); i++ ) {
|
||||
ConsensusSite site = sites.get(i);
|
||||
if ( ! site.isStrongConsensus(MAX_FRACTION_DISAGREEING_BASES) ) {
|
||||
int start = Math.max(i - readContextSize, 0);
|
||||
int stop = Math.min(sites.size(), i + readContextSize + 1);
|
||||
for ( int j = start; j < stop; j++ ) {
|
||||
// aggressive tagging -- you are only conserved if you are never variable
|
||||
sites.get(j).setMarkedType(ConsensusType.VARIABLE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return sites;
|
||||
}
|
||||
|
||||
private List<ConsensusSpan> calculateSpans(List<ConsensusSite> rawSites) {
|
||||
List<ConsensusSite> sites = expandVariableSites(rawSites);
|
||||
List<ConsensusSpan> spans = new ArrayList<ConsensusSpan>();
|
||||
int start = 0;
|
||||
|
||||
// our first span type is the type of the first site
|
||||
ConsensusType consensusType = sites.get(0).getMarkedType();
|
||||
while ( start < sites.size() ) {
|
||||
ConsensusSpan span = findSpan(sites, start, consensusType);
|
||||
|
||||
if ( span == null ) // we are done
|
||||
return spans;
|
||||
else {
|
||||
spans.add(span);
|
||||
start += span.size();
|
||||
}
|
||||
|
||||
consensusType = ConsensusType.otherType(consensusType);
|
||||
}
|
||||
|
||||
return spans;
|
||||
}
|
||||
|
||||
private ConsensusSpan findSpan(List<ConsensusSite> sites, int start, ConsensusType consensusType) {
|
||||
int refStart = sites.get(0).getLoc().getStart();
|
||||
|
||||
for ( int end = start + 1; end < sites.size(); end++ ) {
|
||||
ConsensusSite site = sites.get(end);
|
||||
boolean conserved = site.getMarkedType() == ConsensusType.CONSERVED;
|
||||
if ( (consensusType == ConsensusType.CONSERVED && ! conserved) ||
|
||||
(consensusType == ConsensusType.VARIABLE && conserved) ||
|
||||
end + 1 == sites.size() ) { // we are done with the complete interval
|
||||
GenomeLoc loc = glParser.createGenomeLoc(contig, start+refStart, end+refStart-1);
|
||||
return new ConsensusSpan(refStart, loc, consensusType);
|
||||
}
|
||||
}
|
||||
|
||||
return null; // couldn't find anything
|
||||
}
|
||||
|
||||
|
||||
private List<ConsensusSite> calculateConsensusSites(Collection<SAMRecord> reads) {
|
||||
SAMRecord firstRead = reads.iterator().next();
|
||||
int refStart = firstRead.getAlignmentStart();
|
||||
int refEnd = furtherestEnd(reads);
|
||||
|
||||
// set up the consensus site array
|
||||
List<ConsensusSite> consensusSites = new ArrayList<ConsensusSite>();
|
||||
int len = refEnd - refStart + 1;
|
||||
for ( int i = 0; i < len; i++ ) {
|
||||
int l = refStart + i;
|
||||
GenomeLoc loc = glParser.createGenomeLoc(contig, l, l);
|
||||
consensusSites.add(new ConsensusSite(loc, i));
|
||||
}
|
||||
|
||||
for ( SAMRecord read : reads ) {
|
||||
for ( RefPileupElement p : RefPileupElement.walkRead(read, refStart) ) {
|
||||
// add to the consensus at this site
|
||||
consensusSites.get(p.getRefOffset()).addOverlappingRead(p);
|
||||
}
|
||||
}
|
||||
|
||||
return consensusSites;
|
||||
}
|
||||
|
||||
private List<SAMRecord> consensusReadsFromSitesAndSpans(List<ConsensusSite> sites, List<ConsensusSpan> spans) {
|
||||
List<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||
|
||||
for ( ConsensusSpan span : spans ) {
|
||||
logger.info("Span is " + span);
|
||||
if ( span.isConserved() )
|
||||
reads.addAll(conservedSpanReads(sites, span));
|
||||
else
|
||||
reads.addAll(downsample(variableSpanReads(sites, span)));
|
||||
}
|
||||
|
||||
return reads;
|
||||
}
|
||||
|
||||
private Collection<SAMRecord> downsample(Collection<SAMRecord> reads) {
|
||||
if ( reads.size() > maxReadsAtVariableSites ) {
|
||||
List<SAMRecord> readArray = new ArrayList<SAMRecord>(reads);
|
||||
Collections.shuffle(readArray, GenomeAnalysisEngine.getRandomGenerator());
|
||||
return readArray.subList(0, maxReadsAtVariableSites);
|
||||
} else {
|
||||
return reads;
|
||||
}
|
||||
}
|
||||
|
||||
private List<SAMRecord> conservedSpanReads(List<ConsensusSite> sites, ConsensusSpan span) {
|
||||
byte[] bases = new byte[span.size()];
|
||||
byte[] quals = new byte[span.size()];
|
||||
|
||||
for ( int i = 0; i < span.size(); i++ ) {
|
||||
int refI = i + span.getOffsetFromStartOfSites();
|
||||
ConsensusSite site = sites.get(refI);
|
||||
if ( site.getMarkedType() == ConsensusType.VARIABLE )
|
||||
throw new ReviewedStingException("Variable site included in consensus: " + site);
|
||||
final int count = site.counts.countOfMostCommonBase();
|
||||
final byte base = count == 0 ? (byte)'N' : site.counts.baseWithMostCounts();
|
||||
bases[i] = base;
|
||||
quals[i] = QualityUtils.boundQual(count, (byte)64);
|
||||
}
|
||||
|
||||
SAMRecord consensus = new SAMRecord(header);
|
||||
consensus.setReferenceName(contig);
|
||||
consensus.setReadName("Mark");
|
||||
consensus.setCigarString(String.format("%dM", span.size()));
|
||||
consensus.setReadPairedFlag(false);
|
||||
consensus.setAlignmentStart(span.getGenomeStart());
|
||||
consensus.setReadBases(bases);
|
||||
consensus.setBaseQualities(quals);
|
||||
consensus.setMappingQuality(60);
|
||||
|
||||
// if ( INVERT && PRINT_CONSENSUS_READS )
|
||||
// for ( SAMRecord read : consensusReads )
|
||||
// finalDestination.addAlignment(read);
|
||||
|
||||
return Collections.singletonList(consensus);
|
||||
}
|
||||
|
||||
private Collection<SAMRecord> variableSpanReads(List<ConsensusSite> sites, ConsensusSpan span) {
|
||||
Set<SAMRecord> reads = new HashSet<SAMRecord>();
|
||||
|
||||
for ( int i = 0; i < span.size(); i++ ) {
|
||||
int refI = i + span.getOffsetFromStartOfSites();
|
||||
ConsensusSite site = sites.get(refI);
|
||||
for ( PileupElement p : site.getOverlappingReads() ) {
|
||||
// if ( reads.contains(p.getRead()))
|
||||
// logger.info("Skipping already added read: " + p.getRead().getReadName());
|
||||
reads.add(clipReadToSpan(p.getRead(), span));
|
||||
}
|
||||
}
|
||||
|
||||
return reads;
|
||||
}
|
||||
|
||||
private SAMRecord clipReadToSpan(SAMRecord read, ConsensusSpan span) {
|
||||
ReadClipper clipper = new ReadClipper(read);
|
||||
int spanStart = span.getGenomeStart();
|
||||
int spanEnd = span.getGenomeStop();
|
||||
int readLen = read.getReadLength();
|
||||
|
||||
for ( RefPileupElement p : RefPileupElement.walkRead(read) ) {
|
||||
if ( p.getRefOffset() == spanStart && p.getOffset() != 0 ) {
|
||||
clipper.addOp(new ClippingOp(0, p.getOffset()));
|
||||
}
|
||||
|
||||
if ( p.getRefOffset() == spanEnd && p.getOffset() != readLen - 1 ) {
|
||||
clipper.addOp(new ClippingOp(p.getOffset() + 1, readLen - 1));
|
||||
}
|
||||
}
|
||||
|
||||
return clipper.clipRead(VARIABLE_READ_REPRESENTATION);
|
||||
}
|
||||
|
||||
private static int furtherestEnd(Collection<SAMRecord> reads) {
|
||||
int end = -1;
|
||||
for ( SAMRecord read : reads ) {
|
||||
end = Math.max(end, read.getAlignmentEnd());
|
||||
}
|
||||
return end;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,273 @@
|
|||
package org.broadinstitute.sting.utils.clipreads;
|
||||
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.walkers.ClipReadsWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.Vector;
|
||||
|
||||
/**
|
||||
* Represents a clip on a read. It has a type (see the enum) along with a start and stop in the bases
|
||||
* of the read, plus an option extraInfo (useful for carrying info where needed).
|
||||
* <p/>
|
||||
* Also holds the critical apply function that actually execute the clipping operation on a provided read,
|
||||
* according to the wishes of the supplid ClippingAlgorithm enum.
|
||||
*/
|
||||
public class ClippingOp {
|
||||
public final ClippingType type;
|
||||
public final int start, stop; // inclusive
|
||||
public final Object extraInfo;
|
||||
|
||||
public ClippingOp(int start, int stop) {
|
||||
this(null, start, stop, null);
|
||||
}
|
||||
|
||||
public ClippingOp(ClippingType type, int start, int stop, Object extraInfo) {
|
||||
// todo -- remove type and extra info
|
||||
this.type = type;
|
||||
this.start = start;
|
||||
this.stop = stop;
|
||||
this.extraInfo = extraInfo;
|
||||
}
|
||||
|
||||
public int getLength() {
|
||||
return stop - start + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clips the bases in clippedRead according to this operation's start and stop. Uses the clipping
|
||||
* representation used is the one provided by algorithm argument.
|
||||
*
|
||||
* @param algorithm
|
||||
* @param clippedRead
|
||||
*/
|
||||
public void apply(ClippingRepresentation algorithm, SAMRecord clippedRead) {
|
||||
//clippedRead.setReferenceIndex(1);
|
||||
byte[] quals = clippedRead.getBaseQualities();
|
||||
byte[] bases = clippedRead.getReadBases();
|
||||
|
||||
switch (algorithm) {
|
||||
// important note:
|
||||
// it's not safe to call read.getReadBases()[i] = 'N' or read.getBaseQualities()[i] = 0
|
||||
// because you're not guaranteed to get a pointer to the actual array of bytes in the SAMRecord
|
||||
case WRITE_NS:
|
||||
for (int i = start; i <= stop; i++)
|
||||
bases[i] = 'N';
|
||||
clippedRead.setReadBases(bases);
|
||||
break;
|
||||
case WRITE_Q0S:
|
||||
for (int i = start; i <= stop; i++)
|
||||
quals[i] = 0;
|
||||
clippedRead.setBaseQualities(quals);
|
||||
break;
|
||||
case WRITE_NS_Q0S:
|
||||
for (int i = start; i <= stop; i++) {
|
||||
bases[i] = 'N';
|
||||
quals[i] = 0;
|
||||
}
|
||||
clippedRead.setReadBases(bases);
|
||||
clippedRead.setBaseQualities(quals);
|
||||
break;
|
||||
case SOFTCLIP_BASES:
|
||||
if ( ! clippedRead.getReadUnmappedFlag() ) {
|
||||
// we can't process unmapped reads
|
||||
|
||||
//System.out.printf("%d %d %d%n", stop, start, clippedRead.getReadLength());
|
||||
int myStop = stop;
|
||||
if ( (stop + 1 - start) == clippedRead.getReadLength() ) {
|
||||
// BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it alone
|
||||
//Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", clippedRead.getReadName()));
|
||||
//break;
|
||||
myStop--; // just decrement stop
|
||||
}
|
||||
|
||||
if ( start > 0 && myStop != clippedRead.getReadLength() - 1 )
|
||||
throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d",
|
||||
clippedRead.getReadName(), start, myStop));
|
||||
|
||||
Cigar oldCigar = clippedRead.getCigar();
|
||||
|
||||
int scLeft = 0, scRight = clippedRead.getReadLength();
|
||||
if ( start == 0 )
|
||||
scLeft = myStop + 1;
|
||||
else
|
||||
scRight = start;
|
||||
|
||||
// if ( clippedRead.getReadNegativeStrandFlag() ) {
|
||||
// if ( start == 0 )
|
||||
// scLeft = myStop + 1;
|
||||
// else
|
||||
// scRight = start;
|
||||
// } else {
|
||||
// if ( start == 0 )
|
||||
// scLeft = myStop;
|
||||
// else
|
||||
// scRight = start;
|
||||
// }
|
||||
|
||||
Cigar newCigar = softClip(oldCigar, scLeft, scRight);
|
||||
clippedRead.setCigar(newCigar);
|
||||
|
||||
int newClippedStart = getNewAlignmentStartOffset(newCigar, oldCigar);
|
||||
int newStart = clippedRead.getAlignmentStart() + newClippedStart;
|
||||
clippedRead.setAlignmentStart(newStart);
|
||||
|
||||
//System.out.printf("%s clipping at %d %d / %d %d => %s and %d%n", oldCigar.toString(), start, stop, scLeft, scRight, newCigar.toString(), newStart);
|
||||
}
|
||||
|
||||
break;
|
||||
//throw new RuntimeException("Softclipping of bases not yet implemented.");
|
||||
default:
|
||||
throw new IllegalStateException("Unexpected Clipping operator type " + algorithm);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* What is the type of a ClippingOp?
|
||||
*/
|
||||
public enum ClippingType {
|
||||
LOW_Q_SCORES,
|
||||
WITHIN_CLIP_RANGE,
|
||||
MATCHES_CLIP_SEQ
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a cigar string, get the number of bases hard or soft clipped at the start
|
||||
*/
|
||||
private int getNewAlignmentStartOffset(final Cigar __cigar, final Cigar __oldCigar) {
|
||||
int num = 0;
|
||||
for (CigarElement e : __cigar.getCigarElements()) {
|
||||
if (!e.getOperator().consumesReferenceBases()) {
|
||||
if (e.getOperator().consumesReadBases()) {
|
||||
num += e.getLength();
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int oldNum = 0;
|
||||
int curReadCounter = 0;
|
||||
|
||||
for (CigarElement e : __oldCigar.getCigarElements()) {
|
||||
int curRefLength = e.getLength();
|
||||
int curReadLength = e.getLength();
|
||||
if (!e.getOperator().consumesReadBases()) {
|
||||
curReadLength = 0;
|
||||
}
|
||||
|
||||
boolean truncated = false;
|
||||
if (curReadCounter + curReadLength > num) {
|
||||
curReadLength = num - curReadCounter;
|
||||
curRefLength = num - curReadCounter;
|
||||
truncated = true;
|
||||
}
|
||||
|
||||
if (!e.getOperator().consumesReferenceBases()) {
|
||||
curRefLength = 0;
|
||||
}
|
||||
|
||||
curReadCounter += curReadLength;
|
||||
oldNum += curRefLength;
|
||||
|
||||
if (curReadCounter > num || truncated) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return oldNum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a cigar string, soft clip up to startClipEnd and soft clip starting at endClipBegin
|
||||
*/
|
||||
private Cigar softClip(final Cigar __cigar, final int __startClipEnd, final int __endClipBegin) {
|
||||
if (__endClipBegin <= __startClipEnd) {
|
||||
//whole thing should be soft clipped
|
||||
int cigarLength = 0;
|
||||
for (CigarElement e : __cigar.getCigarElements()) {
|
||||
cigarLength += e.getLength();
|
||||
}
|
||||
|
||||
Cigar newCigar = new Cigar();
|
||||
newCigar.add(new CigarElement(cigarLength, CigarOperator.SOFT_CLIP));
|
||||
assert newCigar.isValid(null, -1) == null;
|
||||
return newCigar;
|
||||
}
|
||||
|
||||
int curLength = 0;
|
||||
Vector<CigarElement> newElements = new Vector<CigarElement>();
|
||||
for (CigarElement curElem : __cigar.getCigarElements()) {
|
||||
if (!curElem.getOperator().consumesReadBases()) {
|
||||
if (curLength > __startClipEnd && curLength < __endClipBegin) {
|
||||
newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator()));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
int s = curLength;
|
||||
int e = curLength + curElem.getLength();
|
||||
if (e <= __startClipEnd || s >= __endClipBegin) {
|
||||
//must turn this entire thing into a clip
|
||||
newElements.add(new CigarElement(curElem.getLength(), CigarOperator.SOFT_CLIP));
|
||||
} else if (s >= __startClipEnd && e <= __endClipBegin) {
|
||||
//same thing
|
||||
newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator()));
|
||||
} else {
|
||||
//we are clipping in the middle of this guy
|
||||
CigarElement newStart = null;
|
||||
CigarElement newMid = null;
|
||||
CigarElement newEnd = null;
|
||||
|
||||
int midLength = curElem.getLength();
|
||||
if (s < __startClipEnd) {
|
||||
newStart = new CigarElement(__startClipEnd - s, CigarOperator.SOFT_CLIP);
|
||||
midLength -= newStart.getLength();
|
||||
}
|
||||
|
||||
if (e > __endClipBegin) {
|
||||
newEnd = new CigarElement(e - __endClipBegin, CigarOperator.SOFT_CLIP);
|
||||
midLength -= newEnd.getLength();
|
||||
}
|
||||
assert midLength >= 0;
|
||||
if (midLength > 0) {
|
||||
newMid = new CigarElement(midLength, curElem.getOperator());
|
||||
}
|
||||
if (newStart != null) {
|
||||
newElements.add(newStart);
|
||||
}
|
||||
if (newMid != null) {
|
||||
newElements.add(newMid);
|
||||
}
|
||||
if (newEnd != null) {
|
||||
newElements.add(newEnd);
|
||||
}
|
||||
}
|
||||
curLength += curElem.getLength();
|
||||
}
|
||||
|
||||
Vector<CigarElement> finalNewElements = new Vector<CigarElement>();
|
||||
CigarElement lastElement = null;
|
||||
for (CigarElement elem : newElements) {
|
||||
if (lastElement == null || lastElement.getOperator() != elem.getOperator()) {
|
||||
if (lastElement != null) {
|
||||
finalNewElements.add(lastElement);
|
||||
}
|
||||
lastElement = elem;
|
||||
} else {
|
||||
lastElement = new CigarElement(lastElement.getLength() + elem.getLength(), lastElement.getOperator());
|
||||
}
|
||||
}
|
||||
if (lastElement != null) {
|
||||
finalNewElements.add(lastElement);
|
||||
}
|
||||
|
||||
Cigar newCigar = new Cigar(finalNewElements);
|
||||
assert newCigar.isValid(null, -1) == null;
|
||||
return newCigar;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
package org.broadinstitute.sting.utils.clipreads;
|
||||
|
||||
/**
|
||||
* How should we represent a clipped bases in a read?
|
||||
*/
|
||||
public enum ClippingRepresentation {
|
||||
WRITE_NS, // change the bases to Ns
|
||||
WRITE_Q0S, // change the quality scores to Q0
|
||||
WRITE_NS_Q0S, // change the quality scores to Q0 and write Ns
|
||||
SOFTCLIP_BASES // change cigar string to S, but keep bases
|
||||
}
|
||||
|
|
@ -0,0 +1,69 @@
|
|||
package org.broadinstitute.sting.utils.clipreads;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.walkers.ClipReadsWalker;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A simple collection of the clipping operations to apply to a read along with its read
|
||||
*/
|
||||
public class ReadClipper {
|
||||
SAMRecord read;
|
||||
List<ClippingOp> ops = null;
|
||||
|
||||
/**
|
||||
* We didn't do any clipping work on this read, just leave everything as a default
|
||||
*
|
||||
* @param read
|
||||
*/
|
||||
public ReadClipper(final SAMRecord read) {
|
||||
this.read = read;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add another clipping operation to apply to this read
|
||||
*
|
||||
* @param op
|
||||
*/
|
||||
public void addOp(ClippingOp op) {
|
||||
if (ops == null) ops = new ArrayList<ClippingOp>();
|
||||
ops.add(op);
|
||||
}
|
||||
|
||||
public List<ClippingOp> getOps() {
|
||||
return ops;
|
||||
}
|
||||
|
||||
public boolean wasClipped() {
|
||||
return ops != null;
|
||||
}
|
||||
|
||||
public SAMRecord getRead() {
|
||||
return read;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return a new read corresponding to this.read that's been clipped according to ops, if any are present.
|
||||
*
|
||||
* @param algorithm
|
||||
* @return
|
||||
*/
|
||||
public SAMRecord clipRead(ClippingRepresentation algorithm) {
|
||||
if (ops == null)
|
||||
return getRead();
|
||||
else {
|
||||
try {
|
||||
SAMRecord clippedRead = (SAMRecord) read.clone();
|
||||
for (ClippingOp op : getOps()) {
|
||||
op.apply(algorithm, clippedRead);
|
||||
}
|
||||
return clippedRead;
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new RuntimeException(e); // this should never happen
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
Features needed:
|
||||
|
||||
- intrinsic support for samples. Pileups are tree based data structures whose leaves are actual
|
||||
pile ups of a single "sample" and whose nodes are collections of multiple sub pileups. This will
|
||||
make join and split operations very cheap.
|
||||
|
||||
- should be light-weight to create, and hold only minimal cached data to avoid unnecessary overhead.
|
||||
Things like the number of deletions, insertions, etc shouldn't be required information. Size will
|
||||
continue to be a key cached value. Could create a simple caching data structure that calculations lots of metrics about the pileup and was somehow
|
||||
cached internally, via a "CachedRBP" structure. This will make it very cheap and easy to filter
|
||||
pileups on the fly, costing O(N) to create the filtered context.
|
||||
|
||||
- immutable
|
||||
|
||||
- support for holding neighboring reads to the left and right of the pileup itself
|
||||
|
||||
- unified picture for "regular" and "extended" events. ExtendedEvents are really a special
|
||||
call from the engine and have nothing to do with the data itself.
|
||||
|
||||
- Where should algorithms operating on the pileups go? Two options are in the interface itself,
|
||||
making it very heavy-weight but easy to access, vs. in an associated PileupOps static methods, a
|
||||
la Collections.
|
||||
|
||||
- The Pileup2 should support in the fly filtering, so that read filters can be added at the top level
|
||||
and applied at all levels of the tree. Basically a filtering pileup would just create a new
|
||||
mirrored tree with filtering applied to each node. Very low overhead.
|
||||
|
||||
- Sizes could be cached as needed, so that only one pass is ever needed over the size of any pileup
|
||||
|
||||
- Fundamentally pileups are just collections of read-backed data. The PileupElements contain
|
||||
all of the smarts -- regular, indel, fragment-based. We need to be able to create pileups containing
|
||||
multiple subtype elements, which by necessity will need to declare their own static consensusType. How is it
|
||||
best to do this in Java? Have a single global ENUM that enumerates all of the possible types at
|
||||
compile time? Perhaps something more dynamic?
|
||||
Loading…
Reference in New Issue