diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java index 17a45cd6a..91eef6f34 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java @@ -33,11 +33,13 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.clipreads.ClippingOp; +import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation; +import org.broadinstitute.sting.utils.clipreads.ReadClipper; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import java.nio.channels.IllegalSelectorException; import java.util.*; import java.util.regex.Pattern; import java.util.regex.Matcher; @@ -51,7 +53,7 @@ import net.sf.samtools.util.StringUtil; * with poor quality scores, that match particular sequences, or that were generated by particular machine cycles. */ @Requires({DataSource.READS}) -public class ClipReadsWalker extends ReadWalker { +public class ClipReadsWalker extends ReadWalker { @Output PrintStream out; @@ -215,7 +217,7 @@ public class ClipReadsWalker extends ReadWalker - * Also holds the critical apply function that actually execute the clipping operation on a provided read, - * according to the wishes of the supplid ClippingAlgorithm enum. - */ - private class ClippingOp { - public ClippingType type; - public int start, stop; // inclusive - public Object extraInfo = null; - - public ClippingOp(ClippingType type, int start, int stop, Object extraInfo) { - this.type = type; - this.start = start; - this.stop = stop; - this.extraInfo = extraInfo; - } - - public int getLength() { - return stop - start + 1; - } - - /** - * Clips the bases in clippedRead according to this operation's start and stop. Uses the clipping - * representation used is the one provided by algorithm argument. - * - * @param algorithm - * @param clippedRead - */ - public void apply(ClippingRepresentation algorithm, SAMRecord clippedRead) { - //clippedRead.setReferenceIndex(1); - byte[] quals = clippedRead.getBaseQualities(); - byte[] bases = clippedRead.getReadBases(); - - switch (algorithm) { - // important note: - // it's not safe to call read.getReadBases()[i] = 'N' or read.getBaseQualities()[i] = 0 - // because you're not guaranteed to get a pointer to the actual array of bytes in the SAMRecord - case WRITE_NS: - for (int i = start; i <= stop; i++) - bases[i] = 'N'; - clippedRead.setReadBases(bases); - break; - case WRITE_Q0S: - for (int i = start; i <= stop; i++) - quals[i] = 0; - clippedRead.setBaseQualities(quals); - break; - case WRITE_NS_Q0S: - for (int i = start; i <= stop; i++) { - bases[i] = 'N'; - quals[i] = 0; - } - clippedRead.setReadBases(bases); - clippedRead.setBaseQualities(quals); - break; - case SOFTCLIP_BASES: - if ( ! clippedRead.getReadUnmappedFlag() ) { - // we can't process unmapped reads - - //System.out.printf("%d %d %d%n", stop, start, clippedRead.getReadLength()); - if ( (stop + 1 - start) == clippedRead.getReadLength() ) { - // BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it alone - logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", clippedRead.getReadName())); - break; - } - - if ( start > 0 && stop != clippedRead.getReadLength() - 1 ) - throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", - clippedRead.getReadName(), start, stop)); - - Cigar oldCigar = clippedRead.getCigar(); - - int scLeft = 0, scRight = clippedRead.getReadLength(); - if ( clippedRead.getReadNegativeStrandFlag() ) { - if ( start == 0 ) - scLeft = stop + 1; - else - scRight = start + 1; - } else { - if ( start == 0 ) - scLeft = stop; - else - scRight = start; - } - - Cigar newCigar = _softClip(oldCigar, scLeft, scRight); - clippedRead.setCigar(newCigar); - - int newClippedStart = _getNewAlignmentStartOffset(newCigar, oldCigar); - int newStart = clippedRead.getAlignmentStart() + newClippedStart; - clippedRead.setAlignmentStart(newStart); - - //System.out.printf("%s clipping at %d %d / %d %d => %s and %d%n", oldCigar.toString(), start, stop, scLeft, scRight, newCigar.toString(), newStart); - } - - break; - //throw new RuntimeException("Softclipping of bases not yet implemented."); - default: - throw new IllegalStateException("Unexpected Clipping operator type " + algorithm); - } - } - } - - /** - * How should we represent a clipped bases in a read? - */ - public enum ClippingRepresentation { - WRITE_NS, // change the bases to Ns - WRITE_Q0S, // change the quality scores to Q0 - WRITE_NS_Q0S, // change the quality scores to Q0 and write Ns - SOFTCLIP_BASES // change cigar string to S, but keep bases - } - - /** - * A simple collection of the clipping operations to apply to a read along with its read - */ - static public class ReadClipper { - SAMRecord read; - List ops = null; - - /** - * We didn't do any clipping work on this read, just leave everything as a default - * - * @param read - */ - public ReadClipper(final SAMRecord read) { - this.read = read; - } - - /** - * Add another clipping operation to apply to this read - * - * @param op - */ - public void addOp(ClippingOp op) { - if (ops == null) ops = new ArrayList(); - ops.add(op); - } - - public List getOps() { - return ops; - } - - public boolean wasClipped() { - return ops != null; - } - - public SAMRecord getRead() { - return read; - } - - - /** - * Return a new read corresponding to this.read that's been clipped according to ops, if any are present. - * - * @param algorithm - * @return - */ - public SAMRecord clipRead(ClippingRepresentation algorithm) { - if (ops == null) - return getRead(); - else { - try { - SAMRecord clippedRead = (SAMRecord) read.clone(); - for (ClippingOp op : getOps()) { - op.apply(algorithm, clippedRead); - } - return clippedRead; - } catch (CloneNotSupportedException e) { - throw new RuntimeException(e); // this should never happen - } - } - } - } - public static class ClippingData { public long nTotalReads = 0; public long nTotalBases = 0; @@ -616,140 +432,4 @@ public class ClipReadsWalker extends ReadWalker num) { - curReadLength = num - curReadCounter; - curRefLength = num - curReadCounter; - truncated = true; - } - - if (!e.getOperator().consumesReferenceBases()) { - curRefLength = 0; - } - - curReadCounter += curReadLength; - oldNum += curRefLength; - - if (curReadCounter > num || truncated) { - break; - } - } - - return oldNum; - } - - /** - * Given a cigar string, soft clip up to startClipEnd and soft clip starting at endClipBegin - */ - private Cigar _softClip(final Cigar __cigar, final int __startClipEnd, final int __endClipBegin) { - if (__endClipBegin <= __startClipEnd) { - //whole thing should be soft clipped - int cigarLength = 0; - for (CigarElement e : __cigar.getCigarElements()) { - cigarLength += e.getLength(); - } - - Cigar newCigar = new Cigar(); - newCigar.add(new CigarElement(cigarLength, CigarOperator.SOFT_CLIP)); - assert newCigar.isValid(null, -1) == null; - return newCigar; - } - - int curLength = 0; - Vector newElements = new Vector(); - for (CigarElement curElem : __cigar.getCigarElements()) { - if (!curElem.getOperator().consumesReadBases()) { - if (curLength > __startClipEnd && curLength < __endClipBegin) { - newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator())); - } - continue; - } - - int s = curLength; - int e = curLength + curElem.getLength(); - if (e <= __startClipEnd || s >= __endClipBegin) { - //must turn this entire thing into a clip - newElements.add(new CigarElement(curElem.getLength(), CigarOperator.SOFT_CLIP)); - } else if (s >= __startClipEnd && e <= __endClipBegin) { - //same thing - newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator())); - } else { - //we are clipping in the middle of this guy - CigarElement newStart = null; - CigarElement newMid = null; - CigarElement newEnd = null; - - int midLength = curElem.getLength(); - if (s < __startClipEnd) { - newStart = new CigarElement(__startClipEnd - s, CigarOperator.SOFT_CLIP); - midLength -= newStart.getLength(); - } - - if (e > __endClipBegin) { - newEnd = new CigarElement(e - __endClipBegin, CigarOperator.SOFT_CLIP); - midLength -= newEnd.getLength(); - } - assert midLength >= 0; - if (midLength > 0) { - newMid = new CigarElement(midLength, curElem.getOperator()); - } - if (newStart != null) { - newElements.add(newStart); - } - if (newMid != null) { - newElements.add(newMid); - } - if (newEnd != null) { - newElements.add(newEnd); - } - } - curLength += curElem.getLength(); - } - - Vector finalNewElements = new Vector(); - CigarElement lastElement = null; - for (CigarElement elem : newElements) { - if (lastElement == null || lastElement.getOperator() != elem.getOperator()) { - if (lastElement != null) { - finalNewElements.add(lastElement); - } - lastElement = elem; - } else { - lastElement = new CigarElement(lastElement.getLength() + elem.getLength(), lastElement.getOperator()); - } - } - if (lastElement != null) { - finalNewElements.add(lastElement); - } - - Cigar newCigar = new Cigar(finalNewElements); - assert newCigar.isValid(null, -1) == null; - return newCigar; - } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/BaseCounts.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/BaseCounts.java new file mode 100644 index 000000000..3919d76dc --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/BaseCounts.java @@ -0,0 +1,56 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.reducereads; + +import org.broadinstitute.sting.utils.BaseUtils; + +/** +* Created by IntelliJ IDEA. +* User: depristo +* Date: 4/8/11 +* Time: 2:55 PM +* To change this template use File | Settings | File Templates. +*/ +final class BaseCounts { + int counts[] = new int[4]; // fixme -- include - and I events + + public void incr(byte base) { + int baseI = BaseUtils.simpleBaseToBaseIndex(base); + if ( baseI >= 0 ) // no Ns + counts[baseI]++; + } + + public byte baseWithMostCounts() { + return BaseUtils.baseIndexToSimpleBase(maxBaseIndex()); + } + + public int countOfMostCommonBase() { + return counts[maxBaseIndex()]; + } + + public int totalCounts() { + int sum = 0; + + for ( int c : counts ) { + sum += c; + } + + return sum; + } + + private int maxBaseIndex() { + int maxI = 0; + for ( int i = 0; i < counts.length; i++) { + if ( counts[i] > counts[maxI] ) { + maxI = i; + } + } + return maxI; + } + + public String toString() { + StringBuilder b = new StringBuilder(); + for ( int i = 0; i < counts.length; i++ ) { + b.append((char)BaseUtils.baseIndexToSimpleBase(i)).append("=").append(counts[i]).append(","); + } + return b.toString(); + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusReadCompressor.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusReadCompressor.java new file mode 100644 index 000000000..fa91137e6 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusReadCompressor.java @@ -0,0 +1,22 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.reducereads; + +import net.sf.samtools.SAMRecord; + +import java.util.Collection; +import java.util.Iterator; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 4/10/11 + * Time: 8:49 AM + * To change this template use File | Settings | File Templates. + */ +public interface ConsensusReadCompressor extends Iterable { + void addAlignment(SAMRecord read); + + Collection consensusReads(); + + @Override + Iterator iterator(); +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSite.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSite.java new file mode 100644 index 000000000..97e1fa70a --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSite.java @@ -0,0 +1,101 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.reducereads; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; + +/** +* Created by IntelliJ IDEA. +* User: depristo +* Date: 4/8/11 +* Time: 3:01 PM +* To change this template use File | Settings | File Templates. +*/ +final class ConsensusSite { + final GenomeLoc loc; + final Set overlappingReads = new HashSet(); + final int offset; + final BaseCounts counts = new BaseCounts(); + + ConsensusType markedType = null; + + public ConsensusSite(GenomeLoc loc, int offset) { + this.loc = loc; + + this.offset = offset; + + } + + public GenomeLoc getLoc() { + return loc; + } + + public Set getOverlappingReads() { + return overlappingReads; + } + + public void addOverlappingRead(PileupElement elt) { + overlappingReads.add(elt); + counts.incr(elt.getBase()); + } + + public boolean isStrongConsensus(final double maxFractionDisagreeingBases) { + int mostCommon = counts.countOfMostCommonBase(); + int total = counts.totalCounts(); + double fractionCommonBase = (1.0 * mostCommon) / total; + return (1 - fractionCommonBase) < maxFractionDisagreeingBases; + } + + public final static class ConsensusBase { + byte base, qual; + + public byte getBase() { + return base; + } + + public byte getQual() { + return qual; + } + + public ConsensusBase(byte base, byte qual) { + this.base = base; + this.qual = qual; + } + } + + public ConsensusBase getConsensus() { + byte base = counts.baseWithMostCounts(); + int qual = 0; + + for ( PileupElement p : overlappingReads ) { + if ( p.getBase() == base ) + qual++; + } + + return new ConsensusBase(base, QualityUtils.boundQual(qual, (byte)64)); + } + + public String toString() { + return counts.toString(); + } + + public void setMarkedType(ConsensusType markedType) { + this.markedType = markedType; + } + + public ConsensusType getMarkedType() { + if ( markedType == null ) throw new ReviewedStingException("markedType not yet set!"); + return markedType; + } + + +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSpan.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSpan.java new file mode 100644 index 000000000..33b196767 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusSpan.java @@ -0,0 +1,49 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.reducereads; + +import org.broadinstitute.sting.utils.GenomeLoc; + +/** +* Created by IntelliJ IDEA. +* User: depristo +* Date: 4/8/11 +* Time: 3:01 PM +* To change this template use File | Settings | File Templates. +*/ +final class ConsensusSpan { + final int refStart; // the start position on the reference for relative calculations + final GenomeLoc loc; + final ConsensusType consensusType; + + public ConsensusSpan(final int refStart, GenomeLoc loc, ConsensusType consensusType) { + this.refStart = refStart; + this.loc = loc; + this.consensusType = consensusType; + } + + public int getOffsetFromStartOfSites() { + return loc.getStart() - refStart; + } + + public int getGenomeStart() { + return loc.getStart(); + } + + public int getGenomeStop() { + return loc.getStop(); + } + + public ConsensusType getConsensusType() { + return consensusType; + } + + public int size() { + return getGenomeStop() - getGenomeStart() + 1; + } + + public boolean isConserved() { return getConsensusType() == ConsensusType.CONSERVED; } + public boolean isVariable() { return getConsensusType() == ConsensusType.VARIABLE; } + + public String toString() { + return String.format("%s %s", consensusType, loc); + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusType.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusType.java new file mode 100644 index 000000000..0e69e7c5d --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ConsensusType.java @@ -0,0 +1,20 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.reducereads; + +/** +* Created by IntelliJ IDEA. +* User: depristo +* Date: 4/9/11 +* Time: 7:52 PM +* To change this template use File | Settings | File Templates. +*/ +public enum ConsensusType { + CONSERVED, VARIABLE; + + public static ConsensusType otherType(ConsensusType t) { + switch ( t ) { + case CONSERVED: return VARIABLE; + case VARIABLE: return CONSERVED; + } + return CONSERVED; + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/MultiSampleConsensusReadCompressor.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/MultiSampleConsensusReadCompressor.java new file mode 100644 index 000000000..e79a8ad0c --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/MultiSampleConsensusReadCompressor.java @@ -0,0 +1,80 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.reducereads; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.*; + +//import org.broadinstitute.sting.utils.SimpleTimer; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * + * @author depristo + * @version 0.1 + */ +public class MultiSampleConsensusReadCompressor implements ConsensusReadCompressor { + protected static final Logger logger = Logger.getLogger(MultiSampleConsensusReadCompressor.class); + + Map compressorsPerSample = new HashMap(); + + public MultiSampleConsensusReadCompressor(SAMFileHeader header, + final int readContextSize, + final GenomeLocParser glParser, + final String contig) { + for ( String name : SampleUtils.getSAMFileSamples(header) ) { + compressorsPerSample.put(name, + new SingleSampleConsensusReadCompressor(readContextSize, glParser, contig)); + } + } + + @Override + public void addAlignment(SAMRecord read) { + String sample = read.getReadGroup().getSample(); + SingleSampleConsensusReadCompressor compressor = compressorsPerSample.get(sample); + if ( compressor == null ) + throw new ReviewedStingException("No compressor for sample " + sample); + compressor.addAlignment(read); + } + + @Override + public Collection consensusReads() { + List reads = new LinkedList(); + for ( SingleSampleConsensusReadCompressor comp : compressorsPerSample.values() ) + reads.addAll(comp.consensusReads()); + return reads; + } + + @Override + public Iterator iterator() { + return consensusReads().iterator(); + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ReduceReadsWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ReduceReadsWalker.java index 78290bd42..6561fe121 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ReduceReadsWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/ReduceReadsWalker.java @@ -25,53 +25,50 @@ package org.broadinstitute.sting.oneoffprojects.walkers.reducereads; -import net.sf.samtools.SAMFileWriter; import net.sf.samtools.SAMRecord; -import org.broad.tribble.util.variantcontext.VariantContext; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Set; /** * Created by IntelliJ IDEA. * User: depristo * Date: April 7, 2011 */ -public class ReduceReadsWalker extends ReadWalker { +public class ReduceReadsWalker extends ReadWalker> { @Output protected StingSAMFileWriter out; - @Argument(fullName = "SNPContextSize", shortName = "SCS", doc = "", required = true) - protected int SNPContextSize; + @Output(fullName="bedOut", shortName = "bedOut", doc="BED output", required = false) + protected PrintStream bedOut = null; - @Argument(fullName = "IndelContextSize", shortName = "ICS", doc = "", required = true) - protected int IndelContextSize; + @Argument(fullName = "contextSize", shortName = "CS", doc = "", required = false) + protected int contextSize = 10; + + @Argument(fullName = "INCLUDE_RAW_READS", shortName = "IRR", doc = "", required = false) + protected boolean INCLUDE_RAW_READS = false; + + @Argument(fullName = "useRead", shortName = "UR", doc = "", required = false) + protected Set readNamesToUse; - protected ReducingSAMFileWriter reducingOut; protected int totalReads = 0; @Override public void initialize() { - reducingOut = new ReducingSAMFileWriter(out, SNPContextSize, IndelContextSize); + super.initialize(); + out.setPresorted(false); } @Override public SAMRecord map( ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker ) { - for ( GATKFeature feature : metaDataTracker.getAllCoveringRods() ) { - if ( feature.getUnderlyingObject() instanceof VariantContext ) { - VariantContext vc = (VariantContext)feature.getUnderlyingObject(); - reducingOut.addVariant(vc); - } - } - totalReads++; return read; // all the work is done in the reduce step for this walker } @@ -83,26 +80,45 @@ public class ReduceReadsWalker extends ReadWalker { * @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise */ @Override - public SAMFileWriter reduceInit() { - return reducingOut; + public Collection reduceInit() { + return new ArrayList(); } /** * given a read and a output location, reduce by emitting the read * @param read the read itself - * @param output the output source * @return the SAMFileWriter, so that the next reduce can emit to the same source */ - public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) { - output.addAlignment(read); - return output; + public Collection reduce( SAMRecord read, Collection reads ) { + if ( readNamesToUse == null || readNamesToUse.contains(read.getReadName()) ) + reads.add(read); + return reads; } - public void onTraversalDone( SAMFileWriter reduceResult ) { - logger.info("Compressed reads: " + reducingOut.getNCompressedReads()); - logger.info("Total reads : " + totalReads); - // todo -- fixme - //reducingOut.close(); + @Override + public void onTraversalDone( Collection reads ) { + String contig = reads.iterator().next().getReferenceName(); + ConsensusReadCompressor compressor = + new MultiSampleConsensusReadCompressor(getToolkit().getSAMFileHeader(), + contextSize, getToolkit().getGenomeLocParser(), contig); + + // add all of the reads to the compressor + for ( SAMRecord read : reads ) { + if ( INCLUDE_RAW_READS ) + out.addAlignment(read); + compressor.addAlignment(read); + } + + // compress the reads + //compressor.writeConsensusBed(bedOut); + int nCompressedReads = 0; + for ( SAMRecord read : compressor ) { + out.addAlignment(read); + nCompressedReads++; + } + + logger.info("Compressed reads : " + nCompressedReads); + logger.info("Total reads : " + totalReads); } } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/RefPileupElement.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/RefPileupElement.java new file mode 100755 index 000000000..f3e820a3c --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/RefPileupElement.java @@ -0,0 +1,74 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.reducereads; + +import net.sf.samtools.CigarElement; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.PileupElement; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Apr 14, 2009 + * Time: 8:54:05 AM + * To change this template use File | Settings | File Templates. + */ +public class RefPileupElement extends PileupElement { + final int refOffset; + + public RefPileupElement(SAMRecord read, int offset, int refOffset) { + super(read, offset); + this.refOffset = refOffset; + } + + public int getRefOffset() { + return refOffset; + } + + public static Iterable walkRead(SAMRecord read) { + return walkRead(read, 0); + } + + public static Iterable walkRead(final SAMRecord read, final int refIStart) { + return new Iterable() { + public Iterator iterator() { + List elts = new ArrayList(); + + int readI = 0, refI = read.getAlignmentStart() - refIStart; + for ( CigarElement elt : read.getCigar().getCigarElements() ) { + int l = elt.getLength(); + switch (elt.getOperator()) { + case N: // cannot handle these + break; + case H : case P : // ignore pads and hard clips + break; + case S : + //refI += l; // move the reference too, in addition to I + readI += l; + break; + case I : + for ( int i = 0; i < l; i++) + elts.add(new RefPileupElement(read, readI++, refI)); + break; + case D : + for ( int i = 0; i < l; i++) + elts.add(new RefPileupElement(read, -1, refI++)); + break; + case M : + for ( int i = 0; i < l; i++) + elts.add(new RefPileupElement(read, readI++, refI++)); + break; + default: + throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName()); + } + } + + return elts.iterator(); + } + }; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/SingleSampleConsensusReadCompressor.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/SingleSampleConsensusReadCompressor.java new file mode 100644 index 000000000..3697f3f6c --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/reducereads/SingleSampleConsensusReadCompressor.java @@ -0,0 +1,294 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.reducereads; + +import net.sf.samtools.*; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.clipreads.ClippingOp; +import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation; +import org.broadinstitute.sting.utils.clipreads.ReadClipper; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.PileupElement; + +import java.io.PrintStream; +import java.util.*; + +//import org.broadinstitute.sting.utils.SimpleTimer; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * + * @author depristo + * @version 0.1 + */ +public class SingleSampleConsensusReadCompressor implements ConsensusReadCompressor { + protected static final Logger logger = Logger.getLogger(SingleSampleConsensusReadCompressor.class); + private static final boolean DEBUG = false; + private static final boolean INVERT = false; + private static final boolean PRINT_CONSENSUS_READS = false; + private static final double MAX_FRACTION_DISAGREEING_BASES = 0.1; + private static final ClippingRepresentation VARIABLE_READ_REPRESENTATION = ClippingRepresentation.SOFTCLIP_BASES; + + /** The place where we ultimately write out our records */ + Queue waitingReads = new LinkedList(); + + final int readContextSize; + final int maxReadsAtVariableSites = 50; + long nCompressedReads = 0; + + final String contig; + final GenomeLocParser glParser; + SAMFileHeader header; + + // todo -- require minimum read size of 2 bp in variable region + + public SingleSampleConsensusReadCompressor(final int readContextSize, + final GenomeLocParser glParser, + final String contig) { + this.readContextSize = readContextSize; + this.glParser = glParser; + this.contig = contig; + } + + /** + * @{inheritDoc} + */ + public void addAlignment( SAMRecord read ) { + if ( header == null ) + header = read.getHeader(); + + if ( ! read.getDuplicateReadFlag() && ! read.getNotPrimaryAlignmentFlag() && ! read.getReadUnmappedFlag() ) + waitingReads.add(read); + } + + public void writeConsensusBed(PrintStream bedOut) { + for ( ConsensusSite site : calculateConsensusSites(waitingReads) ) { + GenomeLoc loc = site.getLoc(); + bedOut.printf("%s\t%d\t%d\t%s%n", loc.getContig(), loc.getStart()-1, loc.getStop(), site.counts); + } + } + + @Override + public Iterator iterator() { + return consensusReads().iterator(); + } + + public Collection consensusReads() { + if ( ! waitingReads.isEmpty() ) { + List sites = calculateConsensusSites(waitingReads); + List spans = calculateSpans(sites); + return consensusReadsFromSitesAndSpans(sites, spans); + } else { + return Collections.EMPTY_LIST; + } + } + + private List expandVariableSites(List sites) { + for ( ConsensusSite site : sites ) + site.setMarkedType(ConsensusType.CONSERVED); + + for ( int i = 0; i < sites.size(); i++ ) { + ConsensusSite site = sites.get(i); + if ( ! site.isStrongConsensus(MAX_FRACTION_DISAGREEING_BASES) ) { + int start = Math.max(i - readContextSize, 0); + int stop = Math.min(sites.size(), i + readContextSize + 1); + for ( int j = start; j < stop; j++ ) { + // aggressive tagging -- you are only conserved if you are never variable + sites.get(j).setMarkedType(ConsensusType.VARIABLE); + } + } + } + + return sites; + } + + private List calculateSpans(List rawSites) { + List sites = expandVariableSites(rawSites); + List spans = new ArrayList(); + int start = 0; + + // our first span type is the type of the first site + ConsensusType consensusType = sites.get(0).getMarkedType(); + while ( start < sites.size() ) { + ConsensusSpan span = findSpan(sites, start, consensusType); + + if ( span == null ) // we are done + return spans; + else { + spans.add(span); + start += span.size(); + } + + consensusType = ConsensusType.otherType(consensusType); + } + + return spans; + } + + private ConsensusSpan findSpan(List sites, int start, ConsensusType consensusType) { + int refStart = sites.get(0).getLoc().getStart(); + + for ( int end = start + 1; end < sites.size(); end++ ) { + ConsensusSite site = sites.get(end); + boolean conserved = site.getMarkedType() == ConsensusType.CONSERVED; + if ( (consensusType == ConsensusType.CONSERVED && ! conserved) || + (consensusType == ConsensusType.VARIABLE && conserved) || + end + 1 == sites.size() ) { // we are done with the complete interval + GenomeLoc loc = glParser.createGenomeLoc(contig, start+refStart, end+refStart-1); + return new ConsensusSpan(refStart, loc, consensusType); + } + } + + return null; // couldn't find anything + } + + + private List calculateConsensusSites(Collection reads) { + SAMRecord firstRead = reads.iterator().next(); + int refStart = firstRead.getAlignmentStart(); + int refEnd = furtherestEnd(reads); + + // set up the consensus site array + List consensusSites = new ArrayList(); + int len = refEnd - refStart + 1; + for ( int i = 0; i < len; i++ ) { + int l = refStart + i; + GenomeLoc loc = glParser.createGenomeLoc(contig, l, l); + consensusSites.add(new ConsensusSite(loc, i)); + } + + for ( SAMRecord read : reads ) { + for ( RefPileupElement p : RefPileupElement.walkRead(read, refStart) ) { + // add to the consensus at this site + consensusSites.get(p.getRefOffset()).addOverlappingRead(p); + } + } + + return consensusSites; + } + + private List consensusReadsFromSitesAndSpans(List sites, List spans) { + List reads = new ArrayList(); + + for ( ConsensusSpan span : spans ) { + logger.info("Span is " + span); + if ( span.isConserved() ) + reads.addAll(conservedSpanReads(sites, span)); + else + reads.addAll(downsample(variableSpanReads(sites, span))); + } + + return reads; + } + + private Collection downsample(Collection reads) { + if ( reads.size() > maxReadsAtVariableSites ) { + List readArray = new ArrayList(reads); + Collections.shuffle(readArray, GenomeAnalysisEngine.getRandomGenerator()); + return readArray.subList(0, maxReadsAtVariableSites); + } else { + return reads; + } + } + + private List conservedSpanReads(List sites, ConsensusSpan span) { + byte[] bases = new byte[span.size()]; + byte[] quals = new byte[span.size()]; + + for ( int i = 0; i < span.size(); i++ ) { + int refI = i + span.getOffsetFromStartOfSites(); + ConsensusSite site = sites.get(refI); + if ( site.getMarkedType() == ConsensusType.VARIABLE ) + throw new ReviewedStingException("Variable site included in consensus: " + site); + final int count = site.counts.countOfMostCommonBase(); + final byte base = count == 0 ? (byte)'N' : site.counts.baseWithMostCounts(); + bases[i] = base; + quals[i] = QualityUtils.boundQual(count, (byte)64); + } + + SAMRecord consensus = new SAMRecord(header); + consensus.setReferenceName(contig); + consensus.setReadName("Mark"); + consensus.setCigarString(String.format("%dM", span.size())); + consensus.setReadPairedFlag(false); + consensus.setAlignmentStart(span.getGenomeStart()); + consensus.setReadBases(bases); + consensus.setBaseQualities(quals); + consensus.setMappingQuality(60); + +// if ( INVERT && PRINT_CONSENSUS_READS ) +// for ( SAMRecord read : consensusReads ) +// finalDestination.addAlignment(read); + + return Collections.singletonList(consensus); + } + + private Collection variableSpanReads(List sites, ConsensusSpan span) { + Set reads = new HashSet(); + + for ( int i = 0; i < span.size(); i++ ) { + int refI = i + span.getOffsetFromStartOfSites(); + ConsensusSite site = sites.get(refI); + for ( PileupElement p : site.getOverlappingReads() ) { +// if ( reads.contains(p.getRead())) +// logger.info("Skipping already added read: " + p.getRead().getReadName()); + reads.add(clipReadToSpan(p.getRead(), span)); + } + } + + return reads; + } + + private SAMRecord clipReadToSpan(SAMRecord read, ConsensusSpan span) { + ReadClipper clipper = new ReadClipper(read); + int spanStart = span.getGenomeStart(); + int spanEnd = span.getGenomeStop(); + int readLen = read.getReadLength(); + + for ( RefPileupElement p : RefPileupElement.walkRead(read) ) { + if ( p.getRefOffset() == spanStart && p.getOffset() != 0 ) { + clipper.addOp(new ClippingOp(0, p.getOffset())); + } + + if ( p.getRefOffset() == spanEnd && p.getOffset() != readLen - 1 ) { + clipper.addOp(new ClippingOp(p.getOffset() + 1, readLen - 1)); + } + } + + return clipper.clipRead(VARIABLE_READ_REPRESENTATION); + } + + private static int furtherestEnd(Collection reads) { + int end = -1; + for ( SAMRecord read : reads ) { + end = Math.max(end, read.getAlignmentEnd()); + } + return end; + } +} diff --git a/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java b/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java new file mode 100644 index 000000000..3f2d5469d --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java @@ -0,0 +1,273 @@ +package org.broadinstitute.sting.utils.clipreads; + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.walkers.ClipReadsWalker; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.Vector; + +/** + * Represents a clip on a read. It has a type (see the enum) along with a start and stop in the bases + * of the read, plus an option extraInfo (useful for carrying info where needed). + *

+ * Also holds the critical apply function that actually execute the clipping operation on a provided read, + * according to the wishes of the supplid ClippingAlgorithm enum. + */ +public class ClippingOp { + public final ClippingType type; + public final int start, stop; // inclusive + public final Object extraInfo; + + public ClippingOp(int start, int stop) { + this(null, start, stop, null); + } + + public ClippingOp(ClippingType type, int start, int stop, Object extraInfo) { + // todo -- remove type and extra info + this.type = type; + this.start = start; + this.stop = stop; + this.extraInfo = extraInfo; + } + + public int getLength() { + return stop - start + 1; + } + + /** + * Clips the bases in clippedRead according to this operation's start and stop. Uses the clipping + * representation used is the one provided by algorithm argument. + * + * @param algorithm + * @param clippedRead + */ + public void apply(ClippingRepresentation algorithm, SAMRecord clippedRead) { + //clippedRead.setReferenceIndex(1); + byte[] quals = clippedRead.getBaseQualities(); + byte[] bases = clippedRead.getReadBases(); + + switch (algorithm) { + // important note: + // it's not safe to call read.getReadBases()[i] = 'N' or read.getBaseQualities()[i] = 0 + // because you're not guaranteed to get a pointer to the actual array of bytes in the SAMRecord + case WRITE_NS: + for (int i = start; i <= stop; i++) + bases[i] = 'N'; + clippedRead.setReadBases(bases); + break; + case WRITE_Q0S: + for (int i = start; i <= stop; i++) + quals[i] = 0; + clippedRead.setBaseQualities(quals); + break; + case WRITE_NS_Q0S: + for (int i = start; i <= stop; i++) { + bases[i] = 'N'; + quals[i] = 0; + } + clippedRead.setReadBases(bases); + clippedRead.setBaseQualities(quals); + break; + case SOFTCLIP_BASES: + if ( ! clippedRead.getReadUnmappedFlag() ) { + // we can't process unmapped reads + + //System.out.printf("%d %d %d%n", stop, start, clippedRead.getReadLength()); + int myStop = stop; + if ( (stop + 1 - start) == clippedRead.getReadLength() ) { + // BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it alone + //Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", clippedRead.getReadName())); + //break; + myStop--; // just decrement stop + } + + if ( start > 0 && myStop != clippedRead.getReadLength() - 1 ) + throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", + clippedRead.getReadName(), start, myStop)); + + Cigar oldCigar = clippedRead.getCigar(); + + int scLeft = 0, scRight = clippedRead.getReadLength(); + if ( start == 0 ) + scLeft = myStop + 1; + else + scRight = start; + +// if ( clippedRead.getReadNegativeStrandFlag() ) { +// if ( start == 0 ) +// scLeft = myStop + 1; +// else +// scRight = start; +// } else { +// if ( start == 0 ) +// scLeft = myStop; +// else +// scRight = start; +// } + + Cigar newCigar = softClip(oldCigar, scLeft, scRight); + clippedRead.setCigar(newCigar); + + int newClippedStart = getNewAlignmentStartOffset(newCigar, oldCigar); + int newStart = clippedRead.getAlignmentStart() + newClippedStart; + clippedRead.setAlignmentStart(newStart); + + //System.out.printf("%s clipping at %d %d / %d %d => %s and %d%n", oldCigar.toString(), start, stop, scLeft, scRight, newCigar.toString(), newStart); + } + + break; + //throw new RuntimeException("Softclipping of bases not yet implemented."); + default: + throw new IllegalStateException("Unexpected Clipping operator type " + algorithm); + } + } + + /** + * What is the type of a ClippingOp? + */ + public enum ClippingType { + LOW_Q_SCORES, + WITHIN_CLIP_RANGE, + MATCHES_CLIP_SEQ + } + + /** + * Given a cigar string, get the number of bases hard or soft clipped at the start + */ + private int getNewAlignmentStartOffset(final Cigar __cigar, final Cigar __oldCigar) { + int num = 0; + for (CigarElement e : __cigar.getCigarElements()) { + if (!e.getOperator().consumesReferenceBases()) { + if (e.getOperator().consumesReadBases()) { + num += e.getLength(); + } + } else { + break; + } + } + + int oldNum = 0; + int curReadCounter = 0; + + for (CigarElement e : __oldCigar.getCigarElements()) { + int curRefLength = e.getLength(); + int curReadLength = e.getLength(); + if (!e.getOperator().consumesReadBases()) { + curReadLength = 0; + } + + boolean truncated = false; + if (curReadCounter + curReadLength > num) { + curReadLength = num - curReadCounter; + curRefLength = num - curReadCounter; + truncated = true; + } + + if (!e.getOperator().consumesReferenceBases()) { + curRefLength = 0; + } + + curReadCounter += curReadLength; + oldNum += curRefLength; + + if (curReadCounter > num || truncated) { + break; + } + } + + return oldNum; + } + + /** + * Given a cigar string, soft clip up to startClipEnd and soft clip starting at endClipBegin + */ + private Cigar softClip(final Cigar __cigar, final int __startClipEnd, final int __endClipBegin) { + if (__endClipBegin <= __startClipEnd) { + //whole thing should be soft clipped + int cigarLength = 0; + for (CigarElement e : __cigar.getCigarElements()) { + cigarLength += e.getLength(); + } + + Cigar newCigar = new Cigar(); + newCigar.add(new CigarElement(cigarLength, CigarOperator.SOFT_CLIP)); + assert newCigar.isValid(null, -1) == null; + return newCigar; + } + + int curLength = 0; + Vector newElements = new Vector(); + for (CigarElement curElem : __cigar.getCigarElements()) { + if (!curElem.getOperator().consumesReadBases()) { + if (curLength > __startClipEnd && curLength < __endClipBegin) { + newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator())); + } + continue; + } + + int s = curLength; + int e = curLength + curElem.getLength(); + if (e <= __startClipEnd || s >= __endClipBegin) { + //must turn this entire thing into a clip + newElements.add(new CigarElement(curElem.getLength(), CigarOperator.SOFT_CLIP)); + } else if (s >= __startClipEnd && e <= __endClipBegin) { + //same thing + newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator())); + } else { + //we are clipping in the middle of this guy + CigarElement newStart = null; + CigarElement newMid = null; + CigarElement newEnd = null; + + int midLength = curElem.getLength(); + if (s < __startClipEnd) { + newStart = new CigarElement(__startClipEnd - s, CigarOperator.SOFT_CLIP); + midLength -= newStart.getLength(); + } + + if (e > __endClipBegin) { + newEnd = new CigarElement(e - __endClipBegin, CigarOperator.SOFT_CLIP); + midLength -= newEnd.getLength(); + } + assert midLength >= 0; + if (midLength > 0) { + newMid = new CigarElement(midLength, curElem.getOperator()); + } + if (newStart != null) { + newElements.add(newStart); + } + if (newMid != null) { + newElements.add(newMid); + } + if (newEnd != null) { + newElements.add(newEnd); + } + } + curLength += curElem.getLength(); + } + + Vector finalNewElements = new Vector(); + CigarElement lastElement = null; + for (CigarElement elem : newElements) { + if (lastElement == null || lastElement.getOperator() != elem.getOperator()) { + if (lastElement != null) { + finalNewElements.add(lastElement); + } + lastElement = elem; + } else { + lastElement = new CigarElement(lastElement.getLength() + elem.getLength(), lastElement.getOperator()); + } + } + if (lastElement != null) { + finalNewElements.add(lastElement); + } + + Cigar newCigar = new Cigar(finalNewElements); + assert newCigar.isValid(null, -1) == null; + return newCigar; + } +} diff --git a/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java b/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java new file mode 100644 index 000000000..3a35244e5 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java @@ -0,0 +1,11 @@ +package org.broadinstitute.sting.utils.clipreads; + +/** + * How should we represent a clipped bases in a read? + */ +public enum ClippingRepresentation { + WRITE_NS, // change the bases to Ns + WRITE_Q0S, // change the quality scores to Q0 + WRITE_NS_Q0S, // change the quality scores to Q0 and write Ns + SOFTCLIP_BASES // change cigar string to S, but keep bases +} diff --git a/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java new file mode 100644 index 000000000..977074b38 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -0,0 +1,69 @@ +package org.broadinstitute.sting.utils.clipreads; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.walkers.ClipReadsWalker; + +import java.util.ArrayList; +import java.util.List; + +/** + * A simple collection of the clipping operations to apply to a read along with its read + */ +public class ReadClipper { + SAMRecord read; + List ops = null; + + /** + * We didn't do any clipping work on this read, just leave everything as a default + * + * @param read + */ + public ReadClipper(final SAMRecord read) { + this.read = read; + } + + /** + * Add another clipping operation to apply to this read + * + * @param op + */ + public void addOp(ClippingOp op) { + if (ops == null) ops = new ArrayList(); + ops.add(op); + } + + public List getOps() { + return ops; + } + + public boolean wasClipped() { + return ops != null; + } + + public SAMRecord getRead() { + return read; + } + + + /** + * Return a new read corresponding to this.read that's been clipped according to ops, if any are present. + * + * @param algorithm + * @return + */ + public SAMRecord clipRead(ClippingRepresentation algorithm) { + if (ops == null) + return getRead(); + else { + try { + SAMRecord clippedRead = (SAMRecord) read.clone(); + for (ClippingOp op : getOps()) { + op.apply(algorithm, clippedRead); + } + return clippedRead; + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); // this should never happen + } + } + } +} diff --git a/java/src/org/broadinstitute/sting/utils/pileup2/Notes b/java/src/org/broadinstitute/sting/utils/pileup2/Notes new file mode 100644 index 000000000..0d989faad --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/pileup2/Notes @@ -0,0 +1,34 @@ +Features needed: + +- intrinsic support for samples. Pileups are tree based data structures whose leaves are actual +pile ups of a single "sample" and whose nodes are collections of multiple sub pileups. This will +make join and split operations very cheap. + +- should be light-weight to create, and hold only minimal cached data to avoid unnecessary overhead. +Things like the number of deletions, insertions, etc shouldn't be required information. Size will +continue to be a key cached value. Could create a simple caching data structure that calculations lots of metrics about the pileup and was somehow +cached internally, via a "CachedRBP" structure. This will make it very cheap and easy to filter +pileups on the fly, costing O(N) to create the filtered context. + +- immutable + +- support for holding neighboring reads to the left and right of the pileup itself + +- unified picture for "regular" and "extended" events. ExtendedEvents are really a special +call from the engine and have nothing to do with the data itself. + +- Where should algorithms operating on the pileups go? Two options are in the interface itself, +making it very heavy-weight but easy to access, vs. in an associated PileupOps static methods, a +la Collections. + +- The Pileup2 should support in the fly filtering, so that read filters can be added at the top level +and applied at all levels of the tree. Basically a filtering pileup would just create a new +mirrored tree with filtering applied to each node. Very low overhead. + +- Sizes could be cached as needed, so that only one pass is ever needed over the size of any pileup + +- Fundamentally pileups are just collections of read-backed data. The PileupElements contain +all of the smarts -- regular, indel, fragment-based. We need to be able to create pileups containing +multiple subtype elements, which by necessity will need to declare their own static consensusType. How is it +best to do this in Java? Have a single global ENUM that enumerates all of the possible types at +compile time? Perhaps something more dynamic? \ No newline at end of file