Softclipping support in clip reads walker. Minor improvement to WalkerTest -- now can specify file extensions for tmp files. Matt -- I couldn't easily create non-presorted SAM file. The softclipper has an impact on this.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1878 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
055a99fb05
commit
2a26bb42dd
|
|
@ -1,8 +1,6 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.*;
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
|
|
@ -15,10 +13,7 @@ import org.broadinstitute.sting.gatk.walkers.Requires;
|
|||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
import java.io.File;
|
||||
|
|
@ -58,29 +53,31 @@ import net.sf.samtools.util.StringUtil;
|
|||
*/
|
||||
@Requires({DataSource.READS})
|
||||
public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, ClipReadsWalker.ClippingData> {
|
||||
/** an optional argument to dump the reads out to a BAM file */
|
||||
/**
|
||||
* an optional argument to dump the reads out to a BAM file
|
||||
*/
|
||||
@Argument(fullName = "outputBam", shortName = "ob", doc = "Write output to this BAM filename instead of STDOUT", required = false)
|
||||
SAMFileWriter outputBam = null;
|
||||
String outputBamFile = null;
|
||||
|
||||
@Argument(fullName = "", shortName = "STD", doc="FOR DEBUGGING ONLY", required = false)
|
||||
boolean toStandardOut = false;
|
||||
@Argument(fullName = "", shortName = "STD", doc = "FOR DEBUGGING ONLY", required = false)
|
||||
boolean toStandardOut = false;
|
||||
|
||||
@Argument(fullName = "qTrimmingThreshold", shortName = "QT", doc="", required = false)
|
||||
@Argument(fullName = "qTrimmingThreshold", shortName = "QT", doc = "", required = false)
|
||||
int qTrimmingThreshold = -1;
|
||||
|
||||
@Argument(fullName = "cyclesToTrim", shortName = "CT", doc="String of the form 1-10,20-30 indicating machine cycles to clip from the reads", required = false)
|
||||
@Argument(fullName = "cyclesToTrim", shortName = "CT", doc = "String of the form 1-10,20-30 indicating machine cycles to clip from the reads", required = false)
|
||||
String cyclesToClipArg = null;
|
||||
|
||||
@Argument(fullName = "clipSequencesFile", shortName = "XF", doc="Remove sequences within reads matching these sequences", required = false)
|
||||
@Argument(fullName = "clipSequencesFile", shortName = "XF", doc = "Remove sequences within reads matching these sequences", required = false)
|
||||
String clipSequenceFile = null;
|
||||
|
||||
@Argument(fullName = "clipSequence", shortName = "X", doc="Remove sequences within reads matching this sequence", required = false)
|
||||
@Argument(fullName = "clipSequence", shortName = "X", doc = "Remove sequences within reads matching this sequence", required = false)
|
||||
String[] clipSequencesArgs = null;
|
||||
|
||||
// @Argument(fullName = "onlyClipFirstSeqMatch", shortName = "ESC", doc="Only clip the first occurrence of a clipping sequence, rather than all subsequences within a read that match", required = false)
|
||||
// boolean onlyClipFirstSeqMatch = false;
|
||||
|
||||
@Argument(fullName = "clipRepresentation", shortName = "CR", doc="How should we actually clip the bases?", required = false)
|
||||
@Argument(fullName = "clipRepresentation", shortName = "CR", doc = "How should we actually clip the bases?", required = false)
|
||||
ClippingRepresentation clippingRepresentation = ClippingRepresentation.WRITE_NS;
|
||||
|
||||
/**
|
||||
|
|
@ -97,28 +94,28 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
* The initialize function.
|
||||
*/
|
||||
public void initialize() {
|
||||
if ( qTrimmingThreshold >= 0 ) {
|
||||
if (qTrimmingThreshold >= 0) {
|
||||
logger.info(String.format("Creating Q-score clipper with threshold %d", qTrimmingThreshold));
|
||||
}
|
||||
|
||||
//
|
||||
// Initialize the sequences to clip
|
||||
//
|
||||
if ( clipSequencesArgs != null ) {
|
||||
if (clipSequencesArgs != null) {
|
||||
int i = 0;
|
||||
for ( String toClip : clipSequencesArgs ) {
|
||||
for (String toClip : clipSequencesArgs) {
|
||||
i++;
|
||||
ReferenceSequence rs = new ReferenceSequence("CMDLINE-"+i, -1, StringUtil.stringToBytes(toClip));
|
||||
ReferenceSequence rs = new ReferenceSequence("CMDLINE-" + i, -1, StringUtil.stringToBytes(toClip));
|
||||
addSeqToClip(rs.getName(), rs.getBases());
|
||||
}
|
||||
}
|
||||
|
||||
if ( clipSequenceFile != null ) {
|
||||
if (clipSequenceFile != null) {
|
||||
ReferenceSequenceFile rsf = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(clipSequenceFile));
|
||||
|
||||
while ( true ) {
|
||||
|
||||
while (true) {
|
||||
ReferenceSequence rs = rsf.nextSequence();
|
||||
if ( rs == null )
|
||||
if (rs == null)
|
||||
break;
|
||||
else {
|
||||
addSeqToClip(rs.getName(), rs.getBases());
|
||||
|
|
@ -127,24 +124,23 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Initialize the cycle ranges to clip
|
||||
//
|
||||
if ( cyclesToClipArg != null ) {
|
||||
if (cyclesToClipArg != null) {
|
||||
cyclesToClip = new ArrayList<Pair<Integer, Integer>>();
|
||||
for ( String range : cyclesToClipArg.split(",") ) {
|
||||
for (String range : cyclesToClipArg.split(",")) {
|
||||
try {
|
||||
String[] elts = range.split("-");
|
||||
int start = Integer.parseInt(elts[0]) - 1;
|
||||
int stop = Integer.parseInt(elts[1]) - 1;
|
||||
|
||||
if ( start < 0 ) throw new Exception();
|
||||
if ( stop < start ) throw new Exception();
|
||||
if (start < 0) throw new Exception();
|
||||
if (stop < start) throw new Exception();
|
||||
|
||||
logger.info(String.format("Creating cycle clipper %d-%d", start, stop));
|
||||
cyclesToClip.add(new Pair<Integer, Integer>(start, stop));
|
||||
} catch ( Exception e ) {
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Badly formatted cyclesToClip argument: " + cyclesToClipArg);
|
||||
}
|
||||
}
|
||||
|
|
@ -153,7 +149,7 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
|
||||
/**
|
||||
* Helper function that adds a seq with name and bases (as bytes) to the list of sequences to be clipped
|
||||
*
|
||||
*
|
||||
* @param name
|
||||
* @param bases
|
||||
*/
|
||||
|
|
@ -165,11 +161,12 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
|
||||
/**
|
||||
* The reads map function.
|
||||
* @param ref the reference bases that correspond to our read, if a reference was provided
|
||||
*
|
||||
* @param ref the reference bases that correspond to our read, if a reference was provided
|
||||
* @param read the read itself, as a SAMRecord
|
||||
* @return the ReadClipper object describing what should be done to clip this read
|
||||
*/
|
||||
public ReadClipper map( char[] ref, SAMRecord read ) {
|
||||
public ReadClipper map(char[] ref, SAMRecord read) {
|
||||
ReadClipper clipper = new ReadClipper(read);
|
||||
|
||||
//
|
||||
|
|
@ -189,10 +186,10 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
* @param clipper
|
||||
*/
|
||||
private void clipSequences(ReadClipper clipper) {
|
||||
if ( sequencesToClip != null ) { // don't bother if we don't have any sequences to clip
|
||||
if (sequencesToClip != null) { // don't bother if we don't have any sequences to clip
|
||||
SAMRecord read = clipper.getRead();
|
||||
|
||||
for ( SeqToClip stc : sequencesToClip ) {
|
||||
for (SeqToClip stc : sequencesToClip) {
|
||||
// we have a pattern for both the forward and the reverse strands
|
||||
Pattern pattern = read.getReadNegativeStrandFlag() ? stc.revPat : stc.fwdPat;
|
||||
String bases = read.getReadString();
|
||||
|
|
@ -200,10 +197,10 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
|
||||
// keep clipping until match.find() says it can't find anything else
|
||||
boolean found = true; // go through at least once
|
||||
while ( found ) {
|
||||
while (found) {
|
||||
found = match.find();
|
||||
//System.out.printf("Matching %s against %s/%s => %b%n", bases, stc.seq, stc.revSeq, found);
|
||||
if ( found ) {
|
||||
if (found) {
|
||||
int start = match.start();
|
||||
int stop = match.end() - 1;
|
||||
ClippingOp op = new ClippingOp(ClippingType.MATCHES_CLIP_SEQ, start, stop, stc.seq);
|
||||
|
|
@ -223,9 +220,9 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
* @param stop
|
||||
* @return
|
||||
*/
|
||||
private Pair<Integer, Integer> strandAwarePositions( SAMRecord read, int start, int stop ) {
|
||||
if ( read.getReadNegativeStrandFlag() )
|
||||
return new Pair<Integer, Integer>( read.getReadLength() - stop - 1, read.getReadLength() - start - 1 );
|
||||
private Pair<Integer, Integer> strandAwarePositions(SAMRecord read, int start, int stop) {
|
||||
if (read.getReadNegativeStrandFlag())
|
||||
return new Pair<Integer, Integer>(read.getReadLength() - stop - 1, read.getReadLength() - start - 1);
|
||||
else
|
||||
return new Pair<Integer, Integer>(start, stop);
|
||||
}
|
||||
|
|
@ -236,20 +233,20 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
* @param clipper
|
||||
*/
|
||||
private void clipCycles(ReadClipper clipper) {
|
||||
if ( cyclesToClip != null ) {
|
||||
if (cyclesToClip != null) {
|
||||
SAMRecord read = clipper.getRead();
|
||||
|
||||
for ( Pair<Integer, Integer> p : cyclesToClip ) { // iterate over each cycle range
|
||||
for (Pair<Integer, Integer> p : cyclesToClip) { // iterate over each cycle range
|
||||
int cycleStart = p.first;
|
||||
int cycleStop = p.second;
|
||||
|
||||
if ( cycleStart < read.getReadLength() ) {
|
||||
if (cycleStart < read.getReadLength()) {
|
||||
// only try to clip if the cycleStart is less than the read's length
|
||||
if ( cycleStop >= read.getReadLength() )
|
||||
if (cycleStop >= read.getReadLength())
|
||||
// we do tolerate [for convenience) clipping when the stop is beyond the end of the read
|
||||
cycleStop = read.getReadLength() - 1;
|
||||
|
||||
Pair<Integer, Integer> startStop = strandAwarePositions( read, cycleStart, cycleStop );
|
||||
Pair<Integer, Integer> startStop = strandAwarePositions(read, cycleStart, cycleStop);
|
||||
int start = startStop.first;
|
||||
int stop = startStop.second;
|
||||
|
||||
|
|
@ -262,11 +259,11 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
|
||||
/**
|
||||
* Clip bases from the read in clipper from
|
||||
*
|
||||
* <p/>
|
||||
* argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)
|
||||
*
|
||||
* <p/>
|
||||
* to the end of the read. This is blatantly stolen from BWA.
|
||||
*
|
||||
* <p/>
|
||||
* Walk through the read from the end (in machine cycle order) to the beginning, calculating the
|
||||
* running sum of qTrimmingThreshold - qual. While we do this, we track the maximum value of this
|
||||
* sum where the delta > 0. After the loop, clipPoint is either -1 (don't do anything) or the
|
||||
|
|
@ -281,17 +278,17 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
|
||||
|
||||
int clipSum = 0, lastMax = -1, clipPoint = -1; // -1 means no clip
|
||||
for ( int i = readLen - 1; i >= 0; i-- ) {
|
||||
for (int i = readLen - 1; i >= 0; i--) {
|
||||
int baseIndex = read.getReadNegativeStrandFlag() ? readLen - i - 1 : i;
|
||||
byte qual = quals[baseIndex];
|
||||
clipSum += (qTrimmingThreshold - qual);
|
||||
if ( clipSum >= 0 && ( clipSum >= lastMax ) ) {
|
||||
if (clipSum >= 0 && (clipSum >= lastMax)) {
|
||||
lastMax = clipSum;
|
||||
clipPoint = baseIndex;
|
||||
}
|
||||
}
|
||||
|
||||
if ( clipPoint != -1 ) {
|
||||
if (clipPoint != -1) {
|
||||
int start = read.getReadNegativeStrandFlag() ? 0 : clipPoint;
|
||||
int stop = read.getReadNegativeStrandFlag() ? clipPoint : readLen - 1;
|
||||
clipper.addOp(new ClippingOp(ClippingType.LOW_Q_SCORES, start, stop, null));
|
||||
|
|
@ -301,33 +298,41 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
/**
|
||||
* reduceInit is called once before any calls to the map function. We use it here to setup the output
|
||||
* bam file, if it was specified on the command line
|
||||
* @return
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public ClippingData reduceInit() {
|
||||
SAMFileWriter outputBam = null;
|
||||
|
||||
if ( outputBamFile != null && ! toStandardOut ) {
|
||||
SAMFileHeader header = this.getToolkit().getSAMFileHeader();
|
||||
outputBam = Utils.createSAMFileWriterWithCompression(header, false, outputBamFile, 5);
|
||||
}
|
||||
|
||||
return new ClippingData(outputBam, sequencesToClip);
|
||||
}
|
||||
|
||||
public ClippingData reduce( ReadClipper clipper, ClippingData data ) {
|
||||
public ClippingData reduce(ReadClipper clipper, ClippingData data) {
|
||||
if (data.output != null) {
|
||||
data.output.addAlignment(clipper.clipRead(clippingRepresentation));
|
||||
} else if ( toStandardOut ) {
|
||||
} else if (toStandardOut) {
|
||||
out.println(clipper.clipRead(clippingRepresentation).format());
|
||||
}
|
||||
|
||||
data.nTotalReads++;
|
||||
data.nTotalBases += clipper.getRead().getReadLength();
|
||||
if ( clipper.wasClipped() ) {
|
||||
if (clipper.wasClipped()) {
|
||||
data.nClippedReads++;
|
||||
for ( ClippingOp op : clipper.getOps() ) {
|
||||
switch ( op.type ) {
|
||||
for (ClippingOp op : clipper.getOps()) {
|
||||
switch (op.type) {
|
||||
case LOW_Q_SCORES:
|
||||
data.incNQClippedBases( op.getLength() );
|
||||
data.incNQClippedBases(op.getLength());
|
||||
break;
|
||||
case WITHIN_CLIP_RANGE:
|
||||
data.incNRangeClippedBases( op.getLength() );
|
||||
data.incNRangeClippedBases(op.getLength());
|
||||
break;
|
||||
case MATCHES_CLIP_SEQ:
|
||||
data.incSeqClippedBases( (String)op.extraInfo, op.getLength() );
|
||||
data.incSeqClippedBases((String) op.extraInfo, op.getLength());
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
@ -336,7 +341,10 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
return data;
|
||||
}
|
||||
|
||||
public void onTraversalDone( ClippingData data ) {
|
||||
public void onTraversalDone(ClippingData data) {
|
||||
if (data.output != null)
|
||||
data.output.close();
|
||||
|
||||
out.printf(data.toString());
|
||||
}
|
||||
|
||||
|
|
@ -372,7 +380,7 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
/**
|
||||
* Represents a clip on a read. It has a type (see the enum) along with a start and stop in the bases
|
||||
* of the read, plus an option extraInfo (useful for carrying info where needed).
|
||||
*
|
||||
* <p/>
|
||||
* Also holds the critical apply function that actually execute the clipping operation on a provided read,
|
||||
* according to the wishes of the supplid ClippingAlgorithm enum.
|
||||
*/
|
||||
|
|
@ -381,14 +389,16 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
public int start, stop; // inclusive
|
||||
public Object extraInfo = null;
|
||||
|
||||
public ClippingOp(ClippingType type, int start, int stop, Object extraInfo ) {
|
||||
public ClippingOp(ClippingType type, int start, int stop, Object extraInfo) {
|
||||
this.type = type;
|
||||
this.start = start;
|
||||
this.stop = stop;
|
||||
this.extraInfo = extraInfo;
|
||||
}
|
||||
|
||||
public int getLength() { return stop - start + 1; }
|
||||
public int getLength() {
|
||||
return stop - start + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clips the bases in clippedRead according to this operation's start and stop. Uses the clipping
|
||||
|
|
@ -398,17 +408,57 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
* @param clippedRead
|
||||
*/
|
||||
public void apply(ClippingRepresentation algorithm, SAMRecord clippedRead) {
|
||||
switch ( algorithm ) {
|
||||
//clippedRead.setReferenceIndex(1);
|
||||
switch (algorithm) {
|
||||
case WRITE_NS:
|
||||
for ( int i = start; i <= stop; i++ )
|
||||
for (int i = start; i <= stop; i++)
|
||||
clippedRead.getReadBases()[i] = 'N';
|
||||
break;
|
||||
case WRITE_Q0S:
|
||||
for ( int i = start; i <= stop; i++ )
|
||||
for (int i = start; i <= stop; i++)
|
||||
clippedRead.getBaseQualities()[i] = 0;
|
||||
break;
|
||||
case WRITE_NS_Q0S:
|
||||
for (int i = start; i <= stop; i++) {
|
||||
clippedRead.getReadBases()[i] = 'N';
|
||||
clippedRead.getBaseQualities()[i] = 0;
|
||||
}
|
||||
break;
|
||||
case SOFTCLIP_BASES:
|
||||
throw new RuntimeException("Softclipping of bases not yet implemented.");
|
||||
if ( ! clippedRead.getReadUnmappedFlag() ) {
|
||||
// we can't process unmapped reads
|
||||
|
||||
if ( start > 0 && stop != clippedRead.getReadLength() - 1 )
|
||||
throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d",
|
||||
clippedRead.getReadName(), start, stop));
|
||||
|
||||
Cigar oldCigar = clippedRead.getCigar();
|
||||
|
||||
int scLeft = 0, scRight = clippedRead.getReadLength();
|
||||
if ( clippedRead.getReadNegativeStrandFlag() ) {
|
||||
if ( start == 0 )
|
||||
scLeft = stop + 1;
|
||||
else
|
||||
scRight = start + 1;
|
||||
} else {
|
||||
if ( start == 0 )
|
||||
scLeft = stop;
|
||||
else
|
||||
scRight = start;
|
||||
}
|
||||
|
||||
Cigar newCigar = _softClip(oldCigar, scLeft, scRight);
|
||||
clippedRead.setCigar(newCigar);
|
||||
|
||||
int newClippedStart = _getNewAlignmentStartOffset(newCigar, oldCigar);
|
||||
int newStart = clippedRead.getAlignmentStart() + newClippedStart;
|
||||
clippedRead.setAlignmentStart(newStart);
|
||||
|
||||
//System.out.printf("%s clipping at %d %d / %d %d => %s and %d%n", oldCigar.toString(), start, stop, scLeft, scRight, newCigar.toString(), newStart);
|
||||
}
|
||||
|
||||
break;
|
||||
//throw new RuntimeException("Softclipping of bases not yet implemented.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -419,12 +469,12 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
private enum ClippingRepresentation {
|
||||
WRITE_NS, // change the bases to Ns
|
||||
WRITE_Q0S, // change the quality scores to Q0
|
||||
WRITE_NS_Q0S, // change the quality scores to Q0 and write Ns
|
||||
SOFTCLIP_BASES // change cigar string to S, but keep bases
|
||||
}
|
||||
|
||||
/**
|
||||
* A simple collection of the clipping operations to apply to a read along with its read
|
||||
*
|
||||
*/
|
||||
public class ReadClipper {
|
||||
SAMRecord read;
|
||||
|
|
@ -432,6 +482,7 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
|
||||
/**
|
||||
* We didn't do any clipping work on this read, just leave everything as a default
|
||||
*
|
||||
* @param read
|
||||
*/
|
||||
public ReadClipper(final SAMRecord read) {
|
||||
|
|
@ -440,16 +491,25 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
|
||||
/**
|
||||
* Add another clipping operation to apply to this read
|
||||
*
|
||||
* @param op
|
||||
*/
|
||||
public void addOp( ClippingOp op ) {
|
||||
if ( ops == null ) ops = new ArrayList<ClippingOp>();
|
||||
public void addOp(ClippingOp op) {
|
||||
if (ops == null) ops = new ArrayList<ClippingOp>();
|
||||
ops.add(op);
|
||||
}
|
||||
|
||||
public List<ClippingOp> getOps() { return ops; }
|
||||
public boolean wasClipped() { return ops != null; }
|
||||
public SAMRecord getRead() { return read; }
|
||||
public List<ClippingOp> getOps() {
|
||||
return ops;
|
||||
}
|
||||
|
||||
public boolean wasClipped() {
|
||||
return ops != null;
|
||||
}
|
||||
|
||||
public SAMRecord getRead() {
|
||||
return read;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -459,12 +519,12 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
* @return
|
||||
*/
|
||||
public SAMRecord clipRead(ClippingRepresentation algorithm) {
|
||||
if ( ops == null )
|
||||
if (ops == null)
|
||||
return getRead();
|
||||
else {
|
||||
try {
|
||||
SAMRecord clippedRead = (SAMRecord)read.clone();
|
||||
for ( ClippingOp op : getOps() ) {
|
||||
SAMRecord clippedRead = (SAMRecord) read.clone();
|
||||
for (ClippingOp op : getOps()) {
|
||||
op.apply(algorithm, clippedRead);
|
||||
}
|
||||
return clippedRead;
|
||||
|
|
@ -490,22 +550,22 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
|
||||
public ClippingData(SAMFileWriter output, List<SeqToClip> clipSeqs) {
|
||||
this.output = output;
|
||||
for ( SeqToClip clipSeq : clipSeqs ) {
|
||||
for (SeqToClip clipSeq : clipSeqs) {
|
||||
seqClipCounts.put(clipSeq.seq, 0L);
|
||||
}
|
||||
}
|
||||
|
||||
public void incNQClippedBases( int n ) {
|
||||
public void incNQClippedBases(int n) {
|
||||
nQClippedBases += n;
|
||||
nClippedBases += n;
|
||||
}
|
||||
|
||||
public void incNRangeClippedBases( int n ) {
|
||||
public void incNRangeClippedBases(int n) {
|
||||
nRangeClippedBases += n;
|
||||
nClippedBases += n;
|
||||
}
|
||||
|
||||
public void incSeqClippedBases( final String seq, int n ) {
|
||||
public void incSeqClippedBases(final String seq, int n) {
|
||||
nSeqClippedBases += n;
|
||||
nClippedBases += n;
|
||||
seqClipCounts.put(seq, seqClipCounts.get(seq) + n);
|
||||
|
|
@ -517,20 +577,156 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
|
|||
s.append(Utils.dupString('-', 80) + "\n");
|
||||
s.append(String.format("Number of examined reads %d%n", nTotalReads));
|
||||
s.append(String.format("Number of clipped reads %d%n", nClippedReads));
|
||||
s.append(String.format("Percent of clipped reads %.2f%n", (100.0*nClippedReads) / nTotalReads));
|
||||
s.append(String.format("Percent of clipped reads %.2f%n", (100.0 * nClippedReads) / nTotalReads));
|
||||
s.append(String.format("Number of examined bases %d%n", nTotalBases));
|
||||
s.append(String.format("Number of clipped bases %d%n", nClippedBases));
|
||||
s.append(String.format("Percent of clipped bases %.2f%n", (100.0*nClippedBases) / nTotalBases));
|
||||
s.append(String.format("Percent of clipped bases %.2f%n", (100.0 * nClippedBases) / nTotalBases));
|
||||
s.append(String.format("Number of quality-score clipped bases %d%n", nQClippedBases));
|
||||
s.append(String.format("Number of range clipped bases %d%n", nRangeClippedBases));
|
||||
s.append(String.format("Number of sequence clipped bases %d%n", nSeqClippedBases));
|
||||
|
||||
for ( Map.Entry<String, Long> elt : seqClipCounts.entrySet() ) {
|
||||
s.append(String.format(" %8d clip sites matching %s%n", elt.getValue(), elt.getKey() ));
|
||||
for (Map.Entry<String, Long> elt : seqClipCounts.entrySet()) {
|
||||
s.append(String.format(" %8d clip sites matching %s%n", elt.getValue(), elt.getKey()));
|
||||
}
|
||||
|
||||
s.append(Utils.dupString('-', 80) + "\n");
|
||||
return s.toString();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a cigar string, get the number of bases hard or soft clipped at the start
|
||||
*/
|
||||
private int _getNewAlignmentStartOffset(final Cigar __cigar, final Cigar __oldCigar) {
|
||||
int num = 0;
|
||||
for (CigarElement e : __cigar.getCigarElements()) {
|
||||
if (!e.getOperator().consumesReferenceBases()) {
|
||||
if (e.getOperator().consumesReadBases()) {
|
||||
num += e.getLength();
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int oldNum = 0;
|
||||
int curReadCounter = 0;
|
||||
|
||||
for (CigarElement e : __oldCigar.getCigarElements()) {
|
||||
int curRefLength = e.getLength();
|
||||
int curReadLength = e.getLength();
|
||||
if (!e.getOperator().consumesReadBases()) {
|
||||
curReadLength = 0;
|
||||
}
|
||||
|
||||
boolean truncated = false;
|
||||
if (curReadCounter + curReadLength > num) {
|
||||
curReadLength = num - curReadCounter;
|
||||
curRefLength = num - curReadCounter;
|
||||
truncated = true;
|
||||
}
|
||||
|
||||
if (!e.getOperator().consumesReferenceBases()) {
|
||||
curRefLength = 0;
|
||||
}
|
||||
|
||||
curReadCounter += curReadLength;
|
||||
oldNum += curRefLength;
|
||||
|
||||
if (curReadCounter > num || truncated) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return oldNum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a cigar string, soft clip up to startClipEnd and soft clip starting at endClipBegin
|
||||
*/
|
||||
private Cigar _softClip(final Cigar __cigar, final int __startClipEnd, final int __endClipBegin) {
|
||||
if (__endClipBegin <= __startClipEnd) {
|
||||
//whole thing should be soft clipped
|
||||
int cigarLength = 0;
|
||||
for (CigarElement e : __cigar.getCigarElements()) {
|
||||
cigarLength += e.getLength();
|
||||
}
|
||||
|
||||
Cigar newCigar = new Cigar();
|
||||
newCigar.add(new CigarElement(cigarLength, CigarOperator.SOFT_CLIP));
|
||||
assert newCigar.isValid(null, -1) == null;
|
||||
return newCigar;
|
||||
}
|
||||
|
||||
int curLength = 0;
|
||||
Vector<CigarElement> newElements = new Vector<CigarElement>();
|
||||
for (CigarElement curElem : __cigar.getCigarElements()) {
|
||||
if (!curElem.getOperator().consumesReadBases()) {
|
||||
if (curLength > __startClipEnd && curLength < __endClipBegin) {
|
||||
newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator()));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
int s = curLength;
|
||||
int e = curLength + curElem.getLength();
|
||||
if (e <= __startClipEnd || s >= __endClipBegin) {
|
||||
//must turn this entire thing into a clip
|
||||
newElements.add(new CigarElement(curElem.getLength(), CigarOperator.SOFT_CLIP));
|
||||
} else if (s >= __startClipEnd && e <= __endClipBegin) {
|
||||
//same thing
|
||||
newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator()));
|
||||
} else {
|
||||
//we are clipping in the middle of this guy
|
||||
CigarElement newStart = null;
|
||||
CigarElement newMid = null;
|
||||
CigarElement newEnd = null;
|
||||
|
||||
int midLength = curElem.getLength();
|
||||
if (s < __startClipEnd) {
|
||||
newStart = new CigarElement(__startClipEnd - s, CigarOperator.SOFT_CLIP);
|
||||
midLength -= newStart.getLength();
|
||||
}
|
||||
|
||||
if (e > __endClipBegin) {
|
||||
newEnd = new CigarElement(e - __endClipBegin, CigarOperator.SOFT_CLIP);
|
||||
midLength -= newEnd.getLength();
|
||||
}
|
||||
assert midLength >= 0;
|
||||
if (midLength > 0) {
|
||||
newMid = new CigarElement(midLength, curElem.getOperator());
|
||||
}
|
||||
if (newStart != null) {
|
||||
newElements.add(newStart);
|
||||
}
|
||||
if (newMid != null) {
|
||||
newElements.add(newMid);
|
||||
}
|
||||
if (newEnd != null) {
|
||||
newElements.add(newEnd);
|
||||
}
|
||||
}
|
||||
curLength += curElem.getLength();
|
||||
}
|
||||
|
||||
Vector<CigarElement> finalNewElements = new Vector<CigarElement>();
|
||||
CigarElement lastElement = null;
|
||||
for (CigarElement elem : newElements) {
|
||||
if (lastElement == null || lastElement.getOperator() != elem.getOperator()) {
|
||||
if (lastElement != null) {
|
||||
finalNewElements.add(lastElement);
|
||||
}
|
||||
lastElement = elem;
|
||||
} else {
|
||||
lastElement = new CigarElement(lastElement.getLength() + elem.getLength(), lastElement.getOperator());
|
||||
}
|
||||
}
|
||||
if (lastElement != null) {
|
||||
finalNewElements.add(lastElement);
|
||||
}
|
||||
|
||||
Cigar newCigar = new Cigar(finalNewElements);
|
||||
assert newCigar.isValid(null, -1) == null;
|
||||
return newCigar;
|
||||
}
|
||||
}
|
||||
|
|
@ -85,12 +85,20 @@ public class WalkerTest extends BaseTest {
|
|||
String args = "";
|
||||
int nOutputFiles = -1;
|
||||
List<String> md5s = null;
|
||||
List<String> exts = null;
|
||||
|
||||
public WalkerTestSpec(String args, int nOutputFiles, List<String> md5s) {
|
||||
this.args = args;
|
||||
this.nOutputFiles = nOutputFiles;
|
||||
this.md5s = md5s;
|
||||
}
|
||||
|
||||
public WalkerTestSpec(String args, int nOutputFiles, List<String> exts, List<String> md5s) {
|
||||
this.args = args;
|
||||
this.nOutputFiles = nOutputFiles;
|
||||
this.md5s = md5s;
|
||||
this.exts = exts;
|
||||
}
|
||||
}
|
||||
|
||||
protected boolean parameterize() {
|
||||
|
|
@ -101,7 +109,8 @@ public class WalkerTest extends BaseTest {
|
|||
List<File> tmpFiles = new ArrayList<File>();
|
||||
for ( int i = 0; i < spec.nOutputFiles; i++ ) {
|
||||
try {
|
||||
File fl = File.createTempFile(String.format("walktest.tmp_param.%d", i), ".tmp" );
|
||||
String ext = spec.exts == null ? ".tmp" : "." + spec.exts.get(i);
|
||||
File fl = File.createTempFile(String.format("walktest.tmp_param.%d", i), ext );
|
||||
fl.deleteOnExit();
|
||||
tmpFiles.add( fl );
|
||||
} catch (IOException ex) {
|
||||
|
|
|
|||
|
|
@ -18,12 +18,14 @@ public class ClipReadsWalkersIntegrationTest extends WalkerTest {
|
|||
"-o %s " +
|
||||
"-ob %s " + args,
|
||||
2, // just one output file
|
||||
Arrays.asList("tmp", "bam"),
|
||||
Arrays.asList(md51, md52));
|
||||
List<File> result = executeTest(name, spec).getFirst();
|
||||
}
|
||||
|
||||
final static String Q10ClipOutput = "b29c5bc1cb9006ed9306d826a11d444f";
|
||||
@Test public void testQClip0() { testClipper("clipQSum0", "-QT 0", "117a4760b54308f81789c39b1c9de578", "2465660bcd975a1dc6dfbf40a21bf6ad"); }
|
||||
@Test public void testQClip2() { testClipper("clipQSum2", "-QT 2", "b29c5bc1cb9006ed9306d826a11d444f", "fb77d3122df468a71e03ca92b69493f4"); }
|
||||
@Test public void testQClip2() { testClipper("clipQSum2", "-QT 2", Q10ClipOutput, "fb77d3122df468a71e03ca92b69493f4"); }
|
||||
@Test public void testQClip10() { testClipper("clipQSum10", "-QT 10", "b29c5bc1cb9006ed9306d826a11d444f", "fb77d3122df468a71e03ca92b69493f4"); }
|
||||
@Test public void testQClip20() { testClipper("clipQSum20", "-QT 20", "6c3434dce66ae5c9eeea502f10fb9bee", "9a4b1c83c026ca83db00bb71999246cf"); }
|
||||
@Test public void testQClip30() { testClipper("clipQSum30", "-QT 20", "6c3434dce66ae5c9eeea502f10fb9bee", "9a4b1c83c026ca83db00bb71999246cf"); }
|
||||
|
|
@ -36,8 +38,7 @@ public class ClipReadsWalkersIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test public void testClipMulti() { testClipper("clipSeqMulti", "-QT 10 -CT 1-5 -XF /humgen/gsa-scr1/GATK_Data/Validation_Data/seqsToClip.fasta -X CCCCC", "a23187bd9bfb06557f799706d98441de", "4a1153d6f0600cf53ff7959a043e57cc"); }
|
||||
|
||||
@Test public void testClipNs() { testClipper("testClipNs", "-QT 10 -CR WRITE_NS", "b29c5bc1cb9006ed9306d826a11d444f", "fb77d3122df468a71e03ca92b69493f4"); }
|
||||
@Test public void testClipQ0s() { testClipper("testClipQs", "-QT 10 -CR WRITE_Q0S", "b29c5bc1cb9006ed9306d826a11d444f", "24053a87b00c0bc2ddf420975e9fea4d"); }
|
||||
@Test (expected = Exception.class)
|
||||
public void testClipSoft() { testClipper("testClipSoft", "-QT 10 -CR SOFTCLIP_BASES", "", ""); }
|
||||
@Test public void testClipNs() { testClipper("testClipNs", "-QT 10 -CR WRITE_NS", Q10ClipOutput, "fb77d3122df468a71e03ca92b69493f4"); }
|
||||
@Test public void testClipQ0s() { testClipper("testClipQs", "-QT 10 -CR WRITE_Q0S", Q10ClipOutput, "24053a87b00c0bc2ddf420975e9fea4d"); }
|
||||
@Test public void testClipSoft() { testClipper("testClipSoft", "-QT 10 -CR SOFTCLIP_BASES", Q10ClipOutput, "aeb67cca75285a68af8a965faa547e7f"); }
|
||||
}
|
||||
Loading…
Reference in New Issue