Softclipping support in clip reads walker. Minor improvement to WalkerTest -- now can specify file extensions for tmp files. Matt -- I couldn't easily create non-presorted SAM file. The softclipper has an impact on this.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1878 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-10-19 21:54:53 +00:00
parent 055a99fb05
commit 2a26bb42dd
3 changed files with 297 additions and 91 deletions

View File

@ -1,8 +1,6 @@
package org.broadinstitute.sting.playground.gatk.walkers; package org.broadinstitute.sting.playground.gatk.walkers;
import net.sf.samtools.SAMFileWriter; import net.sf.samtools.*;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import net.sf.picard.reference.ReferenceSequenceFileFactory; import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequence; import net.sf.picard.reference.ReferenceSequence;
@ -15,10 +13,7 @@ import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import java.util.ArrayList; import java.util.*;
import java.util.List;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.io.File; import java.io.File;
@ -58,9 +53,11 @@ import net.sf.samtools.util.StringUtil;
*/ */
@Requires({DataSource.READS}) @Requires({DataSource.READS})
public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, ClipReadsWalker.ClippingData> { public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, ClipReadsWalker.ClippingData> {
/** an optional argument to dump the reads out to a BAM file */ /**
* an optional argument to dump the reads out to a BAM file
*/
@Argument(fullName = "outputBam", shortName = "ob", doc = "Write output to this BAM filename instead of STDOUT", required = false) @Argument(fullName = "outputBam", shortName = "ob", doc = "Write output to this BAM filename instead of STDOUT", required = false)
SAMFileWriter outputBam = null; String outputBamFile = null;
@Argument(fullName = "", shortName = "STD", doc = "FOR DEBUGGING ONLY", required = false) @Argument(fullName = "", shortName = "STD", doc = "FOR DEBUGGING ONLY", required = false)
boolean toStandardOut = false; boolean toStandardOut = false;
@ -127,7 +124,6 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
} }
// //
// Initialize the cycle ranges to clip // Initialize the cycle ranges to clip
// //
@ -165,6 +161,7 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
/** /**
* The reads map function. * The reads map function.
*
* @param ref the reference bases that correspond to our read, if a reference was provided * @param ref the reference bases that correspond to our read, if a reference was provided
* @param read the read itself, as a SAMRecord * @param read the read itself, as a SAMRecord
* @return the ReadClipper object describing what should be done to clip this read * @return the ReadClipper object describing what should be done to clip this read
@ -262,11 +259,11 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
/** /**
* Clip bases from the read in clipper from * Clip bases from the read in clipper from
* * <p/>
* argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual) * argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)
* * <p/>
* to the end of the read. This is blatantly stolen from BWA. * to the end of the read. This is blatantly stolen from BWA.
* * <p/>
* Walk through the read from the end (in machine cycle order) to the beginning, calculating the * Walk through the read from the end (in machine cycle order) to the beginning, calculating the
* running sum of qTrimmingThreshold - qual. While we do this, we track the maximum value of this * running sum of qTrimmingThreshold - qual. While we do this, we track the maximum value of this
* sum where the delta > 0. After the loop, clipPoint is either -1 (don't do anything) or the * sum where the delta > 0. After the loop, clipPoint is either -1 (don't do anything) or the
@ -301,9 +298,17 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
/** /**
* reduceInit is called once before any calls to the map function. We use it here to setup the output * reduceInit is called once before any calls to the map function. We use it here to setup the output
* bam file, if it was specified on the command line * bam file, if it was specified on the command line
*
* @return * @return
*/ */
public ClippingData reduceInit() { public ClippingData reduceInit() {
SAMFileWriter outputBam = null;
if ( outputBamFile != null && ! toStandardOut ) {
SAMFileHeader header = this.getToolkit().getSAMFileHeader();
outputBam = Utils.createSAMFileWriterWithCompression(header, false, outputBamFile, 5);
}
return new ClippingData(outputBam, sequencesToClip); return new ClippingData(outputBam, sequencesToClip);
} }
@ -337,6 +342,9 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
} }
public void onTraversalDone(ClippingData data) { public void onTraversalDone(ClippingData data) {
if (data.output != null)
data.output.close();
out.printf(data.toString()); out.printf(data.toString());
} }
@ -372,7 +380,7 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
/** /**
* Represents a clip on a read. It has a type (see the enum) along with a start and stop in the bases * Represents a clip on a read. It has a type (see the enum) along with a start and stop in the bases
* of the read, plus an option extraInfo (useful for carrying info where needed). * of the read, plus an option extraInfo (useful for carrying info where needed).
* * <p/>
* Also holds the critical apply function that actually execute the clipping operation on a provided read, * Also holds the critical apply function that actually execute the clipping operation on a provided read,
* according to the wishes of the supplid ClippingAlgorithm enum. * according to the wishes of the supplid ClippingAlgorithm enum.
*/ */
@ -388,7 +396,9 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
this.extraInfo = extraInfo; this.extraInfo = extraInfo;
} }
public int getLength() { return stop - start + 1; } public int getLength() {
return stop - start + 1;
}
/** /**
* Clips the bases in clippedRead according to this operation's start and stop. Uses the clipping * Clips the bases in clippedRead according to this operation's start and stop. Uses the clipping
@ -398,6 +408,7 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
* @param clippedRead * @param clippedRead
*/ */
public void apply(ClippingRepresentation algorithm, SAMRecord clippedRead) { public void apply(ClippingRepresentation algorithm, SAMRecord clippedRead) {
//clippedRead.setReferenceIndex(1);
switch (algorithm) { switch (algorithm) {
case WRITE_NS: case WRITE_NS:
for (int i = start; i <= stop; i++) for (int i = start; i <= stop; i++)
@ -407,8 +418,47 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
for (int i = start; i <= stop; i++) for (int i = start; i <= stop; i++)
clippedRead.getBaseQualities()[i] = 0; clippedRead.getBaseQualities()[i] = 0;
break; break;
case WRITE_NS_Q0S:
for (int i = start; i <= stop; i++) {
clippedRead.getReadBases()[i] = 'N';
clippedRead.getBaseQualities()[i] = 0;
}
break;
case SOFTCLIP_BASES: case SOFTCLIP_BASES:
throw new RuntimeException("Softclipping of bases not yet implemented."); if ( ! clippedRead.getReadUnmappedFlag() ) {
// we can't process unmapped reads
if ( start > 0 && stop != clippedRead.getReadLength() - 1 )
throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d",
clippedRead.getReadName(), start, stop));
Cigar oldCigar = clippedRead.getCigar();
int scLeft = 0, scRight = clippedRead.getReadLength();
if ( clippedRead.getReadNegativeStrandFlag() ) {
if ( start == 0 )
scLeft = stop + 1;
else
scRight = start + 1;
} else {
if ( start == 0 )
scLeft = stop;
else
scRight = start;
}
Cigar newCigar = _softClip(oldCigar, scLeft, scRight);
clippedRead.setCigar(newCigar);
int newClippedStart = _getNewAlignmentStartOffset(newCigar, oldCigar);
int newStart = clippedRead.getAlignmentStart() + newClippedStart;
clippedRead.setAlignmentStart(newStart);
//System.out.printf("%s clipping at %d %d / %d %d => %s and %d%n", oldCigar.toString(), start, stop, scLeft, scRight, newCigar.toString(), newStart);
}
break;
//throw new RuntimeException("Softclipping of bases not yet implemented.");
} }
} }
} }
@ -419,12 +469,12 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
private enum ClippingRepresentation { private enum ClippingRepresentation {
WRITE_NS, // change the bases to Ns WRITE_NS, // change the bases to Ns
WRITE_Q0S, // change the quality scores to Q0 WRITE_Q0S, // change the quality scores to Q0
WRITE_NS_Q0S, // change the quality scores to Q0 and write Ns
SOFTCLIP_BASES // change cigar string to S, but keep bases SOFTCLIP_BASES // change cigar string to S, but keep bases
} }
/** /**
* A simple collection of the clipping operations to apply to a read along with its read * A simple collection of the clipping operations to apply to a read along with its read
*
*/ */
public class ReadClipper { public class ReadClipper {
SAMRecord read; SAMRecord read;
@ -432,6 +482,7 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
/** /**
* We didn't do any clipping work on this read, just leave everything as a default * We didn't do any clipping work on this read, just leave everything as a default
*
* @param read * @param read
*/ */
public ReadClipper(final SAMRecord read) { public ReadClipper(final SAMRecord read) {
@ -440,6 +491,7 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
/** /**
* Add another clipping operation to apply to this read * Add another clipping operation to apply to this read
*
* @param op * @param op
*/ */
public void addOp(ClippingOp op) { public void addOp(ClippingOp op) {
@ -447,9 +499,17 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
ops.add(op); ops.add(op);
} }
public List<ClippingOp> getOps() { return ops; } public List<ClippingOp> getOps() {
public boolean wasClipped() { return ops != null; } return ops;
public SAMRecord getRead() { return read; } }
public boolean wasClipped() {
return ops != null;
}
public SAMRecord getRead() {
return read;
}
/** /**
@ -533,4 +593,140 @@ public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipper, Cli
return s.toString(); return s.toString();
} }
} }
/**
* Given a cigar string, get the number of bases hard or soft clipped at the start
*/
private int _getNewAlignmentStartOffset(final Cigar __cigar, final Cigar __oldCigar) {
int num = 0;
for (CigarElement e : __cigar.getCigarElements()) {
if (!e.getOperator().consumesReferenceBases()) {
if (e.getOperator().consumesReadBases()) {
num += e.getLength();
}
} else {
break;
}
}
int oldNum = 0;
int curReadCounter = 0;
for (CigarElement e : __oldCigar.getCigarElements()) {
int curRefLength = e.getLength();
int curReadLength = e.getLength();
if (!e.getOperator().consumesReadBases()) {
curReadLength = 0;
}
boolean truncated = false;
if (curReadCounter + curReadLength > num) {
curReadLength = num - curReadCounter;
curRefLength = num - curReadCounter;
truncated = true;
}
if (!e.getOperator().consumesReferenceBases()) {
curRefLength = 0;
}
curReadCounter += curReadLength;
oldNum += curRefLength;
if (curReadCounter > num || truncated) {
break;
}
}
return oldNum;
}
/**
* Given a cigar string, soft clip up to startClipEnd and soft clip starting at endClipBegin
*/
private Cigar _softClip(final Cigar __cigar, final int __startClipEnd, final int __endClipBegin) {
if (__endClipBegin <= __startClipEnd) {
//whole thing should be soft clipped
int cigarLength = 0;
for (CigarElement e : __cigar.getCigarElements()) {
cigarLength += e.getLength();
}
Cigar newCigar = new Cigar();
newCigar.add(new CigarElement(cigarLength, CigarOperator.SOFT_CLIP));
assert newCigar.isValid(null, -1) == null;
return newCigar;
}
int curLength = 0;
Vector<CigarElement> newElements = new Vector<CigarElement>();
for (CigarElement curElem : __cigar.getCigarElements()) {
if (!curElem.getOperator().consumesReadBases()) {
if (curLength > __startClipEnd && curLength < __endClipBegin) {
newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator()));
}
continue;
}
int s = curLength;
int e = curLength + curElem.getLength();
if (e <= __startClipEnd || s >= __endClipBegin) {
//must turn this entire thing into a clip
newElements.add(new CigarElement(curElem.getLength(), CigarOperator.SOFT_CLIP));
} else if (s >= __startClipEnd && e <= __endClipBegin) {
//same thing
newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator()));
} else {
//we are clipping in the middle of this guy
CigarElement newStart = null;
CigarElement newMid = null;
CigarElement newEnd = null;
int midLength = curElem.getLength();
if (s < __startClipEnd) {
newStart = new CigarElement(__startClipEnd - s, CigarOperator.SOFT_CLIP);
midLength -= newStart.getLength();
}
if (e > __endClipBegin) {
newEnd = new CigarElement(e - __endClipBegin, CigarOperator.SOFT_CLIP);
midLength -= newEnd.getLength();
}
assert midLength >= 0;
if (midLength > 0) {
newMid = new CigarElement(midLength, curElem.getOperator());
}
if (newStart != null) {
newElements.add(newStart);
}
if (newMid != null) {
newElements.add(newMid);
}
if (newEnd != null) {
newElements.add(newEnd);
}
}
curLength += curElem.getLength();
}
Vector<CigarElement> finalNewElements = new Vector<CigarElement>();
CigarElement lastElement = null;
for (CigarElement elem : newElements) {
if (lastElement == null || lastElement.getOperator() != elem.getOperator()) {
if (lastElement != null) {
finalNewElements.add(lastElement);
}
lastElement = elem;
} else {
lastElement = new CigarElement(lastElement.getLength() + elem.getLength(), lastElement.getOperator());
}
}
if (lastElement != null) {
finalNewElements.add(lastElement);
}
Cigar newCigar = new Cigar(finalNewElements);
assert newCigar.isValid(null, -1) == null;
return newCigar;
}
} }

View File

@ -85,12 +85,20 @@ public class WalkerTest extends BaseTest {
String args = ""; String args = "";
int nOutputFiles = -1; int nOutputFiles = -1;
List<String> md5s = null; List<String> md5s = null;
List<String> exts = null;
public WalkerTestSpec(String args, int nOutputFiles, List<String> md5s) { public WalkerTestSpec(String args, int nOutputFiles, List<String> md5s) {
this.args = args; this.args = args;
this.nOutputFiles = nOutputFiles; this.nOutputFiles = nOutputFiles;
this.md5s = md5s; this.md5s = md5s;
} }
public WalkerTestSpec(String args, int nOutputFiles, List<String> exts, List<String> md5s) {
this.args = args;
this.nOutputFiles = nOutputFiles;
this.md5s = md5s;
this.exts = exts;
}
} }
protected boolean parameterize() { protected boolean parameterize() {
@ -101,7 +109,8 @@ public class WalkerTest extends BaseTest {
List<File> tmpFiles = new ArrayList<File>(); List<File> tmpFiles = new ArrayList<File>();
for ( int i = 0; i < spec.nOutputFiles; i++ ) { for ( int i = 0; i < spec.nOutputFiles; i++ ) {
try { try {
File fl = File.createTempFile(String.format("walktest.tmp_param.%d", i), ".tmp" ); String ext = spec.exts == null ? ".tmp" : "." + spec.exts.get(i);
File fl = File.createTempFile(String.format("walktest.tmp_param.%d", i), ext );
fl.deleteOnExit(); fl.deleteOnExit();
tmpFiles.add( fl ); tmpFiles.add( fl );
} catch (IOException ex) { } catch (IOException ex) {

View File

@ -18,12 +18,14 @@ public class ClipReadsWalkersIntegrationTest extends WalkerTest {
"-o %s " + "-o %s " +
"-ob %s " + args, "-ob %s " + args,
2, // just one output file 2, // just one output file
Arrays.asList("tmp", "bam"),
Arrays.asList(md51, md52)); Arrays.asList(md51, md52));
List<File> result = executeTest(name, spec).getFirst(); List<File> result = executeTest(name, spec).getFirst();
} }
final static String Q10ClipOutput = "b29c5bc1cb9006ed9306d826a11d444f";
@Test public void testQClip0() { testClipper("clipQSum0", "-QT 0", "117a4760b54308f81789c39b1c9de578", "2465660bcd975a1dc6dfbf40a21bf6ad"); } @Test public void testQClip0() { testClipper("clipQSum0", "-QT 0", "117a4760b54308f81789c39b1c9de578", "2465660bcd975a1dc6dfbf40a21bf6ad"); }
@Test public void testQClip2() { testClipper("clipQSum2", "-QT 2", "b29c5bc1cb9006ed9306d826a11d444f", "fb77d3122df468a71e03ca92b69493f4"); } @Test public void testQClip2() { testClipper("clipQSum2", "-QT 2", Q10ClipOutput, "fb77d3122df468a71e03ca92b69493f4"); }
@Test public void testQClip10() { testClipper("clipQSum10", "-QT 10", "b29c5bc1cb9006ed9306d826a11d444f", "fb77d3122df468a71e03ca92b69493f4"); } @Test public void testQClip10() { testClipper("clipQSum10", "-QT 10", "b29c5bc1cb9006ed9306d826a11d444f", "fb77d3122df468a71e03ca92b69493f4"); }
@Test public void testQClip20() { testClipper("clipQSum20", "-QT 20", "6c3434dce66ae5c9eeea502f10fb9bee", "9a4b1c83c026ca83db00bb71999246cf"); } @Test public void testQClip20() { testClipper("clipQSum20", "-QT 20", "6c3434dce66ae5c9eeea502f10fb9bee", "9a4b1c83c026ca83db00bb71999246cf"); }
@Test public void testQClip30() { testClipper("clipQSum30", "-QT 20", "6c3434dce66ae5c9eeea502f10fb9bee", "9a4b1c83c026ca83db00bb71999246cf"); } @Test public void testQClip30() { testClipper("clipQSum30", "-QT 20", "6c3434dce66ae5c9eeea502f10fb9bee", "9a4b1c83c026ca83db00bb71999246cf"); }
@ -36,8 +38,7 @@ public class ClipReadsWalkersIntegrationTest extends WalkerTest {
@Test public void testClipMulti() { testClipper("clipSeqMulti", "-QT 10 -CT 1-5 -XF /humgen/gsa-scr1/GATK_Data/Validation_Data/seqsToClip.fasta -X CCCCC", "a23187bd9bfb06557f799706d98441de", "4a1153d6f0600cf53ff7959a043e57cc"); } @Test public void testClipMulti() { testClipper("clipSeqMulti", "-QT 10 -CT 1-5 -XF /humgen/gsa-scr1/GATK_Data/Validation_Data/seqsToClip.fasta -X CCCCC", "a23187bd9bfb06557f799706d98441de", "4a1153d6f0600cf53ff7959a043e57cc"); }
@Test public void testClipNs() { testClipper("testClipNs", "-QT 10 -CR WRITE_NS", "b29c5bc1cb9006ed9306d826a11d444f", "fb77d3122df468a71e03ca92b69493f4"); } @Test public void testClipNs() { testClipper("testClipNs", "-QT 10 -CR WRITE_NS", Q10ClipOutput, "fb77d3122df468a71e03ca92b69493f4"); }
@Test public void testClipQ0s() { testClipper("testClipQs", "-QT 10 -CR WRITE_Q0S", "b29c5bc1cb9006ed9306d826a11d444f", "24053a87b00c0bc2ddf420975e9fea4d"); } @Test public void testClipQ0s() { testClipper("testClipQs", "-QT 10 -CR WRITE_Q0S", Q10ClipOutput, "24053a87b00c0bc2ddf420975e9fea4d"); }
@Test (expected = Exception.class) @Test public void testClipSoft() { testClipper("testClipSoft", "-QT 10 -CR SOFTCLIP_BASES", Q10ClipOutput, "aeb67cca75285a68af8a965faa547e7f"); }
public void testClipSoft() { testClipper("testClipSoft", "-QT 10 -CR SOFTCLIP_BASES", "", ""); }
} }