From 2a26bb42dd88ca26024c6b3e12f1e77c5935cf41 Mon Sep 17 00:00:00 2001 From: depristo Date: Mon, 19 Oct 2009 21:54:53 +0000 Subject: [PATCH] Softclipping support in clip reads walker. Minor improvement to WalkerTest -- now can specify file extensions for tmp files. Matt -- I couldn't easily create non-presorted SAM file. The softclipper has an impact on this. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1878 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/ClipReadsWalker.java | 366 ++++++++++++++---- .../org/broadinstitute/sting/WalkerTest.java | 11 +- .../ClipReadsWalkersIntegrationTest.java | 11 +- 3 files changed, 297 insertions(+), 91 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ClipReadsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ClipReadsWalker.java index de3cc795d..a4f281d9d 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ClipReadsWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ClipReadsWalker.java @@ -1,8 +1,6 @@ package org.broadinstitute.sting.playground.gatk.walkers; -import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; +import net.sf.samtools.*; import net.sf.picard.reference.ReferenceSequenceFileFactory; import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.picard.reference.ReferenceSequence; @@ -15,10 +13,7 @@ import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import java.util.ArrayList; -import java.util.List; -import java.util.HashMap; -import java.util.Map; +import java.util.*; import java.util.regex.Pattern; import java.util.regex.Matcher; import java.io.File; @@ -58,29 +53,31 @@ import net.sf.samtools.util.StringUtil; */ @Requires({DataSource.READS}) public class ClipReadsWalker extends ReadWalker { - /** an optional argument to dump the reads out to a BAM file */ + /** + * an optional argument to dump the reads out to a BAM file + */ @Argument(fullName = "outputBam", shortName = "ob", doc = "Write output to this BAM filename instead of STDOUT", required = false) - SAMFileWriter outputBam = null; + String outputBamFile = null; - @Argument(fullName = "", shortName = "STD", doc="FOR DEBUGGING ONLY", required = false) - boolean toStandardOut = false; + @Argument(fullName = "", shortName = "STD", doc = "FOR DEBUGGING ONLY", required = false) + boolean toStandardOut = false; - @Argument(fullName = "qTrimmingThreshold", shortName = "QT", doc="", required = false) + @Argument(fullName = "qTrimmingThreshold", shortName = "QT", doc = "", required = false) int qTrimmingThreshold = -1; - @Argument(fullName = "cyclesToTrim", shortName = "CT", doc="String of the form 1-10,20-30 indicating machine cycles to clip from the reads", required = false) + @Argument(fullName = "cyclesToTrim", shortName = "CT", doc = "String of the form 1-10,20-30 indicating machine cycles to clip from the reads", required = false) String cyclesToClipArg = null; - @Argument(fullName = "clipSequencesFile", shortName = "XF", doc="Remove sequences within reads matching these sequences", required = false) + @Argument(fullName = "clipSequencesFile", shortName = "XF", doc = "Remove sequences within reads matching these sequences", required = false) String clipSequenceFile = null; - @Argument(fullName = "clipSequence", shortName = "X", doc="Remove sequences within reads matching this sequence", required = false) + @Argument(fullName = "clipSequence", shortName = "X", doc = "Remove sequences within reads matching this sequence", required = false) String[] clipSequencesArgs = null; // @Argument(fullName = "onlyClipFirstSeqMatch", shortName = "ESC", doc="Only clip the first occurrence of a clipping sequence, rather than all subsequences within a read that match", required = false) // boolean onlyClipFirstSeqMatch = false; - @Argument(fullName = "clipRepresentation", shortName = "CR", doc="How should we actually clip the bases?", required = false) + @Argument(fullName = "clipRepresentation", shortName = "CR", doc = "How should we actually clip the bases?", required = false) ClippingRepresentation clippingRepresentation = ClippingRepresentation.WRITE_NS; /** @@ -97,28 +94,28 @@ public class ClipReadsWalker extends ReadWalker= 0 ) { + if (qTrimmingThreshold >= 0) { logger.info(String.format("Creating Q-score clipper with threshold %d", qTrimmingThreshold)); } // // Initialize the sequences to clip // - if ( clipSequencesArgs != null ) { + if (clipSequencesArgs != null) { int i = 0; - for ( String toClip : clipSequencesArgs ) { + for (String toClip : clipSequencesArgs) { i++; - ReferenceSequence rs = new ReferenceSequence("CMDLINE-"+i, -1, StringUtil.stringToBytes(toClip)); + ReferenceSequence rs = new ReferenceSequence("CMDLINE-" + i, -1, StringUtil.stringToBytes(toClip)); addSeqToClip(rs.getName(), rs.getBases()); } } - if ( clipSequenceFile != null ) { + if (clipSequenceFile != null) { ReferenceSequenceFile rsf = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(clipSequenceFile)); - - while ( true ) { + + while (true) { ReferenceSequence rs = rsf.nextSequence(); - if ( rs == null ) + if (rs == null) break; else { addSeqToClip(rs.getName(), rs.getBases()); @@ -127,24 +124,23 @@ public class ClipReadsWalker extends ReadWalker>(); - for ( String range : cyclesToClipArg.split(",") ) { + for (String range : cyclesToClipArg.split(",")) { try { String[] elts = range.split("-"); int start = Integer.parseInt(elts[0]) - 1; int stop = Integer.parseInt(elts[1]) - 1; - if ( start < 0 ) throw new Exception(); - if ( stop < start ) throw new Exception(); + if (start < 0) throw new Exception(); + if (stop < start) throw new Exception(); logger.info(String.format("Creating cycle clipper %d-%d", start, stop)); cyclesToClip.add(new Pair(start, stop)); - } catch ( Exception e ) { + } catch (Exception e) { throw new RuntimeException("Badly formatted cyclesToClip argument: " + cyclesToClipArg); } } @@ -153,7 +149,7 @@ public class ClipReadsWalker extends ReadWalker %b%n", bases, stc.seq, stc.revSeq, found); - if ( found ) { + if (found) { int start = match.start(); int stop = match.end() - 1; ClippingOp op = new ClippingOp(ClippingType.MATCHES_CLIP_SEQ, start, stop, stc.seq); @@ -223,9 +220,9 @@ public class ClipReadsWalker extends ReadWalker strandAwarePositions( SAMRecord read, int start, int stop ) { - if ( read.getReadNegativeStrandFlag() ) - return new Pair( read.getReadLength() - stop - 1, read.getReadLength() - start - 1 ); + private Pair strandAwarePositions(SAMRecord read, int start, int stop) { + if (read.getReadNegativeStrandFlag()) + return new Pair(read.getReadLength() - stop - 1, read.getReadLength() - start - 1); else return new Pair(start, stop); } @@ -236,20 +233,20 @@ public class ClipReadsWalker extends ReadWalker p : cyclesToClip ) { // iterate over each cycle range + for (Pair p : cyclesToClip) { // iterate over each cycle range int cycleStart = p.first; int cycleStop = p.second; - if ( cycleStart < read.getReadLength() ) { + if (cycleStart < read.getReadLength()) { // only try to clip if the cycleStart is less than the read's length - if ( cycleStop >= read.getReadLength() ) + if (cycleStop >= read.getReadLength()) // we do tolerate [for convenience) clipping when the stop is beyond the end of the read cycleStop = read.getReadLength() - 1; - Pair startStop = strandAwarePositions( read, cycleStart, cycleStop ); + Pair startStop = strandAwarePositions(read, cycleStart, cycleStop); int start = startStop.first; int stop = startStop.second; @@ -262,11 +259,11 @@ public class ClipReadsWalker extends ReadWalker * argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual) - * + *

* to the end of the read. This is blatantly stolen from BWA. - * + *

* Walk through the read from the end (in machine cycle order) to the beginning, calculating the * running sum of qTrimmingThreshold - qual. While we do this, we track the maximum value of this * sum where the delta > 0. After the loop, clipPoint is either -1 (don't do anything) or the @@ -281,17 +278,17 @@ public class ClipReadsWalker extends ReadWalker= 0; i-- ) { + for (int i = readLen - 1; i >= 0; i--) { int baseIndex = read.getReadNegativeStrandFlag() ? readLen - i - 1 : i; byte qual = quals[baseIndex]; clipSum += (qTrimmingThreshold - qual); - if ( clipSum >= 0 && ( clipSum >= lastMax ) ) { + if (clipSum >= 0 && (clipSum >= lastMax)) { lastMax = clipSum; clipPoint = baseIndex; } } - if ( clipPoint != -1 ) { + if (clipPoint != -1) { int start = read.getReadNegativeStrandFlag() ? 0 : clipPoint; int stop = read.getReadNegativeStrandFlag() ? clipPoint : readLen - 1; clipper.addOp(new ClippingOp(ClippingType.LOW_Q_SCORES, start, stop, null)); @@ -301,33 +298,41 @@ public class ClipReadsWalker extends ReadWalker * Also holds the critical apply function that actually execute the clipping operation on a provided read, * according to the wishes of the supplid ClippingAlgorithm enum. */ @@ -381,14 +389,16 @@ public class ClipReadsWalker extends ReadWalker 0 && stop != clippedRead.getReadLength() - 1 ) + throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", + clippedRead.getReadName(), start, stop)); + + Cigar oldCigar = clippedRead.getCigar(); + + int scLeft = 0, scRight = clippedRead.getReadLength(); + if ( clippedRead.getReadNegativeStrandFlag() ) { + if ( start == 0 ) + scLeft = stop + 1; + else + scRight = start + 1; + } else { + if ( start == 0 ) + scLeft = stop; + else + scRight = start; + } + + Cigar newCigar = _softClip(oldCigar, scLeft, scRight); + clippedRead.setCigar(newCigar); + + int newClippedStart = _getNewAlignmentStartOffset(newCigar, oldCigar); + int newStart = clippedRead.getAlignmentStart() + newClippedStart; + clippedRead.setAlignmentStart(newStart); + + //System.out.printf("%s clipping at %d %d / %d %d => %s and %d%n", oldCigar.toString(), start, stop, scLeft, scRight, newCigar.toString(), newStart); + } + + break; + //throw new RuntimeException("Softclipping of bases not yet implemented."); } } } @@ -419,12 +469,12 @@ public class ClipReadsWalker extends ReadWalker(); + public void addOp(ClippingOp op) { + if (ops == null) ops = new ArrayList(); ops.add(op); } - public List getOps() { return ops; } - public boolean wasClipped() { return ops != null; } - public SAMRecord getRead() { return read; } + public List getOps() { + return ops; + } + + public boolean wasClipped() { + return ops != null; + } + + public SAMRecord getRead() { + return read; + } /** @@ -459,12 +519,12 @@ public class ClipReadsWalker extends ReadWalker clipSeqs) { this.output = output; - for ( SeqToClip clipSeq : clipSeqs ) { + for (SeqToClip clipSeq : clipSeqs) { seqClipCounts.put(clipSeq.seq, 0L); } } - public void incNQClippedBases( int n ) { + public void incNQClippedBases(int n) { nQClippedBases += n; nClippedBases += n; } - public void incNRangeClippedBases( int n ) { + public void incNRangeClippedBases(int n) { nRangeClippedBases += n; nClippedBases += n; } - public void incSeqClippedBases( final String seq, int n ) { + public void incSeqClippedBases(final String seq, int n) { nSeqClippedBases += n; nClippedBases += n; seqClipCounts.put(seq, seqClipCounts.get(seq) + n); @@ -517,20 +577,156 @@ public class ClipReadsWalker extends ReadWalker elt : seqClipCounts.entrySet() ) { - s.append(String.format(" %8d clip sites matching %s%n", elt.getValue(), elt.getKey() )); + for (Map.Entry elt : seqClipCounts.entrySet()) { + s.append(String.format(" %8d clip sites matching %s%n", elt.getValue(), elt.getKey())); } s.append(Utils.dupString('-', 80) + "\n"); return s.toString(); } } + + /** + * Given a cigar string, get the number of bases hard or soft clipped at the start + */ + private int _getNewAlignmentStartOffset(final Cigar __cigar, final Cigar __oldCigar) { + int num = 0; + for (CigarElement e : __cigar.getCigarElements()) { + if (!e.getOperator().consumesReferenceBases()) { + if (e.getOperator().consumesReadBases()) { + num += e.getLength(); + } + } else { + break; + } + } + + int oldNum = 0; + int curReadCounter = 0; + + for (CigarElement e : __oldCigar.getCigarElements()) { + int curRefLength = e.getLength(); + int curReadLength = e.getLength(); + if (!e.getOperator().consumesReadBases()) { + curReadLength = 0; + } + + boolean truncated = false; + if (curReadCounter + curReadLength > num) { + curReadLength = num - curReadCounter; + curRefLength = num - curReadCounter; + truncated = true; + } + + if (!e.getOperator().consumesReferenceBases()) { + curRefLength = 0; + } + + curReadCounter += curReadLength; + oldNum += curRefLength; + + if (curReadCounter > num || truncated) { + break; + } + } + + return oldNum; + } + + /** + * Given a cigar string, soft clip up to startClipEnd and soft clip starting at endClipBegin + */ + private Cigar _softClip(final Cigar __cigar, final int __startClipEnd, final int __endClipBegin) { + if (__endClipBegin <= __startClipEnd) { + //whole thing should be soft clipped + int cigarLength = 0; + for (CigarElement e : __cigar.getCigarElements()) { + cigarLength += e.getLength(); + } + + Cigar newCigar = new Cigar(); + newCigar.add(new CigarElement(cigarLength, CigarOperator.SOFT_CLIP)); + assert newCigar.isValid(null, -1) == null; + return newCigar; + } + + int curLength = 0; + Vector newElements = new Vector(); + for (CigarElement curElem : __cigar.getCigarElements()) { + if (!curElem.getOperator().consumesReadBases()) { + if (curLength > __startClipEnd && curLength < __endClipBegin) { + newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator())); + } + continue; + } + + int s = curLength; + int e = curLength + curElem.getLength(); + if (e <= __startClipEnd || s >= __endClipBegin) { + //must turn this entire thing into a clip + newElements.add(new CigarElement(curElem.getLength(), CigarOperator.SOFT_CLIP)); + } else if (s >= __startClipEnd && e <= __endClipBegin) { + //same thing + newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator())); + } else { + //we are clipping in the middle of this guy + CigarElement newStart = null; + CigarElement newMid = null; + CigarElement newEnd = null; + + int midLength = curElem.getLength(); + if (s < __startClipEnd) { + newStart = new CigarElement(__startClipEnd - s, CigarOperator.SOFT_CLIP); + midLength -= newStart.getLength(); + } + + if (e > __endClipBegin) { + newEnd = new CigarElement(e - __endClipBegin, CigarOperator.SOFT_CLIP); + midLength -= newEnd.getLength(); + } + assert midLength >= 0; + if (midLength > 0) { + newMid = new CigarElement(midLength, curElem.getOperator()); + } + if (newStart != null) { + newElements.add(newStart); + } + if (newMid != null) { + newElements.add(newMid); + } + if (newEnd != null) { + newElements.add(newEnd); + } + } + curLength += curElem.getLength(); + } + + Vector finalNewElements = new Vector(); + CigarElement lastElement = null; + for (CigarElement elem : newElements) { + if (lastElement == null || lastElement.getOperator() != elem.getOperator()) { + if (lastElement != null) { + finalNewElements.add(lastElement); + } + lastElement = elem; + } else { + lastElement = new CigarElement(lastElement.getLength() + elem.getLength(), lastElement.getOperator()); + } + } + if (lastElement != null) { + finalNewElements.add(lastElement); + } + + Cigar newCigar = new Cigar(finalNewElements); + assert newCigar.isValid(null, -1) == null; + return newCigar; + } } \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/WalkerTest.java b/java/test/org/broadinstitute/sting/WalkerTest.java index 5fec41be9..ed1355a60 100755 --- a/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/java/test/org/broadinstitute/sting/WalkerTest.java @@ -85,12 +85,20 @@ public class WalkerTest extends BaseTest { String args = ""; int nOutputFiles = -1; List md5s = null; + List exts = null; public WalkerTestSpec(String args, int nOutputFiles, List md5s) { this.args = args; this.nOutputFiles = nOutputFiles; this.md5s = md5s; } + + public WalkerTestSpec(String args, int nOutputFiles, List exts, List md5s) { + this.args = args; + this.nOutputFiles = nOutputFiles; + this.md5s = md5s; + this.exts = exts; + } } protected boolean parameterize() { @@ -101,7 +109,8 @@ public class WalkerTest extends BaseTest { List tmpFiles = new ArrayList(); for ( int i = 0; i < spec.nOutputFiles; i++ ) { try { - File fl = File.createTempFile(String.format("walktest.tmp_param.%d", i), ".tmp" ); + String ext = spec.exts == null ? ".tmp" : "." + spec.exts.get(i); + File fl = File.createTempFile(String.format("walktest.tmp_param.%d", i), ext ); fl.deleteOnExit(); tmpFiles.add( fl ); } catch (IOException ex) { diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/ClipReadsWalkersIntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/ClipReadsWalkersIntegrationTest.java index 7e5622583..4cde739d7 100755 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/ClipReadsWalkersIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/playground/gatk/walkers/ClipReadsWalkersIntegrationTest.java @@ -18,12 +18,14 @@ public class ClipReadsWalkersIntegrationTest extends WalkerTest { "-o %s " + "-ob %s " + args, 2, // just one output file + Arrays.asList("tmp", "bam"), Arrays.asList(md51, md52)); List result = executeTest(name, spec).getFirst(); } + final static String Q10ClipOutput = "b29c5bc1cb9006ed9306d826a11d444f"; @Test public void testQClip0() { testClipper("clipQSum0", "-QT 0", "117a4760b54308f81789c39b1c9de578", "2465660bcd975a1dc6dfbf40a21bf6ad"); } - @Test public void testQClip2() { testClipper("clipQSum2", "-QT 2", "b29c5bc1cb9006ed9306d826a11d444f", "fb77d3122df468a71e03ca92b69493f4"); } + @Test public void testQClip2() { testClipper("clipQSum2", "-QT 2", Q10ClipOutput, "fb77d3122df468a71e03ca92b69493f4"); } @Test public void testQClip10() { testClipper("clipQSum10", "-QT 10", "b29c5bc1cb9006ed9306d826a11d444f", "fb77d3122df468a71e03ca92b69493f4"); } @Test public void testQClip20() { testClipper("clipQSum20", "-QT 20", "6c3434dce66ae5c9eeea502f10fb9bee", "9a4b1c83c026ca83db00bb71999246cf"); } @Test public void testQClip30() { testClipper("clipQSum30", "-QT 20", "6c3434dce66ae5c9eeea502f10fb9bee", "9a4b1c83c026ca83db00bb71999246cf"); } @@ -36,8 +38,7 @@ public class ClipReadsWalkersIntegrationTest extends WalkerTest { @Test public void testClipMulti() { testClipper("clipSeqMulti", "-QT 10 -CT 1-5 -XF /humgen/gsa-scr1/GATK_Data/Validation_Data/seqsToClip.fasta -X CCCCC", "a23187bd9bfb06557f799706d98441de", "4a1153d6f0600cf53ff7959a043e57cc"); } - @Test public void testClipNs() { testClipper("testClipNs", "-QT 10 -CR WRITE_NS", "b29c5bc1cb9006ed9306d826a11d444f", "fb77d3122df468a71e03ca92b69493f4"); } - @Test public void testClipQ0s() { testClipper("testClipQs", "-QT 10 -CR WRITE_Q0S", "b29c5bc1cb9006ed9306d826a11d444f", "24053a87b00c0bc2ddf420975e9fea4d"); } - @Test (expected = Exception.class) - public void testClipSoft() { testClipper("testClipSoft", "-QT 10 -CR SOFTCLIP_BASES", "", ""); } + @Test public void testClipNs() { testClipper("testClipNs", "-QT 10 -CR WRITE_NS", Q10ClipOutput, "fb77d3122df468a71e03ca92b69493f4"); } + @Test public void testClipQ0s() { testClipper("testClipQs", "-QT 10 -CR WRITE_Q0S", Q10ClipOutput, "24053a87b00c0bc2ddf420975e9fea4d"); } + @Test public void testClipSoft() { testClipper("testClipSoft", "-QT 10 -CR SOFTCLIP_BASES", Q10ClipOutput, "aeb67cca75285a68af8a965faa547e7f"); } } \ No newline at end of file