Merge pull request #167 from broadinstitute/gda_read_adaptor_trimmer_improvements

Several improvements to ReadAdaptorTrimmer so that it can be incorporate...
2013-04-13 10:43:45 -07:00 · 2013-04-13 10:43:45 -07:00 · 6c9360b020
parent a5301e17a2 a971e7ab6d
commit 6c9360b020
2 changed files with 452 additions and 0 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmer.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmer.java
@ -0,0 +1,392 @@
+/*
+* Copyright (c) 2012 The Broad Institute
+* 
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use,
+* copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following
+* conditions:
+* 
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.readutils;
+
+import com.google.java.contract.Ensures;
+import com.google.java.contract.Requires;
+import net.sf.samtools.SAMFileWriter;
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.commandline.Advanced;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Hidden;
+import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.gatk.CommandLineGATK;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.collections.Pair;
+import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
+import org.broadinstitute.sting.utils.help.HelpConstants;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+/**
+ * Utility tool to blindly strip base adaptors. Main application is for FASTQ/unaligned BAM pre-processing where libraries
+ * have very short inserts, and hence a substantial part of the sequencing data will have adaptor sequence present.
+ * <p>
+ * By design, tool will only work for Illumina-like library constructs, where the typical library architecture is:
+ * [Adaptor 1]-[Genomic Insert]-[Adaptor 2 (index/barcode)]
+ * <p>
+ * It is assumed that when data is paired, one read will span the forward strand and one read will span the reverse strand.
+ * Hence, when specifying adaptors they should be specified as both forward and reverse-complement to make sure they're removed in all cases.
+ * By design, as well, "circular" constructions where a read can have an insert, then adaptor, then more genomic insert, are not supported.
+ * When an adaptor is detected, all bases downstream from it (i.e. in the 3' direction) will be removed.
+ * Adaptor detection is carried out by looking for overlaps between forward and reverse reads in a pair.
+ * If a sufficiently high overlap is found, the insert size is computed and if insert size < read lengths adaptor bases are removed from reads.
+ *
+ * Advantages over ReadClipper:
+ * - No previous knowledge of adaptors or library structure is necessary
+ *
+ * Advantages over 3rd party tools like SeqPrep:
+ * - Can do BAM streaming instead of having to convert to fastq
+ * - No need to merge reads - merging reads can have some advantages, but complicates downstream processing and loses information that can be used,
+ *   e.g. in variant calling
+ * <p>
+ *
+ * <h2>Input</h2>
+ * <p>
+ * The input read data in BAM format. Read data MUST be in query name ordering as produced, for example with Picard's FastqToBam
+ *
+ * <h2>Output</h2>
+ * <p>
+ * A merged BAM file with unaligned reads
+ * </p>
+ *
+ * <h2>Examples</h2>
+ * <pre>
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ *   -T ReadAdaptorTrimmer \
+ *   -I my_reads.bam \
+ *   -R resources/Homo_sapiens_assembly18.fasta \
+ *   -o trimmed_Reads.bam
+ * </pre>
+ */
+
+@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} )
+@PartitionBy(PartitionType.READ)
+public class ReadAdaptorTrimmer extends ReadWalker<List<GATKSAMRecord>, SAMFileWriter> implements NanoSchedulable {
+    @Output(doc="Write output to this BAM filename instead of STDOUT", required = false)
+    SAMFileWriter out;
+
+     /**
+     * Only prints the first n reads of the file - for short testing
+     */
+     @Hidden
+    @Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false)
+    int nReadsToPrint = -1;
+
+    /**
+     * Argument to control strictness of match between forward and reverse reads - by default, we require 15 matches between them to declare
+     * an overlap.
+     */
+    @Advanced
+    @Argument(fullName = "minMatches", shortName = "minMatches", doc="Minimum number of substring matches to detect pair overlaps", required = false)
+    int minMatchesForOverlap = 15;
+
+
+    /**
+     * If true, this argument will make the walker discard unpaired reads instead of erroring out.
+     */
+    @Advanced
+    @Argument(fullName = "removeUnpairedReads", shortName = "removeUnpairedReads", doc="Remove unpaired reads instead of erroring out", required = false)
+    boolean cleanUnpairedReads = false;
+
+     /**
+     * private class members
+     */
+    private GATKSAMRecord firstReadInPair;
+    private TrimStats trimStats = new TrimStats();
+
+    static class TrimStats {
+        long numReadsProcessed;
+        long numReadsWithAdaptorTrimmed;
+        long numUnpairedReadsFound;
+    }
+
+   /**
+     * The reads filter function.
+     *
+     * @param ref  the reference bases that correspond to our read, if a reference was provided
+     * @param read the read itself, as a GATKSAMRecord
+     * @return true if the read passes the filter, false if it doesn't
+     */
+    public boolean filter(ReferenceContext ref, GATKSAMRecord read) {
+         // check if we've reached the output limit
+        if ( nReadsToPrint == 0 ) {
+            return false;          // n == 0 means we've printed all we needed.
+        }
+        else if (nReadsToPrint > 0) {
+            nReadsToPrint--;       // n > 0 means there are still reads to be printed.
+        }
+        return true;
+    }
+    /**
+     * reduceInit is called once before any calls to the map function.  We use it here to setup the output
+     * bam file, if it was specified on the command line
+     *
+     * @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise
+     */
+    public SAMFileWriter reduceInit() {
+        return out;
+    }
+
+    public List<GATKSAMRecord> map( final ReferenceContext ref, final GATKSAMRecord readIn, final RefMetaDataTracker metaDataTracker ) {
+
+
+        final List<GATKSAMRecord> readsToEmit = new ArrayList<GATKSAMRecord>();
+
+
+        // cache first read in pair if flag set.
+        if (readIn.getFirstOfPairFlag()) {
+            firstReadInPair = GATKSAMRecord.emptyRead(readIn);
+            firstReadInPair.setReadString(readIn.getReadString());
+            firstReadInPair.setReadName(readIn.getReadName());
+            firstReadInPair.setBaseQualities(readIn.getBaseQualities());
+        }
+        else {
+            if (!readIn.getReadName().equals(firstReadInPair.getReadName()))  {
+                if (cleanUnpairedReads) {
+                    trimStats.numUnpairedReadsFound++;
+                    return readsToEmit;
+                }
+                else // by default require that reads be completely paired
+                    throw new IllegalStateException("Second read in pair must follow first read in pair: data not ordered?");
+            }
+
+            final int oldLength1 = firstReadInPair.getReadLength();
+            final int oldLength2 = readIn.getReadLength();
+            // try to strip any adaptor sequence in read pair
+            final Integer result = trimReads(firstReadInPair, readIn, minMatchesForOverlap, logger);
+
+            if (logger.isDebugEnabled()) {
+                if (result == null)
+                    logger.debug("No overlap found, insert size cannot be computed");
+                else
+                    logger.debug("Insert size estimate = " + result);
+
+            }
+
+
+            readsToEmit.add(firstReadInPair);
+            readsToEmit.add(readIn);
+
+            if (oldLength1 != firstReadInPair.getReadLength())
+                trimStats.numReadsWithAdaptorTrimmed++;
+            if (oldLength2 != readIn.getReadLength())
+                trimStats.numReadsWithAdaptorTrimmed++;
+
+         }
+
+
+        trimStats.numReadsProcessed++;
+        return readsToEmit;
+
+    }
+
+    /**
+     * given a read and a output location, reduce by emitting the read
+     *
+     * @param readsToEmit   the read itself
+     * @param output the output source
+     * @return the SAMFileWriter, so that the next reduce can emit to the same source
+     */
+    public SAMFileWriter reduce( final List<GATKSAMRecord> readsToEmit, final SAMFileWriter output ) {
+        for (final GATKSAMRecord read : readsToEmit)
+             output.addAlignment(read);
+
+        return output;
+    }
+
+    @Override
+    public void onTraversalDone(SAMFileWriter output) {
+
+        logger.info("Finished Trimming:");
+        logger.info("Number of processed reads:                     "+ trimStats.numReadsProcessed);
+        logger.info("Number of reads with adaptor sequence trimmed: "+ trimStats.numReadsWithAdaptorTrimmed);
+        if (cleanUnpairedReads)
+            logger.info("Number of unpaired reads thrown out: "+ trimStats.numUnpairedReadsFound);
+    }
+
+
+    /**
+     *
+     * Workhorse routines...
+     *
+     */
+        /**
+         * Core routine that does most underlying work for walker. Takes two reads and looks for overlaps in them.
+         * An overlap is defined as a contiguous chunk of N bases that matches reverse-complement between reads.
+         * Currently, the only insert structure that it will look for overlaps is as follows:
+         * CASE 1: Insert shorter than read length:
+         * 3' XXXXXXXXXXXXXXXX 5'            (second read)
+         * 5'      YYYYYYYYYYYYYYYY 3'       (first read)
+         *         ***********
+         *
+         * In this case, if X and Y are complements at the 11 positions marked by *, routine will do the following
+         * iff minMatchesForOverlap <= 11:
+         *  a) Cleave adaptor from end of second read (leftmost dangling part in diagram above)
+         *  b) Cleave adaptor from end of first read (rightmost part in diagram).
+         *
+         * CASE 2: Insert size >= read length:
+         * 3'             XXXXXXXXXXXXXXXX 5'           (second read)
+         * 5'      YYYYYYYYYYYYYYYY 3'                  (first read)
+         *                *********                        (overlap)
+         *
+         * In this case, no trimming is done and reads are left unchanged
+         * @param first                      (I/O) First read in pair - read contents (bases/quals) can be modified if adaptor is detected
+         * @param second                     (I/O) Second read in pair - read contents (bases/quals) can be modified if adaptor is detected
+         * @param minMatchesForOverlap       Reads need to match in these # of bases to be joined
+         * @return                           Offset between second and first read.
+         *                                   If there's no detectable offset, return Null
+         */
+    @Requires({"first != null","second != null","minMatchesForOverlap>0"})
+    protected static Integer trimReads(final GATKSAMRecord first,
+                                       final GATKSAMRecord second,
+                                       final int minMatchesForOverlap,
+                                       final Logger logger) {
+
+        final Integer insertSize = estimateInsertSize(first.getReadBases(), second.getReadBases(),
+                minMatchesForOverlap, logger);
+
+        if (insertSize == null)
+            return insertSize;
+        if (insertSize < first.getReadLength()) {
+            // trim adaptor sequence from read
+            first.setReadBases(Arrays.copyOfRange(first.getReadBases(),0,insertSize));
+            first.setBaseQualities(Arrays.copyOfRange(first.getBaseQualities(),0,insertSize));
+        }
+        if (insertSize < second.getReadLength()) {
+            // trim adaptor sequence from read
+            second.setReadBases(Arrays.copyOfRange(second.getReadBases(),0,insertSize));
+            second.setBaseQualities(Arrays.copyOfRange(second.getBaseQualities(),0,insertSize));
+        }
+        return insertSize;
+    }
+
+    /**
+    * Brain-dead implementation of an aligner of two sequences, where it's assumed that there might be an overlap
+    * from the first into the second. From this, an estimate of insert size is performed and returned
+    * Assumes that reads come in reverse direction, so one of the base sequences needs to be reverse-complemented.]
+    *
+    * @param firstRead                           Bytes from first read
+    * @param secondRead                          Bytes from second read (reverse direction)
+    * @return                                  Estimated insert size based on offset between first and second read.
+    *                                          If no overlap can be detected, return null
+    */
+
+    @Requires({"firstRead != null","secondRead != null","minMatches>0","firstRead.length == secondRead.length"})
+    protected static Integer estimateInsertSize(final byte[] firstRead,
+                                                                final byte[] secondRead,
+                                                                final int minMatches,
+                                                                final Logger logger) {
+        final byte[] firstBases = firstRead;
+        final byte[] secondBases = BaseUtils.simpleReverseComplement(secondRead);
+
+        final Pair<Integer,Integer> overlaps = findOverlappingSequence(firstBases, secondBases);
+        final int bestOffset = overlaps.first;
+        final int maxScore = overlaps.second;
+        if ( logger.isDebugEnabled()) {
+            String sb="", s1 = new String(firstBases), s2 = new String(secondBases);
+            for (int k=0; k < Math.abs(bestOffset); k++) sb+=" ";
+            if (maxScore >= minMatches) {
+                logger.debug(String.format("Match, Max Score = %d, best offset = %d\n",maxScore, bestOffset));
+                if (bestOffset>0)
+                    s2 = sb+s2;
+                else
+                    s1 = sb+s1;
+            }
+            else logger.debug("NoMatch:");
+            logger.debug("R1:"+s1);
+            logger.debug("R2:"+s2);
+
+
+        }
+
+        if (maxScore < minMatches)
+            return null; // no overlap detected
+
+        return bestOffset+secondRead.length;
+
+
+    }
+
+
+     /**
+     * Tries to find overlapping sequence between two reads, and computes offset between them
+      * For each possible offset, computes matching score, which is = MATCH_SCORE*Num_matches + MISMATCH_SCORE*num_mismatches
+      * (like SW with infinite gap penalties).
+     * @param first                              First read bytes
+     * @param second                             Second read bytes
+     * @return                                   Pair of integers (x,y). x = best offset between reads, y = corresponding score
+     */
+    @Requires({"first != null","second != null"})
+    @Ensures("result != null")
+    protected static Pair<Integer,Integer> findOverlappingSequence(final byte[] first,
+                                                 final byte[] second) {
+        final int MATCH_SCORE = 1;
+        final int MISMATCH_SCORE = -1;
+        // try every possible offset - O(N^2) algorithm
+
+        // In case of following structure,
+        //      111111111
+        // 222222222
+        // computed offset will be negative (=-5 in this case).
+        // If however,
+        //   111111111
+        //      222222222
+        // then offset will be positive (=3 in this case)
+        int maxScore = 0, bestOffset =0;
+        for (int offset = -second.length; offset < first.length; offset++) {
+            int score = 0;
+            // compute start index for each array
+            int ind1 = (offset<0)?0:offset;
+            int ind2 = (offset<0)?-offset:0;
+            for (int k=0; k < Math.min(first.length, second.length) ; k++) {
+                if (ind1 >= first.length)
+                    break;
+                if (ind2 >= second.length )
+                    break;
+                if (first[ind1] != 'N' && second[ind2] != 'N')  {
+                    if (first[ind1] == second[ind2])
+                        score += MATCH_SCORE;
+                    else
+                        score += MISMATCH_SCORE;
+                }
+                ind1++;
+                ind2++;
+            }
+            if (score > maxScore) {
+                maxScore = score;
+                bestOffset = offset;
+            }
+        }
+        return new Pair<Integer, Integer>(bestOffset,maxScore);
+    }
+
+}
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmerIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/readutils/ReadAdaptorTrimmerIntegrationTest.java
@ -0,0 +1,60 @@
+/*
+* Copyright (c) 2012 The Broad Institute
+* 
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use,
+* copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following
+* conditions:
+* 
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.readutils;
+
+import org.broadinstitute.sting.WalkerTest;
+import org.testng.annotations.Test;
+
+import java.util.Arrays;
+import java.util.Collections;
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: delangel
+ * Date: 4/13/13
+ * Time: 7:28 AM
+ * To change this template use File | Settings | File Templates.
+ */
+public class ReadAdaptorTrimmerIntegrationTest extends WalkerTest {
+    private String getBaseCommand(final String BAM) {
+        return  "-T ReadAdaptorTrimmer -R " + b37KGReference +
+                " -I " + privateTestDir + BAM +
+                " -o %s";
+    }
+
+    @Test
+    public void testBasicTrimmer() {
+        WalkerTestSpec spec = new WalkerTestSpec( getBaseCommand("shortInsertTest.bam"),  1, Arrays.asList("1d42414e12b45d44e6f396d97d0f60fe"));
+        executeTest(String.format("testBasicTrimmer"), spec);
+    }
+
+    @Test
+    public void testSkippingBadPairs() {
+        WalkerTestSpec spec = new WalkerTestSpec( getBaseCommand("shortInsertTest2.bam")+" -removeUnpairedReads",  1, Arrays.asList("5e796345502fbfc31134f7736ce68868"));
+        executeTest(String.format("testSkippingBadPairs"), spec);
+    }
+
+}