Merge pull request #167 from broadinstitute/gda_read_adaptor_trimmer_improvements

Several improvements to ReadAdaptorTrimmer so that it can be incorporate...
This commit is contained in:
delangel 2013-04-13 10:43:45 -07:00
commit 6c9360b020
2 changed files with 452 additions and 0 deletions

View File

@ -0,0 +1,392 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.readutils;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.SAMFileWriter;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.help.HelpConstants;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Utility tool to blindly strip base adaptors. Main application is for FASTQ/unaligned BAM pre-processing where libraries
* have very short inserts, and hence a substantial part of the sequencing data will have adaptor sequence present.
* <p>
* By design, tool will only work for Illumina-like library constructs, where the typical library architecture is:
* [Adaptor 1]-[Genomic Insert]-[Adaptor 2 (index/barcode)]
* <p>
* It is assumed that when data is paired, one read will span the forward strand and one read will span the reverse strand.
* Hence, when specifying adaptors they should be specified as both forward and reverse-complement to make sure they're removed in all cases.
* By design, as well, "circular" constructions where a read can have an insert, then adaptor, then more genomic insert, are not supported.
* When an adaptor is detected, all bases downstream from it (i.e. in the 3' direction) will be removed.
* Adaptor detection is carried out by looking for overlaps between forward and reverse reads in a pair.
* If a sufficiently high overlap is found, the insert size is computed and if insert size < read lengths adaptor bases are removed from reads.
*
* Advantages over ReadClipper:
* - No previous knowledge of adaptors or library structure is necessary
*
* Advantages over 3rd party tools like SeqPrep:
* - Can do BAM streaming instead of having to convert to fastq
* - No need to merge reads - merging reads can have some advantages, but complicates downstream processing and loses information that can be used,
* e.g. in variant calling
* <p>
*
* <h2>Input</h2>
* <p>
* The input read data in BAM format. Read data MUST be in query name ordering as produced, for example with Picard's FastqToBam
*
* <h2>Output</h2>
* <p>
* A merged BAM file with unaligned reads
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* -T ReadAdaptorTrimmer \
* -I my_reads.bam \
* -R resources/Homo_sapiens_assembly18.fasta \
* -o trimmed_Reads.bam
* </pre>
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} )
@PartitionBy(PartitionType.READ)
public class ReadAdaptorTrimmer extends ReadWalker<List<GATKSAMRecord>, SAMFileWriter> implements NanoSchedulable {
@Output(doc="Write output to this BAM filename instead of STDOUT", required = false)
SAMFileWriter out;
/**
* Only prints the first n reads of the file - for short testing
*/
@Hidden
@Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false)
int nReadsToPrint = -1;
/**
* Argument to control strictness of match between forward and reverse reads - by default, we require 15 matches between them to declare
* an overlap.
*/
@Advanced
@Argument(fullName = "minMatches", shortName = "minMatches", doc="Minimum number of substring matches to detect pair overlaps", required = false)
int minMatchesForOverlap = 15;
/**
* If true, this argument will make the walker discard unpaired reads instead of erroring out.
*/
@Advanced
@Argument(fullName = "removeUnpairedReads", shortName = "removeUnpairedReads", doc="Remove unpaired reads instead of erroring out", required = false)
boolean cleanUnpairedReads = false;
/**
* private class members
*/
private GATKSAMRecord firstReadInPair;
private TrimStats trimStats = new TrimStats();
static class TrimStats {
long numReadsProcessed;
long numReadsWithAdaptorTrimmed;
long numUnpairedReadsFound;
}
/**
* The reads filter function.
*
* @param ref the reference bases that correspond to our read, if a reference was provided
* @param read the read itself, as a GATKSAMRecord
* @return true if the read passes the filter, false if it doesn't
*/
public boolean filter(ReferenceContext ref, GATKSAMRecord read) {
// check if we've reached the output limit
if ( nReadsToPrint == 0 ) {
return false; // n == 0 means we've printed all we needed.
}
else if (nReadsToPrint > 0) {
nReadsToPrint--; // n > 0 means there are still reads to be printed.
}
return true;
}
/**
* reduceInit is called once before any calls to the map function. We use it here to setup the output
* bam file, if it was specified on the command line
*
* @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise
*/
public SAMFileWriter reduceInit() {
return out;
}
public List<GATKSAMRecord> map( final ReferenceContext ref, final GATKSAMRecord readIn, final RefMetaDataTracker metaDataTracker ) {
final List<GATKSAMRecord> readsToEmit = new ArrayList<GATKSAMRecord>();
// cache first read in pair if flag set.
if (readIn.getFirstOfPairFlag()) {
firstReadInPair = GATKSAMRecord.emptyRead(readIn);
firstReadInPair.setReadString(readIn.getReadString());
firstReadInPair.setReadName(readIn.getReadName());
firstReadInPair.setBaseQualities(readIn.getBaseQualities());
}
else {
if (!readIn.getReadName().equals(firstReadInPair.getReadName())) {
if (cleanUnpairedReads) {
trimStats.numUnpairedReadsFound++;
return readsToEmit;
}
else // by default require that reads be completely paired
throw new IllegalStateException("Second read in pair must follow first read in pair: data not ordered?");
}
final int oldLength1 = firstReadInPair.getReadLength();
final int oldLength2 = readIn.getReadLength();
// try to strip any adaptor sequence in read pair
final Integer result = trimReads(firstReadInPair, readIn, minMatchesForOverlap, logger);
if (logger.isDebugEnabled()) {
if (result == null)
logger.debug("No overlap found, insert size cannot be computed");
else
logger.debug("Insert size estimate = " + result);
}
readsToEmit.add(firstReadInPair);
readsToEmit.add(readIn);
if (oldLength1 != firstReadInPair.getReadLength())
trimStats.numReadsWithAdaptorTrimmed++;
if (oldLength2 != readIn.getReadLength())
trimStats.numReadsWithAdaptorTrimmed++;
}
trimStats.numReadsProcessed++;
return readsToEmit;
}
/**
* given a read and a output location, reduce by emitting the read
*
* @param readsToEmit the read itself
* @param output the output source
* @return the SAMFileWriter, so that the next reduce can emit to the same source
*/
public SAMFileWriter reduce( final List<GATKSAMRecord> readsToEmit, final SAMFileWriter output ) {
for (final GATKSAMRecord read : readsToEmit)
output.addAlignment(read);
return output;
}
@Override
public void onTraversalDone(SAMFileWriter output) {
logger.info("Finished Trimming:");
logger.info("Number of processed reads: "+ trimStats.numReadsProcessed);
logger.info("Number of reads with adaptor sequence trimmed: "+ trimStats.numReadsWithAdaptorTrimmed);
if (cleanUnpairedReads)
logger.info("Number of unpaired reads thrown out: "+ trimStats.numUnpairedReadsFound);
}
/**
*
* Workhorse routines...
*
*/
/**
* Core routine that does most underlying work for walker. Takes two reads and looks for overlaps in them.
* An overlap is defined as a contiguous chunk of N bases that matches reverse-complement between reads.
* Currently, the only insert structure that it will look for overlaps is as follows:
* CASE 1: Insert shorter than read length:
* 3' XXXXXXXXXXXXXXXX 5' (second read)
* 5' YYYYYYYYYYYYYYYY 3' (first read)
* ***********
*
* In this case, if X and Y are complements at the 11 positions marked by *, routine will do the following
* iff minMatchesForOverlap <= 11:
* a) Cleave adaptor from end of second read (leftmost dangling part in diagram above)
* b) Cleave adaptor from end of first read (rightmost part in diagram).
*
* CASE 2: Insert size >= read length:
* 3' XXXXXXXXXXXXXXXX 5' (second read)
* 5' YYYYYYYYYYYYYYYY 3' (first read)
* ********* (overlap)
*
* In this case, no trimming is done and reads are left unchanged
* @param first (I/O) First read in pair - read contents (bases/quals) can be modified if adaptor is detected
* @param second (I/O) Second read in pair - read contents (bases/quals) can be modified if adaptor is detected
* @param minMatchesForOverlap Reads need to match in these # of bases to be joined
* @return Offset between second and first read.
* If there's no detectable offset, return Null
*/
@Requires({"first != null","second != null","minMatchesForOverlap>0"})
protected static Integer trimReads(final GATKSAMRecord first,
final GATKSAMRecord second,
final int minMatchesForOverlap,
final Logger logger) {
final Integer insertSize = estimateInsertSize(first.getReadBases(), second.getReadBases(),
minMatchesForOverlap, logger);
if (insertSize == null)
return insertSize;
if (insertSize < first.getReadLength()) {
// trim adaptor sequence from read
first.setReadBases(Arrays.copyOfRange(first.getReadBases(),0,insertSize));
first.setBaseQualities(Arrays.copyOfRange(first.getBaseQualities(),0,insertSize));
}
if (insertSize < second.getReadLength()) {
// trim adaptor sequence from read
second.setReadBases(Arrays.copyOfRange(second.getReadBases(),0,insertSize));
second.setBaseQualities(Arrays.copyOfRange(second.getBaseQualities(),0,insertSize));
}
return insertSize;
}
/**
* Brain-dead implementation of an aligner of two sequences, where it's assumed that there might be an overlap
* from the first into the second. From this, an estimate of insert size is performed and returned
* Assumes that reads come in reverse direction, so one of the base sequences needs to be reverse-complemented.]
*
* @param firstRead Bytes from first read
* @param secondRead Bytes from second read (reverse direction)
* @return Estimated insert size based on offset between first and second read.
* If no overlap can be detected, return null
*/
@Requires({"firstRead != null","secondRead != null","minMatches>0","firstRead.length == secondRead.length"})
protected static Integer estimateInsertSize(final byte[] firstRead,
final byte[] secondRead,
final int minMatches,
final Logger logger) {
final byte[] firstBases = firstRead;
final byte[] secondBases = BaseUtils.simpleReverseComplement(secondRead);
final Pair<Integer,Integer> overlaps = findOverlappingSequence(firstBases, secondBases);
final int bestOffset = overlaps.first;
final int maxScore = overlaps.second;
if ( logger.isDebugEnabled()) {
String sb="", s1 = new String(firstBases), s2 = new String(secondBases);
for (int k=0; k < Math.abs(bestOffset); k++) sb+=" ";
if (maxScore >= minMatches) {
logger.debug(String.format("Match, Max Score = %d, best offset = %d\n",maxScore, bestOffset));
if (bestOffset>0)
s2 = sb+s2;
else
s1 = sb+s1;
}
else logger.debug("NoMatch:");
logger.debug("R1:"+s1);
logger.debug("R2:"+s2);
}
if (maxScore < minMatches)
return null; // no overlap detected
return bestOffset+secondRead.length;
}
/**
* Tries to find overlapping sequence between two reads, and computes offset between them
* For each possible offset, computes matching score, which is = MATCH_SCORE*Num_matches + MISMATCH_SCORE*num_mismatches
* (like SW with infinite gap penalties).
* @param first First read bytes
* @param second Second read bytes
* @return Pair of integers (x,y). x = best offset between reads, y = corresponding score
*/
@Requires({"first != null","second != null"})
@Ensures("result != null")
protected static Pair<Integer,Integer> findOverlappingSequence(final byte[] first,
final byte[] second) {
final int MATCH_SCORE = 1;
final int MISMATCH_SCORE = -1;
// try every possible offset - O(N^2) algorithm
// In case of following structure,
// 111111111
// 222222222
// computed offset will be negative (=-5 in this case).
// If however,
// 111111111
// 222222222
// then offset will be positive (=3 in this case)
int maxScore = 0, bestOffset =0;
for (int offset = -second.length; offset < first.length; offset++) {
int score = 0;
// compute start index for each array
int ind1 = (offset<0)?0:offset;
int ind2 = (offset<0)?-offset:0;
for (int k=0; k < Math.min(first.length, second.length) ; k++) {
if (ind1 >= first.length)
break;
if (ind2 >= second.length )
break;
if (first[ind1] != 'N' && second[ind2] != 'N') {
if (first[ind1] == second[ind2])
score += MATCH_SCORE;
else
score += MISMATCH_SCORE;
}
ind1++;
ind2++;
}
if (score > maxScore) {
maxScore = score;
bestOffset = offset;
}
}
return new Pair<Integer, Integer>(bestOffset,maxScore);
}
}

View File

@ -0,0 +1,60 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.readutils;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import java.util.Arrays;
import java.util.Collections;
/**
* Created with IntelliJ IDEA.
* User: delangel
* Date: 4/13/13
* Time: 7:28 AM
* To change this template use File | Settings | File Templates.
*/
public class ReadAdaptorTrimmerIntegrationTest extends WalkerTest {
private String getBaseCommand(final String BAM) {
return "-T ReadAdaptorTrimmer -R " + b37KGReference +
" -I " + privateTestDir + BAM +
" -o %s";
}
@Test
public void testBasicTrimmer() {
WalkerTestSpec spec = new WalkerTestSpec( getBaseCommand("shortInsertTest.bam"), 1, Arrays.asList("1d42414e12b45d44e6f396d97d0f60fe"));
executeTest(String.format("testBasicTrimmer"), spec);
}
@Test
public void testSkippingBadPairs() {
WalkerTestSpec spec = new WalkerTestSpec( getBaseCommand("shortInsertTest2.bam")+" -removeUnpairedReads", 1, Arrays.asList("5e796345502fbfc31134f7736ce68868"));
executeTest(String.format("testSkippingBadPairs"), spec);
}
}