create a new read transformer that refactor NDN cigar elements to one N element.
story: https://www.pivotaltracker.com/story/show/69648104 description: This read transformer will refactor cigar strings that contain N-D-N elements to one N element (with total length of the three refactored elements). This is intended primarily for users of RNA-Seq data handling programs such as TopHat2. Currently we consider that the internal N-D-N motif is illegal and we error out when we encounter it. By refactoring the cigar string of those specific reads, users of TopHat and other tools can circumvent this problem without affecting the rest of their dataset. edit: address review comments - change the tool's name and change the tool to be a readTransformer instead of read filter
This commit is contained in:
parent
385fe5fb56
commit
13dd755468
|
|
@ -55,6 +55,8 @@ import org.broadinstitute.sting.gatk.CommandLineGATK;
|
|||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.iterators.RNAReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
|
|
@ -69,6 +71,8 @@ import org.broadinstitute.sting.utils.sam.CigarUtils;
|
|||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
*
|
||||
|
|
@ -132,12 +136,19 @@ public class SplitNCigarReads extends ReadWalker<GATKSAMRecord, OverhangFixingMa
|
|||
* It will emit reads to the underlying writer as needed so we don't need to worry about any of that in this class.
|
||||
*/
|
||||
protected OverhangFixingManager overhangManager;
|
||||
private List<RNAReadTransformer> rnaReadTransformers = Collections.emptyList();
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
final GenomeAnalysisEngine toolkit = getToolkit();
|
||||
|
||||
if ( getToolkit() != null ) {
|
||||
for (final ReadTransformer transformer: getToolkit().getReadTransformers())
|
||||
if(transformer instanceof RNAReadTransformer) // TODO: when a read transformer can be called directly from the command line we won't need that mechanism any more
|
||||
rnaReadTransformers.add((RNAReadTransformer)transformer);
|
||||
}
|
||||
if ( !NO_PG_TAG ) {
|
||||
// we don't want to assume that reads will be written in order by the manager because in deep, deep pileups it won't work
|
||||
Utils.setupWriter(writer, toolkit, toolkit.getSAMFileHeader(), false, this, PROGRAM_RECORD_NAME);
|
||||
|
|
@ -150,11 +161,19 @@ public class SplitNCigarReads extends ReadWalker<GATKSAMRecord, OverhangFixingMa
|
|||
catch (FileNotFoundException ex) {
|
||||
throw new UserException.CouldNotReadInputFile(toolkit.getArguments().referenceFile, ex);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public GATKSAMRecord map(final ReferenceContext ref, final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker) {
|
||||
return read;
|
||||
GATKSAMRecord workingRead = read;
|
||||
|
||||
for ( final RNAReadTransformer transformer : rnaReadTransformers ) {
|
||||
workingRead = transformer.apply(workingRead); // TODO: when a read transformer can be called directly from the command line we won't need that mechanism any more
|
||||
}
|
||||
|
||||
return workingRead;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -234,6 +234,19 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty", required = false, minValue = 0)
|
||||
public double BAQGOP = BAQ.DEFAULT_GOP;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// refactor NDN cigar string arguments
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
/**
|
||||
* This flag tells GATK to refactor cigar string with NDN elements to one element. It intended primarily for use in
|
||||
* a RNAseq pipeline since the problem might come up when using RNAseq aligner such as Tophat2 with provided transcriptoms.
|
||||
* You should only use this if you know that your reads have that problem.
|
||||
*/
|
||||
@Argument(fullName = "refactor_NDN_cigar_string", shortName = "fixNDN", doc = "refactor cigar string with NDN elements to one element", required = false)
|
||||
public boolean REFACTOR_NDN_CIGAR_READS = false;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// quality encoding checking arguments
|
||||
|
|
|
|||
|
|
@ -0,0 +1,118 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.filters;
|
||||
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.iterators.RNAReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/**
|
||||
* A read transformer that refactor NDN cigar elements to one N element.
|
||||
*
|
||||
* <p>
|
||||
* This read transformer will refactor cigar strings that contain N-D-N elements to one N element (with total length of the three refactored elements).
|
||||
* This is intended primarily for users of RNA-Seq data handling programs such as TopHat2.
|
||||
* Currently we consider that the internal N-D-N motif is illegal and we error out when we encounter it. By refactoring the cigar string of
|
||||
* those specific reads, users of TopHat and other tools can circumvent this problem without affecting the rest of their dataset.
|
||||
*
|
||||
* NOTE: any walker that need that functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine.
|
||||
*
|
||||
* The engine parameter that activate this read transformer is --refactor_NDN_cigar_string or -fixNDN
|
||||
* </p>
|
||||
*
|
||||
*
|
||||
*
|
||||
* @author ami
|
||||
* @since 04/22/14
|
||||
*/
|
||||
|
||||
public class NDNCigarReadTransformer extends RNAReadTransformer {
|
||||
|
||||
private boolean refactorReads;
|
||||
|
||||
@Override
|
||||
public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) {
|
||||
refactorReads = engine.getArguments().REFACTOR_NDN_CIGAR_READS;
|
||||
|
||||
return ApplicationTime.HANDLED_IN_WALKER; // NOTE: any walker that need that functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine.
|
||||
}
|
||||
|
||||
@Override
|
||||
public GATKSAMRecord apply(final GATKSAMRecord read) {
|
||||
if(read == null)
|
||||
throw new UserException.BadInput("try to transform a null GATKSAMRecord");
|
||||
final Cigar originalCigar = read.getCigar();
|
||||
if (originalCigar.isValid(read.getReadName(),-1) != null)
|
||||
throw new UserException.BadInput("try to transform a read with non-valid cigar string: readName: "+read.getReadName()+" Cigar String: "+originalCigar);
|
||||
read.setCigar(refactorNDNtoN(originalCigar));
|
||||
return read;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean enabled() {
|
||||
return refactorReads;
|
||||
}
|
||||
|
||||
|
||||
|
||||
protected Cigar refactorNDNtoN(final Cigar originalCigar) {
|
||||
final Cigar refactoredCigar = new Cigar();
|
||||
final int cigarLength = originalCigar.numCigarElements();
|
||||
for(int i = 0; i < cigarLength; i++){
|
||||
final CigarElement element = originalCigar.getCigarElement(i);
|
||||
if(element.getOperator() == CigarOperator.N && thereAreAtLeast2MoreElements(i,cigarLength)){
|
||||
final CigarElement nextElement = originalCigar.getCigarElement(i+1);
|
||||
final CigarElement nextNextElement = originalCigar.getCigarElement(i+2);
|
||||
|
||||
// if it is N-D-N replace with N (with the total length) otherwise just add the first N.
|
||||
if(nextElement.getOperator() == CigarOperator.D && nextNextElement.getOperator() == CigarOperator.N){
|
||||
final int threeElementsLength = element.getLength() + nextElement.getLength() + nextNextElement.getLength();
|
||||
final CigarElement refactoredElement = new CigarElement(threeElementsLength,CigarOperator.N);
|
||||
refactoredCigar.add(refactoredElement);
|
||||
i += 2; //skip the elements that were refactored
|
||||
}
|
||||
else
|
||||
refactoredCigar.add(element); // add only the first N
|
||||
}
|
||||
else
|
||||
refactoredCigar.add(element); // add any non-N element
|
||||
}
|
||||
return refactoredCigar;
|
||||
}
|
||||
|
||||
private boolean thereAreAtLeast2MoreElements(final int index, final int cigarLength){
|
||||
return index < cigarLength - 2;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
/**
|
||||
*
|
||||
* A baseclass for RNAseq read transformer
|
||||
*
|
||||
* @author ami
|
||||
* @since 4/28/14.
|
||||
*/
|
||||
public abstract class RNAReadTransformer extends ReadTransformer{
|
||||
public boolean isRNAReadTransformer(){return true;}
|
||||
}
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.filters;
|
||||
|
||||
import net.sf.samtools.Cigar;
|
||||
import org.broadinstitute.sting.utils.sam.CigarUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
/**
|
||||
* @author ami
|
||||
* @since 04/22/14
|
||||
*/
|
||||
public class NDNCigarReadTransformerUnitTest {
|
||||
|
||||
|
||||
@DataProvider(name = "filteringIteratorTestData")
|
||||
public String[][] getFilteringIteratorTestData() {
|
||||
return new String[][] {
|
||||
{"1M1N1N1M","1M1N1N1M"}, // NN elements
|
||||
{"1M1N1D4M","1M1N1D4M"}, // ND
|
||||
{"1M1N3M","1M1N3M"}, // N
|
||||
{"1M1N2I1N3M","1M1N2I1N3M"}, // NIN
|
||||
{"1M1N3D2N1M","1M6N1M"},
|
||||
{"1M2N2D2N1M1D3N1D1N1M2H","1M6N1M1D5N1M2H"},
|
||||
{"1H2S1M1N3D2N1M","1H2S1M6N1M"},
|
||||
{"10M628N2D203N90M","10M833N90M"}
|
||||
};
|
||||
}
|
||||
|
||||
NDNCigarReadTransformer filter;
|
||||
|
||||
@BeforeClass
|
||||
public void init() {
|
||||
filter = new NDNCigarReadTransformer();
|
||||
}
|
||||
|
||||
@Test(dataProvider = "filteringIteratorTestData")
|
||||
public void testCigarRefactoring (final String originalCigarString, final String expectedString) {
|
||||
Cigar originalCigar = CigarUtils.cigarFromString(originalCigarString);
|
||||
String actualString = filter.refactorNDNtoN(originalCigar).toString();
|
||||
Assert.assertEquals(actualString, expectedString, "ciagr string "+ originalCigarString+" should become: "+expectedString+" but got: "+actualString);
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue