diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReads.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReads.java index 6b9fca312..b21a133c9 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReads.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReads.java @@ -55,6 +55,8 @@ import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.iterators.RNAReadTransformer; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; @@ -69,6 +71,8 @@ import org.broadinstitute.sting.utils.sam.CigarUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.FileNotFoundException; +import java.util.Collections; +import java.util.List; /** * @@ -132,12 +136,19 @@ public class SplitNCigarReads extends ReadWalker rnaReadTransformers = Collections.emptyList(); + @Override public void initialize() { final GenomeAnalysisEngine toolkit = getToolkit(); + if ( getToolkit() != null ) { + for (final ReadTransformer transformer: getToolkit().getReadTransformers()) + if(transformer instanceof RNAReadTransformer) // TODO: when a read transformer can be called directly from the command line we won't need that mechanism any more + rnaReadTransformers.add((RNAReadTransformer)transformer); + } if ( !NO_PG_TAG ) { // we don't want to assume that reads will be written in order by the manager because in deep, deep pileups it won't work Utils.setupWriter(writer, toolkit, toolkit.getSAMFileHeader(), false, this, PROGRAM_RECORD_NAME); @@ -150,11 +161,19 @@ public class SplitNCigarReads extends ReadWalker + * This read transformer will refactor cigar strings that contain N-D-N elements to one N element (with total length of the three refactored elements). + * This is intended primarily for users of RNA-Seq data handling programs such as TopHat2. + * Currently we consider that the internal N-D-N motif is illegal and we error out when we encounter it. By refactoring the cigar string of + * those specific reads, users of TopHat and other tools can circumvent this problem without affecting the rest of their dataset. + * + * NOTE: any walker that need that functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine. + * + * The engine parameter that activate this read transformer is --refactor_NDN_cigar_string or -fixNDN + *

+ * + * + * + * @author ami + * @since 04/22/14 + */ + +public class NDNCigarReadTransformer extends RNAReadTransformer { + + private boolean refactorReads; + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { + refactorReads = engine.getArguments().REFACTOR_NDN_CIGAR_READS; + + return ApplicationTime.HANDLED_IN_WALKER; // NOTE: any walker that need that functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine. + } + + @Override + public GATKSAMRecord apply(final GATKSAMRecord read) { + if(read == null) + throw new UserException.BadInput("try to transform a null GATKSAMRecord"); + final Cigar originalCigar = read.getCigar(); + if (originalCigar.isValid(read.getReadName(),-1) != null) + throw new UserException.BadInput("try to transform a read with non-valid cigar string: readName: "+read.getReadName()+" Cigar String: "+originalCigar); + read.setCigar(refactorNDNtoN(originalCigar)); + return read; + } + + @Override + public boolean enabled() { + return refactorReads; + } + + + + protected Cigar refactorNDNtoN(final Cigar originalCigar) { + final Cigar refactoredCigar = new Cigar(); + final int cigarLength = originalCigar.numCigarElements(); + for(int i = 0; i < cigarLength; i++){ + final CigarElement element = originalCigar.getCigarElement(i); + if(element.getOperator() == CigarOperator.N && thereAreAtLeast2MoreElements(i,cigarLength)){ + final CigarElement nextElement = originalCigar.getCigarElement(i+1); + final CigarElement nextNextElement = originalCigar.getCigarElement(i+2); + + // if it is N-D-N replace with N (with the total length) otherwise just add the first N. + if(nextElement.getOperator() == CigarOperator.D && nextNextElement.getOperator() == CigarOperator.N){ + final int threeElementsLength = element.getLength() + nextElement.getLength() + nextNextElement.getLength(); + final CigarElement refactoredElement = new CigarElement(threeElementsLength,CigarOperator.N); + refactoredCigar.add(refactoredElement); + i += 2; //skip the elements that were refactored + } + else + refactoredCigar.add(element); // add only the first N + } + else + refactoredCigar.add(element); // add any non-N element + } + return refactoredCigar; + } + + private boolean thereAreAtLeast2MoreElements(final int index, final int cigarLength){ + return index < cigarLength - 2; + } + +} + diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/RNAReadTransformer.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/RNAReadTransformer.java new file mode 100644 index 000000000..a412ed701 --- /dev/null +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/iterators/RNAReadTransformer.java @@ -0,0 +1,37 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.iterators; + +/** + * + * A baseclass for RNAseq read transformer + * + * @author ami + * @since 4/28/14. + */ +public abstract class RNAReadTransformer extends ReadTransformer{ + public boolean isRNAReadTransformer(){return true;} +} diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/NDNCigarReadTransformerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/NDNCigarReadTransformerUnitTest.java new file mode 100644 index 000000000..90bc3870c --- /dev/null +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/filters/NDNCigarReadTransformerUnitTest.java @@ -0,0 +1,70 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.filters; + +import net.sf.samtools.Cigar; +import org.broadinstitute.sting.utils.sam.CigarUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +/** + * @author ami + * @since 04/22/14 + */ +public class NDNCigarReadTransformerUnitTest { + + + @DataProvider(name = "filteringIteratorTestData") + public String[][] getFilteringIteratorTestData() { + return new String[][] { + {"1M1N1N1M","1M1N1N1M"}, // NN elements + {"1M1N1D4M","1M1N1D4M"}, // ND + {"1M1N3M","1M1N3M"}, // N + {"1M1N2I1N3M","1M1N2I1N3M"}, // NIN + {"1M1N3D2N1M","1M6N1M"}, + {"1M2N2D2N1M1D3N1D1N1M2H","1M6N1M1D5N1M2H"}, + {"1H2S1M1N3D2N1M","1H2S1M6N1M"}, + {"10M628N2D203N90M","10M833N90M"} + }; + } + + NDNCigarReadTransformer filter; + + @BeforeClass + public void init() { + filter = new NDNCigarReadTransformer(); + } + + @Test(dataProvider = "filteringIteratorTestData") + public void testCigarRefactoring (final String originalCigarString, final String expectedString) { + Cigar originalCigar = CigarUtils.cigarFromString(originalCigarString); + String actualString = filter.refactorNDNtoN(originalCigar).toString(); + Assert.assertEquals(actualString, expectedString, "ciagr string "+ originalCigarString+" should become: "+expectedString+" but got: "+actualString); + } + +}