Merge pull request #615 from broadinstitute/ami-createCigarDNFilter
create a new read filter (transformer) that refactor NDN cigar elements ...
This commit is contained in:
commit
8d5a7d412b
|
|
@ -55,6 +55,8 @@ import org.broadinstitute.sting.gatk.CommandLineGATK;
|
|||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.iterators.RNAReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
|
|
@ -69,6 +71,8 @@ import org.broadinstitute.sting.utils.sam.CigarUtils;
|
|||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
*
|
||||
|
|
@ -132,12 +136,19 @@ public class SplitNCigarReads extends ReadWalker<GATKSAMRecord, OverhangFixingMa
|
|||
* It will emit reads to the underlying writer as needed so we don't need to worry about any of that in this class.
|
||||
*/
|
||||
protected OverhangFixingManager overhangManager;
|
||||
private List<RNAReadTransformer> rnaReadTransformers = Collections.emptyList();
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
final GenomeAnalysisEngine toolkit = getToolkit();
|
||||
|
||||
if ( getToolkit() != null ) {
|
||||
for (final ReadTransformer transformer: getToolkit().getReadTransformers())
|
||||
if(transformer instanceof RNAReadTransformer) // TODO: when a read transformer can be called directly from the command line we won't need that mechanism any more
|
||||
rnaReadTransformers.add((RNAReadTransformer)transformer);
|
||||
}
|
||||
if ( !NO_PG_TAG ) {
|
||||
// we don't want to assume that reads will be written in order by the manager because in deep, deep pileups it won't work
|
||||
Utils.setupWriter(writer, toolkit, toolkit.getSAMFileHeader(), false, this, PROGRAM_RECORD_NAME);
|
||||
|
|
@ -150,11 +161,19 @@ public class SplitNCigarReads extends ReadWalker<GATKSAMRecord, OverhangFixingMa
|
|||
catch (FileNotFoundException ex) {
|
||||
throw new UserException.CouldNotReadInputFile(toolkit.getArguments().referenceFile, ex);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public GATKSAMRecord map(final ReferenceContext ref, final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker) {
|
||||
return read;
|
||||
GATKSAMRecord workingRead = read;
|
||||
|
||||
for ( final RNAReadTransformer transformer : rnaReadTransformers ) {
|
||||
workingRead = transformer.apply(workingRead); // TODO: when a read transformer can be called directly from the command line we won't need that mechanism any more
|
||||
}
|
||||
|
||||
return workingRead;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -234,6 +234,19 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty", required = false, minValue = 0)
|
||||
public double BAQGOP = BAQ.DEFAULT_GOP;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// refactor NDN cigar string arguments
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
/**
|
||||
* This flag tells GATK to refactor cigar string with NDN elements to one element. It intended primarily for use in
|
||||
* a RNAseq pipeline since the problem might come up when using RNAseq aligner such as Tophat2 with provided transcriptoms.
|
||||
* You should only use this if you know that your reads have that problem.
|
||||
*/
|
||||
@Argument(fullName = "refactor_NDN_cigar_string", shortName = "fixNDN", doc = "refactor cigar string with NDN elements to one element", required = false)
|
||||
public boolean REFACTOR_NDN_CIGAR_READS = false;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// quality encoding checking arguments
|
||||
|
|
|
|||
|
|
@ -0,0 +1,118 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.filters;
|
||||
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.iterators.RNAReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/**
|
||||
* A read transformer that refactor NDN cigar elements to one N element.
|
||||
*
|
||||
* <p>
|
||||
* This read transformer will refactor cigar strings that contain N-D-N elements to one N element (with total length of the three refactored elements).
|
||||
* This is intended primarily for users of RNA-Seq data handling programs such as TopHat2.
|
||||
* Currently we consider that the internal N-D-N motif is illegal and we error out when we encounter it. By refactoring the cigar string of
|
||||
* those specific reads, users of TopHat and other tools can circumvent this problem without affecting the rest of their dataset.
|
||||
*
|
||||
* NOTE: any walker that need that functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine.
|
||||
*
|
||||
* The engine parameter that activate this read transformer is --refactor_NDN_cigar_string or -fixNDN
|
||||
* </p>
|
||||
*
|
||||
*
|
||||
*
|
||||
* @author ami
|
||||
* @since 04/22/14
|
||||
*/
|
||||
|
||||
public class NDNCigarReadTransformer extends RNAReadTransformer {
|
||||
|
||||
private boolean refactorReads;
|
||||
|
||||
@Override
|
||||
public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) {
|
||||
refactorReads = engine.getArguments().REFACTOR_NDN_CIGAR_READS;
|
||||
|
||||
return ApplicationTime.HANDLED_IN_WALKER; // NOTE: any walker that need that functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine.
|
||||
}
|
||||
|
||||
@Override
|
||||
public GATKSAMRecord apply(final GATKSAMRecord read) {
|
||||
if(read == null)
|
||||
throw new UserException.BadInput("try to transform a null GATKSAMRecord");
|
||||
final Cigar originalCigar = read.getCigar();
|
||||
if (originalCigar.isValid(read.getReadName(),-1) != null)
|
||||
throw new UserException.BadInput("try to transform a read with non-valid cigar string: readName: "+read.getReadName()+" Cigar String: "+originalCigar);
|
||||
read.setCigar(refactorNDNtoN(originalCigar));
|
||||
return read;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean enabled() {
|
||||
return refactorReads;
|
||||
}
|
||||
|
||||
|
||||
|
||||
protected Cigar refactorNDNtoN(final Cigar originalCigar) {
|
||||
final Cigar refactoredCigar = new Cigar();
|
||||
final int cigarLength = originalCigar.numCigarElements();
|
||||
for(int i = 0; i < cigarLength; i++){
|
||||
final CigarElement element = originalCigar.getCigarElement(i);
|
||||
if(element.getOperator() == CigarOperator.N && thereAreAtLeast2MoreElements(i,cigarLength)){
|
||||
final CigarElement nextElement = originalCigar.getCigarElement(i+1);
|
||||
final CigarElement nextNextElement = originalCigar.getCigarElement(i+2);
|
||||
|
||||
// if it is N-D-N replace with N (with the total length) otherwise just add the first N.
|
||||
if(nextElement.getOperator() == CigarOperator.D && nextNextElement.getOperator() == CigarOperator.N){
|
||||
final int threeElementsLength = element.getLength() + nextElement.getLength() + nextNextElement.getLength();
|
||||
final CigarElement refactoredElement = new CigarElement(threeElementsLength,CigarOperator.N);
|
||||
refactoredCigar.add(refactoredElement);
|
||||
i += 2; //skip the elements that were refactored
|
||||
}
|
||||
else
|
||||
refactoredCigar.add(element); // add only the first N
|
||||
}
|
||||
else
|
||||
refactoredCigar.add(element); // add any non-N element
|
||||
}
|
||||
return refactoredCigar;
|
||||
}
|
||||
|
||||
private boolean thereAreAtLeast2MoreElements(final int index, final int cigarLength){
|
||||
return index < cigarLength - 2;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
/**
|
||||
*
|
||||
* A baseclass for RNAseq read transformer
|
||||
*
|
||||
* @author ami
|
||||
* @since 4/28/14.
|
||||
*/
|
||||
public abstract class RNAReadTransformer extends ReadTransformer{
|
||||
public boolean isRNAReadTransformer(){return true;}
|
||||
}
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.filters;
|
||||
|
||||
import net.sf.samtools.Cigar;
|
||||
import org.broadinstitute.sting.utils.sam.CigarUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
/**
|
||||
* @author ami
|
||||
* @since 04/22/14
|
||||
*/
|
||||
public class NDNCigarReadTransformerUnitTest {
|
||||
|
||||
|
||||
@DataProvider(name = "filteringIteratorTestData")
|
||||
public String[][] getFilteringIteratorTestData() {
|
||||
return new String[][] {
|
||||
{"1M1N1N1M","1M1N1N1M"}, // NN elements
|
||||
{"1M1N1D4M","1M1N1D4M"}, // ND
|
||||
{"1M1N3M","1M1N3M"}, // N
|
||||
{"1M1N2I1N3M","1M1N2I1N3M"}, // NIN
|
||||
{"1M1N3D2N1M","1M6N1M"},
|
||||
{"1M2N2D2N1M1D3N1D1N1M2H","1M6N1M1D5N1M2H"},
|
||||
{"1H2S1M1N3D2N1M","1H2S1M6N1M"},
|
||||
{"10M628N2D203N90M","10M833N90M"}
|
||||
};
|
||||
}
|
||||
|
||||
NDNCigarReadTransformer filter;
|
||||
|
||||
@BeforeClass
|
||||
public void init() {
|
||||
filter = new NDNCigarReadTransformer();
|
||||
}
|
||||
|
||||
@Test(dataProvider = "filteringIteratorTestData")
|
||||
public void testCigarRefactoring (final String originalCigarString, final String expectedString) {
|
||||
Cigar originalCigar = CigarUtils.cigarFromString(originalCigarString);
|
||||
String actualString = filter.refactorNDNtoN(originalCigar).toString();
|
||||
Assert.assertEquals(actualString, expectedString, "ciagr string "+ originalCigarString+" should become: "+expectedString+" but got: "+actualString);
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue