Merge pull request #615 from broadinstitute/ami-createCigarDNFilter

create a new read filter (transformer) that refactor NDN cigar elements ...
This commit is contained in:
Ryan Poplin 2014-04-28 13:31:04 -04:00
commit 8d5a7d412b
5 changed files with 258 additions and 1 deletions

View File

@ -55,6 +55,8 @@ import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.iterators.RNAReadTransformer;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
@ -69,6 +71,8 @@ import org.broadinstitute.sting.utils.sam.CigarUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.FileNotFoundException;
import java.util.Collections;
import java.util.List;
/**
*
@ -132,12 +136,19 @@ public class SplitNCigarReads extends ReadWalker<GATKSAMRecord, OverhangFixingMa
* It will emit reads to the underlying writer as needed so we don't need to worry about any of that in this class.
*/
protected OverhangFixingManager overhangManager;
private List<RNAReadTransformer> rnaReadTransformers = Collections.emptyList();
@Override
public void initialize() {
final GenomeAnalysisEngine toolkit = getToolkit();
if ( getToolkit() != null ) {
for (final ReadTransformer transformer: getToolkit().getReadTransformers())
if(transformer instanceof RNAReadTransformer) // TODO: when a read transformer can be called directly from the command line we won't need that mechanism any more
rnaReadTransformers.add((RNAReadTransformer)transformer);
}
if ( !NO_PG_TAG ) {
// we don't want to assume that reads will be written in order by the manager because in deep, deep pileups it won't work
Utils.setupWriter(writer, toolkit, toolkit.getSAMFileHeader(), false, this, PROGRAM_RECORD_NAME);
@ -150,11 +161,19 @@ public class SplitNCigarReads extends ReadWalker<GATKSAMRecord, OverhangFixingMa
catch (FileNotFoundException ex) {
throw new UserException.CouldNotReadInputFile(toolkit.getArguments().referenceFile, ex);
}
}
@Override
public GATKSAMRecord map(final ReferenceContext ref, final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker) {
return read;
GATKSAMRecord workingRead = read;
for ( final RNAReadTransformer transformer : rnaReadTransformers ) {
workingRead = transformer.apply(workingRead); // TODO: when a read transformer can be called directly from the command line we won't need that mechanism any more
}
return workingRead;
}
@Override

View File

@ -234,6 +234,19 @@ public class GATKArgumentCollection {
@Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty", required = false, minValue = 0)
public double BAQGOP = BAQ.DEFAULT_GOP;
// --------------------------------------------------------------------------------------------------------------
//
// refactor NDN cigar string arguments
//
// --------------------------------------------------------------------------------------------------------------
/**
* This flag tells GATK to refactor cigar string with NDN elements to one element. It intended primarily for use in
* a RNAseq pipeline since the problem might come up when using RNAseq aligner such as Tophat2 with provided transcriptoms.
* You should only use this if you know that your reads have that problem.
*/
@Argument(fullName = "refactor_NDN_cigar_string", shortName = "fixNDN", doc = "refactor cigar string with NDN elements to one element", required = false)
public boolean REFACTOR_NDN_CIGAR_READS = false;
// --------------------------------------------------------------------------------------------------------------
//
// quality encoding checking arguments

View File

@ -0,0 +1,118 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.filters;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.iterators.RNAReadTransformer;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
/**
* A read transformer that refactor NDN cigar elements to one N element.
*
* <p>
* This read transformer will refactor cigar strings that contain N-D-N elements to one N element (with total length of the three refactored elements).
* This is intended primarily for users of RNA-Seq data handling programs such as TopHat2.
* Currently we consider that the internal N-D-N motif is illegal and we error out when we encounter it. By refactoring the cigar string of
* those specific reads, users of TopHat and other tools can circumvent this problem without affecting the rest of their dataset.
*
* NOTE: any walker that need that functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine.
*
* The engine parameter that activate this read transformer is --refactor_NDN_cigar_string or -fixNDN
* </p>
*
*
*
* @author ami
* @since 04/22/14
*/
public class NDNCigarReadTransformer extends RNAReadTransformer {
private boolean refactorReads;
@Override
public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) {
refactorReads = engine.getArguments().REFACTOR_NDN_CIGAR_READS;
return ApplicationTime.HANDLED_IN_WALKER; // NOTE: any walker that need that functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine.
}
@Override
public GATKSAMRecord apply(final GATKSAMRecord read) {
if(read == null)
throw new UserException.BadInput("try to transform a null GATKSAMRecord");
final Cigar originalCigar = read.getCigar();
if (originalCigar.isValid(read.getReadName(),-1) != null)
throw new UserException.BadInput("try to transform a read with non-valid cigar string: readName: "+read.getReadName()+" Cigar String: "+originalCigar);
read.setCigar(refactorNDNtoN(originalCigar));
return read;
}
@Override
public boolean enabled() {
return refactorReads;
}
protected Cigar refactorNDNtoN(final Cigar originalCigar) {
final Cigar refactoredCigar = new Cigar();
final int cigarLength = originalCigar.numCigarElements();
for(int i = 0; i < cigarLength; i++){
final CigarElement element = originalCigar.getCigarElement(i);
if(element.getOperator() == CigarOperator.N && thereAreAtLeast2MoreElements(i,cigarLength)){
final CigarElement nextElement = originalCigar.getCigarElement(i+1);
final CigarElement nextNextElement = originalCigar.getCigarElement(i+2);
// if it is N-D-N replace with N (with the total length) otherwise just add the first N.
if(nextElement.getOperator() == CigarOperator.D && nextNextElement.getOperator() == CigarOperator.N){
final int threeElementsLength = element.getLength() + nextElement.getLength() + nextNextElement.getLength();
final CigarElement refactoredElement = new CigarElement(threeElementsLength,CigarOperator.N);
refactoredCigar.add(refactoredElement);
i += 2; //skip the elements that were refactored
}
else
refactoredCigar.add(element); // add only the first N
}
else
refactoredCigar.add(element); // add any non-N element
}
return refactoredCigar;
}
private boolean thereAreAtLeast2MoreElements(final int index, final int cigarLength){
return index < cigarLength - 2;
}
}

View File

@ -0,0 +1,37 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.iterators;
/**
*
* A baseclass for RNAseq read transformer
*
* @author ami
* @since 4/28/14.
*/
public abstract class RNAReadTransformer extends ReadTransformer{
public boolean isRNAReadTransformer(){return true;}
}

View File

@ -0,0 +1,70 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.filters;
import net.sf.samtools.Cigar;
import org.broadinstitute.sting.utils.sam.CigarUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
/**
* @author ami
* @since 04/22/14
*/
public class NDNCigarReadTransformerUnitTest {
@DataProvider(name = "filteringIteratorTestData")
public String[][] getFilteringIteratorTestData() {
return new String[][] {
{"1M1N1N1M","1M1N1N1M"}, // NN elements
{"1M1N1D4M","1M1N1D4M"}, // ND
{"1M1N3M","1M1N3M"}, // N
{"1M1N2I1N3M","1M1N2I1N3M"}, // NIN
{"1M1N3D2N1M","1M6N1M"},
{"1M2N2D2N1M1D3N1D1N1M2H","1M6N1M1D5N1M2H"},
{"1H2S1M1N3D2N1M","1H2S1M6N1M"},
{"10M628N2D203N90M","10M833N90M"}
};
}
NDNCigarReadTransformer filter;
@BeforeClass
public void init() {
filter = new NDNCigarReadTransformer();
}
@Test(dataProvider = "filteringIteratorTestData")
public void testCigarRefactoring (final String originalCigarString, final String expectedString) {
Cigar originalCigar = CigarUtils.cigarFromString(originalCigarString);
String actualString = filter.refactorNDNtoN(originalCigar).toString();
Assert.assertEquals(actualString, expectedString, "ciagr string "+ originalCigarString+" should become: "+expectedString+" but got: "+actualString);
}
}