From 573ed07ad06ec022b4bf9896384cd452162ca116 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 14 Mar 2013 11:06:45 -0400 Subject: [PATCH] Fixed reported bug in BQSR for RNA seq alignments with Ns. * ClippingOp updated to incorporate Ns in the hard clips. * ReadUtils.getReadCoordinateForReferenceCoordinate() updated to account for Ns. * Added test that covers the BQSR case we saw. * Created GSA-856 (for Mauricio) to add lots of tests to ReadUtils. * It will require refactoring code and not in the scope of what I was willing to do to fix this. --- .../sting/utils/clipping/ClippingOp.java | 4 ++-- .../sting/utils/sam/ReadUtils.java | 2 +- .../sting/utils/sam/ReadUtilsUnitTest.java | 22 +++++++++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index fe1a386fb..ad6f05563 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -581,8 +581,8 @@ public class ClippingOp { if (cigarElement.getOperator() == CigarOperator.INSERTION) return -clippedLength; - // Deletions should be added to the total hard clip count - else if (cigarElement.getOperator() == CigarOperator.DELETION) + // Deletions and Ns should be added to the total hard clip count (because we want to maintain the original alignment start) + else if (cigarElement.getOperator() == CigarOperator.DELETION || cigarElement.getOperator() == CigarOperator.SKIPPED_REGION) return cigarElement.getLength(); // There is no shift if we are not clipping an indel diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 95e0d55f3..c84e4245d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -524,7 +524,7 @@ public class ReadUtils { // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need // to add the shift of the current cigar element but go back to it's last element to return the last // base before the deletion (see warning in function contracts) - else if (fallsInsideDeletion && !endsWithinCigar) + else if (fallsInsideDeletion && !endsWithinCigar && cigarElement.getOperator().consumesReadBases()) readBases += shift - 1; // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java index baad67d53..331121c55 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java @@ -25,13 +25,19 @@ package org.broadinstitute.sting.utils.sam; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; +import java.io.FileNotFoundException; import java.util.*; @@ -179,4 +185,20 @@ public class ReadUtilsUnitTest extends BaseTest { final List reads = new LinkedList(); Assert.assertEquals(ReadUtils.getMaxReadLength(reads), 0, "Empty list should have max length of zero"); } + + @Test (enabled = true) + public void testReadWithNs() throws FileNotFoundException { + + final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); + final int readLength = 76; + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 8975, readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); + read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); + read.setCigarString("3M414N1D73M"); + + final int result = ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, 9392, ReadUtils.ClippingTail.LEFT_TAIL); + Assert.assertEquals(result, 3); + } }