Fixed reported bug in BQSR for RNA seq alignments with Ns.

* ClippingOp updated to incorporate Ns in the hard clips.
  * ReadUtils.getReadCoordinateForReferenceCoordinate() updated to account for Ns.
  * Added test that covers the BQSR case we saw.
  * Created GSA-856 (for Mauricio) to add lots of tests to ReadUtils.
    * It will require refactoring code and not in the scope of what I was willing to do to fix this.
This commit is contained in:
Eric Banks 2013-03-14 11:06:45 -04:00
parent be729410b9
commit 573ed07ad0
3 changed files with 25 additions and 3 deletions

View File

@ -581,8 +581,8 @@ public class ClippingOp {
if (cigarElement.getOperator() == CigarOperator.INSERTION)
return -clippedLength;
// Deletions should be added to the total hard clip count
else if (cigarElement.getOperator() == CigarOperator.DELETION)
// Deletions and Ns should be added to the total hard clip count (because we want to maintain the original alignment start)
else if (cigarElement.getOperator() == CigarOperator.DELETION || cigarElement.getOperator() == CigarOperator.SKIPPED_REGION)
return cigarElement.getLength();
// There is no shift if we are not clipping an indel

View File

@ -524,7 +524,7 @@ public class ReadUtils {
// If we reached our goal inside a deletion, but the deletion is the next cigar element then we need
// to add the shift of the current cigar element but go back to it's last element to return the last
// base before the deletion (see warning in function contracts)
else if (fallsInsideDeletion && !endsWithinCigar)
else if (fallsInsideDeletion && !endsWithinCigar && cigarElement.getOperator().consumesReadBases())
readBases += shift - 1;
// If we reached our goal inside a deletion then we must backtrack to the last base before the deletion

View File

@ -25,13 +25,19 @@
package org.broadinstitute.sting.utils.sam;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
@ -179,4 +185,20 @@ public class ReadUtilsUnitTest extends BaseTest {
final List<GATKSAMRecord> reads = new LinkedList<GATKSAMRecord>();
Assert.assertEquals(ReadUtils.getMaxReadLength(reads), 0, "Empty list should have max length of zero");
}
@Test (enabled = true)
public void testReadWithNs() throws FileNotFoundException {
final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary());
final int readLength = 76;
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 8975, readLength);
read.setReadBases(Utils.dupBytes((byte) 'A', readLength));
read.setBaseQualities(Utils.dupBytes((byte)30, readLength));
read.setCigarString("3M414N1D73M");
final int result = ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, 9392, ReadUtils.ClippingTail.LEFT_TAIL);
Assert.assertEquals(result, 3);
}
}