Expanded unit tests for AlignmentUtils
-- Added JIRA entries for the remaining capabilities to be fixed up and unit tested
This commit is contained in:
parent
d82b855c4c
commit
63e68a725f
|
|
@ -40,9 +40,16 @@ import org.broadinstitute.sting.utils.recalibration.EventType;
|
|||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class AlignmentUtils {
|
||||
public final class AlignmentUtils {
|
||||
private final static EnumSet<CigarOperator> ALIGNED_TO_GENOME_OPERATORS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X);
|
||||
private final static EnumSet<CigarOperator> ALIGNED_TO_GENOME_PLUS_SOFTCLIPS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.S);
|
||||
|
||||
// cannot be instantiated
|
||||
private AlignmentUtils() { }
|
||||
|
||||
public static class MismatchCount {
|
||||
public int numMismatches = 0;
|
||||
|
|
@ -118,103 +125,6 @@ public class AlignmentUtils {
|
|||
return mc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of mismatches in the pileup within the given reference context.
|
||||
*
|
||||
* @param pileup the pileup with reads
|
||||
* @param ref the reference context
|
||||
* @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window)
|
||||
* @return the number of mismatches
|
||||
*/
|
||||
public static int mismatchesInRefWindow(ReadBackedPileup pileup, ReferenceContext ref, boolean ignoreTargetSite) {
|
||||
int mismatches = 0;
|
||||
for (PileupElement p : pileup)
|
||||
mismatches += mismatchesInRefWindow(p, ref, ignoreTargetSite);
|
||||
return mismatches;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of mismatches in the pileup element within the given reference context.
|
||||
*
|
||||
* @param p the pileup element
|
||||
* @param ref the reference context
|
||||
* @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window)
|
||||
* @return the number of mismatches
|
||||
*/
|
||||
public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite) {
|
||||
return mismatchesInRefWindow(p, ref, ignoreTargetSite, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of mismatches in the pileup element within the given reference context.
|
||||
*
|
||||
* @param p the pileup element
|
||||
* @param ref the reference context
|
||||
* @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window)
|
||||
* @param qualitySumInsteadOfMismatchCount
|
||||
* if true, return the quality score sum of the mismatches rather than the count
|
||||
* @return the number of mismatches
|
||||
*/
|
||||
public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite, boolean qualitySumInsteadOfMismatchCount) {
|
||||
int sum = 0;
|
||||
|
||||
int windowStart = ref.getWindow().getStart();
|
||||
int windowStop = ref.getWindow().getStop();
|
||||
byte[] refBases = ref.getBases();
|
||||
byte[] readBases = p.getRead().getReadBases();
|
||||
byte[] readQualities = p.getRead().getBaseQualities();
|
||||
Cigar c = p.getRead().getCigar();
|
||||
|
||||
int readIndex = 0;
|
||||
int currentPos = p.getRead().getAlignmentStart();
|
||||
int refIndex = Math.max(0, currentPos - windowStart);
|
||||
|
||||
for (int i = 0; i < c.numCigarElements(); i++) {
|
||||
CigarElement ce = c.getCigarElement(i);
|
||||
int cigarElementLength = ce.getLength();
|
||||
switch (ce.getOperator()) {
|
||||
case EQ:
|
||||
case X:
|
||||
case M:
|
||||
for (int j = 0; j < cigarElementLength; j++, readIndex++, currentPos++) {
|
||||
// are we past the ref window?
|
||||
if (currentPos > windowStop)
|
||||
break;
|
||||
|
||||
// are we before the ref window?
|
||||
if (currentPos < windowStart)
|
||||
continue;
|
||||
|
||||
byte refChr = refBases[refIndex++];
|
||||
|
||||
// do we need to skip the target site?
|
||||
if (ignoreTargetSite && ref.getLocus().getStart() == currentPos)
|
||||
continue;
|
||||
|
||||
byte readChr = readBases[readIndex];
|
||||
if (readChr != refChr)
|
||||
sum += (qualitySumInsteadOfMismatchCount) ? readQualities[readIndex] : 1;
|
||||
}
|
||||
break;
|
||||
case I:
|
||||
case S:
|
||||
readIndex += cigarElementLength;
|
||||
break;
|
||||
case D:
|
||||
case N:
|
||||
currentPos += cigarElementLength;
|
||||
if (currentPos > windowStart)
|
||||
refIndex += Math.min(cigarElementLength, currentPos - windowStart);
|
||||
break;
|
||||
case H:
|
||||
case P:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment.
|
||||
* This method follows closely the SAMRecord::getAlignmentBlocks() implemented in samtools library, but
|
||||
|
|
@ -225,31 +135,54 @@ public class AlignmentUtils {
|
|||
* @param r alignment
|
||||
* @return number of continuous alignment blocks (i.e. 'M' elements of the cigar; all indel and clipping elements are ignored).
|
||||
*/
|
||||
@Ensures("result >= 0")
|
||||
public static int getNumAlignmentBlocks(final SAMRecord r) {
|
||||
int n = 0;
|
||||
if ( r == null ) throw new IllegalArgumentException("read cannot be null");
|
||||
final Cigar cigar = r.getCigar();
|
||||
if (cigar == null) return 0;
|
||||
|
||||
int n = 0;
|
||||
for (final CigarElement e : cigar.getCigarElements()) {
|
||||
if (e.getOperator() == CigarOperator.M) n++;
|
||||
if (ALIGNED_TO_GENOME_OPERATORS.contains(e.getOperator()))
|
||||
n++;
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
public static int getNumAlignedBasesCountingSoftClips(final SAMRecord r) {
|
||||
|
||||
/**
|
||||
* Get the number of bases aligned to the genome, including soft clips
|
||||
*
|
||||
* If read is not mapped (i.e., doesn't have a cigar) returns 0
|
||||
*
|
||||
* @param r a non-null GATKSAMRecord
|
||||
* @return the number of bases aligned to the genome in R, including soft clipped bases
|
||||
*/
|
||||
public static int getNumAlignedBasesCountingSoftClips(final GATKSAMRecord r) {
|
||||
int n = 0;
|
||||
final Cigar cigar = r.getCigar();
|
||||
if (cigar == null) return 0;
|
||||
|
||||
for (final CigarElement e : cigar.getCigarElements())
|
||||
if (e.getOperator() == CigarOperator.M || e.getOperator() == CigarOperator.S)
|
||||
if (ALIGNED_TO_GENOME_PLUS_SOFTCLIPS.contains(e.getOperator()))
|
||||
n += e.getLength();
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
/**
|
||||
* Count the number of bases hard clipped from read
|
||||
*
|
||||
* If read's cigar is null, return 0
|
||||
*
|
||||
* @param r a non-null read
|
||||
* @return a positive integer
|
||||
*/
|
||||
@Ensures("result >= 0")
|
||||
public static int getNumHardClippedBases(final SAMRecord r) {
|
||||
if ( r == null ) throw new IllegalArgumentException("Read cannot be null");
|
||||
|
||||
int n = 0;
|
||||
final Cigar cigar = r.getCigar();
|
||||
if (cigar == null) return 0;
|
||||
|
|
@ -264,7 +197,9 @@ public class AlignmentUtils {
|
|||
/**
|
||||
* Calculate the number of bases that are soft clipped in read with quality score greater than threshold
|
||||
*
|
||||
* @param read a non-null GATKSAMRecord
|
||||
* Handles the case where the cigar is null (i.e., the read is unmapped), returning 0
|
||||
*
|
||||
* @param read a non-null GATKSAMRecord.
|
||||
* @param qualThreshold consider bases with quals > this value as high quality. Must be >= 0
|
||||
* @return positive integer
|
||||
*/
|
||||
|
|
@ -273,6 +208,9 @@ public class AlignmentUtils {
|
|||
if ( read == null ) throw new IllegalArgumentException("Read cannot be null");
|
||||
if ( qualThreshold < 0 ) throw new IllegalArgumentException("Expected qualThreshold to be a positive byte but saw " + qualThreshold);
|
||||
|
||||
if ( read.getCigar() == null ) // the read is unmapped
|
||||
return 0;
|
||||
|
||||
final byte[] qual = read.getBaseQualities( EventType.BASE_SUBSTITUTION );
|
||||
|
||||
int numHQSoftClips = 0;
|
||||
|
|
@ -300,16 +238,12 @@ public class AlignmentUtils {
|
|||
}
|
||||
|
||||
public static int calcAlignmentByteArrayOffset(final Cigar cigar, final PileupElement pileupElement, final int alignmentStart, final int refLocus) {
|
||||
return calcAlignmentByteArrayOffset( cigar, pileupElement.getOffset(), false, pileupElement.isDeletion(), alignmentStart, refLocus );
|
||||
return calcAlignmentByteArrayOffset( cigar, pileupElement.getOffset(), pileupElement.isDeletion(), alignmentStart, refLocus );
|
||||
}
|
||||
|
||||
public static int calcAlignmentByteArrayOffset(final Cigar cigar, final int offset, final boolean isInsertionAtBeginningOfRead, final boolean isDeletion, final int alignmentStart, final int refLocus) {
|
||||
public static int calcAlignmentByteArrayOffset(final Cigar cigar, final int offset, final boolean isDeletion, final int alignmentStart, final int refLocus) {
|
||||
int pileupOffset = offset;
|
||||
|
||||
// Special case for reads starting with insertion
|
||||
if (isInsertionAtBeginningOfRead)
|
||||
return 0;
|
||||
|
||||
// Reassign the offset if we are in the middle of a deletion because of the modified representation of the read bases
|
||||
if (isDeletion) {
|
||||
pileupOffset = refLocus - alignmentStart;
|
||||
|
|
@ -462,47 +396,27 @@ public class AlignmentUtils {
|
|||
* specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and
|
||||
* alignment reference index/start.
|
||||
*
|
||||
* @param r record
|
||||
* Our life would be so much easier if all sam files followed the specs. In reality,
|
||||
* sam files (including those generated by maq or bwa) miss headers altogether. When
|
||||
* reading such a SAM file, reference name is set, but since there is no sequence dictionary,
|
||||
* null is always returned for referenceIndex. Let's be paranoid here, and make sure that
|
||||
* we do not call the read "unmapped" when it has only reference name set with ref. index missing
|
||||
* or vice versa.
|
||||
*
|
||||
* @param r a non-null record
|
||||
* @return true if read is unmapped
|
||||
*/
|
||||
public static boolean isReadUnmapped(final SAMRecord r) {
|
||||
if ( r == null ) throw new IllegalArgumentException("Read cannot be null");
|
||||
|
||||
if (r.getReadUnmappedFlag()) return true;
|
||||
|
||||
// our life would be so much easier if all sam files followed the specs. In reality,
|
||||
// sam files (including those generated by maq or bwa) miss headers altogether. When
|
||||
// reading such a SAM file, reference name is set, but since there is no sequence dictionary,
|
||||
// null is always returned for referenceIndex. Let's be paranoid here, and make sure that
|
||||
// we do not call the read "unmapped" when it has only reference name set with ref. index missing
|
||||
// or vice versa.
|
||||
if ((r.getReferenceIndex() != null && r.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX
|
||||
|| r.getReferenceName() != null && !r.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME))
|
||||
&& r.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Due to (unfortunate) multiple ways to indicate that read/mate is unmapped allowed by SAM format
|
||||
* specification, one may need this convenience shortcut. Checks both 'mate unmapped' flag and
|
||||
* alignment reference index/start of the mate.
|
||||
*
|
||||
* @param r sam record for the read
|
||||
* @return true if read's mate is unmapped
|
||||
*/
|
||||
public static boolean isMateUnmapped(final SAMRecord r) {
|
||||
if (r.getMateUnmappedFlag()) return true;
|
||||
|
||||
// our life would be so much easier if all sam files followed the specs. In reality,
|
||||
// sam files (including those generated by maq or bwa) miss headers altogether. When
|
||||
// reading such a SAM file, reference name is set, but since there is no sequence dictionary,
|
||||
// null is always returned for referenceIndex. Let's be paranoid here, and make sure that
|
||||
// we do not call the read "unmapped" when it has only reference name set with ref. index missing
|
||||
// or vice versa.
|
||||
if ((r.getMateReferenceIndex() != null && r.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX
|
||||
|| r.getMateReferenceName() != null && !r.getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME))
|
||||
&& r.getMateAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Need a well-formed, consolidated Cigar string so that the left aligning code works properly.
|
||||
* For example, 1M1M1M1D2M1M --> 3M1D3M
|
||||
|
|
@ -592,22 +506,41 @@ public class AlignmentUtils {
|
|||
return cigar;
|
||||
}
|
||||
|
||||
private static boolean cigarHasZeroSizeElement(Cigar c) {
|
||||
for (CigarElement ce : c.getCigarElements()) {
|
||||
/**
|
||||
* Does one of the elements in cigar have a 0 length?
|
||||
*
|
||||
* @param c a non-null cigar
|
||||
* @return true if any element has 0 size
|
||||
*/
|
||||
@Requires("c != null")
|
||||
protected static boolean cigarHasZeroSizeElement(final Cigar c) {
|
||||
for (final CigarElement ce : c.getCigarElements()) {
|
||||
if (ce.getLength() == 0)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private static Cigar cleanUpCigar(Cigar c) {
|
||||
ArrayList<CigarElement> elements = new ArrayList<CigarElement>(c.numCigarElements() - 1);
|
||||
for (CigarElement ce : c.getCigarElements()) {
|
||||
if (ce.getLength() != 0 &&
|
||||
(elements.size() != 0 || ce.getOperator() != CigarOperator.D)) {
|
||||
/**
|
||||
* Clean up the incoming cigar
|
||||
*
|
||||
* Removes elements with zero size
|
||||
* Clips away beginning deletion operators
|
||||
*
|
||||
* @param c the cigar string we want to clean up
|
||||
* @return a newly allocated, cleaned up Cigar
|
||||
*/
|
||||
@Requires("c != null")
|
||||
@Ensures("result != null")
|
||||
private static Cigar cleanUpCigar(final Cigar c) {
|
||||
final List<CigarElement> elements = new ArrayList<CigarElement>(c.numCigarElements() - 1);
|
||||
|
||||
for (final CigarElement ce : c.getCigarElements()) {
|
||||
if (ce.getLength() != 0 && (! elements.isEmpty() || ce.getOperator() != CigarOperator.D)) {
|
||||
elements.add(ce);
|
||||
}
|
||||
}
|
||||
|
||||
return new Cigar(elements);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -33,9 +33,7 @@ import org.testng.annotations.BeforeClass;
|
|||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
public class AlignmentUtilsUnitTest {
|
||||
private SAMFileHeader header;
|
||||
|
|
@ -127,6 +125,114 @@ public class AlignmentUtilsUnitTest {
|
|||
ArtificialSAMUtils.DEFAULT_READ_LENGTH);
|
||||
}
|
||||
|
||||
private final List<List<CigarElement>> makeCigarElementCombinations() {
|
||||
// this functionality can be adapted to provide input data for whatever you might want in your data
|
||||
final List<CigarElement> cigarElements = new LinkedList<CigarElement>();
|
||||
for ( final int size : Arrays.asList(0, 10) ) {
|
||||
for ( final CigarOperator op : CigarOperator.values() ) {
|
||||
cigarElements.add(new CigarElement(size, op));
|
||||
}
|
||||
}
|
||||
|
||||
final List<List<CigarElement>> combinations = new LinkedList<List<CigarElement>>();
|
||||
for ( final int nElements : Arrays.asList(1, 2, 3) ) {
|
||||
combinations.addAll(Utils.makePermutations(cigarElements, nElements, true));
|
||||
}
|
||||
|
||||
return combinations;
|
||||
}
|
||||
|
||||
|
||||
@DataProvider(name = "NumAlignedBasesCountingSoftClips")
|
||||
public Object[][] makeNumAlignedBasesCountingSoftClips() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final EnumSet<CigarOperator> alignedToGenome = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.S);
|
||||
for ( final List<CigarElement> elements : makeCigarElementCombinations() ) {
|
||||
int n = 0;
|
||||
for ( final CigarElement elt : elements ) n += alignedToGenome.contains(elt.getOperator()) ? elt.getLength() : 0;
|
||||
tests.add(new Object[]{new Cigar(elements), n});
|
||||
}
|
||||
|
||||
tests.add(new Object[]{null, 0});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "NumAlignedBasesCountingSoftClips")
|
||||
public void testNumAlignedBasesCountingSoftClips(final Cigar cigar, final int expected) {
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, cigar == null ? 10 : cigar.getReadLength());
|
||||
read.setCigar(cigar);
|
||||
Assert.assertEquals(AlignmentUtils.getNumAlignedBasesCountingSoftClips(read), expected, "Cigar " + cigar + " failed NumAlignedBasesCountingSoftClips");
|
||||
}
|
||||
|
||||
@DataProvider(name = "CigarHasZeroElement")
|
||||
public Object[][] makeCigarHasZeroElement() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
for ( final List<CigarElement> elements : makeCigarElementCombinations() ) {
|
||||
boolean hasZero = false;
|
||||
for ( final CigarElement elt : elements ) hasZero = hasZero || elt.getLength() == 0;
|
||||
tests.add(new Object[]{new Cigar(elements), hasZero});
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "CigarHasZeroElement")
|
||||
public void testCigarHasZeroSize(final Cigar cigar, final boolean hasZero) {
|
||||
Assert.assertEquals(AlignmentUtils.cigarHasZeroSizeElement(cigar), hasZero, "Cigar " + cigar.toString() + " failed cigarHasZeroSizeElement");
|
||||
}
|
||||
|
||||
@DataProvider(name = "NumHardClipped")
|
||||
public Object[][] makeNumHardClipped() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
for ( final List<CigarElement> elements : makeCigarElementCombinations() ) {
|
||||
int n = 0;
|
||||
for ( final CigarElement elt : elements ) n += elt.getOperator() == CigarOperator.H ? elt.getLength() : 0;
|
||||
tests.add(new Object[]{new Cigar(elements), n});
|
||||
}
|
||||
|
||||
tests.add(new Object[]{null, 0});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "NumHardClipped")
|
||||
public void testNumHardClipped(final Cigar cigar, final int expected) {
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, cigar == null ? 10 : cigar.getReadLength());
|
||||
read.setCigar(cigar);
|
||||
Assert.assertEquals(AlignmentUtils.getNumHardClippedBases(read), expected, "Cigar " + cigar + " failed num hard clips");
|
||||
}
|
||||
|
||||
@DataProvider(name = "NumAlignedBlocks")
|
||||
public Object[][] makeNumAlignedBlocks() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
for ( final List<CigarElement> elements : makeCigarElementCombinations() ) {
|
||||
int n = 0;
|
||||
for ( final CigarElement elt : elements ) {
|
||||
switch ( elt.getOperator() ) {
|
||||
case M:case X:case EQ: n++; break;
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
tests.add(new Object[]{new Cigar(elements), n});
|
||||
}
|
||||
|
||||
tests.add(new Object[]{null, 0});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "NumAlignedBlocks")
|
||||
public void testNumAlignedBlocks(final Cigar cigar, final int expected) {
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, cigar == null ? 10 : cigar.getReadLength());
|
||||
read.setCigar(cigar);
|
||||
Assert.assertEquals(AlignmentUtils.getNumAlignmentBlocks(read), expected, "Cigar " + cigar + " failed NumAlignedBlocks");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testConsolidateCigar() {
|
||||
{
|
||||
|
|
|
|||
Loading…
Reference in New Issue