Cleanup of FragmentUtils
-- Code was undocumented, big, and not well tested. All three things fixed. -- Currently not passing, but the framework works well for testing -- Added concat(byte[] ... arrays) to utils
This commit is contained in:
parent
8ed78b453f
commit
925846c65f
|
|
@ -415,6 +415,24 @@ public class Utils {
|
|||
return C;
|
||||
}
|
||||
|
||||
/**
|
||||
* Concatenates byte arrays
|
||||
* @return a concat of all bytes in allBytes in order
|
||||
*/
|
||||
public static byte[] concat(final byte[] ... allBytes) {
|
||||
int size = 0;
|
||||
for ( final byte[] bytes : allBytes ) size += bytes.length;
|
||||
|
||||
final byte[] c = new byte[size];
|
||||
int offset = 0;
|
||||
for ( final byte[] bytes : allBytes ) {
|
||||
System.arraycopy(bytes, 0, c, offset, bytes.length);
|
||||
offset += bytes.length;
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends String(s) B to array A.
|
||||
* @param A First array.
|
||||
|
|
|
|||
|
|
@ -25,6 +25,8 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.fragments;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
|
|
@ -56,7 +58,8 @@ import java.util.*;
|
|||
* Date: 3/26/11
|
||||
* Time: 10:09 PM
|
||||
*/
|
||||
public class FragmentUtils {
|
||||
public final class FragmentUtils {
|
||||
protected final static byte MIN_QUAL_BAD_OVERLAP = 16;
|
||||
private FragmentUtils() {} // private constructor
|
||||
|
||||
/**
|
||||
|
|
@ -65,18 +68,28 @@ public class FragmentUtils {
|
|||
* Allows us to write a generic T -> Fragment algorithm that works with any object containing
|
||||
* a read.
|
||||
*
|
||||
* @param <T>
|
||||
* @param <T> The type of the object that contains a GATKSAMRecord
|
||||
*/
|
||||
public interface ReadGetter<T> {
|
||||
/**
|
||||
* Get the GATKSAMRecord associated with object
|
||||
*
|
||||
* @param object the thing that contains the read
|
||||
* @return a non-null GATKSAMRecord read
|
||||
*/
|
||||
public GATKSAMRecord get(T object);
|
||||
}
|
||||
|
||||
/** Identify getter for SAMRecords themselves */
|
||||
/**
|
||||
* Identify getter for SAMRecords themselves
|
||||
*/
|
||||
private final static ReadGetter<GATKSAMRecord> SamRecordGetter = new ReadGetter<GATKSAMRecord>() {
|
||||
@Override public GATKSAMRecord get(final GATKSAMRecord object) { return object; }
|
||||
};
|
||||
|
||||
/** Gets the SAMRecord in a PileupElement */
|
||||
/**
|
||||
* Gets the SAMRecord in a PileupElement
|
||||
*/
|
||||
private final static ReadGetter<PileupElement> PileupElementGetter = new ReadGetter<PileupElement>() {
|
||||
@Override public GATKSAMRecord get(final PileupElement object) { return object.getRead(); }
|
||||
};
|
||||
|
|
@ -87,13 +100,20 @@ public class FragmentUtils {
|
|||
* and returns a FragmentCollection that contains the T objects whose underlying reads either overlap (or
|
||||
* not) with their mate pairs.
|
||||
*
|
||||
* @param readContainingObjects
|
||||
* @param nElements
|
||||
* @param getter
|
||||
* @param readContainingObjects An iterator of objects that contain GATKSAMRecords
|
||||
* @param nElements the number of elements to be provided by the iterator, which is usually known upfront and
|
||||
* greatly improves the efficiency of the fragment calculation
|
||||
* @param getter a helper function that takes an object of type T and returns is associated GATKSAMRecord
|
||||
* @param <T>
|
||||
* @return
|
||||
* @return a fragment collection
|
||||
*/
|
||||
private final static <T> FragmentCollection<T> create(Iterable<T> readContainingObjects, int nElements, ReadGetter<T> getter) {
|
||||
@Requires({
|
||||
"readContainingObjects != null",
|
||||
"nElements >= 0",
|
||||
"getter != null"
|
||||
})
|
||||
@Ensures("result != null")
|
||||
private static <T> FragmentCollection<T> create(final Iterable<T> readContainingObjects, final int nElements, final ReadGetter<T> getter) {
|
||||
Collection<T> singletons = null;
|
||||
Collection<List<T>> overlapping = null;
|
||||
Map<String, T> nameMap = null;
|
||||
|
|
@ -145,30 +165,69 @@ public class FragmentUtils {
|
|||
return new FragmentCollection<T>(singletons, overlapping);
|
||||
}
|
||||
|
||||
public final static FragmentCollection<PileupElement> create(ReadBackedPileup rbp) {
|
||||
/**
|
||||
* Create a FragmentCollection containing PileupElements from the ReadBackedPileup rbp
|
||||
* @param rbp a non-null read-backed pileup. The elements in this ReadBackedPileup must be ordered
|
||||
* @return a non-null FragmentCollection
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public static FragmentCollection<PileupElement> create(final ReadBackedPileup rbp) {
|
||||
if ( rbp == null ) throw new IllegalArgumentException("Pileup cannot be null");
|
||||
return create(rbp, rbp.getNumberOfElements(), PileupElementGetter);
|
||||
}
|
||||
|
||||
public final static FragmentCollection<GATKSAMRecord> create(List<GATKSAMRecord> reads) {
|
||||
/**
|
||||
* Create a FragmentCollection containing GATKSAMRecords from a list of reads
|
||||
*
|
||||
* @param reads a non-null list of reads, ordered by their start location
|
||||
* @return a non-null FragmentCollection
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public static FragmentCollection<GATKSAMRecord> create(final List<GATKSAMRecord> reads) {
|
||||
if ( reads == null ) throw new IllegalArgumentException("Pileup cannot be null");
|
||||
return create(reads, reads.size(), SamRecordGetter);
|
||||
}
|
||||
|
||||
public final static List<GATKSAMRecord> mergeOverlappingPairedFragments( final List<GATKSAMRecord> overlappingPair ) {
|
||||
final byte MIN_QUAL_BAD_OVERLAP = 16;
|
||||
public static List<GATKSAMRecord> mergeOverlappingPairedFragments( final List<GATKSAMRecord> overlappingPair ) {
|
||||
if( overlappingPair.size() != 2 ) { throw new ReviewedStingException("Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2."); }
|
||||
|
||||
GATKSAMRecord firstRead = overlappingPair.get(0);
|
||||
GATKSAMRecord secondRead = overlappingPair.get(1);
|
||||
final GATKSAMRecord firstRead = overlappingPair.get(0);
|
||||
final GATKSAMRecord secondRead = overlappingPair.get(1);
|
||||
|
||||
final GATKSAMRecord merged;
|
||||
if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) {
|
||||
merged = mergeOverlappingPairedFragments(secondRead, firstRead);
|
||||
} else {
|
||||
merged = mergeOverlappingPairedFragments(firstRead, secondRead);
|
||||
}
|
||||
|
||||
return merged == null ? overlappingPair : Collections.singletonList(merged);
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge two overlapping reads from the same fragment into a single super read, if possible
|
||||
*
|
||||
* firstRead and secondRead must be part of the same fragment (though this isn't checked). Looks
|
||||
* at the bases and alignment, and tries its best to create a meaningful synthetic single super read
|
||||
* that represents the entire sequenced fragment.
|
||||
*
|
||||
* Assumes that firstRead starts before secondRead (according to their soft clipped starts)
|
||||
*
|
||||
* @param firstRead the left most read
|
||||
* @param firstRead the right most read
|
||||
*
|
||||
* @return a strandless merged read of first and second, or null if the algorithm cannot create a meaningful one
|
||||
*/
|
||||
public static GATKSAMRecord mergeOverlappingPairedFragments(final GATKSAMRecord firstRead, final GATKSAMRecord secondRead) {
|
||||
if ( firstRead == null ) throw new IllegalArgumentException("firstRead cannot be null");
|
||||
if ( secondRead == null ) throw new IllegalArgumentException("secondRead cannot be null");
|
||||
if ( ! firstRead.getReadName().equals(secondRead.getReadName()) ) throw new IllegalArgumentException("attempting to merge two reads with different names " + firstRead + " and " + secondRead);
|
||||
|
||||
if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) {
|
||||
firstRead = overlappingPair.get(1); // swap them
|
||||
secondRead = overlappingPair.get(0);
|
||||
}
|
||||
if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) {
|
||||
return overlappingPair; // can't merge them, yet: AAAAAAAAAAA-BBBBBBBBBBB-AAAAAAAAAAAAAA, B is contained entirely inside A
|
||||
return null; // can't merge them, yet: AAAAAAAAAAA-BBBBBBBBBBB-AAAAAAAAAAAAAA, B is contained entirely inside A
|
||||
}
|
||||
if( firstRead.getCigarString().contains("I") || firstRead.getCigarString().contains("D") || secondRead.getCigarString().contains("I") || secondRead.getCigarString().contains("D") ) {
|
||||
return overlappingPair; // fragments contain indels so don't merge them
|
||||
return null; // fragments contain indels so don't merge them
|
||||
}
|
||||
|
||||
final Pair<Integer, Boolean> pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getSoftStart());
|
||||
|
|
@ -190,10 +249,10 @@ public class FragmentUtils {
|
|||
}
|
||||
for(int iii = firstReadStop; iii < firstRead.getReadLength(); iii++) {
|
||||
if( firstReadQuals[iii] > MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] > MIN_QUAL_BAD_OVERLAP && firstReadBases[iii] != secondReadBases[iii-firstReadStop] ) {
|
||||
return overlappingPair; // high qual bases don't match exactly, probably indel in only one of the fragments, so don't merge them
|
||||
return null; // high qual bases don't match exactly, probably indel in only one of the fragments, so don't merge them
|
||||
}
|
||||
if( firstReadQuals[iii] < MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] < MIN_QUAL_BAD_OVERLAP ) {
|
||||
return overlappingPair; // both reads have low qual bases in the overlap region so don't merge them because don't know what is going on
|
||||
return null; // both reads have low qual bases in the overlap region so don't merge them because don't know what is going on
|
||||
}
|
||||
bases[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadBases[iii] : secondReadBases[iii-firstReadStop] );
|
||||
quals[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadQuals[iii] : secondReadQuals[iii-firstReadStop] );
|
||||
|
|
@ -237,8 +296,6 @@ public class FragmentUtils {
|
|||
returnRead.setBaseQualities( deletionQuals, EventType.BASE_DELETION );
|
||||
}
|
||||
|
||||
final ArrayList<GATKSAMRecord> returnList = new ArrayList<GATKSAMRecord>();
|
||||
returnList.add(returnRead);
|
||||
return returnList;
|
||||
return returnRead;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -112,6 +112,19 @@ public class UtilsUnitTest extends BaseTest {
|
|||
Assert.assertTrue("one-1;two-2;three-1;four-2;five-1;six-2".equals(joined));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testConcat() {
|
||||
final String s1 = "A";
|
||||
final String s2 = "CC";
|
||||
final String s3 = "TTT";
|
||||
final String s4 = "GGGG";
|
||||
Assert.assertEquals(new String(Utils.concat()), "");
|
||||
Assert.assertEquals(new String(Utils.concat(s1.getBytes())), s1);
|
||||
Assert.assertEquals(new String(Utils.concat(s1.getBytes(), s2.getBytes())), s1 + s2);
|
||||
Assert.assertEquals(new String(Utils.concat(s1.getBytes(), s2.getBytes(), s3.getBytes())), s1 + s2 + s3);
|
||||
Assert.assertEquals(new String(Utils.concat(s1.getBytes(), s2.getBytes(), s3.getBytes(), s4.getBytes())), s1 + s2 + s3 + s4);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEscapeExpressions() {
|
||||
String[] expected, actual;
|
||||
|
|
|
|||
|
|
@ -27,23 +27,30 @@ package org.broadinstitute.sting.utils.fragments;
|
|||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
import org.broadinstitute.sting.utils.recalibration.EventType;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeTest;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Test routines for read-backed pileup.
|
||||
*/
|
||||
public class FragmentUtilsUnitTest extends BaseTest {
|
||||
private static SAMFileHeader header;
|
||||
private static GATKSAMReadGroupRecord rgForMerged;
|
||||
private final static boolean DEBUG = false;
|
||||
|
||||
private class FragmentUtilsTest extends TestDataProvider {
|
||||
List<TestState> statesForPileup = new ArrayList<TestState>();
|
||||
|
|
@ -119,7 +126,7 @@ public class FragmentUtilsUnitTest extends BaseTest {
|
|||
return FragmentUtilsTest.getTests(FragmentUtilsTest.class);
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "fragmentUtilsTest")
|
||||
@Test(enabled = !DEBUG, dataProvider = "fragmentUtilsTest")
|
||||
public void testAsPileup(FragmentUtilsTest test) {
|
||||
for ( TestState testState : test.statesForPileup ) {
|
||||
ReadBackedPileup rbp = testState.pileup;
|
||||
|
|
@ -129,7 +136,7 @@ public class FragmentUtilsUnitTest extends BaseTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "fragmentUtilsTest")
|
||||
@Test(enabled = !DEBUG, dataProvider = "fragmentUtilsTest")
|
||||
public void testAsListOfReadsFromPileup(FragmentUtilsTest test) {
|
||||
for ( TestState testState : test.statesForPileup ) {
|
||||
FragmentCollection<GATKSAMRecord> fp = FragmentUtils.create(testState.pileup.getReads());
|
||||
|
|
@ -138,7 +145,7 @@ public class FragmentUtilsUnitTest extends BaseTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "fragmentUtilsTest")
|
||||
@Test(enabled = !DEBUG, dataProvider = "fragmentUtilsTest")
|
||||
public void testAsListOfReads(FragmentUtilsTest test) {
|
||||
for ( TestState testState : test.statesForReads ) {
|
||||
FragmentCollection<GATKSAMRecord> fp = FragmentUtils.create(testState.rawReads);
|
||||
|
|
@ -147,7 +154,7 @@ public class FragmentUtilsUnitTest extends BaseTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test(enabled = true, expectedExceptions = IllegalArgumentException.class)
|
||||
@Test(enabled = !DEBUG, expectedExceptions = IllegalArgumentException.class)
|
||||
public void testOutOfOrder() {
|
||||
final List<GATKSAMRecord> pair = ArtificialSAMUtils.createPair(header, "readpair", 100, 1, 50, true, true);
|
||||
final GATKSAMRecord left = pair.get(0);
|
||||
|
|
@ -161,5 +168,75 @@ public class FragmentUtilsUnitTest extends BaseTest {
|
|||
@BeforeTest
|
||||
public void setup() {
|
||||
header = ArtificialSAMUtils.createArtificialSamHeader(1,1,1000);
|
||||
rgForMerged = new GATKSAMReadGroupRecord("RG1");
|
||||
}
|
||||
|
||||
@DataProvider(name = "MergeFragmentsTest")
|
||||
public Object[][] createMergeFragmentsTest() throws Exception {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final String leftFlank = "CCC";
|
||||
final String rightFlank = "AAA";
|
||||
final String allOverlappingBases = "ACGTACGTGGAACCTTAG";
|
||||
for ( int overlapSize = 1; overlapSize < allOverlappingBases.length(); overlapSize++ ) {
|
||||
final String overlappingBases = allOverlappingBases.substring(0, overlapSize);
|
||||
final byte[] overlappingBaseQuals = new byte[overlapSize];
|
||||
for ( int i = 0; i < overlapSize; i++ ) overlappingBaseQuals[i] = (byte)(i + 30);
|
||||
final GATKSAMRecord read1 = makeOverlappingRead(leftFlank, 20, overlappingBases, overlappingBaseQuals, "", 30, 1);
|
||||
final GATKSAMRecord read2 = makeOverlappingRead("", 20, overlappingBases, overlappingBaseQuals, rightFlank, 30, leftFlank.length() + 1);
|
||||
final GATKSAMRecord merged = makeOverlappingRead(leftFlank, 20, overlappingBases, overlappingBaseQuals, rightFlank, 30, 1);
|
||||
tests.add(new Object[]{"equalQuals", read1, read2, merged});
|
||||
|
||||
// test that the merged read base quality is the
|
||||
tests.add(new Object[]{"lowQualLeft", modifyBaseQualities(read1, leftFlank.length(), overlapSize), read2, merged});
|
||||
tests.add(new Object[]{"lowQualRight", read1, modifyBaseQualities(read2, 0, overlapSize), merged});
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
private GATKSAMRecord modifyBaseQualities(final GATKSAMRecord read, final int startOffset, final int length) throws Exception {
|
||||
final GATKSAMRecord readWithLowQuals = (GATKSAMRecord)read.clone();
|
||||
final byte[] withLowQuals = Arrays.copyOf(read.getBaseQualities(), read.getBaseQualities().length);
|
||||
for ( int i = startOffset; i < startOffset + length; i++ )
|
||||
withLowQuals[i] = (byte)(read.getBaseQualities()[i] + (i % 2 == 0 ? -1 : 0));
|
||||
readWithLowQuals.setBaseQualities(withLowQuals);
|
||||
return readWithLowQuals;
|
||||
}
|
||||
|
||||
private GATKSAMRecord makeOverlappingRead(final String leftFlank, final int leftQual, final String overlapBases,
|
||||
final byte[] overlapQuals, final String rightFlank, final int rightQual,
|
||||
final int alignmentStart) {
|
||||
final String bases = leftFlank + overlapBases + rightFlank;
|
||||
final int readLength = bases.length();
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, alignmentStart, readLength);
|
||||
final byte[] leftQuals = Utils.dupBytes((byte) leftQual, leftFlank.length());
|
||||
final byte[] rightQuals = Utils.dupBytes((byte) rightQual, rightFlank.length());
|
||||
final byte[] quals = Utils.concat(leftQuals, overlapQuals, rightQuals);
|
||||
read.setCigarString(readLength + "M");
|
||||
read.setReadBases(bases.getBytes());
|
||||
for ( final EventType type : EventType.values() )
|
||||
read.setBaseQualities(quals, type);
|
||||
read.setReadGroup(rgForMerged);
|
||||
read.setMappingQuality(60);
|
||||
return read;
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "MergeFragmentsTest")
|
||||
public void testMergingTwoReads(final String name, final GATKSAMRecord read1, GATKSAMRecord read2, final GATKSAMRecord expectedMerged) {
|
||||
final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2);
|
||||
|
||||
if ( expectedMerged == null ) {
|
||||
Assert.assertNull(actual, "Expected reads not to merge, but got non-null result from merging");
|
||||
} else {
|
||||
Assert.assertNotNull(actual, "Expected reads to merge, but got null result from merging");
|
||||
// I really care about the bases, the quals, the CIGAR, and the read group tag
|
||||
Assert.assertEquals(actual.getCigarString(), expectedMerged.getCigarString());
|
||||
Assert.assertEquals(actual.getReadBases(), expectedMerged.getReadBases());
|
||||
Assert.assertEquals(actual.getReadGroup(), expectedMerged.getReadGroup());
|
||||
Assert.assertEquals(actual.getMappingQuality(), expectedMerged.getMappingQuality());
|
||||
for ( final EventType type : EventType.values() )
|
||||
Assert.assertEquals(actual.getBaseQualities(type), expectedMerged.getBaseQualities(type), "Failed base qualities for event type " + type);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue