diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index 45a2fa58d..ff64133a7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -415,6 +415,24 @@ public class Utils { return C; } + /** + * Concatenates byte arrays + * @return a concat of all bytes in allBytes in order + */ + public static byte[] concat(final byte[] ... allBytes) { + int size = 0; + for ( final byte[] bytes : allBytes ) size += bytes.length; + + final byte[] c = new byte[size]; + int offset = 0; + for ( final byte[] bytes : allBytes ) { + System.arraycopy(bytes, 0, c, offset, bytes.length); + offset += bytes.length; + } + + return c; + } + /** * Appends String(s) B to array A. * @param A First array. diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index 76ccede62..fa0187728 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -25,6 +25,8 @@ package org.broadinstitute.sting.utils.fragments; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; @@ -56,7 +58,8 @@ import java.util.*; * Date: 3/26/11 * Time: 10:09 PM */ -public class FragmentUtils { +public final class FragmentUtils { + protected final static byte MIN_QUAL_BAD_OVERLAP = 16; private FragmentUtils() {} // private constructor /** @@ -65,18 +68,28 @@ public class FragmentUtils { * Allows us to write a generic T -> Fragment algorithm that works with any object containing * a read. * - * @param + * @param The type of the object that contains a GATKSAMRecord */ public interface ReadGetter { + /** + * Get the GATKSAMRecord associated with object + * + * @param object the thing that contains the read + * @return a non-null GATKSAMRecord read + */ public GATKSAMRecord get(T object); } - /** Identify getter for SAMRecords themselves */ + /** + * Identify getter for SAMRecords themselves + */ private final static ReadGetter SamRecordGetter = new ReadGetter() { @Override public GATKSAMRecord get(final GATKSAMRecord object) { return object; } }; - /** Gets the SAMRecord in a PileupElement */ + /** + * Gets the SAMRecord in a PileupElement + */ private final static ReadGetter PileupElementGetter = new ReadGetter() { @Override public GATKSAMRecord get(final PileupElement object) { return object.getRead(); } }; @@ -87,13 +100,20 @@ public class FragmentUtils { * and returns a FragmentCollection that contains the T objects whose underlying reads either overlap (or * not) with their mate pairs. * - * @param readContainingObjects - * @param nElements - * @param getter + * @param readContainingObjects An iterator of objects that contain GATKSAMRecords + * @param nElements the number of elements to be provided by the iterator, which is usually known upfront and + * greatly improves the efficiency of the fragment calculation + * @param getter a helper function that takes an object of type T and returns is associated GATKSAMRecord * @param - * @return + * @return a fragment collection */ - private final static FragmentCollection create(Iterable readContainingObjects, int nElements, ReadGetter getter) { + @Requires({ + "readContainingObjects != null", + "nElements >= 0", + "getter != null" + }) + @Ensures("result != null") + private static FragmentCollection create(final Iterable readContainingObjects, final int nElements, final ReadGetter getter) { Collection singletons = null; Collection> overlapping = null; Map nameMap = null; @@ -145,30 +165,69 @@ public class FragmentUtils { return new FragmentCollection(singletons, overlapping); } - public final static FragmentCollection create(ReadBackedPileup rbp) { + /** + * Create a FragmentCollection containing PileupElements from the ReadBackedPileup rbp + * @param rbp a non-null read-backed pileup. The elements in this ReadBackedPileup must be ordered + * @return a non-null FragmentCollection + */ + @Ensures("result != null") + public static FragmentCollection create(final ReadBackedPileup rbp) { + if ( rbp == null ) throw new IllegalArgumentException("Pileup cannot be null"); return create(rbp, rbp.getNumberOfElements(), PileupElementGetter); } - public final static FragmentCollection create(List reads) { + /** + * Create a FragmentCollection containing GATKSAMRecords from a list of reads + * + * @param reads a non-null list of reads, ordered by their start location + * @return a non-null FragmentCollection + */ + @Ensures("result != null") + public static FragmentCollection create(final List reads) { + if ( reads == null ) throw new IllegalArgumentException("Pileup cannot be null"); return create(reads, reads.size(), SamRecordGetter); } - public final static List mergeOverlappingPairedFragments( final List overlappingPair ) { - final byte MIN_QUAL_BAD_OVERLAP = 16; + public static List mergeOverlappingPairedFragments( final List overlappingPair ) { if( overlappingPair.size() != 2 ) { throw new ReviewedStingException("Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2."); } - GATKSAMRecord firstRead = overlappingPair.get(0); - GATKSAMRecord secondRead = overlappingPair.get(1); + final GATKSAMRecord firstRead = overlappingPair.get(0); + final GATKSAMRecord secondRead = overlappingPair.get(1); + + final GATKSAMRecord merged; + if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) { + merged = mergeOverlappingPairedFragments(secondRead, firstRead); + } else { + merged = mergeOverlappingPairedFragments(firstRead, secondRead); + } + + return merged == null ? overlappingPair : Collections.singletonList(merged); + } + + /** + * Merge two overlapping reads from the same fragment into a single super read, if possible + * + * firstRead and secondRead must be part of the same fragment (though this isn't checked). Looks + * at the bases and alignment, and tries its best to create a meaningful synthetic single super read + * that represents the entire sequenced fragment. + * + * Assumes that firstRead starts before secondRead (according to their soft clipped starts) + * + * @param firstRead the left most read + * @param firstRead the right most read + * + * @return a strandless merged read of first and second, or null if the algorithm cannot create a meaningful one + */ + public static GATKSAMRecord mergeOverlappingPairedFragments(final GATKSAMRecord firstRead, final GATKSAMRecord secondRead) { + if ( firstRead == null ) throw new IllegalArgumentException("firstRead cannot be null"); + if ( secondRead == null ) throw new IllegalArgumentException("secondRead cannot be null"); + if ( ! firstRead.getReadName().equals(secondRead.getReadName()) ) throw new IllegalArgumentException("attempting to merge two reads with different names " + firstRead + " and " + secondRead); if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) { - firstRead = overlappingPair.get(1); // swap them - secondRead = overlappingPair.get(0); - } - if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) { - return overlappingPair; // can't merge them, yet: AAAAAAAAAAA-BBBBBBBBBBB-AAAAAAAAAAAAAA, B is contained entirely inside A + return null; // can't merge them, yet: AAAAAAAAAAA-BBBBBBBBBBB-AAAAAAAAAAAAAA, B is contained entirely inside A } if( firstRead.getCigarString().contains("I") || firstRead.getCigarString().contains("D") || secondRead.getCigarString().contains("I") || secondRead.getCigarString().contains("D") ) { - return overlappingPair; // fragments contain indels so don't merge them + return null; // fragments contain indels so don't merge them } final Pair pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getSoftStart()); @@ -190,10 +249,10 @@ public class FragmentUtils { } for(int iii = firstReadStop; iii < firstRead.getReadLength(); iii++) { if( firstReadQuals[iii] > MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] > MIN_QUAL_BAD_OVERLAP && firstReadBases[iii] != secondReadBases[iii-firstReadStop] ) { - return overlappingPair; // high qual bases don't match exactly, probably indel in only one of the fragments, so don't merge them + return null; // high qual bases don't match exactly, probably indel in only one of the fragments, so don't merge them } if( firstReadQuals[iii] < MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] < MIN_QUAL_BAD_OVERLAP ) { - return overlappingPair; // both reads have low qual bases in the overlap region so don't merge them because don't know what is going on + return null; // both reads have low qual bases in the overlap region so don't merge them because don't know what is going on } bases[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadBases[iii] : secondReadBases[iii-firstReadStop] ); quals[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadQuals[iii] : secondReadQuals[iii-firstReadStop] ); @@ -237,8 +296,6 @@ public class FragmentUtils { returnRead.setBaseQualities( deletionQuals, EventType.BASE_DELETION ); } - final ArrayList returnList = new ArrayList(); - returnList.add(returnRead); - return returnList; + return returnRead; } } diff --git a/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java index 705db6f85..154b000ce 100644 --- a/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java @@ -112,6 +112,19 @@ public class UtilsUnitTest extends BaseTest { Assert.assertTrue("one-1;two-2;three-1;four-2;five-1;six-2".equals(joined)); } + @Test + public void testConcat() { + final String s1 = "A"; + final String s2 = "CC"; + final String s3 = "TTT"; + final String s4 = "GGGG"; + Assert.assertEquals(new String(Utils.concat()), ""); + Assert.assertEquals(new String(Utils.concat(s1.getBytes())), s1); + Assert.assertEquals(new String(Utils.concat(s1.getBytes(), s2.getBytes())), s1 + s2); + Assert.assertEquals(new String(Utils.concat(s1.getBytes(), s2.getBytes(), s3.getBytes())), s1 + s2 + s3); + Assert.assertEquals(new String(Utils.concat(s1.getBytes(), s2.getBytes(), s3.getBytes(), s4.getBytes())), s1 + s2 + s3 + s4); + } + @Test public void testEscapeExpressions() { String[] expected, actual; diff --git a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java index 15d69c400..89d192f9e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java @@ -27,23 +27,30 @@ package org.broadinstitute.sting.utils.fragments; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.testng.annotations.BeforeTest; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; /** * Test routines for read-backed pileup. */ public class FragmentUtilsUnitTest extends BaseTest { private static SAMFileHeader header; + private static GATKSAMReadGroupRecord rgForMerged; + private final static boolean DEBUG = false; private class FragmentUtilsTest extends TestDataProvider { List statesForPileup = new ArrayList(); @@ -119,7 +126,7 @@ public class FragmentUtilsUnitTest extends BaseTest { return FragmentUtilsTest.getTests(FragmentUtilsTest.class); } - @Test(enabled = true, dataProvider = "fragmentUtilsTest") + @Test(enabled = !DEBUG, dataProvider = "fragmentUtilsTest") public void testAsPileup(FragmentUtilsTest test) { for ( TestState testState : test.statesForPileup ) { ReadBackedPileup rbp = testState.pileup; @@ -129,7 +136,7 @@ public class FragmentUtilsUnitTest extends BaseTest { } } - @Test(enabled = true, dataProvider = "fragmentUtilsTest") + @Test(enabled = !DEBUG, dataProvider = "fragmentUtilsTest") public void testAsListOfReadsFromPileup(FragmentUtilsTest test) { for ( TestState testState : test.statesForPileup ) { FragmentCollection fp = FragmentUtils.create(testState.pileup.getReads()); @@ -138,7 +145,7 @@ public class FragmentUtilsUnitTest extends BaseTest { } } - @Test(enabled = true, dataProvider = "fragmentUtilsTest") + @Test(enabled = !DEBUG, dataProvider = "fragmentUtilsTest") public void testAsListOfReads(FragmentUtilsTest test) { for ( TestState testState : test.statesForReads ) { FragmentCollection fp = FragmentUtils.create(testState.rawReads); @@ -147,7 +154,7 @@ public class FragmentUtilsUnitTest extends BaseTest { } } - @Test(enabled = true, expectedExceptions = IllegalArgumentException.class) + @Test(enabled = !DEBUG, expectedExceptions = IllegalArgumentException.class) public void testOutOfOrder() { final List pair = ArtificialSAMUtils.createPair(header, "readpair", 100, 1, 50, true, true); final GATKSAMRecord left = pair.get(0); @@ -161,5 +168,75 @@ public class FragmentUtilsUnitTest extends BaseTest { @BeforeTest public void setup() { header = ArtificialSAMUtils.createArtificialSamHeader(1,1,1000); + rgForMerged = new GATKSAMReadGroupRecord("RG1"); + } + + @DataProvider(name = "MergeFragmentsTest") + public Object[][] createMergeFragmentsTest() throws Exception { + List tests = new ArrayList(); + + final String leftFlank = "CCC"; + final String rightFlank = "AAA"; + final String allOverlappingBases = "ACGTACGTGGAACCTTAG"; + for ( int overlapSize = 1; overlapSize < allOverlappingBases.length(); overlapSize++ ) { + final String overlappingBases = allOverlappingBases.substring(0, overlapSize); + final byte[] overlappingBaseQuals = new byte[overlapSize]; + for ( int i = 0; i < overlapSize; i++ ) overlappingBaseQuals[i] = (byte)(i + 30); + final GATKSAMRecord read1 = makeOverlappingRead(leftFlank, 20, overlappingBases, overlappingBaseQuals, "", 30, 1); + final GATKSAMRecord read2 = makeOverlappingRead("", 20, overlappingBases, overlappingBaseQuals, rightFlank, 30, leftFlank.length() + 1); + final GATKSAMRecord merged = makeOverlappingRead(leftFlank, 20, overlappingBases, overlappingBaseQuals, rightFlank, 30, 1); + tests.add(new Object[]{"equalQuals", read1, read2, merged}); + + // test that the merged read base quality is the + tests.add(new Object[]{"lowQualLeft", modifyBaseQualities(read1, leftFlank.length(), overlapSize), read2, merged}); + tests.add(new Object[]{"lowQualRight", read1, modifyBaseQualities(read2, 0, overlapSize), merged}); + } + + return tests.toArray(new Object[][]{}); + } + + private GATKSAMRecord modifyBaseQualities(final GATKSAMRecord read, final int startOffset, final int length) throws Exception { + final GATKSAMRecord readWithLowQuals = (GATKSAMRecord)read.clone(); + final byte[] withLowQuals = Arrays.copyOf(read.getBaseQualities(), read.getBaseQualities().length); + for ( int i = startOffset; i < startOffset + length; i++ ) + withLowQuals[i] = (byte)(read.getBaseQualities()[i] + (i % 2 == 0 ? -1 : 0)); + readWithLowQuals.setBaseQualities(withLowQuals); + return readWithLowQuals; + } + + private GATKSAMRecord makeOverlappingRead(final String leftFlank, final int leftQual, final String overlapBases, + final byte[] overlapQuals, final String rightFlank, final int rightQual, + final int alignmentStart) { + final String bases = leftFlank + overlapBases + rightFlank; + final int readLength = bases.length(); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, alignmentStart, readLength); + final byte[] leftQuals = Utils.dupBytes((byte) leftQual, leftFlank.length()); + final byte[] rightQuals = Utils.dupBytes((byte) rightQual, rightFlank.length()); + final byte[] quals = Utils.concat(leftQuals, overlapQuals, rightQuals); + read.setCigarString(readLength + "M"); + read.setReadBases(bases.getBytes()); + for ( final EventType type : EventType.values() ) + read.setBaseQualities(quals, type); + read.setReadGroup(rgForMerged); + read.setMappingQuality(60); + return read; + } + + @Test(enabled = true, dataProvider = "MergeFragmentsTest") + public void testMergingTwoReads(final String name, final GATKSAMRecord read1, GATKSAMRecord read2, final GATKSAMRecord expectedMerged) { + final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2); + + if ( expectedMerged == null ) { + Assert.assertNull(actual, "Expected reads not to merge, but got non-null result from merging"); + } else { + Assert.assertNotNull(actual, "Expected reads to merge, but got null result from merging"); + // I really care about the bases, the quals, the CIGAR, and the read group tag + Assert.assertEquals(actual.getCigarString(), expectedMerged.getCigarString()); + Assert.assertEquals(actual.getReadBases(), expectedMerged.getReadBases()); + Assert.assertEquals(actual.getReadGroup(), expectedMerged.getReadGroup()); + Assert.assertEquals(actual.getMappingQuality(), expectedMerged.getMappingQuality()); + for ( final EventType type : EventType.values() ) + Assert.assertEquals(actual.getBaseQualities(type), expectedMerged.getBaseQualities(type), "Failed base qualities for event type " + type); + } } }