diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java index e811db355..4eda7c7cd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java @@ -14,9 +14,6 @@ import java.util.*; * * Based on the original code by E. Banks * - * TODO -- technically we could generalize this code to support a pseudo-duplicate marking - * TODO -- algorithm that could collect all duplicates into a single super pileup element - * * Oct 21: note that the order of the oneReadPileup and twoReadPileups are not * defined. The algorithms that produce these lists are in fact producing * lists of Pileup elements *NOT* sorted by alignment start position of the underlying @@ -30,11 +27,9 @@ public class FragmentPileup { Collection oneReadPile = null; Collection twoReadPile = null; - public enum FragmentMatchingAlgorithm { + protected enum FragmentMatchingAlgorithm { ORIGINAL, - FAST_V1, skipNonOverlapping, - skipNonOverlappingNotLazy } /** @@ -42,16 +37,14 @@ public class FragmentPileup { * @param pileup */ public FragmentPileup(ReadBackedPileup pileup) { - //oldSlowCalculation(pileup); - fastNewCalculation(pileup); + skipNonOverlapping(pileup); } + /** For performance testing only */ protected FragmentPileup(ReadBackedPileup pileup, FragmentMatchingAlgorithm algorithm) { switch ( algorithm ) { case ORIGINAL: oldSlowCalculation(pileup); break; - case FAST_V1: fastNewCalculation(pileup); break; case skipNonOverlapping: skipNonOverlapping(pileup); break; - case skipNonOverlappingNotLazy: skipNonOverlappingNotLazy(pileup); break; } } @@ -76,58 +69,36 @@ public class FragmentPileup { oneReadPile = nameMap.values(); } - /** - * @param pileup - */ - private final void fastNewCalculation(final ReadBackedPileup pileup) { - Map nameMap = null; // lazy initialization + private final void skipNonOverlapping(final ReadBackedPileup pileup) { + Map nameMap = null; + // build an initial map, grabbing all of the multi-read fragments for ( final PileupElement p : pileup ) { final SAMRecord read = p.getRead(); + final int mateStart = read.getMateAlignmentStart(); - switch (ReadUtils.readMightOverlapMate(read) ) { - // we know for certain this read doesn't have an overlapping mate - case NO: { - addToOnePile(p); - break; - } - - // we know that we overlap our mate, so put the read in the nameMap in - // case our mate shows up - case LEFT_YES: { - nameMap = addToNameMap(nameMap, p); - break; - } - - // read starts at the same position, so we are looking at either the first or - // the second read. In the first, add it to the map, in the second grab it - // from the map and create a fragment - case SAME_START: { - final PileupElement pe1 = getFromNameMap(nameMap, p); - if ( pe1 != null ) { - addToTwoPile(pe1, p); - nameMap.remove(p.getRead().getReadName()); - } else { - nameMap = addToNameMap(nameMap, p); - } - break; - } - - // in this case we need to see if our mate is already present, and if so - // grab the read from the list - case RIGHT_MAYBE: { - final PileupElement pe1 = getFromNameMap(nameMap, p); - if ( pe1 != null ) { - addToTwoPile(pe1, p); - nameMap.remove(p.getRead().getReadName()); - } else { - addToOnePile(p); - } - break; + if ( mateStart == 0 || mateStart > read.getAlignmentEnd() ) { + // if we know that this read won't overlap its mate, or doesn't have one, jump out early + if ( oneReadPile == null ) oneReadPile = new ArrayList(pileup.size()); // lazy init + oneReadPile.add(p); + } else { + // the read might overlap it's mate, or is the rightmost read of a pair + final String readName = p.getRead().getReadName(); + final PileupElement pe1 = nameMap == null ? null : nameMap.get(readName); + if ( pe1 != null ) { + // assumes we have at most 2 reads per fragment + if ( twoReadPile == null ) twoReadPile = new ArrayList(); // lazy init + twoReadPile.add(new TwoReadPileupElement(pe1, p)); + nameMap.remove(readName); + } else { + if ( nameMap == null ) nameMap = new HashMap(pileup.size()); // lazy init + nameMap.put(readName, p); } } } + // add all of the reads that are potentially overlapping but whose mate never showed + // up to the oneReadPile if ( nameMap != null && ! nameMap.isEmpty() ) { if ( oneReadPile == null ) oneReadPile = nameMap.values(); @@ -136,90 +107,6 @@ public class FragmentPileup { } } - /** - * @param pileup - */ - private final void skipNonOverlappingNotLazy(final ReadBackedPileup pileup) { - oneReadPile = new ArrayList(pileup.size()); - twoReadPile = new ArrayList(); - final Map nameMap = new HashMap(pileup.size()); - - // build an initial map, grabbing all of the multi-read fragments - for ( final PileupElement p : pileup ) { - // if we know that this read won't overlap its mate, or doesn't have one, jump out early - final SAMRecord read = p.getRead(); - final int mateStart = read.getMateAlignmentStart(); - if ( mateStart == 0 || mateStart > read.getAlignmentEnd() ) { - oneReadPile.add(p); - } else { - final String readName = p.getRead().getReadName(); - final PileupElement pe1 = nameMap.get(readName); - if ( pe1 != null ) { - // assumes we have at most 2 reads per fragment - twoReadPile.add(new TwoReadPileupElement(pe1, p)); - nameMap.remove(readName); - } else { - nameMap.put(readName, p); - } - } - } - - oneReadPile.addAll(nameMap.values()); - } - - private final void skipNonOverlapping(final ReadBackedPileup pileup) { - Map nameMap = null; - - // build an initial map, grabbing all of the multi-read fragments - for ( final PileupElement p : pileup ) { - // if we know that this read won't overlap its mate, or doesn't have one, jump out early - final SAMRecord read = p.getRead(); - final int mateStart = read.getMateAlignmentStart(); - if ( mateStart == 0 || mateStart > read.getAlignmentEnd() ) { - if ( oneReadPile == null ) oneReadPile = new ArrayList(pileup.size()); - oneReadPile.add(p); - } else { - final String readName = p.getRead().getReadName(); - final PileupElement pe1 = nameMap == null ? null : nameMap.get(readName); - if ( pe1 != null ) { - // assumes we have at most 2 reads per fragment - if ( twoReadPile == null ) twoReadPile = new ArrayList(); - twoReadPile.add(new TwoReadPileupElement(pe1, p)); - nameMap.remove(readName); - } else { - nameMap = addToNameMap(nameMap, p); - } - } - } - - if ( oneReadPile == null ) - oneReadPile = nameMap == null ? Collections.emptyList() : nameMap.values(); - else if ( nameMap != null ) - oneReadPile.addAll(nameMap.values()); - } - - private final Map addToNameMap(Map map, final PileupElement p) { - if ( map == null ) map = new HashMap(); - map.put(p.getRead().getReadName(), p); - return map; - } - - private final PileupElement getFromNameMap(Map map, final PileupElement p) { - return map == null ? null : map.get(p.getRead().getReadName()); - } - - - private final void addToOnePile(final PileupElement p) { - if ( oneReadPile == null ) oneReadPile = new ArrayList(); - oneReadPile.add(p); - } - - private final void addToTwoPile(final PileupElement p1, final PileupElement p2) { - // assumes we have at most 2 reads per fragment - if ( twoReadPile == null ) twoReadPile = new ArrayList(); - twoReadPile.add(new TwoReadPileupElement(p1, p2)); - } - /** * Gets the pileup elements containing two reads, in no particular order * diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index 4eba32383..1b3641128 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -2,11 +2,15 @@ package org.broadinstitute.sting.utils.sam; import net.sf.samtools.*; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import java.io.File; -import java.util.ArrayList; -import java.util.List; +import java.util.*; /** * @author aaron @@ -29,7 +33,7 @@ public class ArtificialSAMUtils { File outFile = new File(filename); SAMFileWriter out = new SAMFileWriterFactory().makeBAMWriter(header, true, outFile); - + for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { for (int readNumber = 1; readNumber < readsPerChomosome; readNumber++) { out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, DEFAULT_READ_LENGTH)); @@ -145,7 +149,7 @@ public class ArtificialSAMUtils { */ public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length) { if( (refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart != SAMRecord.NO_ALIGNMENT_START) || - (refIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart == SAMRecord.NO_ALIGNMENT_START) ) + (refIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart == SAMRecord.NO_ALIGNMENT_START) ) throw new ReviewedStingException("Invalid alignment start for artificial read, start = " + alignmentStart); GATKSAMRecord record = new GATKSAMRecord(header); record.setReadName(name); @@ -197,6 +201,37 @@ public class ArtificialSAMUtils { return rec; } + public final static List createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { + SAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen); + SAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen); + + left.setReadPairedFlag(true); + right.setReadPairedFlag(true); + + left.setProperPairFlag(true); + right.setProperPairFlag(true); + + left.setFirstOfPairFlag(leftIsFirst); + right.setFirstOfPairFlag(! leftIsFirst); + + left.setReadNegativeStrandFlag(leftIsNegative); + left.setMateNegativeStrandFlag(!leftIsNegative); + right.setReadNegativeStrandFlag(!leftIsNegative); + right.setMateNegativeStrandFlag(leftIsNegative); + + left.setMateAlignmentStart(right.getAlignmentStart()); + right.setMateAlignmentStart(left.getAlignmentStart()); + + left.setMateReferenceIndex(0); + right.setMateReferenceIndex(0); + + int isize = rightStart + readLen - leftStart; + left.setInferredInsertSize(isize); + right.setInferredInsertSize(-isize); + + return Arrays.asList(left, right); + } + /** * create an iterator containing the specified read piles * @@ -258,4 +293,52 @@ public class ArtificialSAMUtils { return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header); } + + private final static int ranIntInclusive(Random ran, int start, int stop) { + final int range = stop - start; + return ran.nextInt(range) + start; + } + + /** + * Creates a read backed pileup containing up to pileupSize reads at refID 0 from header at loc with + * reads created that have readLen bases. Pairs are sampled from a gaussian distribution with mean insert + * size of insertSize and variation of insertSize / 10. The first read will be in the pileup, and the second + * may be, depending on where this sampled insertSize puts it. + * @param header + * @param loc + * @param readLen + * @param insertSize + * @param pileupSize + * @return + */ + public static ReadBackedPileup createReadBackedPileup(final SAMFileHeader header, final GenomeLoc loc, final int readLen, final int insertSize, final int pileupSize) { + final Random ran = new Random(); + final boolean leftIsFirst = true; + final boolean leftIsNegative = false; + final int insertSizeVariation = insertSize / 10; + final int pos = loc.getStart(); + + final List pileupElements = new ArrayList(); + for ( int i = 0; i < pileupSize / 2; i++ ) { + final String readName = "read" + i; + final int leftStart = ranIntInclusive(ran, 1, pos); + final int fragmentSize = (int)(ran.nextGaussian() * insertSizeVariation + insertSize); + final int rightStart = leftStart + fragmentSize - readLen; + + if ( rightStart <= 0 ) continue; + + List pair = createPair(header, readName, readLen, leftStart, rightStart, leftIsFirst, leftIsNegative); + final SAMRecord left = pair.get(0); + final SAMRecord right = pair.get(1); + + pileupElements.add(new PileupElement(left, pos - leftStart)); + + if ( pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd() ) { + pileupElements.add(new PileupElement(right, pos - rightStart)); + } + } + + Collections.sort(pileupElements); + return new ReadBackedPileupImpl(loc, pileupElements); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index f093e6539..f8e4927ed 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -214,54 +214,6 @@ public class ReadUtils { return state; } - /** - * s1 e1 - * |-----------------------> [record in hand] - * s2 - * <-----------------------| - * - * s1, e1, and s2 are all in the record. Assuming that s1 < s2 (we are the left most read), - * we can compute whether we overlap with our mate by seeing if s2 <= e1 or no. If e1 < - * s2 then we known that we cannot over. - * - * If we are looking at the right read - * - * s1 - * |-----------------------> - * s2 e2 - * <-----------------------| [record in hand] - * - * we know the position of s1 and s2, but we don't know e1, so we cannot tell if we - * overlap with our mate or not, so in this case we return MAYBE. - * - * Note that if rec has an unmapped mate or is unpaired we certainly know the answer - * - * @param rec - * @return - */ - public static ReadOverlapsMateType readMightOverlapMate(final SAMRecord rec) { - if ( ! rec.getReadPairedFlag() || rec.getMateUnmappedFlag() ) { - return ReadOverlapsMateType.NO; - } else { // read is actually paired - final int recStart = rec.getAlignmentStart(); - final int recEnd = rec.getAlignmentEnd(); - final int mateStart = rec.getMateAlignmentStart(); - - if ( recStart < mateStart ) { - // we are the left most read - return mateStart <= recEnd ? ReadOverlapsMateType.LEFT_YES: ReadOverlapsMateType.NO; - } else if ( recStart == mateStart ) { - // we are the left most read - return ReadOverlapsMateType.SAME_START; - } else { - // we are the right most read, so we cannot tell - return ReadOverlapsMateType.RIGHT_MAYBE; - } - } - } - - public enum ReadOverlapsMateType { LEFT_YES, NO, SAME_START, RIGHT_MAYBE } - private static Pair getAdaptorBoundaries(SAMRecord rec, int adaptorLength) { int isize = rec.getInferredInsertSize(); if ( isize == 0 ) diff --git a/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupBenchmark.java b/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupBenchmark.java index 6a5378d46..8b797def4 100644 --- a/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupBenchmark.java @@ -53,38 +53,12 @@ public class FragmentPileupBenchmark extends SimpleBenchmark { SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); GenomeLocParser genomeLocParser; genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); - final int pos = 50; - GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", pos); - - final Random ran = new Random(); + GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 50); final int readLen = 100; - final boolean leftIsFirst = true; - final boolean leftIsNegative = false; - final int insertSizeVariation = insertSize / 10; for ( int pileupN = 0; pileupN < nPileupsToGenerate; pileupN++ ) { - List pileupElements = new ArrayList(); - for ( int i = 0; i < pileupSize / 2; i++ ) { - final String readName = "read" + i; - final int leftStart = new Random().nextInt(49) + 1; - final int fragmentSize = (int)(ran.nextGaussian() * insertSizeVariation + insertSize); - final int rightStart = leftStart + fragmentSize - readLen; - - if ( rightStart <= 0 ) continue; - - List pair = FragmentPileupUnitTest.createPair(header, readName, readLen, leftStart, rightStart, leftIsFirst, leftIsNegative); - SAMRecord left = pair.get(0); - SAMRecord right = pair.get(1); - - pileupElements.add(new PileupElement(left, pos - leftStart)); - - if ( pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd() ) { - pileupElements.add(new PileupElement(right, pos - rightStart)); - } - } - - Collections.sort(pileupElements); - pileups.add(new ReadBackedPileupImpl(loc, pileupElements)); + ReadBackedPileup rbp = ArtificialSAMUtils.createReadBackedPileup(header, loc, readLen, insertSize, pileupSize); + pileups.add(rbp); } } @@ -100,18 +74,10 @@ public class FragmentPileupBenchmark extends SimpleBenchmark { run(rep, FragmentPileup.FragmentMatchingAlgorithm.ORIGINAL); } - public void timeFullOverlapPotential(int rep) { - run(rep, FragmentPileup.FragmentMatchingAlgorithm.FAST_V1); - } - public void timeSkipNonOverlapping(int rep) { run(rep, FragmentPileup.FragmentMatchingAlgorithm.skipNonOverlapping); } - public void timeSkipNonOverlappingNotLazy(int rep) { - run(rep, FragmentPileup.FragmentMatchingAlgorithm.skipNonOverlappingNotLazy); - } - public static void main(String[] args) { CaliperMain.main(FragmentPileupBenchmark.class, args); } diff --git a/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupUnitTest.java index be4ecc0c6..c42c01c65 100644 --- a/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/pileup/FragmentPileupUnitTest.java @@ -43,44 +43,13 @@ import java.util.*; public class FragmentPileupUnitTest extends BaseTest { private static SAMFileHeader header; - public final static List createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { - SAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen); - SAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen); - - left.setReadPairedFlag(true); - right.setReadPairedFlag(true); - - left.setProperPairFlag(true); - right.setProperPairFlag(true); - - left.setFirstOfPairFlag(leftIsFirst); - right.setFirstOfPairFlag(! leftIsFirst); - - left.setReadNegativeStrandFlag(leftIsNegative); - left.setMateNegativeStrandFlag(!leftIsNegative); - right.setReadNegativeStrandFlag(!leftIsNegative); - right.setMateNegativeStrandFlag(leftIsNegative); - - left.setMateAlignmentStart(right.getAlignmentStart()); - right.setMateAlignmentStart(left.getAlignmentStart()); - - left.setMateReferenceIndex(0); - right.setMateReferenceIndex(0); - - int isize = rightStart + readLen - leftStart; - left.setInferredInsertSize(isize); - right.setInferredInsertSize(-isize); - - return Arrays.asList(left, right); - } - private class FragmentPileupTest extends TestDataProvider { List states = new ArrayList(); private FragmentPileupTest(String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { super(FragmentPileupTest.class, String.format("%s-leftIsFirst:%b-leftIsNegative:%b", name, leftIsFirst, leftIsNegative)); - List pair = createPair(header, "readpair", readLen, leftStart, rightStart, leftIsFirst, leftIsNegative); + List pair = ArtificialSAMUtils.createPair(header, "readpair", readLen, leftStart, rightStart, leftIsFirst, leftIsNegative); SAMRecord left = pair.get(0); SAMRecord right = pair.get(1);