Cleanup FragmentPileup before main repo commit

-- removed intermiate functions.  Now only original version and best optimized new version remain
-- Moved general artificial read backed pileup creation code into ArtificialSamUtils
This commit is contained in:
Mark DePristo 2011-10-24 14:40:05 -04:00
parent 166174a551
commit 502592671d
5 changed files with 116 additions and 259 deletions

View File

@ -14,9 +14,6 @@ import java.util.*;
*
* Based on the original code by E. Banks
*
* TODO -- technically we could generalize this code to support a pseudo-duplicate marking
* TODO -- algorithm that could collect all duplicates into a single super pileup element
*
* Oct 21: note that the order of the oneReadPileup and twoReadPileups are not
* defined. The algorithms that produce these lists are in fact producing
* lists of Pileup elements *NOT* sorted by alignment start position of the underlying
@ -30,11 +27,9 @@ public class FragmentPileup {
Collection<PileupElement> oneReadPile = null;
Collection<TwoReadPileupElement> twoReadPile = null;
public enum FragmentMatchingAlgorithm {
protected enum FragmentMatchingAlgorithm {
ORIGINAL,
FAST_V1,
skipNonOverlapping,
skipNonOverlappingNotLazy
}
/**
@ -42,16 +37,14 @@ public class FragmentPileup {
* @param pileup
*/
public FragmentPileup(ReadBackedPileup pileup) {
//oldSlowCalculation(pileup);
fastNewCalculation(pileup);
skipNonOverlapping(pileup);
}
/** For performance testing only */
protected FragmentPileup(ReadBackedPileup pileup, FragmentMatchingAlgorithm algorithm) {
switch ( algorithm ) {
case ORIGINAL: oldSlowCalculation(pileup); break;
case FAST_V1: fastNewCalculation(pileup); break;
case skipNonOverlapping: skipNonOverlapping(pileup); break;
case skipNonOverlappingNotLazy: skipNonOverlappingNotLazy(pileup); break;
}
}
@ -76,58 +69,36 @@ public class FragmentPileup {
oneReadPile = nameMap.values();
}
/**
* @param pileup
*/
private final void fastNewCalculation(final ReadBackedPileup pileup) {
Map<String, PileupElement> nameMap = null; // lazy initialization
private final void skipNonOverlapping(final ReadBackedPileup pileup) {
Map<String, PileupElement> nameMap = null;
// build an initial map, grabbing all of the multi-read fragments
for ( final PileupElement p : pileup ) {
final SAMRecord read = p.getRead();
final int mateStart = read.getMateAlignmentStart();
switch (ReadUtils.readMightOverlapMate(read) ) {
// we know for certain this read doesn't have an overlapping mate
case NO: {
addToOnePile(p);
break;
}
// we know that we overlap our mate, so put the read in the nameMap in
// case our mate shows up
case LEFT_YES: {
nameMap = addToNameMap(nameMap, p);
break;
}
// read starts at the same position, so we are looking at either the first or
// the second read. In the first, add it to the map, in the second grab it
// from the map and create a fragment
case SAME_START: {
final PileupElement pe1 = getFromNameMap(nameMap, p);
if ( pe1 != null ) {
addToTwoPile(pe1, p);
nameMap.remove(p.getRead().getReadName());
} else {
nameMap = addToNameMap(nameMap, p);
}
break;
}
// in this case we need to see if our mate is already present, and if so
// grab the read from the list
case RIGHT_MAYBE: {
final PileupElement pe1 = getFromNameMap(nameMap, p);
if ( pe1 != null ) {
addToTwoPile(pe1, p);
nameMap.remove(p.getRead().getReadName());
} else {
addToOnePile(p);
}
break;
if ( mateStart == 0 || mateStart > read.getAlignmentEnd() ) {
// if we know that this read won't overlap its mate, or doesn't have one, jump out early
if ( oneReadPile == null ) oneReadPile = new ArrayList<PileupElement>(pileup.size()); // lazy init
oneReadPile.add(p);
} else {
// the read might overlap it's mate, or is the rightmost read of a pair
final String readName = p.getRead().getReadName();
final PileupElement pe1 = nameMap == null ? null : nameMap.get(readName);
if ( pe1 != null ) {
// assumes we have at most 2 reads per fragment
if ( twoReadPile == null ) twoReadPile = new ArrayList<TwoReadPileupElement>(); // lazy init
twoReadPile.add(new TwoReadPileupElement(pe1, p));
nameMap.remove(readName);
} else {
if ( nameMap == null ) nameMap = new HashMap<String, PileupElement>(pileup.size()); // lazy init
nameMap.put(readName, p);
}
}
}
// add all of the reads that are potentially overlapping but whose mate never showed
// up to the oneReadPile
if ( nameMap != null && ! nameMap.isEmpty() ) {
if ( oneReadPile == null )
oneReadPile = nameMap.values();
@ -136,90 +107,6 @@ public class FragmentPileup {
}
}
/**
* @param pileup
*/
private final void skipNonOverlappingNotLazy(final ReadBackedPileup pileup) {
oneReadPile = new ArrayList<PileupElement>(pileup.size());
twoReadPile = new ArrayList<TwoReadPileupElement>();
final Map<String, PileupElement> nameMap = new HashMap<String, PileupElement>(pileup.size());
// build an initial map, grabbing all of the multi-read fragments
for ( final PileupElement p : pileup ) {
// if we know that this read won't overlap its mate, or doesn't have one, jump out early
final SAMRecord read = p.getRead();
final int mateStart = read.getMateAlignmentStart();
if ( mateStart == 0 || mateStart > read.getAlignmentEnd() ) {
oneReadPile.add(p);
} else {
final String readName = p.getRead().getReadName();
final PileupElement pe1 = nameMap.get(readName);
if ( pe1 != null ) {
// assumes we have at most 2 reads per fragment
twoReadPile.add(new TwoReadPileupElement(pe1, p));
nameMap.remove(readName);
} else {
nameMap.put(readName, p);
}
}
}
oneReadPile.addAll(nameMap.values());
}
private final void skipNonOverlapping(final ReadBackedPileup pileup) {
Map<String, PileupElement> nameMap = null;
// build an initial map, grabbing all of the multi-read fragments
for ( final PileupElement p : pileup ) {
// if we know that this read won't overlap its mate, or doesn't have one, jump out early
final SAMRecord read = p.getRead();
final int mateStart = read.getMateAlignmentStart();
if ( mateStart == 0 || mateStart > read.getAlignmentEnd() ) {
if ( oneReadPile == null ) oneReadPile = new ArrayList<PileupElement>(pileup.size());
oneReadPile.add(p);
} else {
final String readName = p.getRead().getReadName();
final PileupElement pe1 = nameMap == null ? null : nameMap.get(readName);
if ( pe1 != null ) {
// assumes we have at most 2 reads per fragment
if ( twoReadPile == null ) twoReadPile = new ArrayList<TwoReadPileupElement>();
twoReadPile.add(new TwoReadPileupElement(pe1, p));
nameMap.remove(readName);
} else {
nameMap = addToNameMap(nameMap, p);
}
}
}
if ( oneReadPile == null )
oneReadPile = nameMap == null ? Collections.<PileupElement>emptyList() : nameMap.values();
else if ( nameMap != null )
oneReadPile.addAll(nameMap.values());
}
private final Map<String, PileupElement> addToNameMap(Map<String, PileupElement> map, final PileupElement p) {
if ( map == null ) map = new HashMap<String, PileupElement>();
map.put(p.getRead().getReadName(), p);
return map;
}
private final PileupElement getFromNameMap(Map<String, PileupElement> map, final PileupElement p) {
return map == null ? null : map.get(p.getRead().getReadName());
}
private final void addToOnePile(final PileupElement p) {
if ( oneReadPile == null ) oneReadPile = new ArrayList<PileupElement>();
oneReadPile.add(p);
}
private final void addToTwoPile(final PileupElement p1, final PileupElement p2) {
// assumes we have at most 2 reads per fragment
if ( twoReadPile == null ) twoReadPile = new ArrayList<TwoReadPileupElement>();
twoReadPile.add(new TwoReadPileupElement(p1, p2));
}
/**
* Gets the pileup elements containing two reads, in no particular order
*

View File

@ -2,11 +2,15 @@ package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.*;
/**
* @author aaron
@ -29,7 +33,7 @@ public class ArtificialSAMUtils {
File outFile = new File(filename);
SAMFileWriter out = new SAMFileWriterFactory().makeBAMWriter(header, true, outFile);
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
for (int readNumber = 1; readNumber < readsPerChomosome; readNumber++) {
out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, DEFAULT_READ_LENGTH));
@ -145,7 +149,7 @@ public class ArtificialSAMUtils {
*/
public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length) {
if( (refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart != SAMRecord.NO_ALIGNMENT_START) ||
(refIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart == SAMRecord.NO_ALIGNMENT_START) )
(refIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart == SAMRecord.NO_ALIGNMENT_START) )
throw new ReviewedStingException("Invalid alignment start for artificial read, start = " + alignmentStart);
GATKSAMRecord record = new GATKSAMRecord(header);
record.setReadName(name);
@ -197,6 +201,37 @@ public class ArtificialSAMUtils {
return rec;
}
public final static List<SAMRecord> createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) {
SAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen);
SAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen);
left.setReadPairedFlag(true);
right.setReadPairedFlag(true);
left.setProperPairFlag(true);
right.setProperPairFlag(true);
left.setFirstOfPairFlag(leftIsFirst);
right.setFirstOfPairFlag(! leftIsFirst);
left.setReadNegativeStrandFlag(leftIsNegative);
left.setMateNegativeStrandFlag(!leftIsNegative);
right.setReadNegativeStrandFlag(!leftIsNegative);
right.setMateNegativeStrandFlag(leftIsNegative);
left.setMateAlignmentStart(right.getAlignmentStart());
right.setMateAlignmentStart(left.getAlignmentStart());
left.setMateReferenceIndex(0);
right.setMateReferenceIndex(0);
int isize = rightStart + readLen - leftStart;
left.setInferredInsertSize(isize);
right.setInferredInsertSize(-isize);
return Arrays.asList(left, right);
}
/**
* create an iterator containing the specified read piles
*
@ -258,4 +293,52 @@ public class ArtificialSAMUtils {
return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header);
}
private final static int ranIntInclusive(Random ran, int start, int stop) {
final int range = stop - start;
return ran.nextInt(range) + start;
}
/**
* Creates a read backed pileup containing up to pileupSize reads at refID 0 from header at loc with
* reads created that have readLen bases. Pairs are sampled from a gaussian distribution with mean insert
* size of insertSize and variation of insertSize / 10. The first read will be in the pileup, and the second
* may be, depending on where this sampled insertSize puts it.
* @param header
* @param loc
* @param readLen
* @param insertSize
* @param pileupSize
* @return
*/
public static ReadBackedPileup createReadBackedPileup(final SAMFileHeader header, final GenomeLoc loc, final int readLen, final int insertSize, final int pileupSize) {
final Random ran = new Random();
final boolean leftIsFirst = true;
final boolean leftIsNegative = false;
final int insertSizeVariation = insertSize / 10;
final int pos = loc.getStart();
final List<PileupElement> pileupElements = new ArrayList<PileupElement>();
for ( int i = 0; i < pileupSize / 2; i++ ) {
final String readName = "read" + i;
final int leftStart = ranIntInclusive(ran, 1, pos);
final int fragmentSize = (int)(ran.nextGaussian() * insertSizeVariation + insertSize);
final int rightStart = leftStart + fragmentSize - readLen;
if ( rightStart <= 0 ) continue;
List<SAMRecord> pair = createPair(header, readName, readLen, leftStart, rightStart, leftIsFirst, leftIsNegative);
final SAMRecord left = pair.get(0);
final SAMRecord right = pair.get(1);
pileupElements.add(new PileupElement(left, pos - leftStart));
if ( pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd() ) {
pileupElements.add(new PileupElement(right, pos - rightStart));
}
}
Collections.sort(pileupElements);
return new ReadBackedPileupImpl(loc, pileupElements);
}
}

View File

@ -214,54 +214,6 @@ public class ReadUtils {
return state;
}
/**
* s1 e1
* |-----------------------> [record in hand]
* s2
* <-----------------------|
*
* s1, e1, and s2 are all in the record. Assuming that s1 < s2 (we are the left most read),
* we can compute whether we overlap with our mate by seeing if s2 <= e1 or no. If e1 <
* s2 then we known that we cannot over.
*
* If we are looking at the right read
*
* s1
* |----------------------->
* s2 e2
* <-----------------------| [record in hand]
*
* we know the position of s1 and s2, but we don't know e1, so we cannot tell if we
* overlap with our mate or not, so in this case we return MAYBE.
*
* Note that if rec has an unmapped mate or is unpaired we certainly know the answer
*
* @param rec
* @return
*/
public static ReadOverlapsMateType readMightOverlapMate(final SAMRecord rec) {
if ( ! rec.getReadPairedFlag() || rec.getMateUnmappedFlag() ) {
return ReadOverlapsMateType.NO;
} else { // read is actually paired
final int recStart = rec.getAlignmentStart();
final int recEnd = rec.getAlignmentEnd();
final int mateStart = rec.getMateAlignmentStart();
if ( recStart < mateStart ) {
// we are the left most read
return mateStart <= recEnd ? ReadOverlapsMateType.LEFT_YES: ReadOverlapsMateType.NO;
} else if ( recStart == mateStart ) {
// we are the left most read
return ReadOverlapsMateType.SAME_START;
} else {
// we are the right most read, so we cannot tell
return ReadOverlapsMateType.RIGHT_MAYBE;
}
}
}
public enum ReadOverlapsMateType { LEFT_YES, NO, SAME_START, RIGHT_MAYBE }
private static Pair<Integer, Integer> getAdaptorBoundaries(SAMRecord rec, int adaptorLength) {
int isize = rec.getInferredInsertSize();
if ( isize == 0 )

View File

@ -53,38 +53,12 @@ public class FragmentPileupBenchmark extends SimpleBenchmark {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
GenomeLocParser genomeLocParser;
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
final int pos = 50;
GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", pos);
final Random ran = new Random();
GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 50);
final int readLen = 100;
final boolean leftIsFirst = true;
final boolean leftIsNegative = false;
final int insertSizeVariation = insertSize / 10;
for ( int pileupN = 0; pileupN < nPileupsToGenerate; pileupN++ ) {
List<PileupElement> pileupElements = new ArrayList<PileupElement>();
for ( int i = 0; i < pileupSize / 2; i++ ) {
final String readName = "read" + i;
final int leftStart = new Random().nextInt(49) + 1;
final int fragmentSize = (int)(ran.nextGaussian() * insertSizeVariation + insertSize);
final int rightStart = leftStart + fragmentSize - readLen;
if ( rightStart <= 0 ) continue;
List<SAMRecord> pair = FragmentPileupUnitTest.createPair(header, readName, readLen, leftStart, rightStart, leftIsFirst, leftIsNegative);
SAMRecord left = pair.get(0);
SAMRecord right = pair.get(1);
pileupElements.add(new PileupElement(left, pos - leftStart));
if ( pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd() ) {
pileupElements.add(new PileupElement(right, pos - rightStart));
}
}
Collections.sort(pileupElements);
pileups.add(new ReadBackedPileupImpl(loc, pileupElements));
ReadBackedPileup rbp = ArtificialSAMUtils.createReadBackedPileup(header, loc, readLen, insertSize, pileupSize);
pileups.add(rbp);
}
}
@ -100,18 +74,10 @@ public class FragmentPileupBenchmark extends SimpleBenchmark {
run(rep, FragmentPileup.FragmentMatchingAlgorithm.ORIGINAL);
}
public void timeFullOverlapPotential(int rep) {
run(rep, FragmentPileup.FragmentMatchingAlgorithm.FAST_V1);
}
public void timeSkipNonOverlapping(int rep) {
run(rep, FragmentPileup.FragmentMatchingAlgorithm.skipNonOverlapping);
}
public void timeSkipNonOverlappingNotLazy(int rep) {
run(rep, FragmentPileup.FragmentMatchingAlgorithm.skipNonOverlappingNotLazy);
}
public static void main(String[] args) {
CaliperMain.main(FragmentPileupBenchmark.class, args);
}

View File

@ -43,44 +43,13 @@ import java.util.*;
public class FragmentPileupUnitTest extends BaseTest {
private static SAMFileHeader header;
public final static List<SAMRecord> createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) {
SAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen);
SAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen);
left.setReadPairedFlag(true);
right.setReadPairedFlag(true);
left.setProperPairFlag(true);
right.setProperPairFlag(true);
left.setFirstOfPairFlag(leftIsFirst);
right.setFirstOfPairFlag(! leftIsFirst);
left.setReadNegativeStrandFlag(leftIsNegative);
left.setMateNegativeStrandFlag(!leftIsNegative);
right.setReadNegativeStrandFlag(!leftIsNegative);
right.setMateNegativeStrandFlag(leftIsNegative);
left.setMateAlignmentStart(right.getAlignmentStart());
right.setMateAlignmentStart(left.getAlignmentStart());
left.setMateReferenceIndex(0);
right.setMateReferenceIndex(0);
int isize = rightStart + readLen - leftStart;
left.setInferredInsertSize(isize);
right.setInferredInsertSize(-isize);
return Arrays.asList(left, right);
}
private class FragmentPileupTest extends TestDataProvider {
List<TestState> states = new ArrayList<TestState>();
private FragmentPileupTest(String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) {
super(FragmentPileupTest.class, String.format("%s-leftIsFirst:%b-leftIsNegative:%b", name, leftIsFirst, leftIsNegative));
List<SAMRecord> pair = createPair(header, "readpair", readLen, leftStart, rightStart, leftIsFirst, leftIsNegative);
List<SAMRecord> pair = ArtificialSAMUtils.createPair(header, "readpair", readLen, leftStart, rightStart, leftIsFirst, leftIsNegative);
SAMRecord left = pair.get(0);
SAMRecord right = pair.get(1);