Cleanup FragmentPileup before main repo commit

-- removed intermiate functions.  Now only original version and best optimized new version remain
-- Moved general artificial read backed pileup creation code into ArtificialSamUtils
This commit is contained in:
Mark DePristo 2011-10-24 14:40:05 -04:00
parent 166174a551
commit 502592671d
5 changed files with 116 additions and 259 deletions

View File

@ -14,9 +14,6 @@ import java.util.*;
* *
* Based on the original code by E. Banks * Based on the original code by E. Banks
* *
* TODO -- technically we could generalize this code to support a pseudo-duplicate marking
* TODO -- algorithm that could collect all duplicates into a single super pileup element
*
* Oct 21: note that the order of the oneReadPileup and twoReadPileups are not * Oct 21: note that the order of the oneReadPileup and twoReadPileups are not
* defined. The algorithms that produce these lists are in fact producing * defined. The algorithms that produce these lists are in fact producing
* lists of Pileup elements *NOT* sorted by alignment start position of the underlying * lists of Pileup elements *NOT* sorted by alignment start position of the underlying
@ -30,11 +27,9 @@ public class FragmentPileup {
Collection<PileupElement> oneReadPile = null; Collection<PileupElement> oneReadPile = null;
Collection<TwoReadPileupElement> twoReadPile = null; Collection<TwoReadPileupElement> twoReadPile = null;
public enum FragmentMatchingAlgorithm { protected enum FragmentMatchingAlgorithm {
ORIGINAL, ORIGINAL,
FAST_V1,
skipNonOverlapping, skipNonOverlapping,
skipNonOverlappingNotLazy
} }
/** /**
@ -42,16 +37,14 @@ public class FragmentPileup {
* @param pileup * @param pileup
*/ */
public FragmentPileup(ReadBackedPileup pileup) { public FragmentPileup(ReadBackedPileup pileup) {
//oldSlowCalculation(pileup); skipNonOverlapping(pileup);
fastNewCalculation(pileup);
} }
/** For performance testing only */
protected FragmentPileup(ReadBackedPileup pileup, FragmentMatchingAlgorithm algorithm) { protected FragmentPileup(ReadBackedPileup pileup, FragmentMatchingAlgorithm algorithm) {
switch ( algorithm ) { switch ( algorithm ) {
case ORIGINAL: oldSlowCalculation(pileup); break; case ORIGINAL: oldSlowCalculation(pileup); break;
case FAST_V1: fastNewCalculation(pileup); break;
case skipNonOverlapping: skipNonOverlapping(pileup); break; case skipNonOverlapping: skipNonOverlapping(pileup); break;
case skipNonOverlappingNotLazy: skipNonOverlappingNotLazy(pileup); break;
} }
} }
@ -76,58 +69,36 @@ public class FragmentPileup {
oneReadPile = nameMap.values(); oneReadPile = nameMap.values();
} }
/** private final void skipNonOverlapping(final ReadBackedPileup pileup) {
* @param pileup Map<String, PileupElement> nameMap = null;
*/
private final void fastNewCalculation(final ReadBackedPileup pileup) {
Map<String, PileupElement> nameMap = null; // lazy initialization
// build an initial map, grabbing all of the multi-read fragments
for ( final PileupElement p : pileup ) { for ( final PileupElement p : pileup ) {
final SAMRecord read = p.getRead(); final SAMRecord read = p.getRead();
final int mateStart = read.getMateAlignmentStart();
switch (ReadUtils.readMightOverlapMate(read) ) { if ( mateStart == 0 || mateStart > read.getAlignmentEnd() ) {
// we know for certain this read doesn't have an overlapping mate // if we know that this read won't overlap its mate, or doesn't have one, jump out early
case NO: { if ( oneReadPile == null ) oneReadPile = new ArrayList<PileupElement>(pileup.size()); // lazy init
addToOnePile(p); oneReadPile.add(p);
break; } else {
} // the read might overlap it's mate, or is the rightmost read of a pair
final String readName = p.getRead().getReadName();
// we know that we overlap our mate, so put the read in the nameMap in final PileupElement pe1 = nameMap == null ? null : nameMap.get(readName);
// case our mate shows up if ( pe1 != null ) {
case LEFT_YES: { // assumes we have at most 2 reads per fragment
nameMap = addToNameMap(nameMap, p); if ( twoReadPile == null ) twoReadPile = new ArrayList<TwoReadPileupElement>(); // lazy init
break; twoReadPile.add(new TwoReadPileupElement(pe1, p));
} nameMap.remove(readName);
} else {
// read starts at the same position, so we are looking at either the first or if ( nameMap == null ) nameMap = new HashMap<String, PileupElement>(pileup.size()); // lazy init
// the second read. In the first, add it to the map, in the second grab it nameMap.put(readName, p);
// from the map and create a fragment
case SAME_START: {
final PileupElement pe1 = getFromNameMap(nameMap, p);
if ( pe1 != null ) {
addToTwoPile(pe1, p);
nameMap.remove(p.getRead().getReadName());
} else {
nameMap = addToNameMap(nameMap, p);
}
break;
}
// in this case we need to see if our mate is already present, and if so
// grab the read from the list
case RIGHT_MAYBE: {
final PileupElement pe1 = getFromNameMap(nameMap, p);
if ( pe1 != null ) {
addToTwoPile(pe1, p);
nameMap.remove(p.getRead().getReadName());
} else {
addToOnePile(p);
}
break;
} }
} }
} }
// add all of the reads that are potentially overlapping but whose mate never showed
// up to the oneReadPile
if ( nameMap != null && ! nameMap.isEmpty() ) { if ( nameMap != null && ! nameMap.isEmpty() ) {
if ( oneReadPile == null ) if ( oneReadPile == null )
oneReadPile = nameMap.values(); oneReadPile = nameMap.values();
@ -136,90 +107,6 @@ public class FragmentPileup {
} }
} }
/**
* @param pileup
*/
private final void skipNonOverlappingNotLazy(final ReadBackedPileup pileup) {
oneReadPile = new ArrayList<PileupElement>(pileup.size());
twoReadPile = new ArrayList<TwoReadPileupElement>();
final Map<String, PileupElement> nameMap = new HashMap<String, PileupElement>(pileup.size());
// build an initial map, grabbing all of the multi-read fragments
for ( final PileupElement p : pileup ) {
// if we know that this read won't overlap its mate, or doesn't have one, jump out early
final SAMRecord read = p.getRead();
final int mateStart = read.getMateAlignmentStart();
if ( mateStart == 0 || mateStart > read.getAlignmentEnd() ) {
oneReadPile.add(p);
} else {
final String readName = p.getRead().getReadName();
final PileupElement pe1 = nameMap.get(readName);
if ( pe1 != null ) {
// assumes we have at most 2 reads per fragment
twoReadPile.add(new TwoReadPileupElement(pe1, p));
nameMap.remove(readName);
} else {
nameMap.put(readName, p);
}
}
}
oneReadPile.addAll(nameMap.values());
}
private final void skipNonOverlapping(final ReadBackedPileup pileup) {
Map<String, PileupElement> nameMap = null;
// build an initial map, grabbing all of the multi-read fragments
for ( final PileupElement p : pileup ) {
// if we know that this read won't overlap its mate, or doesn't have one, jump out early
final SAMRecord read = p.getRead();
final int mateStart = read.getMateAlignmentStart();
if ( mateStart == 0 || mateStart > read.getAlignmentEnd() ) {
if ( oneReadPile == null ) oneReadPile = new ArrayList<PileupElement>(pileup.size());
oneReadPile.add(p);
} else {
final String readName = p.getRead().getReadName();
final PileupElement pe1 = nameMap == null ? null : nameMap.get(readName);
if ( pe1 != null ) {
// assumes we have at most 2 reads per fragment
if ( twoReadPile == null ) twoReadPile = new ArrayList<TwoReadPileupElement>();
twoReadPile.add(new TwoReadPileupElement(pe1, p));
nameMap.remove(readName);
} else {
nameMap = addToNameMap(nameMap, p);
}
}
}
if ( oneReadPile == null )
oneReadPile = nameMap == null ? Collections.<PileupElement>emptyList() : nameMap.values();
else if ( nameMap != null )
oneReadPile.addAll(nameMap.values());
}
private final Map<String, PileupElement> addToNameMap(Map<String, PileupElement> map, final PileupElement p) {
if ( map == null ) map = new HashMap<String, PileupElement>();
map.put(p.getRead().getReadName(), p);
return map;
}
private final PileupElement getFromNameMap(Map<String, PileupElement> map, final PileupElement p) {
return map == null ? null : map.get(p.getRead().getReadName());
}
private final void addToOnePile(final PileupElement p) {
if ( oneReadPile == null ) oneReadPile = new ArrayList<PileupElement>();
oneReadPile.add(p);
}
private final void addToTwoPile(final PileupElement p1, final PileupElement p2) {
// assumes we have at most 2 reads per fragment
if ( twoReadPile == null ) twoReadPile = new ArrayList<TwoReadPileupElement>();
twoReadPile.add(new TwoReadPileupElement(p1, p2));
}
/** /**
* Gets the pileup elements containing two reads, in no particular order * Gets the pileup elements containing two reads, in no particular order
* *

View File

@ -2,11 +2,15 @@ package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.*; import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import java.io.File; import java.io.File;
import java.util.ArrayList; import java.util.*;
import java.util.List;
/** /**
* @author aaron * @author aaron
@ -29,7 +33,7 @@ public class ArtificialSAMUtils {
File outFile = new File(filename); File outFile = new File(filename);
SAMFileWriter out = new SAMFileWriterFactory().makeBAMWriter(header, true, outFile); SAMFileWriter out = new SAMFileWriterFactory().makeBAMWriter(header, true, outFile);
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
for (int readNumber = 1; readNumber < readsPerChomosome; readNumber++) { for (int readNumber = 1; readNumber < readsPerChomosome; readNumber++) {
out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, DEFAULT_READ_LENGTH)); out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, DEFAULT_READ_LENGTH));
@ -145,7 +149,7 @@ public class ArtificialSAMUtils {
*/ */
public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length) { public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length) {
if( (refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart != SAMRecord.NO_ALIGNMENT_START) || if( (refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart != SAMRecord.NO_ALIGNMENT_START) ||
(refIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart == SAMRecord.NO_ALIGNMENT_START) ) (refIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart == SAMRecord.NO_ALIGNMENT_START) )
throw new ReviewedStingException("Invalid alignment start for artificial read, start = " + alignmentStart); throw new ReviewedStingException("Invalid alignment start for artificial read, start = " + alignmentStart);
GATKSAMRecord record = new GATKSAMRecord(header); GATKSAMRecord record = new GATKSAMRecord(header);
record.setReadName(name); record.setReadName(name);
@ -197,6 +201,37 @@ public class ArtificialSAMUtils {
return rec; return rec;
} }
public final static List<SAMRecord> createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) {
SAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen);
SAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen);
left.setReadPairedFlag(true);
right.setReadPairedFlag(true);
left.setProperPairFlag(true);
right.setProperPairFlag(true);
left.setFirstOfPairFlag(leftIsFirst);
right.setFirstOfPairFlag(! leftIsFirst);
left.setReadNegativeStrandFlag(leftIsNegative);
left.setMateNegativeStrandFlag(!leftIsNegative);
right.setReadNegativeStrandFlag(!leftIsNegative);
right.setMateNegativeStrandFlag(leftIsNegative);
left.setMateAlignmentStart(right.getAlignmentStart());
right.setMateAlignmentStart(left.getAlignmentStart());
left.setMateReferenceIndex(0);
right.setMateReferenceIndex(0);
int isize = rightStart + readLen - leftStart;
left.setInferredInsertSize(isize);
right.setInferredInsertSize(-isize);
return Arrays.asList(left, right);
}
/** /**
* create an iterator containing the specified read piles * create an iterator containing the specified read piles
* *
@ -258,4 +293,52 @@ public class ArtificialSAMUtils {
return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header); return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header);
} }
private final static int ranIntInclusive(Random ran, int start, int stop) {
final int range = stop - start;
return ran.nextInt(range) + start;
}
/**
* Creates a read backed pileup containing up to pileupSize reads at refID 0 from header at loc with
* reads created that have readLen bases. Pairs are sampled from a gaussian distribution with mean insert
* size of insertSize and variation of insertSize / 10. The first read will be in the pileup, and the second
* may be, depending on where this sampled insertSize puts it.
* @param header
* @param loc
* @param readLen
* @param insertSize
* @param pileupSize
* @return
*/
public static ReadBackedPileup createReadBackedPileup(final SAMFileHeader header, final GenomeLoc loc, final int readLen, final int insertSize, final int pileupSize) {
final Random ran = new Random();
final boolean leftIsFirst = true;
final boolean leftIsNegative = false;
final int insertSizeVariation = insertSize / 10;
final int pos = loc.getStart();
final List<PileupElement> pileupElements = new ArrayList<PileupElement>();
for ( int i = 0; i < pileupSize / 2; i++ ) {
final String readName = "read" + i;
final int leftStart = ranIntInclusive(ran, 1, pos);
final int fragmentSize = (int)(ran.nextGaussian() * insertSizeVariation + insertSize);
final int rightStart = leftStart + fragmentSize - readLen;
if ( rightStart <= 0 ) continue;
List<SAMRecord> pair = createPair(header, readName, readLen, leftStart, rightStart, leftIsFirst, leftIsNegative);
final SAMRecord left = pair.get(0);
final SAMRecord right = pair.get(1);
pileupElements.add(new PileupElement(left, pos - leftStart));
if ( pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd() ) {
pileupElements.add(new PileupElement(right, pos - rightStart));
}
}
Collections.sort(pileupElements);
return new ReadBackedPileupImpl(loc, pileupElements);
}
} }

View File

@ -214,54 +214,6 @@ public class ReadUtils {
return state; return state;
} }
/**
* s1 e1
* |-----------------------> [record in hand]
* s2
* <-----------------------|
*
* s1, e1, and s2 are all in the record. Assuming that s1 < s2 (we are the left most read),
* we can compute whether we overlap with our mate by seeing if s2 <= e1 or no. If e1 <
* s2 then we known that we cannot over.
*
* If we are looking at the right read
*
* s1
* |----------------------->
* s2 e2
* <-----------------------| [record in hand]
*
* we know the position of s1 and s2, but we don't know e1, so we cannot tell if we
* overlap with our mate or not, so in this case we return MAYBE.
*
* Note that if rec has an unmapped mate or is unpaired we certainly know the answer
*
* @param rec
* @return
*/
public static ReadOverlapsMateType readMightOverlapMate(final SAMRecord rec) {
if ( ! rec.getReadPairedFlag() || rec.getMateUnmappedFlag() ) {
return ReadOverlapsMateType.NO;
} else { // read is actually paired
final int recStart = rec.getAlignmentStart();
final int recEnd = rec.getAlignmentEnd();
final int mateStart = rec.getMateAlignmentStart();
if ( recStart < mateStart ) {
// we are the left most read
return mateStart <= recEnd ? ReadOverlapsMateType.LEFT_YES: ReadOverlapsMateType.NO;
} else if ( recStart == mateStart ) {
// we are the left most read
return ReadOverlapsMateType.SAME_START;
} else {
// we are the right most read, so we cannot tell
return ReadOverlapsMateType.RIGHT_MAYBE;
}
}
}
public enum ReadOverlapsMateType { LEFT_YES, NO, SAME_START, RIGHT_MAYBE }
private static Pair<Integer, Integer> getAdaptorBoundaries(SAMRecord rec, int adaptorLength) { private static Pair<Integer, Integer> getAdaptorBoundaries(SAMRecord rec, int adaptorLength) {
int isize = rec.getInferredInsertSize(); int isize = rec.getInferredInsertSize();
if ( isize == 0 ) if ( isize == 0 )

View File

@ -53,38 +53,12 @@ public class FragmentPileupBenchmark extends SimpleBenchmark {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
GenomeLocParser genomeLocParser; GenomeLocParser genomeLocParser;
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
final int pos = 50; GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 50);
GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", pos);
final Random ran = new Random();
final int readLen = 100; final int readLen = 100;
final boolean leftIsFirst = true;
final boolean leftIsNegative = false;
final int insertSizeVariation = insertSize / 10;
for ( int pileupN = 0; pileupN < nPileupsToGenerate; pileupN++ ) { for ( int pileupN = 0; pileupN < nPileupsToGenerate; pileupN++ ) {
List<PileupElement> pileupElements = new ArrayList<PileupElement>(); ReadBackedPileup rbp = ArtificialSAMUtils.createReadBackedPileup(header, loc, readLen, insertSize, pileupSize);
for ( int i = 0; i < pileupSize / 2; i++ ) { pileups.add(rbp);
final String readName = "read" + i;
final int leftStart = new Random().nextInt(49) + 1;
final int fragmentSize = (int)(ran.nextGaussian() * insertSizeVariation + insertSize);
final int rightStart = leftStart + fragmentSize - readLen;
if ( rightStart <= 0 ) continue;
List<SAMRecord> pair = FragmentPileupUnitTest.createPair(header, readName, readLen, leftStart, rightStart, leftIsFirst, leftIsNegative);
SAMRecord left = pair.get(0);
SAMRecord right = pair.get(1);
pileupElements.add(new PileupElement(left, pos - leftStart));
if ( pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd() ) {
pileupElements.add(new PileupElement(right, pos - rightStart));
}
}
Collections.sort(pileupElements);
pileups.add(new ReadBackedPileupImpl(loc, pileupElements));
} }
} }
@ -100,18 +74,10 @@ public class FragmentPileupBenchmark extends SimpleBenchmark {
run(rep, FragmentPileup.FragmentMatchingAlgorithm.ORIGINAL); run(rep, FragmentPileup.FragmentMatchingAlgorithm.ORIGINAL);
} }
public void timeFullOverlapPotential(int rep) {
run(rep, FragmentPileup.FragmentMatchingAlgorithm.FAST_V1);
}
public void timeSkipNonOverlapping(int rep) { public void timeSkipNonOverlapping(int rep) {
run(rep, FragmentPileup.FragmentMatchingAlgorithm.skipNonOverlapping); run(rep, FragmentPileup.FragmentMatchingAlgorithm.skipNonOverlapping);
} }
public void timeSkipNonOverlappingNotLazy(int rep) {
run(rep, FragmentPileup.FragmentMatchingAlgorithm.skipNonOverlappingNotLazy);
}
public static void main(String[] args) { public static void main(String[] args) {
CaliperMain.main(FragmentPileupBenchmark.class, args); CaliperMain.main(FragmentPileupBenchmark.class, args);
} }

View File

@ -43,44 +43,13 @@ import java.util.*;
public class FragmentPileupUnitTest extends BaseTest { public class FragmentPileupUnitTest extends BaseTest {
private static SAMFileHeader header; private static SAMFileHeader header;
public final static List<SAMRecord> createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) {
SAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen);
SAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen);
left.setReadPairedFlag(true);
right.setReadPairedFlag(true);
left.setProperPairFlag(true);
right.setProperPairFlag(true);
left.setFirstOfPairFlag(leftIsFirst);
right.setFirstOfPairFlag(! leftIsFirst);
left.setReadNegativeStrandFlag(leftIsNegative);
left.setMateNegativeStrandFlag(!leftIsNegative);
right.setReadNegativeStrandFlag(!leftIsNegative);
right.setMateNegativeStrandFlag(leftIsNegative);
left.setMateAlignmentStart(right.getAlignmentStart());
right.setMateAlignmentStart(left.getAlignmentStart());
left.setMateReferenceIndex(0);
right.setMateReferenceIndex(0);
int isize = rightStart + readLen - leftStart;
left.setInferredInsertSize(isize);
right.setInferredInsertSize(-isize);
return Arrays.asList(left, right);
}
private class FragmentPileupTest extends TestDataProvider { private class FragmentPileupTest extends TestDataProvider {
List<TestState> states = new ArrayList<TestState>(); List<TestState> states = new ArrayList<TestState>();
private FragmentPileupTest(String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { private FragmentPileupTest(String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) {
super(FragmentPileupTest.class, String.format("%s-leftIsFirst:%b-leftIsNegative:%b", name, leftIsFirst, leftIsNegative)); super(FragmentPileupTest.class, String.format("%s-leftIsFirst:%b-leftIsNegative:%b", name, leftIsFirst, leftIsNegative));
List<SAMRecord> pair = createPair(header, "readpair", readLen, leftStart, rightStart, leftIsFirst, leftIsNegative); List<SAMRecord> pair = ArtificialSAMUtils.createPair(header, "readpair", readLen, leftStart, rightStart, leftIsFirst, leftIsNegative);
SAMRecord left = pair.get(0); SAMRecord left = pair.get(0);
SAMRecord right = pair.get(1); SAMRecord right = pair.get(1);