diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java b/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java index 9434944ca..871d9d514 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; +import org.broadinstitute.sting.utils.pileup.UnifiedReadBackedPileup; import org.broadinstitute.sting.gatk.iterators.LocusOverflowTracker; import java.util.*; @@ -80,7 +81,7 @@ public class AlignmentContext { if ( skippedBases < 0 ) throw new StingException("BUG: skippedBases is -1 in Alignment context"); this.loc = loc; - this.basePileup = new ReadBackedPileup(loc, reads, offsets); + this.basePileup = new UnifiedReadBackedPileup(loc, reads, offsets); this.skippedBases = skippedBases; } diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java b/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java index 24335a50e..9df20b3bf 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java @@ -90,9 +90,9 @@ public class StratifiedAlignmentContext { int index = type.ordinal(); if ( contexts[index] == null ) { if ( isExtended ) { - contexts[index] = new AlignmentContext(loc , new ReadBackedExtendedEventPileup(loc, (ArrayList)((ArrayList)getPileupElements(type)))); + contexts[index] = new AlignmentContext(loc , new UnifiedReadBackedExtendedEventPileup(loc, (ArrayList)((ArrayList)getPileupElements(type)))); } else { - contexts[index] = new AlignmentContext(loc, new ReadBackedPileup(loc, getPileupElements(type))); + contexts[index] = new AlignmentContext(loc, new UnifiedReadBackedPileup(loc, getPileupElements(type))); } } return contexts[index]; @@ -300,7 +300,7 @@ public class StratifiedAlignmentContext { // dirty trick below. generics do not allow to cast pe (ArrayList) directly to ArrayList, // so we first cast to "? extends" wildcard, then to what we actually need. - if ( isExtended ) return new AlignmentContext(loc, new ReadBackedExtendedEventPileup(loc, (ArrayList< ExtendedEventPileupElement>)((ArrayList)pe)) ); - else return new AlignmentContext(loc, new ReadBackedPileup(loc,pe)); + if ( isExtended ) return new AlignmentContext(loc, new UnifiedReadBackedExtendedEventPileup(loc, (ArrayList< ExtendedEventPileupElement>)((ArrayList)pe)) ); + else return new AlignmentContext(loc, new UnifiedReadBackedPileup(loc,pe)); } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java index 79ef27e09..072ad3ab0 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.MergingIterator; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.UnifiedReadBackedPileup; import org.broadinstitute.sting.gatk.iterators.LocusOverflowTracker; import java.util.*; @@ -136,7 +137,7 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { // calculate the number of skipped bases, and update lastLoc so we can do that again in the next() long skippedBases = getSkippedBases( rodSite ); lastLoc = site; - return new AlignmentContext(site, new ReadBackedPileup(site), skippedBases); + return new AlignmentContext(site, new UnifiedReadBackedPileup(site), skippedBases); } public LocusOverflowTracker getLocusOverflowTracker() { diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/DownsamplingLocusIteratorByState.java b/java/src/org/broadinstitute/sting/gatk/iterators/DownsamplingLocusIteratorByState.java index 71414c1e3..1d10980a7 100755 --- a/java/src/org/broadinstitute/sting/gatk/iterators/DownsamplingLocusIteratorByState.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/DownsamplingLocusIteratorByState.java @@ -34,10 +34,7 @@ import org.broadinstitute.sting.gatk.DownsamplingMethod; import org.broadinstitute.sting.gatk.DownsampleType; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; +import org.broadinstitute.sting.utils.pileup.*; import java.util.*; @@ -308,11 +305,15 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { } public void printState() { - for ( SAMRecordState state : readStates ) { - logger.debug(String.format("printState():")); - SAMRecord read = state.getRead(); - int offset = state.getReadOffset(); - logger.debug(String.format(" read: %s(%d)=%s, cigar=%s", read.getReadName(), offset, (char)read.getReadBases()[offset], read.getCigarString())); + for(String sampleName: sampleNames) { + Iterator iterator = readStates.iteratorForSample(sampleName); + while(iterator.hasNext()) { + SAMRecordState state = iterator.next(); + logger.debug(String.format("printState():")); + SAMRecord read = state.getRead(); + int offset = state.getReadOffset(); + logger.debug(String.format(" read: %s(%d)=%s, cigar=%s", read.getReadName(), offset, (char)read.getReadBases()[offset], read.getCigarString())); + } } } @@ -341,7 +342,7 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { private void lazyLoadNextAlignmentContext() { while(nextAlignmentContext == null && readStates.hasNext()) { // this call will set hasExtendedEvents to true if it picks up a read with indel right before the current position on the ref: - collectPendingReads(readInfo.getMaxReadsAtLocus()); + readStates.collectPendingReads(); int size = 0; int nDeletions = 0; @@ -355,93 +356,112 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { // In this case, the subsequent call to next() will emit the normal pileup at the current base // and shift the position. if (readInfo.generateExtendedEvents() && hasExtendedEvents) { - ArrayList indelPile = new ArrayList(readStates.size()); + Map fullExtendedEventPileup = new HashMap(); - int maxDeletionLength = 0; - - for ( SAMRecordState state : readStates ) { - if ( state.hadIndel() ) { - size++; - if ( state.getEventBases() == null ) { - nDeletions++; - maxDeletionLength = Math.max(maxDeletionLength,state.getEventLength()); - } - else nInsertions++; - indelPile.add ( new ExtendedEventPileupElement(state.getRead(), - state.getReadEventStartOffset(), - state.getEventLength(), - state.getEventBases()) ); - - } else { - if ( state.getCurrentCigarOperator() != CigarOperator.N ) { - // this read has no indel associated with the previous position on the ref; - // we count this read in only if it has actual bases, not N span... - if ( state.getCurrentCigarOperator() != CigarOperator.D || readInfo.includeReadsWithDeletionAtLoci() ) { - - // if cigar operator is D but the read has no extended event reported (that's why we ended - // up in this branch), it means that we are currently inside a deletion that started earlier; - // we count such reads (with a longer deletion spanning over a deletion at the previous base we are - // about to report) only if includeReadsWithDeletionAtLoci is true. - size++; - indelPile.add ( new ExtendedEventPileupElement(state.getRead(), - state.getReadOffset()-1, - -1) // length=-1 --> noevent - ); - } - } - } - if ( state.getRead().getMappingQuality() == 0 ) { - nMQ0Reads++; - } - } - hasExtendedEvents = false; // we are done with extended events prior to current ref base SAMRecordState our1stState = readStates.getFirst(); // get current location on the reference and decrement it by 1: the indels we just stepped over // are associated with the *previous* reference base GenomeLoc loc = GenomeLocParser.incPos(our1stState.getLocation(),-1); + + for(String sampleName: sampleNames) { + Iterator iterator = readStates.iteratorForSample(sampleName); + ArrayList indelPile = new ArrayList(readStates.size()); + + size = 0; + nDeletions = 0; + nInsertions = 0; + nMQ0Reads = 0; + int maxDeletionLength = 0; + + while(iterator.hasNext()) { + SAMRecordState state = iterator.next(); + if ( state.hadIndel() ) { + size++; + if ( state.getEventBases() == null ) { + nDeletions++; + maxDeletionLength = Math.max(maxDeletionLength,state.getEventLength()); + } + else nInsertions++; + indelPile.add ( new ExtendedEventPileupElement(state.getRead(), + state.getReadEventStartOffset(), + state.getEventLength(), + state.getEventBases()) ); + + } else { + if ( state.getCurrentCigarOperator() != CigarOperator.N ) { + // this read has no indel associated with the previous position on the ref; + // we count this read in only if it has actual bases, not N span... + if ( state.getCurrentCigarOperator() != CigarOperator.D || readInfo.includeReadsWithDeletionAtLoci() ) { + + // if cigar operator is D but the read has no extended event reported (that's why we ended + // up in this branch), it means that we are currently inside a deletion that started earlier; + // we count such reads (with a longer deletion spanning over a deletion at the previous base we are + // about to report) only if includeReadsWithDeletionAtLoci is true. + size++; + indelPile.add ( new ExtendedEventPileupElement(state.getRead(), + state.getReadOffset()-1, + -1) // length=-1 --> noevent + ); + } + } + } + if ( state.getRead().getMappingQuality() == 0 ) { + nMQ0Reads++; + } + if( indelPile.size() != 0 ) fullExtendedEventPileup.put(sampleName,new UnifiedReadBackedExtendedEventPileup(loc,indelPile,size,maxDeletionLength,nDeletions,nInsertions,nMQ0Reads)); + } + } + hasExtendedEvents = false; // we are done with extended events prior to current ref base // System.out.println("Indel(s) at "+loc); // for ( ExtendedEventPileupElement pe : indelPile ) { if ( pe.isIndel() ) System.out.println(" "+pe.toString()); } - nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileup(loc, indelPile, size, maxDeletionLength, nInsertions, nDeletions, nMQ0Reads)); + nextAlignmentContext = new AlignmentContext(loc, new SampleSplitReadBackedExtendedEventPileup(loc, fullExtendedEventPileup)); } else { - ArrayList pile = new ArrayList(readStates.size()); + GenomeLoc location = getLocation(); + Map fullPileup = new HashMap(); // todo -- performance problem -- should be lazy, really - GenomeLoc location = getLocation(); - for ( SAMRecordState state : readStates ) { - if ( state.getCurrentCigarOperator() != CigarOperator.D && state.getCurrentCigarOperator() != CigarOperator.N ) { - if ( filterRead(state.getRead(), location.getStart(), filters ) ) { - discarded_bases++; - //printStatus("Adaptor bases", discarded_adaptor_bases); - continue; - } else { - observed_bases++; - pile.add(new PileupElement(state.getRead(), state.getReadOffset())); - size++; - } - } else if ( readInfo.includeReadsWithDeletionAtLoci() && state.getCurrentCigarOperator() != CigarOperator.N ) { - size++; - pile.add(new PileupElement(state.getRead(), -1)); - nDeletions++; - } + for(String sampleName: sampleNames) { + Iterator iterator = readStates.iteratorForSample(sampleName); + ArrayList pile = new ArrayList(readStates.size()); - // todo -- this looks like a bug w.r.t. including reads with deletion at loci -- MAD 05/27/10 - if ( state.getRead().getMappingQuality() == 0 ) { - nMQ0Reads++; + size = 0; + nDeletions = 0; + nInsertions = 0; + nMQ0Reads = 0; + + while(iterator.hasNext()) { + SAMRecordState state = iterator.next(); + if ( state.getCurrentCigarOperator() != CigarOperator.D && state.getCurrentCigarOperator() != CigarOperator.N ) { + if ( filterRead(state.getRead(), location.getStart(), filters ) ) { + discarded_bases++; + //printStatus("Adaptor bases", discarded_adaptor_bases); + continue; + } else { + observed_bases++; + pile.add(new PileupElement(state.getRead(), state.getReadOffset())); + size++; + } + } else if ( readInfo.includeReadsWithDeletionAtLoci() && state.getCurrentCigarOperator() != CigarOperator.N ) { + size++; + pile.add(new PileupElement(state.getRead(), -1)); + nDeletions++; + } + + // todo -- this looks like a bug w.r.t. including reads with deletion at loci -- MAD 05/27/10 + if ( state.getRead().getMappingQuality() == 0 ) { + nMQ0Reads++; + } } + if( pile.size() != 0 ) fullPileup.put(sampleName,new UnifiedReadBackedPileup(location,pile,size,nDeletions,nMQ0Reads)); } updateReadStates(); // critical - must be called after we get the current state offsets and location // if we got reads with non-D/N over the current position, we are done - if ( pile.size() != 0 ) nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileup(location, pile, size, nDeletions, nMQ0Reads)); + if ( !fullPileup.isEmpty() ) nextAlignmentContext = new AlignmentContext(location, new SampleSplitReadBackedPileup(location, fullPileup)); } } } - - private void collectPendingReads(int maxReads) { - readStates.collectPendingReads(); - } - // fast testing of position private boolean readIsPastCurrentPosition(SAMRecord read) { if ( readStates.isEmpty() ) @@ -475,18 +495,20 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { } private void updateReadStates() { - Iterator it = readStates.iterator(); - while ( it.hasNext() ) { - SAMRecordState state = it.next(); - CigarOperator op = state.stepForwardOnGenome(); - if ( state.hadIndel() && readInfo.generateExtendedEvents() ) hasExtendedEvents = true; - else { - // we discard the read only when we are past its end AND indel at the end of the read (if any) was - // already processed. Keeping the read state that retunred null upon stepForwardOnGenome() is safe - // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. - if ( op == null ) { // we've stepped off the end of the object - //if (DEBUG) logger.debug(String.format(" removing read %s at %d", state.getRead().getReadName(), state.getRead().getAlignmentStart())); - it.remove(); + for(String sampleName: sampleNames) { + Iterator it = readStates.iteratorForSample(sampleName); + while ( it.hasNext() ) { + SAMRecordState state = it.next(); + CigarOperator op = state.stepForwardOnGenome(); + if ( state.hadIndel() && readInfo.generateExtendedEvents() ) hasExtendedEvents = true; + else { + // we discard the read only when we are past its end AND indel at the end of the read (if any) was + // already processed. Keeping the read state that retunred null upon stepForwardOnGenome() is safe + // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. + if ( op == null ) { // we've stepped off the end of the object + //if (DEBUG) logger.debug(String.format(" removing read %s at %d", state.getRead().getReadName(), state.getRead().getAlignmentStart())); + it.remove(); + } } } } @@ -512,16 +534,18 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { return this.overflowTracker; } - private class ReadStateManager implements Iterable { + private class ReadStateManager { private final PeekableIterator iterator; private final DownsamplingMethod downsamplingMethod; - private final Map> aggregatorsBySampleName = new HashMap>(); + private final ReadSelector firstReadSelector; + private final SamplePartitioner samplePartitioner; + + private final Map> readStatesBySample = new HashMap>(); private final int targetCoverage; private final int maxReadsAtLocus; - private final Map>> readStatesBySample; private int totalReadStatesInHanger = 0; /** @@ -547,104 +571,59 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { this.targetCoverage = Integer.MAX_VALUE; } this.maxReadsAtLocus = maxReadsAtLocus; - this.readStatesBySample = new HashMap>>(); - if(downsamplingMethod.type == DownsampleType.NONE) { - aggregatorsBySampleName.put(null,new ArrayList()); - readStatesBySample.put(null,new LinkedList>()); - } - else if(downsamplingMethod.type == DownsampleType.EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR) { - aggregatorsBySampleName.put(null,new ReservoirDownsampler(targetCoverage)); - readStatesBySample.put(null,new LinkedList>()); - } - else { - for(String sampleName: sampleNames) { - aggregatorsBySampleName.put(sampleName,new ReservoirDownsampler(targetCoverage)); - readStatesBySample.put(sampleName,new LinkedList>()); - } + samplePartitioner = new SamplePartitioner(sampleNames); + for(String sampleName: sampleNames) + readStatesBySample.put(sampleName,new LinkedList()); + + ReadSelector primaryReadSelector; + if(downsamplingMethod.type == DownsampleType.EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR) { + primaryReadSelector = new NRandomReadSelector(samplePartitioner,targetCoverage); } + else + primaryReadSelector = samplePartitioner; + + firstReadSelector = maxReadsAtLocus!=Integer.MAX_VALUE ? new FirstNReadSelector(primaryReadSelector,maxReadsAtLocus) : primaryReadSelector; } - public Iterator iterator() { + public Iterator iteratorForSample(final String sampleName) { return new Iterator() { - private final Iterator>> sampleIterators; - private Iterator> sampleIterator; - private List currentAlignmentStart; - private Iterator alignmentStartIterator; - private SAMRecordState nextReadState; - - private int readsInHanger = totalReadStatesInHanger; - - { - List>> sampleIteratorList = new LinkedList>>(); - for(Deque> hanger: readStatesBySample.values()) - sampleIteratorList.add(hanger.iterator()); - sampleIterators = sampleIteratorList.iterator(); - advance(); - } + private Iterator wrappedIterator = readStatesBySample.get(sampleName).iterator(); public boolean hasNext() { - return readsInHanger > 0; + return wrappedIterator.hasNext(); } public SAMRecordState next() { - advance(); - if(nextReadState==null) throw new NoSuchElementException("reader is out of elements"); - try { - return nextReadState; - } - finally { - readsInHanger--; - nextReadState = null; - } + return wrappedIterator.next(); } public void remove() { - if(alignmentStartIterator == null) - throw new StingException("Cannot remove read -- iterator is in an invalid state."); - alignmentStartIterator.remove(); - if(currentAlignmentStart.isEmpty()) - sampleIterator.remove(); + wrappedIterator.remove(); totalReadStatesInHanger--; } - - private void advance() { - // nextReadState != null indicates that we haven't returned this value from the next() method yet. - if(nextReadState != null) - return; - while(alignmentStartIterator!=null&&alignmentStartIterator.hasNext()) { - nextReadState = alignmentStartIterator.next(); - } - while(nextReadState==null&&sampleIterator!=null&&sampleIterator.hasNext()) { - currentAlignmentStart = sampleIterator.next(); - alignmentStartIterator = currentAlignmentStart!=null ? currentAlignmentStart.iterator() : null; - nextReadState = alignmentStartIterator!=null&&alignmentStartIterator.hasNext() ? alignmentStartIterator.next() : null; - } - while(nextReadState==null&&sampleIterators.hasNext()) { - sampleIterator = sampleIterators.next(); - currentAlignmentStart = sampleIterator!=null&&sampleIterator.hasNext() ? sampleIterator.next() : null; - alignmentStartIterator = currentAlignmentStart!=null ? currentAlignmentStart.iterator() : null; - nextReadState = alignmentStartIterator!=null&&alignmentStartIterator.hasNext() ? alignmentStartIterator.next() : null; - } - } }; } public boolean isEmpty() { - return readStatesBySample.isEmpty(); + return totalReadStatesInHanger == 0; } public int size() { int size = 0; - for(Deque> readStatesByAlignmentStart: readStatesBySample.values()) { - for(Collection readStates: readStatesByAlignmentStart) - size += readStates.size(); + for(Deque readStates: readStatesBySample.values()) { + size += readStates.size(); } return size; } public SAMRecordState getFirst() { - return iterator().next(); + for(String sampleName: sampleNames) { + Deque reads = readStatesBySample.get(sampleName); + if(!reads.isEmpty()) + return reads.peek(); + } + return null; } public boolean hasNext() { @@ -652,38 +631,36 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { } public void collectPendingReads() { - if(iterator.hasNext() && readStates.size() == 0) { + if(!iterator.hasNext()) + return; + + if(readStates.size() == 0) { int firstContigIndex = iterator.peek().getReferenceIndex(); int firstAlignmentStart = iterator.peek().getAlignmentStart(); while(iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) { - SAMRecord read = iterator.next(); - Collection aggregator = getAggregator(read.getReadGroup()!=null ? read.getReadGroup().getSample() : null); - aggregator.add(read); + firstReadSelector.submitRead(iterator.next()); } } else { // Fast fail in the case that the read is past the current position. - if(iterator.hasNext() && readIsPastCurrentPosition(iterator.peek())) + if(readIsPastCurrentPosition(iterator.peek())) return; while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) { - SAMRecord read = iterator.next(); - Collection aggregator = getAggregator(read.getReadGroup()!=null ? read.getReadGroup().getSample() : null); - aggregator.add(read); + firstReadSelector.submitRead(iterator.next()); } } + firstReadSelector.complete(); int readStatesInHangerEntry = 0; - for(Map.Entry> entry: aggregatorsBySampleName.entrySet()) { - String sampleName = entry.getKey(); - Collection aggregator = entry.getValue(); + for(String sampleName: sampleNames) { + ReadSelector aggregator = samplePartitioner.getSelectedReads(sampleName); - Collection newReads = new ArrayList(aggregator); - aggregator.clear(); + Collection newReads = new ArrayList(aggregator.getSelectedReads()); - Deque> hanger = readStatesBySample.get(sampleName); - int readsInHanger = countReadsInHanger(sampleName); + Deque hanger = readStatesBySample.get(sampleName); + int readsInHanger = hanger.size(); if(readsInHanger+newReads.size()<=targetCoverage || downsamplingMethod.type==DownsampleType.NONE || downsamplingMethod.type==DownsampleType.EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR) { int readLimit = newReads.size(); @@ -692,9 +669,11 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { readLimit = maxReadsAtLocus-totalReadStatesInHanger; mrlViolation = true; } - readStatesInHangerEntry += addReadsToHanger(hanger,newReads,readLimit,mrlViolation); + readStatesInHangerEntry += addReadsToSample(hanger,newReads,readLimit,mrlViolation); } else { + // TODO: implement downsampling mechanism + /* Iterator> backIterator = hanger.descendingIterator(); boolean readPruned = true; while(readsInHanger+newReads.size()>targetCoverage && readPruned) { @@ -714,36 +693,23 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { readsInHanger -= readsInFirstHanger.size(); } - readStatesInHangerEntry += addReadsToHanger(hanger,newReads,targetCoverage-readsInHanger,false); + readStatesInHangerEntry += addReadsToSample(hanger,newReads,targetCoverage-readsInHanger,false); + */ } totalReadStatesInHanger += readStatesInHangerEntry; } - } - - private Collection getAggregator(String sampleName) { - if(downsamplingMethod.type == DownsampleType.EXPERIMENTAL_BY_SAMPLE) - return aggregatorsBySampleName.get(sampleName); - else - return aggregatorsBySampleName.get(null); - } - - private int countReadsInHanger(final String sampleName) { - int count = 0; - for(List hangerEntry: readStatesBySample.get(sampleName)) { - count += hangerEntry.size(); - } - return count; + firstReadSelector.reset(); } /** * Add reads with the given sample name to the given hanger entry. - * @param newHangerEntry The hanger entry to add. + * @param readStates The list of read states to add this collection of reads. * @param reads Reads to add. Selected reads will be pulled from this source. * @param maxReads Maximum number of reads to add. * @return Total number of reads added. */ - private int addReadsToHanger(final Deque> newHangerEntry, final Collection reads, final int maxReads, boolean atMaxReadsAtLocusLimit) { + private int addReadsToSample(final Deque readStates, final Collection reads, final int maxReads, boolean atMaxReadsAtLocusLimit) { if(reads.isEmpty()) return 0; @@ -752,7 +718,6 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { // the farthest right a read extends Integer rightMostEnd = -1; - List readStates = new LinkedList(); int readCount = 0; for(SAMRecord read: reads) { if(readCount <= maxReads) { @@ -769,7 +734,6 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { rightMostEnd = (read.getAlignmentEnd() > rightMostEnd) ? read.getAlignmentEnd() : rightMostEnd; } } - newHangerEntry.add(readStates); if (location != null) overflowTracker.exceeded(GenomeLocParser.createGenomeLoc(location.getContigIndex(),location.getStart(),rightMostEnd), @@ -780,3 +744,247 @@ public class DownsamplingLocusIteratorByState extends LocusIterator { } } +/** + * Selects reads passed to it based on a criteria decided through inheritance. + */ +interface ReadSelector { + /** + * All previous selectors in the chain have allowed this read. Submit it to this selector for consideration. + * @param read the read to evaluate. + */ + public void submitRead(SAMRecord read); + + /** + * A previous selector has deemed this read unfit. Notify this selector so that this selector's counts are valid. + * @param read the read previously rejected. + */ + public void notifyReadRejected(SAMRecord read); + + /** + * Signal the selector that read additions are complete. + */ + public void complete(); + + /** + * Retrieve the number of reads seen by this selector so far. + * @return number of reads seen. + */ + public long getNumReadsSeen(); + + /** + * Return the number of reads accepted by this selector so far. + * @return number of reads selected. + */ + public long getNumReadsSelected(); + + /** + * Get the reads selected by this selector. + * @return collection of reads selected by this selector. + */ + public Collection getSelectedReads(); + + /** + * Reset this collection to its pre-gathered state. + */ + public void reset(); +} + +/** + * Choose the first N reads from the submitted set. + */ +class FirstNReadSelector implements ReadSelector { + private final ReadSelector chainedSelector; + + private final Collection selectedReads = new LinkedList(); + private final long readLimit; + private long readsSeen = 0; + + public FirstNReadSelector(ReadSelector chainedSelector, long readLimit) { + this.chainedSelector = chainedSelector; + this.readLimit = readLimit; + } + + public void submitRead(SAMRecord read) { + if(readsSeen > readLimit) { + selectedReads.add(read); + if(chainedSelector != null) + chainedSelector.submitRead(read); + } + else + if(chainedSelector != null) + chainedSelector.notifyReadRejected(read); + readsSeen++; + } + + public void notifyReadRejected(SAMRecord read) { + if(chainedSelector != null) + chainedSelector.notifyReadRejected(read); + readsSeen++; + } + + public void complete() { + if(chainedSelector != null) + chainedSelector.complete(); + } + + public long getNumReadsSeen() { + return readsSeen; + } + + public long getNumReadsSelected() { + return selectedReads.size(); + } + + public Collection getSelectedReads() { + return selectedReads; + } + + public void reset() { + selectedReads.clear(); + readsSeen = 0; + if(chainedSelector != null) + chainedSelector.reset(); + } +} + +/** + * Select N reads randomly from the input stream. + */ +class NRandomReadSelector implements ReadSelector { + private final ReservoirDownsampler reservoir; + private final ReadSelector chainedSelector; + private long readsSeen = 0; + + public NRandomReadSelector(ReadSelector chainedSelector, long readLimit) { + this.reservoir = new ReservoirDownsampler((int)readLimit); + this.chainedSelector = chainedSelector; + } + + public void submitRead(SAMRecord read) { + SAMRecord displaced = reservoir.add(read); + if(displaced != null && chainedSelector != null) + chainedSelector.notifyReadRejected(read); + readsSeen++; + } + + public void notifyReadRejected(SAMRecord read) { + readsSeen++; + } + + public void complete() { + for(SAMRecord read: reservoir.getDownsampledContents()) + chainedSelector.submitRead(read); + if(chainedSelector != null) + chainedSelector.complete(); + } + + + public long getNumReadsSeen() { + return readsSeen; + } + + public long getNumReadsSelected() { + return reservoir.size(); + } + + public Collection getSelectedReads() { + return reservoir.getDownsampledContents(); + } + + public void reset() { + reservoir.clear(); + if(chainedSelector != null) + chainedSelector.reset(); + } +} + +class SamplePartitioner implements ReadSelector { + private final Map readsBySample; + private long readsSeen = 0; + + public SamplePartitioner(Collection sampleNames) { + readsBySample = new HashMap(); + for(String sampleName: sampleNames) + readsBySample.put(sampleName,new SampleStorage()); + } + + public void submitRead(SAMRecord read) { + String sampleName = read.getReadGroup()!=null ? read.getReadGroup().getSample() : null; + if(readsBySample.containsKey(sampleName)) + readsBySample.get(sampleName).submitRead(read); + readsSeen++; + } + + public void notifyReadRejected(SAMRecord read) { + String sampleName = read.getReadGroup()!=null ? read.getReadGroup().getSample() : null; + if(readsBySample.containsKey(sampleName)) + readsBySample.get(sampleName).notifyReadRejected(read); + readsSeen++; + } + + public void complete() { + // NO-OP. + } + + public long getNumReadsSeen() { + return readsSeen; + } + + public long getNumReadsSelected() { + return readsSeen; + } + + public Collection getSelectedReads() { + throw new UnsupportedOperationException("Cannot directly get selected reads from a read partitioner."); + } + + public ReadSelector getSelectedReads(String sampleName) { + if(!readsBySample.containsKey(sampleName)) + throw new NoSuchElementException("Sample name not found"); + return readsBySample.get(sampleName); + } + + public void reset() { + for(SampleStorage storage: readsBySample.values()) + storage.reset(); + readsSeen = 0; + } + + private class SampleStorage implements ReadSelector { + private Collection reads = new LinkedList(); + private long readsSeen = 0; + + public void submitRead(SAMRecord read) { + reads.add(read); + readsSeen++; + } + + public void notifyReadRejected(SAMRecord read) { + readsSeen++; + } + + public void complete() { + // NO-OP. + } + + public long getNumReadsSeen() { + return readsSeen; + } + + public long getNumReadsSelected() { + return readsSeen; + } + + public Collection getSelectedReads() { + return reads; + } + + public void reset() { + reads.clear(); + readsSeen = 0; + } + } + +} + + diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 4492dabf2..e0a4e59d5 100755 --- a/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -33,10 +33,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; +import org.broadinstitute.sting.utils.pileup.*; import java.util.*; @@ -399,7 +396,7 @@ public class LocusIteratorByState extends LocusIterator { GenomeLoc loc = GenomeLocParser.incPos(our1stState.getLocation(),-1); // System.out.println("Indel(s) at "+loc); // for ( ExtendedEventPileupElement pe : indelPile ) { if ( pe.isIndel() ) System.out.println(" "+pe.toString()); } - nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileup(loc, indelPile, size, maxDeletionLength, nInsertions, nDeletions, nMQ0Reads)); + nextAlignmentContext = new AlignmentContext(loc, new UnifiedReadBackedExtendedEventPileup(loc, indelPile, size, maxDeletionLength, nInsertions, nDeletions, nMQ0Reads)); } else { ArrayList pile = new ArrayList(readStates.size()); @@ -430,7 +427,7 @@ public class LocusIteratorByState extends LocusIterator { GenomeLoc loc = getLocation(); updateReadStates(); // critical - must be called after we get the current state offsets and location // if we got reads with non-D/N over the current position, we are done - if ( pile.size() != 0 ) nextAlignmentContext = new AlignmentContext(loc, new ReadBackedPileup(loc, pile, size, nDeletions, nMQ0Reads)); + if ( pile.size() != 0 ) nextAlignmentContext = new AlignmentContext(loc, new UnifiedReadBackedPileup(loc, pile, size, nDeletions, nMQ0Reads)); } } } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java index 4f16a3b3d..b2cc34b0b 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java @@ -37,6 +37,7 @@ import org.broadinstitute.sting.gatk.walkers.DuplicateWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.UnifiedReadBackedPileup; import java.util.*; @@ -183,7 +184,7 @@ public class TraverseDuplicates extends TraversalEngine extends TraversalEngine,Locu long nSkipped = rodLocusView.getLastSkippedBases(); if ( nSkipped > 0 ) { GenomeLoc site = rodLocusView.getLocOneBeyondShard(); - AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileup(site), nSkipped); + AlignmentContext ac = new AlignmentContext(site, new UnifiedReadBackedPileup(site), nSkipped); M x = walker.map(null, null, ac); sum = walker.reduce(x, sum); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index 8c68afd2a..41641e4ef 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -37,6 +37,7 @@ import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ExtendedPileupElement; +import org.broadinstitute.sting.utils.pileup.UnifiedReadBackedPileup; import java.util.*; import net.sf.samtools.SAMRecord; @@ -265,7 +266,7 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { } } - return new ReadBackedPileup(pileup.getLocation(), reads, offsets); + return new UnifiedReadBackedPileup(pileup.getLocation(), reads, offsets); } private static ReadBackedPileup getPileupOfAllele( Allele allele, ReadBackedPileup pileup ) { @@ -278,7 +279,7 @@ public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { } } - return new ReadBackedPileup(pileup.getLocation(), filteredPileup); + return new UnifiedReadBackedPileup(pileup.getLocation(), filteredPileup); } public List getKeyNames() { return Arrays.asList("HaplotypeScore"); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BatchedCallsMerger.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BatchedCallsMerger.java index eeae98c9d..bc27aeec9 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BatchedCallsMerger.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BatchedCallsMerger.java @@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.genotype.*; import org.broadinstitute.sting.utils.genotype.vcf.*; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.UnifiedReadBackedPileup; import org.broad.tribble.vcf.VCFHeaderLine; import java.util.*; @@ -158,7 +159,7 @@ public class BatchedCallsMerger extends LocusWalker imp if ( readGroup != null && samples.contains(readGroup.getSample()) ) newPileup.add(p); } - return new AlignmentContext(pileup.getLocation(), new ReadBackedPileup(pileup.getLocation(), newPileup)); + return new AlignmentContext(pileup.getLocation(), new UnifiedReadBackedPileup(pileup.getLocation(), newPileup)); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 5f112d0ee..2ffb2b36e 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -45,10 +45,7 @@ import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory; import org.broadinstitute.sting.utils.genotype.geli.GeliGenotypeWriter; import org.broadinstitute.sting.utils.genotype.glf.GLFGenotypeWriter; import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeWriter; -import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.*; import java.io.PrintStream; import java.util.*; @@ -241,7 +238,7 @@ public class UnifiedGenotyperEngine { AlignmentUtils.mismatchesInRefWindow(p, refContext, true) <= UAC.MAX_MISMATCHES ) filteredPileup.add(p); } - return new ReadBackedPileup(pileup.getLocation(), filteredPileup); + return new UnifiedReadBackedPileup(pileup.getLocation(), filteredPileup); } // filter based on maximum mismatches and bad mates @@ -253,7 +250,7 @@ public class UnifiedGenotyperEngine { AlignmentUtils.mismatchesInRefWindow(p, refContext, true) <= UAC.MAX_MISMATCHES ) filteredPileup.add(p); } - return new ReadBackedExtendedEventPileup(pileup.getLocation(), filteredPileup); + return new UnifiedReadBackedExtendedEventPileup(pileup.getLocation(), filteredPileup); } private static boolean isValidDeletionFraction(double d) { diff --git a/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java b/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java index 51a1775bf..8dd2085bb 100644 --- a/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java +++ b/java/src/org/broadinstitute/sting/utils/ReservoirDownsampler.java @@ -13,7 +13,7 @@ import java.util.*; * @author mhanna * @version 0.1 */ -public class ReservoirDownsampler implements Collection { +public class ReservoirDownsampler { /** * Create a random number generator with a random, but reproducible, seed. */ @@ -40,31 +40,35 @@ public class ReservoirDownsampler implements Collection { this.maxElements = maxElements; } - @Override - public boolean add(T element) { + /** + * Returns the eliminated element. + * @param element Eliminated element; null if no element has been eliminated. + * @return + */ + public T add(T element) { if(maxElements <= 0) - return false; + return element; else if(reservoir.size() < maxElements) { reservoir.add(element); - return true; + return null; } else { // Get a uniformly distributed int. If the chosen slot lives within the partition, replace the entry in that slot with the newest entry. int slot = random.nextInt(maxElements); if(slot >= 0 && slot < maxElements) { + T displaced = reservoir.get(slot); reservoir.set(slot,element); - return true; + return displaced; } else - return false; + return element; } } - @Override public boolean addAll(Collection elements) { boolean added = false; for(T element: elements) - added |= add(element); + added |= (add(element) != null); return added; } @@ -73,62 +77,51 @@ public class ReservoirDownsampler implements Collection { * @return The downsampled contents of this reservoir. */ public Collection getDownsampledContents() { - return (Collection)reservoir.clone(); + return reservoir; } - @Override public void clear() { reservoir.clear(); } - @Override public boolean isEmpty() { return reservoir.isEmpty(); } - @Override public int size() { return reservoir.size(); } - @Override public Iterator iterator() { return reservoir.iterator(); } - @Override public boolean contains(Object o) { return reservoir.contains(o); } - @Override public boolean containsAll(Collection elements) { return reservoir.containsAll(elements); } - @Override public boolean retainAll(Collection elements) { return reservoir.retainAll(elements); } - @Override public boolean remove(Object o) { return reservoir.remove(o); } - @Override public boolean removeAll(Collection elements) { return reservoir.removeAll(elements); } - @Override public Object[] toArray() { Object[] contents = new Object[reservoir.size()]; reservoir.toArray(contents); return contents; } - @Override public T[] toArray(T[] array) { return reservoir.toArray(array); } diff --git a/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java b/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java index 4f04bbecc..f40b1051b 100644 --- a/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java +++ b/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.UnifiedReadBackedPileup; import java.util.List; import java.util.Arrays; @@ -114,7 +115,7 @@ public class DupUtils { private static Pair combineBaseProbs(List duplicates, int readOffset, int maxQScore) { GenomeLoc loc = GenomeLocParser.createGenomeLoc(duplicates.get(0)); - ReadBackedPileup pileup = new ReadBackedPileup(loc, duplicates, readOffset); + ReadBackedPileup pileup = new UnifiedReadBackedPileup(loc, duplicates, readOffset); final boolean debug = false; diff --git a/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java b/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java new file mode 100644 index 000000000..a23a2e3be --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pileup; + +import net.sf.picard.util.PeekableIterator; + +import java.util.PriorityQueue; +import java.util.Map; +import java.util.Comparator; +import java.util.Iterator; + +/** + * Merges multiple pileups broken down by sample. + * + * @author mhanna + * @version 0.1 + */ +class MergingPileupElementIterator implements Iterator { + private final PriorityQueue> perSampleIterators; + + public MergingPileupElementIterator(Map pileupsPerSample) { + perSampleIterators = new PriorityQueue>(pileupsPerSample.size(),new PileupElementIteratorComparator()); + for(Object pileupForSample: pileupsPerSample.values()) { + if(pileupForSample instanceof ReadBackedPileup) { + ReadBackedPileup pileup = (ReadBackedPileup)pileupForSample; + if(pileup.size() != 0) + perSampleIterators.add(new PeekableIterator(pileup.iterator())); + } + else if(pileupForSample instanceof ReadBackedExtendedEventPileup) { + ReadBackedExtendedEventPileup pileup = (ReadBackedExtendedEventPileup)pileupForSample; + if(pileup.size() != 0) + perSampleIterators.add(new PeekableIterator(pileup.iterator())); + } + } + } + + public boolean hasNext() { + return !perSampleIterators.isEmpty(); + } + + public PileupElement next() { + PeekableIterator currentIterator = perSampleIterators.remove(); + PileupElement current = currentIterator.next(); + if(currentIterator.hasNext()) + perSampleIterators.add(currentIterator); + return current; + } + + public void remove() { + throw new UnsupportedOperationException("Cannot remove from a merging iterator."); + } + + /** + * Compares two peekable iterators consisting of pileup elements. + */ + private class PileupElementIteratorComparator implements Comparator> { + public int compare(PeekableIterator lhs, PeekableIterator rhs) { + return rhs.peek().getOffset() - lhs.peek().getOffset(); + } + } +} diff --git a/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java b/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java index 6454cb340..1ce46a1e2 100644 --- a/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java +++ b/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java @@ -1,426 +1,141 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.pileup; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.collections.Pair; - -import java.util.*; - -import net.sf.samtools.SAMRecord; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Dec 29, 2009 - * Time: 12:25:55 PM - * To change this template use File | Settings | File Templates. - */ -public class ReadBackedExtendedEventPileup implements Iterable { - private GenomeLoc loc = null; - private ArrayList pileup = null; - - private int size = 0; // cached value of the size of the pileup - private int maxDeletionLength = 0; // cached value of the length of the longest deletion observed at the site - private int nDeletions = 0; // cached value of the number of deletions - private int nInsertions = 0; - private int nMQ0Reads = 0; // cached value of the number of MQ0 reads - - /** - * Create a new version of a read backed pileup at loc without any aligned reads - * - */ - public ReadBackedExtendedEventPileup(GenomeLoc loc ) { - this(loc, new ArrayList(0)); - } - - /** - * Create a new version of a read backed pileup at loc, using the reads and their corresponding - * offsets. This lower level constructure assumes pileup is well-formed and merely keeps a - * pointer to pileup. Don't go changing the data in pileup. - * - */ - public ReadBackedExtendedEventPileup(GenomeLoc loc, ArrayList pileup ) { - if ( loc == null ) throw new StingException("Illegal null genomeloc in ReadBackedExtendedEventPileup"); - if ( pileup == null ) throw new StingException("Illegal null pileup in ReadBackedExtendedEventPileup"); - - this.loc = loc; - this.pileup = pileup; - - calculatedCachedData(); - } - - /** - * Optimization of above constructor where all of the cached data is provided - * @param loc - * @param pileup - */ - public ReadBackedExtendedEventPileup(GenomeLoc loc, ArrayList pileup, int size, - int maxDeletionLength, int nInsertions, int nDeletions, int nMQ0Reads ) { - if ( loc == null ) throw new StingException("Illegal null genomeloc in ReadBackedExtendedEventPileup"); - if ( pileup == null ) throw new StingException("Illegal null pileup in ReadBackedExtendedEventPileup"); - - this.loc = loc; - this.pileup = pileup; - this.size = size; - this.maxDeletionLength = maxDeletionLength; - this.nDeletions = nDeletions; - this.nInsertions = nInsertions; - this.nMQ0Reads = nMQ0Reads; - } - - - /** - * Calculate cached sizes, nDeletion, and base counts for the pileup. This calculation is done upfront, - * so you pay the cost at the start, but it's more efficient to do this rather than pay the cost of calling - * sizes, nDeletion, etc. over and over potentially. - */ - private void calculatedCachedData() { - size = 0; - nDeletions = 0; - nInsertions = 0; - nMQ0Reads = 0; - - for ( ExtendedEventPileupElement p : this ) { - - size++; - if ( p.isDeletion() ) { - nDeletions++; - maxDeletionLength = Math.max(maxDeletionLength, p.getEventLength()); - } else { - if ( p.isInsertion() ) nInsertions++; - } - - if ( p.getRead().getMappingQuality() == 0 ) { - nMQ0Reads++; - } - } - } - - - // -------------------------------------------------------- - // - // Special 'constructors' - // - // -------------------------------------------------------- - - - /** - * Returns a new ReadBackedPileup that is free of mapping quality zero reads in this pileup. Note that this - * does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy - * of the pileup (just returns this) if there are no MQ0 reads in the pileup. - * - * @return - */ - public ReadBackedExtendedEventPileup getPileupWithoutMappingQualityZeroReads() { - - if ( getNumberOfMappingQualityZeroReads() > 0 ) { - ArrayList filteredPileup = new ArrayList(); - for ( ExtendedEventPileupElement p : pileup ) { - if ( p.getRead().getMappingQuality() > 0 ) { - filteredPileup.add(p); - } - } - return new ReadBackedExtendedEventPileup(loc, filteredPileup); - } else { - return this; - } - } - - public ReadBackedExtendedEventPileup getMappingFilteredPileup( int minMapQ ) { - ArrayList filteredPileup = new ArrayList(); - - for ( ExtendedEventPileupElement p : pileup ) { - if ( p.getRead().getMappingQuality() >= minMapQ ) { - filteredPileup.add(p); - } - } - - return new ReadBackedExtendedEventPileup(loc, filteredPileup); - } - - /** - * Returns a pileup randomly downsampled to the desiredCoverage. - * - * @param desiredCoverage - * @return - */ - public ReadBackedExtendedEventPileup getDownsampledPileup(int desiredCoverage) { - if ( size() <= desiredCoverage ) - return this; - - // randomly choose numbers corresponding to positions in the reads list - Random generator = new Random(); - TreeSet positions = new TreeSet(); - for ( int i = 0; i < desiredCoverage; /* no update */ ) { - if ( positions.add(generator.nextInt(pileup.size())) ) - i++; - } - - Iterator positionIter = positions.iterator(); - ArrayList downsampledPileup = new ArrayList(); - - while ( positionIter.hasNext() ) { - int nextReadToKeep = (Integer)positionIter.next(); - downsampledPileup.add(pileup.get(nextReadToKeep)); - } - - return new ReadBackedExtendedEventPileup(getLocation(), downsampledPileup); - } - - // -------------------------------------------------------- - // - // iterators - // - // -------------------------------------------------------- - - /** - * The best way to access PileupElements where you only care about the bases and quals in the pileup. - * - * for (PileupElement p : this) { doSomething(p); } - * - * Provides efficient iteration of the data. - * - * @return - */ - public Iterator iterator() { - return pileup.iterator(); - } - - - /** - * Returns the number of deletion events in this pileup - * - * @return - */ - public int getNumberOfDeletions() { - return nDeletions; - } - - /** - * Returns the number of insertion events in this pileup - * - * @return - */ - public int getNumberOfInsertions() { - return nInsertions; - } - - - /** Returns the length of the longest deletion observed at the site this - * pileup is associated with (NOTE: by convention, both insertions and deletions - * are associated with genomic location immediately before the actual event). If - * there are no deletions at the site, returns 0. - * @return - */ - public int getMaxDeletionLength() { - return maxDeletionLength; - } - /** - * Returns the number of mapping quality zero reads in this pileup. - * @return - */ - public int getNumberOfMappingQualityZeroReads() { - return nMQ0Reads; - } - -// public int getNumberOfDeletions() { -// int n = 0; -// -// for ( int i = 0; i < size(); i++ ) { -// if ( getOffsets().get(i) != -1 ) { n++; } -// } -// -// return n; -// } - - /** - * @return the number of elements in this pileup - */ - public int size() { - return size; - } - - /** - * @return the location of this pileup - */ - public GenomeLoc getLocation() { - return loc; - } - - - - public String getShortPileupString() { - // In the pileup format, each extended event line has genomic position (chromosome name and offset), - // reference "base" (always set to "E" for E(xtended)), then 'I','D', or '.' symbol for each read representing - // insertion, deletion or no-event, respectively. - return String.format("%s %s E %s", - getLocation().getContig(), getLocation().getStart(), // chromosome name and coordinate - new String(getEvents()) ); - } - - - // -------------------------------------------------------- - // - // Convenience functions that may be slow - // - // -------------------------------------------------------- - - /** - * Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time - * @return - */ - public List getReads() { - List reads = new ArrayList(size()); - for ( ExtendedEventPileupElement pile : this ) { reads.add(pile.getRead()); } - return reads; - } - - /** - * Returns a list of the offsets in this pileup. Note this call costs O(n) and allocates fresh lists each time - * @return - */ - public List getOffsets() { - List offsets = new ArrayList(size()); - for ( ExtendedEventPileupElement pile : this ) { offsets.add(pile.getOffset()); } - return offsets; - } - - /** - * Returns an array of the events in this pileup ('I', 'D', or '.'). Note this call costs O(n) and allocates fresh array each time - * @return - */ - public byte[] getEvents() { - byte[] v = new byte[size()]; - int i = 0; - for ( ExtendedEventPileupElement e : this ) { - switch ( e.getType() ) { - case INSERTION: v[i] = 'I'; break; - case DELETION: v[i] = 'D'; break; - case NOEVENT: v[i] = '.'; break; - default: throw new StingException("Unknown event type encountered: "+e.getType()); - } - i++; - } - return v; - } - - /** A shortcut for getEventStringsWithCounts(null); - * - * @return - */ - public List> getEventStringsWithCounts() { - return getEventStringsWithCounts(null); - } - - /** Returns String representation of all distinct extended events (indels) at the site along with - * observation counts (numbers of reads) for each distinct event. If refBases is null, a simple string representation for - * deletions will be generated as "D" (i.e. "5D"); if the reference bases are provided, the actual - * deleted sequence will be used in the string representation (e.g. "-AAC"). - * @param refBases reference bases, starting with the current locus (i.e. the one immediately before the indel), and - * extending far enough to accomodate the longest deletion (i.e. size of refBases must be at least 1+) - * @return list of distinct events; first element of a pair is a string representation of the event, second element - * gives the number of reads, in which that event was observed - */ - public List> getEventStringsWithCounts(byte[] refBases) { - Map events = new HashMap(); - - for ( ExtendedEventPileupElement e : this ) { - Integer cnt; - String indel = null; - switch ( e.getType() ) { - case INSERTION: - indel = "+"+e.getEventBases(); - break; - case DELETION: - indel = getDeletionString(e.getEventLength(),refBases); - break; - case NOEVENT: continue; - default: throw new StingException("Unknown event type encountered: "+e.getType()); - } - - cnt = events.get(indel); - if ( cnt == null ) events.put(indel,1); - else events.put(indel,cnt.intValue()+1); - } - - List> eventList = new ArrayList>(events.size()); - for ( Map.Entry m : events.entrySet() ) { - eventList.add( new Pair(m.getKey(),m.getValue())); - } - return eventList; - } - - /** - * Builds string representation of the deletion event. If refBases is null, the representation will be - * "D" (e.g. "5D"); if the reference bases are provided, a verbose representation (e.g. "-AAC") - * will be generated. NOTE: refBases must start with the base prior to the actual deletion (i.e. deleted - * base(s) are refBase[1], refBase[2], ...), and the length of the passed array must be sufficient to accomodate the - * deletion length (i.e. size of refBase must be at least length+1). - * @param length - * @param refBases - * @return - */ - private String getDeletionString(int length, byte[] refBases) { - if ( refBases == null ) { - return Integer.toString(length)+"D"; // if we do not have reference bases, we can only report something like "5D" - } else { - return "-"+new String(refBases,1,length).toUpperCase(); - } - } - - /** - * Get an array of the mapping qualities - * @return - */ - public byte[] getMappingQuals() { - byte[] v = new byte[size()]; - int i = 0; - for ( ExtendedEventPileupElement e : this ) { v[i++] = (byte)e.getRead().getMappingQuality(); } - return v; - } - - - // - // Private functions for printing pileups - // - private String getMappingQualsString() { - return quals2String(getMappingQuals()); - } - - private static String quals2String( byte[] quals ) { - StringBuilder qualStr = new StringBuilder(); - for ( int qual : quals ) { - qual = Math.min(qual, 63); // todo: fixme, this isn't a good idea - char qualChar = (char) (33 + qual); // todo: warning, this is illegal for qual > 63 - qualStr.append(qualChar); - } - - return qualStr.toString(); - } - -} +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pileup; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.collections.Pair; + +import java.util.Iterator; +import java.util.List; + +import net.sf.samtools.SAMRecord; + +/** + * A clean interface for working with extended event pileups. + * + * @author mhanna + * @version 0.1 + */ +public interface ReadBackedExtendedEventPileup extends Iterable { + /** + * Returns a new ReadBackedPileup that is free of mapping quality zero reads in this pileup. Note that this + * does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy + * of the pileup (just returns this) if there are no MQ0 reads in the pileup. + * + * @return + */ + public ReadBackedExtendedEventPileup getPileupWithoutMappingQualityZeroReads(); + + public ReadBackedExtendedEventPileup getMappingFilteredPileup( int minMapQ ); + + /** + * Returns a pileup randomly downsampled to the desiredCoverage. + * + * @param desiredCoverage + * @return + */ + public ReadBackedExtendedEventPileup getDownsampledPileup(int desiredCoverage); + + /** + * Returns the number of deletion events in this pileup + * + * @return + */ + public int getNumberOfDeletions(); + + /** + * Returns the number of insertion events in this pileup + * + * @return + */ + public int getNumberOfInsertions(); + + /** Returns the length of the longest deletion observed at the site this + * pileup is associated with (NOTE: by convention, both insertions and deletions + * are associated with genomic location immediately before the actual event). If + * there are no deletions at the site, returns 0. + * @return + */ + public int getMaxDeletionLength(); + + /** + * Returns the number of mapping quality zero reads in this pileup. + * @return + */ + public int getNumberOfMappingQualityZeroReads(); + + /** + * @return the number of elements in this pileup + */ + public int size(); + + /** + * @return the location of this pileup + */ + public GenomeLoc getLocation(); + + public String getShortPileupString(); + + /** + * Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time + * @return + */ + public List getReads(); + + /** + * Returns a list of the offsets in this pileup. Note this call costs O(n) and allocates fresh lists each time + * @return + */ + public List getOffsets(); + + /** + * Returns an array of the events in this pileup ('I', 'D', or '.'). Note this call costs O(n) and allocates fresh array each time + * @return + */ + public byte[] getEvents(); + + /** A shortcut for getEventStringsWithCounts(null); + * + * @return + */ + public List> getEventStringsWithCounts(); + + /** Returns String representation of all distinct extended events (indels) at the site along with + * observation counts (numbers of reads) for each distinct event. If refBases is null, a simple string representation for + * deletions will be generated as "D" (i.e. "5D"); if the reference bases are provided, the actual + * deleted sequence will be used in the string representation (e.g. "-AAC"). + * @param refBases reference bases, starting with the current locus (i.e. the one immediately before the indel), and + * extending far enough to accomodate the longest deletion (i.e. size of refBases must be at least 1+) + * @return list of distinct events; first element of a pair is a string representation of the event, second element + * gives the number of reads, in which that event was observed + */ + public List> getEventStringsWithCounts(byte[] refBases); + + /** + * Get an array of the mapping qualities + * @return + */ + public byte[] getMappingQuals(); +} diff --git a/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java old mode 100755 new mode 100644 index 7883230e8..6f480f7f6 --- a/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -1,157 +1,42 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.utils.pileup; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.iterators.IterableIterator; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ExtendedPileupElement; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.sam.ReadUtils; - -import java.util.*; - +import org.broadinstitute.sting.gatk.iterators.IterableIterator; import net.sf.samtools.SAMRecord; +import java.util.List; + /** - * Version two file implementing pileups of bases in reads at a locus. + * A data retrieval interface for accessing parts of the pileup. * - * @author Mark DePristo + * @author mhanna + * @version 0.1 */ -public class ReadBackedPileup implements Iterable { - private GenomeLoc loc = null; - private ArrayList pileup = null; - - private int size = 0; // cached value of the size of the pileup - private int nDeletions = 0; // cached value of the number of deletions - private int nMQ0Reads = 0; // cached value of the number of MQ0 reads - - /** - * Create a new version of a read backed pileup at loc, using the reads and their corresponding - * offsets. This pileup will contain a list, in order of the reads, of the piled bases at - * reads[i] for all i in offsets. Does not make a copy of the data, so it's not safe to - * go changing the reads. - * - * @param loc - * @param reads - * @param offsets - */ - public ReadBackedPileup(GenomeLoc loc, List reads, List offsets ) { - this(loc, readsOffsets2Pileup(reads, offsets)); - } - - public ReadBackedPileup(GenomeLoc loc, List reads, int offset ) { - this(loc, readsOffsets2Pileup(reads, offset)); - } - - /** - * Create a new version of a read backed pileup at loc without any aligned reads - * - */ - public ReadBackedPileup(GenomeLoc loc ) { - this(loc, new ArrayList(0)); - } - - /** - * Create a new version of a read backed pileup at loc, using the reads and their corresponding - * offsets. This lower level constructure assumes pileup is well-formed and merely keeps a - * pointer to pileup. Don't go changing the data in pileup. - * - */ - public ReadBackedPileup(GenomeLoc loc, ArrayList pileup ) { - if ( loc == null ) throw new StingException("Illegal null genomeloc in ReadBackedPileup2"); - if ( pileup == null ) throw new StingException("Illegal null pileup in ReadBackedPileup2"); - - this.loc = loc; - this.pileup = pileup; - - calculatedCachedData(); - } - - /** - * Optimization of above constructor where all of the cached data is provided - * @param loc - * @param pileup - */ - public ReadBackedPileup(GenomeLoc loc, ArrayList pileup, int size, int nDeletions, int nMQ0Reads ) { - if ( loc == null ) throw new StingException("Illegal null genomeloc in ReadBackedPileup2"); - if ( pileup == null ) throw new StingException("Illegal null pileup in ReadBackedPileup2"); - - this.loc = loc; - this.pileup = pileup; - this.size = size; - this.nDeletions = nDeletions; - this.nMQ0Reads = nMQ0Reads; - } - - - /** - * Calculate cached sizes, nDeletion, and base counts for the pileup. This calculation is done upfront, - * so you pay the cost at the start, but it's more efficient to do this rather than pay the cost of calling - * sizes, nDeletion, etc. over and over potentially. - */ - private void calculatedCachedData() { - size = 0; - nDeletions = 0; - nMQ0Reads = 0; - - for ( PileupElement p : this ) { - size++; - if ( p.isDeletion() ) { - nDeletions++; - } - if ( p.getRead().getMappingQuality() == 0 ) { - nMQ0Reads++; - } - } - } - - - /** - * Helper routine for converting reads and offset lists to a PileupElement list. - * - * @param reads - * @param offsets - * @return - */ - private static ArrayList readsOffsets2Pileup(List reads, List offsets ) { - if ( reads == null ) throw new StingException("Illegal null read list in ReadBackedPileup2"); - if ( offsets == null ) throw new StingException("Illegal null offsets list in ReadBackedPileup2"); - if ( reads.size() != offsets.size() ) throw new StingException("Reads and offset lists have different sizes!"); - - ArrayList pileup = new ArrayList(reads.size()); - for ( int i = 0; i < reads.size(); i++ ) { - pileup.add( new PileupElement( reads.get(i), offsets.get(i) ) ); - } - - return pileup; - } - - /** - * Helper routine for converting reads and a single offset to a PileupElement list. - * - * @param reads - * @param offset - * @return - */ - private static ArrayList readsOffsets2Pileup(List reads, int offset ) { - if ( reads == null ) throw new StingException("Illegal null read list in ReadBackedPileup2"); - if ( offset < 0 ) throw new StingException("Illegal offset < 0 ReadBackedPileup2"); - - ArrayList pileup = new ArrayList(reads.size()); - for ( int i = 0; i < reads.size(); i++ ) { - pileup.add( new PileupElement( reads.get(i), offset ) ); - } - - return pileup; - } - - // -------------------------------------------------------- - // - // Special 'constructors' - // - // -------------------------------------------------------- - +public interface ReadBackedPileup extends Iterable { /** * Returns a new ReadBackedPileup that is free of deletion spanning reads in this pileup. Note that this * does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy @@ -159,20 +44,7 @@ public class ReadBackedPileup implements Iterable { * * @return */ - public ReadBackedPileup getPileupWithoutDeletions() { - if ( getNumberOfDeletions() > 0 ) { - ArrayList filteredPileup = new ArrayList(); - - for ( PileupElement p : pileup ) { - if ( !p.isDeletion() ) { - filteredPileup.add(p); - } - } - return new ReadBackedPileup(loc, filteredPileup); - } else { - return this; - } - } + public ReadBackedPileup getPileupWithoutDeletions(); /** * Returns a new ReadBackedPileup where only one read from an overlapping read @@ -182,32 +54,7 @@ public class ReadBackedPileup implements Iterable { * * @return the newly filtered pileup */ - public ReadBackedPileup getOverlappingFragementFilteredPileup() { - Map filteredPileup = new HashMap(); - - for ( PileupElement p : pileup ) { - String readName = p.getRead().getReadName(); - - // if we've never seen this read before, life is good - if (!filteredPileup.containsKey(readName)) { - filteredPileup.put(readName, p); - } else { - PileupElement existing = filteredPileup.get(readName); - - // if the reads disagree at this position, throw them both out. Otherwise - // keep the element with the higher quality score - if (existing.getBase() != p.getBase()) { - filteredPileup.remove(readName); - } else { - if (existing.getQual() < p.getQual()) { - filteredPileup.put(readName, p); - } - } - } - } - - return new ReadBackedPileup(loc, new ArrayList(filteredPileup.values())); - } + public ReadBackedPileup getOverlappingFragmentFilteredPileup(); /** * Returns a new ReadBackedPileup that is free of mapping quality zero reads in this pileup. Note that this @@ -216,20 +63,7 @@ public class ReadBackedPileup implements Iterable { * * @return */ - public ReadBackedPileup getPileupWithoutMappingQualityZeroReads() { - - if ( getNumberOfMappingQualityZeroReads() > 0 ) { - ArrayList filteredPileup = new ArrayList(); - for ( PileupElement p : pileup ) { - if ( p.getRead().getMappingQuality() > 0 ) { - filteredPileup.add(p); - } - } - return new ReadBackedPileup(loc, filteredPileup); - } else { - return this; - } - } + public ReadBackedPileup getPileupWithoutMappingQualityZeroReads(); /** Returns subset of this pileup that contains only bases with quality >= minBaseQ, coming from * reads with mapping qualities >= minMapQ. This method allocates and returns a new instance of ReadBackedPileup. @@ -237,35 +71,21 @@ public class ReadBackedPileup implements Iterable { * @param minMapQ * @return */ - public ReadBackedPileup getBaseAndMappingFilteredPileup( int minBaseQ, int minMapQ ) { - ArrayList filteredPileup = new ArrayList(); - - for ( PileupElement p : pileup ) { - if ( p.getRead().getMappingQuality() >= minMapQ && (p.isDeletion() || p.getQual() >= minBaseQ) ) { - filteredPileup.add(p); - } - } - - return new ReadBackedPileup(loc, filteredPileup); - } + public ReadBackedPileup getBaseAndMappingFilteredPileup( int minBaseQ, int minMapQ ); /** Returns subset of this pileup that contains only bases with quality >= minBaseQ. * This method allocates and returns a new instance of ReadBackedPileup. * @param minBaseQ * @return */ - public ReadBackedPileup getBaseFilteredPileup( int minBaseQ ) { - return getBaseAndMappingFilteredPileup(minBaseQ, -1); - } - + public ReadBackedPileup getBaseFilteredPileup( int minBaseQ ); + /** Returns subset of this pileup that contains only bases coming from reads with mapping quality >= minMapQ. * This method allocates and returns a new instance of ReadBackedPileup. * @param minMapQ * @return */ - public ReadBackedPileup getMappingFilteredPileup( int minMapQ ) { - return getBaseAndMappingFilteredPileup(-1, minMapQ); - } + public ReadBackedPileup getMappingFilteredPileup( int minMapQ ); /** * Returns a pileup randomly downsampled to the desiredCoverage. @@ -273,97 +93,36 @@ public class ReadBackedPileup implements Iterable { * @param desiredCoverage * @return */ - public ReadBackedPileup getDownsampledPileup(int desiredCoverage) { - if ( size() <= desiredCoverage ) - return this; - - // randomly choose numbers corresponding to positions in the reads list - Random generator = new Random(); - TreeSet positions = new TreeSet(); - for ( int i = 0; i < desiredCoverage; /* no update */ ) { - if ( positions.add(generator.nextInt(pileup.size())) ) - i++; - } - - Iterator positionIter = positions.iterator(); - ArrayList downsampledPileup = new ArrayList(); - - while ( positionIter.hasNext() ) { - int nextReadToKeep = (Integer)positionIter.next(); - downsampledPileup.add(pileup.get(nextReadToKeep)); - } - - return new ReadBackedPileup(getLocation(), downsampledPileup); - } - - // -------------------------------------------------------- - // - // iterators - // - // -------------------------------------------------------- + public ReadBackedPileup getDownsampledPileup(int desiredCoverage); /** - * The best way to access PileupElements where you only care about the bases and quals in the pileup. - * - * for (PileupElement p : this) { doSomething(p); } - * - * Provides efficient iteration of the data. - * - * @return + * Gets the particular subset of this pileup with the given sample name. + * @param sampleName Name of the sample to use. + * @return A subset of this pileup containing only reads with the given sample. */ - public Iterator iterator() { - return pileup.iterator(); - } - - - /** - * The best way to access PileupElements where you only care not only about bases and quals in the pileup - * but also need access to the index of the pileup element in the pile. - * - * for (ExtendedPileupElement p : this) { doSomething(p); } - * - * Provides efficient iteration of the data. - * - * @return - */ - - // todo -- reimplement efficiently - public IterableIterator extendedForeachIterator() { - ArrayList x = new ArrayList(size()); - int i = 0; - for ( PileupElement pile : this ) { - x.add(new ExtendedPileupElement(pile.getRead(), pile.getOffset(), i++, this)); - } - - return new IterableIterator(x.iterator()); - } + public ReadBackedPileup getPileupForSample(String sampleName); + + // todo -- delete or make private + public IterableIterator extendedForeachIterator(); /** * Simple useful routine to count the number of deletion bases in this pileup * * @return */ - public int getNumberOfDeletions() { - return nDeletions; - } + public int getNumberOfDeletions(); - public int getNumberOfMappingQualityZeroReads() { - return nMQ0Reads; - } + public int getNumberOfMappingQualityZeroReads(); /** * @return the number of elements in this pileup */ - public int size() { - return size; - } + public int size(); /** * @return the location of this pileup */ - public GenomeLoc getLocation() { - return loc; - } + public GenomeLoc getLocation(); /** * Get counts of A, C, G, T in order, which returns a int[4] vector with counts according @@ -371,143 +130,49 @@ public class ReadBackedPileup implements Iterable { * * @return */ - public int[] getBaseCounts() { - int[] counts = new int[4]; - for ( PileupElement pile : this ) { - // skip deletion sites - if ( ! pile.isDeletion() ) { - int index = BaseUtils.simpleBaseToBaseIndex((char)pile.getBase()); - if (index != -1) - counts[index]++; - } - } - - return counts; - } + public int[] getBaseCounts(); /** * Somewhat expensive routine that returns true if any base in the pileup has secondary bases annotated * @return */ - public boolean hasSecondaryBases() { - for ( PileupElement pile : this ) { - // skip deletion sites - if ( ! pile.isDeletion() && BaseUtils.isRegularBase((char)pile.getSecondBase()) ) - return true; - } + public boolean hasSecondaryBases(); - return false; - } - - public String getPileupString(char ref) { - // In the pileup format, each line represents a genomic position, consisting of chromosome name, - // coordinate, reference base, read bases, read qualities and alignment mapping qualities. - return String.format("%s %s %c %s %s", - getLocation().getContig(), getLocation().getStart(), // chromosome name and coordinate - ref, // reference base - new String(getBases()), - getQualsString()); - } - - - // -------------------------------------------------------- - // - // Convenience functions that may be slow - // - // -------------------------------------------------------- + public String getPileupString(char ref); /** * Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time * @return */ - public List getReads() { - List reads = new ArrayList(size()); - for ( PileupElement pile : this ) { reads.add(pile.getRead()); } - return reads; - } + public List getReads(); /** * Returns a list of the offsets in this pileup. Note this call costs O(n) and allocates fresh lists each time * @return */ - public List getOffsets() { - List offsets = new ArrayList(size()); - for ( PileupElement pile : this ) { offsets.add(pile.getOffset()); } - return offsets; - } + public List getOffsets(); /** * Returns an array of the bases in this pileup. Note this call costs O(n) and allocates fresh array each time * @return */ - public byte[] getBases() { - byte[] v = new byte[size()]; - for ( ExtendedPileupElement pile : this.extendedForeachIterator() ) { v[pile.getPileupOffset()] = pile.getBase(); } - return v; - } + public byte[] getBases(); /** * Returns an array of the secondary bases in this pileup. Note this call costs O(n) and allocates fresh array each time * @return */ - public byte[] getSecondaryBases() { - byte[] v = new byte[size()]; - for ( ExtendedPileupElement pile : this.extendedForeachIterator() ) { v[pile.getPileupOffset()] = pile.getSecondBase(); } - return v; - } + public byte[] getSecondaryBases(); - /** - * Returns an array of the quals in this pileup. Note this call costs O(n) and allocates fresh array each time - * @return - */ - public byte[] getQuals() { - byte[] v = new byte[size()]; - for ( ExtendedPileupElement pile : this.extendedForeachIterator() ) { v[pile.getPileupOffset()] = pile.getQual(); } - return v; - } + /** + * Returns an array of the quals in this pileup. Note this call costs O(n) and allocates fresh array each time + * @return + */ + public byte[] getQuals(); /** * Get an array of the mapping qualities * @return */ - public byte[] getMappingQuals() { - byte[] v = new byte[size()]; - for ( ExtendedPileupElement pile : this.extendedForeachIterator() ) { v[pile.getPileupOffset()] = (byte)pile.getRead().getMappingQuality(); } - return v; - } - - - // - // Private functions for printing pileups - // - private String getMappingQualsString() { - return quals2String(getMappingQuals()); - } - - private static String quals2String( byte[] quals ) { - StringBuilder qualStr = new StringBuilder(); - for ( int qual : quals ) { - qual = Math.min(qual, 63); // todo: fixme, this isn't a good idea - char qualChar = (char) (33 + qual); // todo: warning, this is illegal for qual > 63 - qualStr.append(qualChar); - } - - return qualStr.toString(); - } - - private String getQualsString() { - return quals2String(getQuals()); - } - - public String toString() { - StringBuilder s = new StringBuilder(); - s.append(getLocation()); - s.append(": "); - - for ( PileupElement p : this ) { - s.append((char)p.getBase()); - } - - return s.toString(); - } + public byte[] getMappingQuals(); } diff --git a/java/src/org/broadinstitute/sting/utils/pileup/SampleSplitReadBackedExtendedEventPileup.java b/java/src/org/broadinstitute/sting/utils/pileup/SampleSplitReadBackedExtendedEventPileup.java new file mode 100644 index 000000000..72d57e964 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/pileup/SampleSplitReadBackedExtendedEventPileup.java @@ -0,0 +1,348 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pileup; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.collections.Pair; + +import java.util.*; + +import net.sf.samtools.SAMRecord; + +/** + * Extended event pileups, split out by sample. + * + * @author mhanna + * @version 0.1 + */ +public class SampleSplitReadBackedExtendedEventPileup implements ReadBackedExtendedEventPileup { + private GenomeLoc loc = null; + private Map pileupBySample = null; + + private int size = 0; // cached value of the size of the pileup + private int maxDeletionLength = 0; // cached value of the length of the longest deletion observed at the site + private int nDeletions = 0; // cached value of the number of deletions + private int nInsertions = 0; + private int nMQ0Reads = 0; // cached value of the number of MQ0 reads + + public SampleSplitReadBackedExtendedEventPileup(GenomeLoc loc) { + this(loc,new HashMap()); + } + + /** + * Optimization of above constructor where all of the cached data is provided + * @param loc Location of this pileup. + * @param pileup Pileup data, split by sample name. + */ + public SampleSplitReadBackedExtendedEventPileup(GenomeLoc loc, Map pileup) { + if ( loc == null ) throw new StingException("Illegal null genomeloc in UnifiedReadBackedPileup"); + if ( pileup == null ) throw new StingException("Illegal null pileup in UnifiedReadBackedPileup"); + + this.loc = loc; + this.pileupBySample = pileup; + for(ReadBackedExtendedEventPileup pileupBySample: pileup.values()) { + this.size += pileupBySample.size(); + this.nDeletions += pileupBySample.getNumberOfDeletions(); + this.nMQ0Reads += pileupBySample.getNumberOfMappingQualityZeroReads(); + } + } + + + + private void addPileup(final String sampleName, ReadBackedExtendedEventPileup pileup) { + pileupBySample.put(sampleName,pileup); + + size += pileup.size(); + nDeletions += pileup.getNumberOfDeletions(); + nMQ0Reads += pileup.getNumberOfMappingQualityZeroReads(); + } + + @Override + public ReadBackedExtendedEventPileup getPileupWithoutMappingQualityZeroReads() { + if ( getNumberOfMappingQualityZeroReads() > 0 ) { + SampleSplitReadBackedExtendedEventPileup filteredPileup = new SampleSplitReadBackedExtendedEventPileup(loc); + + for (Map.Entry entry: pileupBySample.entrySet()) { + String sampleName = entry.getKey(); + ReadBackedExtendedEventPileup pileup = entry.getValue(); + filteredPileup.addPileup(sampleName,pileup.getPileupWithoutMappingQualityZeroReads()); + } + + return filteredPileup; + } else { + return this; + } + } + + @Override + public ReadBackedExtendedEventPileup getMappingFilteredPileup( int minMapQ ) { + SampleSplitReadBackedExtendedEventPileup filteredPileup = new SampleSplitReadBackedExtendedEventPileup(loc); + + for (Map.Entry entry: pileupBySample.entrySet()) { + String sampleName = entry.getKey(); + ReadBackedExtendedEventPileup pileup = entry.getValue(); + filteredPileup.addPileup(sampleName,pileup.getMappingFilteredPileup(minMapQ)); + } + + return filteredPileup; + } + + /** + * Returns a pileup randomly downsampled to the desiredCoverage. + * + * @param desiredCoverage + * @return + */ + @Override + public ReadBackedExtendedEventPileup getDownsampledPileup(int desiredCoverage) { + if ( size() <= desiredCoverage ) + return this; + + // randomly choose numbers corresponding to positions in the reads list + Random generator = new Random(); + TreeSet positions = new TreeSet(); + for ( int i = 0; i < desiredCoverage; /* no update */ ) { + if ( positions.add(generator.nextInt(size)) ) + i++; + } + + SampleSplitReadBackedExtendedEventPileup downsampledPileup = new SampleSplitReadBackedExtendedEventPileup(getLocation()); + int current = 0; + + for(Map.Entry entry: this.pileupBySample.entrySet()) { + String sampleName = entry.getKey(); + ReadBackedExtendedEventPileup pileup = entry.getValue(); + + List filteredPileup = new ArrayList(); + for(ExtendedEventPileupElement p: pileup) { + if(positions.contains(current)) + filteredPileup.add(p); + } + + if(!filteredPileup.isEmpty()) + downsampledPileup.addPileup(sampleName,new UnifiedReadBackedExtendedEventPileup(loc,filteredPileup)); + + current++; + } + + return downsampledPileup; + } + + @Override + public Iterator iterator() { + return new ExtendedEventCastingIterator(new MergingPileupElementIterator(pileupBySample)); + } + + /** + * Returns the number of deletion events in this pileup + * + * @return + */ + @Override + public int getNumberOfDeletions() { + return nDeletions; + } + + /** + * Returns the number of insertion events in this pileup + * + * @return + */ + @Override + public int getNumberOfInsertions() { + return nInsertions; + } + + /** Returns the length of the longest deletion observed at the site this + * pileup is associated with (NOTE: by convention, both insertions and deletions + * are associated with genomic location immediately before the actual event). If + * there are no deletions at the site, returns 0. + * @return + */ + @Override + public int getMaxDeletionLength() { + int maxDeletionLength = 0; + for(ReadBackedExtendedEventPileup pileup: pileupBySample.values()) + maxDeletionLength = Math.max(maxDeletionLength,pileup.getMaxDeletionLength()); + return maxDeletionLength; + } + + /** + * Returns the number of mapping quality zero reads in this pileup. + * @return + */ + @Override + public int getNumberOfMappingQualityZeroReads() { + return nMQ0Reads; + } + + /** + * @return the number of elements in this pileup + */ + @Override + public int size() { + return size; + } + + /** + * @return the location of this pileup + */ + @Override + public GenomeLoc getLocation() { + return loc; + } + + @Override + public String getShortPileupString() { + // In the pileup format, each extended event line has genomic position (chromosome name and offset), + // reference "base" (always set to "E" for E(xtended)), then 'I','D', or '.' symbol for each read representing + // insertion, deletion or no-event, respectively. + return String.format("%s %s E %s", + getLocation().getContig(), getLocation().getStart(), // chromosome name and coordinate + new String(getEvents()) ); + } + + // -------------------------------------------------------- + // + // Convenience functions that may be slow + // + // -------------------------------------------------------- + + /** + * Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time + * @return + */ + @Override + public List getReads() { + List reads = new ArrayList(size()); + for ( ExtendedEventPileupElement pile : this ) { reads.add(pile.getRead()); } + return reads; + } + + /** + * Returns a list of the offsets in this pileup. Note this call costs O(n) and allocates fresh lists each time + * @return + */ + @Override + public List getOffsets() { + List offsets = new ArrayList(size()); + for ( ExtendedEventPileupElement pile : this ) { offsets.add(pile.getOffset()); } + return offsets; + } + + /** + * Returns an array of the events in this pileup ('I', 'D', or '.'). Note this call costs O(n) and allocates fresh array each time + * @return + */ + @Override + public byte[] getEvents() { + byte[] v = new byte[size()]; + int i = 0; + for ( ExtendedEventPileupElement e : this ) { + switch ( e.getType() ) { + case INSERTION: v[i] = 'I'; break; + case DELETION: v[i] = 'D'; break; + case NOEVENT: v[i] = '.'; break; + default: throw new StingException("Unknown event type encountered: "+e.getType()); + } + i++; + } + return v; + } + + /** A shortcut for getEventStringsWithCounts(null); + * + * @return + */ + @Override + public List> getEventStringsWithCounts() { + return getEventStringsWithCounts(null); + } + + /** Returns String representation of all distinct extended events (indels) at the site along with + * observation counts (numbers of reads) for each distinct event. If refBases is null, a simple string representation for + * deletions will be generated as "D" (i.e. "5D"); if the reference bases are provided, the actual + * deleted sequence will be used in the string representation (e.g. "-AAC"). + * @param refBases reference bases, starting with the current locus (i.e. the one immediately before the indel), and + * extending far enough to accomodate the longest deletion (i.e. size of refBases must be at least 1+) + * @return list of distinct events; first element of a pair is a string representation of the event, second element + * gives the number of reads, in which that event was observed + */ + @Override + public List> getEventStringsWithCounts(byte[] refBases) { + Map events = new HashMap(); + + for ( ExtendedEventPileupElement e : this ) { + Integer cnt; + String indel = null; + switch ( e.getType() ) { + case INSERTION: + indel = "+"+e.getEventBases(); + break; + case DELETION: + indel = UnifiedReadBackedExtendedEventPileup.getDeletionString(e.getEventLength(),refBases); + break; + case NOEVENT: continue; + default: throw new StingException("Unknown event type encountered: "+e.getType()); + } + + cnt = events.get(indel); + if ( cnt == null ) events.put(indel,1); + else events.put(indel,cnt.intValue()+1); + } + + List> eventList = new ArrayList>(events.size()); + for ( Map.Entry m : events.entrySet() ) { + eventList.add( new Pair(m.getKey(),m.getValue())); + } + return eventList; + } + + /** + * Get an array of the mapping qualities + * @return + */ + @Override + public byte[] getMappingQuals() { + byte[] v = new byte[size()]; + int i = 0; + for ( ExtendedEventPileupElement e : this ) { v[i++] = (byte)e.getRead().getMappingQuality(); } + return v; + } + + private class ExtendedEventCastingIterator implements Iterator { + private final Iterator wrappedIterator; + + public ExtendedEventCastingIterator(Iterator iterator) { + this.wrappedIterator = iterator; + } + + public boolean hasNext() { return wrappedIterator.hasNext(); } + public ExtendedEventPileupElement next() { return (ExtendedEventPileupElement)wrappedIterator.next(); } + public void remove() { wrappedIterator.remove(); } + } + +} diff --git a/java/src/org/broadinstitute/sting/utils/pileup/SampleSplitReadBackedPileup.java b/java/src/org/broadinstitute/sting/utils/pileup/SampleSplitReadBackedPileup.java new file mode 100644 index 000000000..52e6db31a --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/pileup/SampleSplitReadBackedPileup.java @@ -0,0 +1,414 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pileup; + +import net.sf.picard.util.PeekableIterator; +import net.sf.samtools.SAMRecord; + +import java.util.*; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.gatk.iterators.IterableIterator; + +/** + * A read-backed pileup that internally divides the dataset based + * on sample, for super efficient per-sample access. + * + * TODO: there are a few functions that are shared between UnifiedReadBackedPileup and SampleSplitReadBackedPileup. + * TODO: refactor these into a common class. + * + * @author mhanna + * @version 0.1 + */ +public class SampleSplitReadBackedPileup implements ReadBackedPileup { + private GenomeLoc loc = null; + private Map pileupBySample = null; + + private int size = 0; // cached value of the size of the pileup + private int nDeletions = 0; // cached value of the number of deletions + private int nMQ0Reads = 0; // cached value of the number of MQ0 reads + + public SampleSplitReadBackedPileup(GenomeLoc loc) { + this(loc,new HashMap()); + } + + /** + * Optimization of above constructor where all of the cached data is provided + * @param loc Location of this pileup. + * @param pileup Pileup data, split by sample name. + */ + public SampleSplitReadBackedPileup(GenomeLoc loc, Map pileup) { + if ( loc == null ) throw new StingException("Illegal null genomeloc in UnifiedReadBackedPileup"); + if ( pileup == null ) throw new StingException("Illegal null pileup in UnifiedReadBackedPileup"); + + this.loc = loc; + this.pileupBySample = pileup; + for(ReadBackedPileup pileupBySample: pileup.values()) { + this.size += pileupBySample.size(); + this.nDeletions += pileupBySample.getNumberOfDeletions(); + this.nMQ0Reads += pileupBySample.getNumberOfMappingQualityZeroReads(); + } + } + + private void addPileup(final String sampleName, ReadBackedPileup pileup) { + pileupBySample.put(sampleName,pileup); + + size += pileup.size(); + nDeletions += pileup.getNumberOfDeletions(); + nMQ0Reads += pileup.getNumberOfMappingQualityZeroReads(); + } + + /** + * Returns a new ReadBackedPileup that is free of deletion spanning reads in this pileup. Note that this + * does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy + * of the pileup (just returns this) if there are no deletions in the pileup. + * + * @return + */ + @Override + public ReadBackedPileup getPileupWithoutDeletions() { + if ( getNumberOfDeletions() > 0 ) { + SampleSplitReadBackedPileup filteredPileup = new SampleSplitReadBackedPileup(loc); + + for (Map.Entry entry: pileupBySample.entrySet()) { + String sampleName = entry.getKey(); + ReadBackedPileup pileup = entry.getValue(); + filteredPileup.addPileup(sampleName,pileup.getPileupWithoutDeletions()); + } + + return filteredPileup; + } else { + return this; + } + } + + /** + * Returns a new ReadBackedPileup where only one read from an overlapping read + * pair is retained. If the two reads in question disagree to their basecall, + * neither read is retained. If they agree on the base, the read with the higher + * quality observation is retained + * + * @return the newly filtered pileup + */ + @Override + public ReadBackedPileup getOverlappingFragmentFilteredPileup() { + SampleSplitReadBackedPileup filteredPileup = new SampleSplitReadBackedPileup(loc); + + for (Map.Entry entry: pileupBySample.entrySet()) { + String sampleName = entry.getKey(); + ReadBackedPileup pileup = entry.getValue(); + filteredPileup.addPileup(sampleName,pileup.getOverlappingFragmentFilteredPileup()); + } + + return filteredPileup; + } + + /** + * Returns a new ReadBackedPileup that is free of mapping quality zero reads in this pileup. Note that this + * does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy + * of the pileup (just returns this) if there are no MQ0 reads in the pileup. + * + * @return + */ + public ReadBackedPileup getPileupWithoutMappingQualityZeroReads() { + SampleSplitReadBackedPileup filteredPileup = new SampleSplitReadBackedPileup(loc); + + for (Map.Entry entry: pileupBySample.entrySet()) { + String sampleName = entry.getKey(); + ReadBackedPileup pileup = entry.getValue(); + filteredPileup.addPileup(sampleName,pileup.getPileupWithoutMappingQualityZeroReads()); + } + + return filteredPileup; + } + + /** + * Returns subset of this pileup that contains only bases with quality >= minBaseQ, coming from + * reads with mapping qualities >= minMapQ. This method allocates and returns a new instance of ReadBackedPileup. + * @param minBaseQ + * @param minMapQ + * @return + */ + @Override + public ReadBackedPileup getBaseAndMappingFilteredPileup( int minBaseQ, int minMapQ ) { + SampleSplitReadBackedPileup filteredPileup = new SampleSplitReadBackedPileup(loc); + + for (Map.Entry entry: pileupBySample.entrySet()) { + String sampleName = entry.getKey(); + ReadBackedPileup pileup = entry.getValue(); + filteredPileup.addPileup(sampleName,pileup.getBaseAndMappingFilteredPileup(minBaseQ,minMapQ)); + } + + return filteredPileup; + } + + /** Returns subset of this pileup that contains only bases with quality >= minBaseQ. + * This method allocates and returns a new instance of ReadBackedPileup. + * @param minBaseQ + * @return + */ + @Override + public ReadBackedPileup getBaseFilteredPileup( int minBaseQ ) { + return getBaseAndMappingFilteredPileup(minBaseQ, -1); + } + + /** Returns subset of this pileup that contains only bases coming from reads with mapping quality >= minMapQ. + * This method allocates and returns a new instance of ReadBackedPileup. + * @param minMapQ + * @return + */ + @Override + public ReadBackedPileup getMappingFilteredPileup( int minMapQ ) { + return getBaseAndMappingFilteredPileup(-1, minMapQ); + } + + /** + * Returns a pileup randomly downsampled to the desiredCoverage. + * + * @param desiredCoverage + * @return + */ + @Override + public ReadBackedPileup getDownsampledPileup(int desiredCoverage) { + if ( size() <= desiredCoverage ) + return this; + + // randomly choose numbers corresponding to positions in the reads list + Random generator = new Random(); + TreeSet positions = new TreeSet(); + for ( int i = 0; i < desiredCoverage; /* no update */ ) { + if ( positions.add(generator.nextInt(size)) ) + i++; + } + + SampleSplitReadBackedPileup downsampledPileup = new SampleSplitReadBackedPileup(getLocation()); + int current = 0; + + for(Map.Entry entry: this.pileupBySample.entrySet()) { + String sampleName = entry.getKey(); + ReadBackedPileup pileup = entry.getValue(); + + List filteredPileup = new ArrayList(); + for(PileupElement p: pileup) { + if(positions.contains(current)) + filteredPileup.add(p); + } + + if(!filteredPileup.isEmpty()) + downsampledPileup.addPileup(sampleName,new UnifiedReadBackedPileup(loc,filteredPileup)); + + current++; + } + + return downsampledPileup; + } + + @Override + public ReadBackedPileup getPileupForSample(String sampleName) { + return pileupBySample.containsKey(sampleName) ? pileupBySample.get(sampleName) : null; + } + + // -------------------------------------------------------- + // + // iterators + // + // -------------------------------------------------------- + + /** + * The best way to access PileupElements where you only care about the bases and quals in the pileup. + * + * for (PileupElement p : this) { doSomething(p); } + * + * Provides efficient iteration of the data. + * + * @return + */ + @Override + public Iterator iterator() { + return new MergingPileupElementIterator(pileupBySample); + } + + // todo -- why is this public? + @Override + public IterableIterator extendedForeachIterator() { + ArrayList x = new ArrayList(size()); + int i = 0; + Iterator iterator = new MergingPileupElementIterator(pileupBySample); + while(iterator.hasNext()) { + PileupElement pile = iterator.next(); + x.add(new ExtendedPileupElement(pile.getRead(), pile.getOffset(), i++, this)); + } + + return new IterableIterator(x.iterator()); + } + + /** + * Simple useful routine to count the number of deletion bases in this pileup + * + * @return + */ + @Override + public int getNumberOfDeletions() { + return nDeletions; + } + + @Override + public int getNumberOfMappingQualityZeroReads() { + return nMQ0Reads; + } + + /** + * @return the number of elements in this pileup + */ + @Override + public int size() { + return size; + } + + /** + * @return the location of this pileup + */ + @Override + public GenomeLoc getLocation() { + return loc; + } + + /** + * Get counts of A, C, G, T in order, which returns a int[4] vector with counts according + * to BaseUtils.simpleBaseToBaseIndex for each base. + * + * @return + */ + @Override + public int[] getBaseCounts() { + int[] counts = new int[4]; + for(ReadBackedPileup pileup: pileupBySample.values()) { + int[] countsBySample = pileup.getBaseCounts(); + for(int i = 0; i < counts.length; i++) + counts[i] += countsBySample[i]; + } + return counts; + } + + /** + * Somewhat expensive routine that returns true if any base in the pileup has secondary bases annotated + * @return + */ + @Override + public boolean hasSecondaryBases() { + boolean hasSecondaryBases = false; + for(ReadBackedPileup pileup: pileupBySample.values()) + hasSecondaryBases |= pileup.hasSecondaryBases(); + return hasSecondaryBases; + } + + @Override + public String getPileupString(char ref) { + // In the pileup format, each line represents a genomic position, consisting of chromosome name, + // coordinate, reference base, read bases, read qualities and alignment mapping qualities. + return String.format("%s %s %c %s %s", + getLocation().getContig(), getLocation().getStart(), // chromosome name and coordinate + ref, // reference base + new String(getBases()), + getQualsString()); + } + + // -------------------------------------------------------- + // + // Convenience functions that may be slow + // + // -------------------------------------------------------- + + /** + * Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time + * @return + */ + @Override + public List getReads() { + List reads = new ArrayList(size()); + for ( PileupElement pile : this ) { reads.add(pile.getRead()); } + return reads; + } + + /** + * Returns a list of the offsets in this pileup. Note this call costs O(n) and allocates fresh lists each time + * @return + */ + @Override + public List getOffsets() { + List offsets = new ArrayList(size()); + for ( PileupElement pile : this ) { offsets.add(pile.getOffset()); } + return offsets; + } + + /** + * Returns an array of the bases in this pileup. Note this call costs O(n) and allocates fresh array each time + * @return + */ + @Override + public byte[] getBases() { + byte[] v = new byte[size()]; + for ( ExtendedPileupElement pile : this.extendedForeachIterator() ) { v[pile.getPileupOffset()] = pile.getBase(); } + return v; + } + + /** + * Returns an array of the secondary bases in this pileup. Note this call costs O(n) and allocates fresh array each time + * @return + */ + @Override + public byte[] getSecondaryBases() { + byte[] v = new byte[size()]; + for ( ExtendedPileupElement pile : this.extendedForeachIterator() ) { v[pile.getPileupOffset()] = pile.getSecondBase(); } + return v; + } + + /** + * Returns an array of the quals in this pileup. Note this call costs O(n) and allocates fresh array each time + * @return + */ + @Override + public byte[] getQuals() { + byte[] v = new byte[size()]; + for ( ExtendedPileupElement pile : this.extendedForeachIterator() ) { v[pile.getPileupOffset()] = pile.getQual(); } + return v; + } + + /** + * Get an array of the mapping qualities + * @return + */ + @Override + public byte[] getMappingQuals() { + byte[] v = new byte[size()]; + for ( ExtendedPileupElement pile : this.extendedForeachIterator() ) { v[pile.getPileupOffset()] = (byte)pile.getRead().getMappingQuality(); } + return v; + } + + private String getQualsString() { + return UnifiedReadBackedPileup.quals2String(getQuals()); + } +} diff --git a/java/src/org/broadinstitute/sting/utils/pileup/UnifiedReadBackedExtendedEventPileup.java b/java/src/org/broadinstitute/sting/utils/pileup/UnifiedReadBackedExtendedEventPileup.java new file mode 100644 index 000000000..bc1f46429 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/pileup/UnifiedReadBackedExtendedEventPileup.java @@ -0,0 +1,420 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pileup; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.collections.Pair; + +import java.util.*; + +import net.sf.samtools.SAMRecord; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Dec 29, 2009 + * Time: 12:25:55 PM + * To change this template use File | Settings | File Templates. + */ +public class UnifiedReadBackedExtendedEventPileup implements ReadBackedExtendedEventPileup { + private GenomeLoc loc = null; + private List pileup = null; + + private int size = 0; // cached value of the size of the pileup + private int maxDeletionLength = 0; // cached value of the length of the longest deletion observed at the site + private int nDeletions = 0; // cached value of the number of deletions + private int nInsertions = 0; + private int nMQ0Reads = 0; // cached value of the number of MQ0 reads + + /** + * Create a new version of a read backed pileup at loc without any aligned reads + * + */ + public UnifiedReadBackedExtendedEventPileup(GenomeLoc loc ) { + this(loc, new ArrayList(0)); + } + + /** + * Create a new version of a read backed pileup at loc, using the reads and their corresponding + * offsets. This lower level constructure assumes pileup is well-formed and merely keeps a + * pointer to pileup. Don't go changing the data in pileup. + * + */ + public UnifiedReadBackedExtendedEventPileup(GenomeLoc loc, List pileup ) { + if ( loc == null ) throw new StingException("Illegal null genomeloc in ReadBackedExtendedEventPileup"); + if ( pileup == null ) throw new StingException("Illegal null pileup in ReadBackedExtendedEventPileup"); + + this.loc = loc; + this.pileup = pileup; + + calculatedCachedData(); + } + + /** + * Optimization of above constructor where all of the cached data is provided + * @param loc + * @param pileup + */ + public UnifiedReadBackedExtendedEventPileup(GenomeLoc loc, List pileup, int size, + int maxDeletionLength, int nInsertions, int nDeletions, int nMQ0Reads ) { + if ( loc == null ) throw new StingException("Illegal null genomeloc in ReadBackedExtendedEventPileup"); + if ( pileup == null ) throw new StingException("Illegal null pileup in ReadBackedExtendedEventPileup"); + + this.loc = loc; + this.pileup = pileup; + this.size = size; + this.maxDeletionLength = maxDeletionLength; + this.nDeletions = nDeletions; + this.nInsertions = nInsertions; + this.nMQ0Reads = nMQ0Reads; + } + + + /** + * Calculate cached sizes, nDeletion, and base counts for the pileup. This calculation is done upfront, + * so you pay the cost at the start, but it's more efficient to do this rather than pay the cost of calling + * sizes, nDeletion, etc. over and over potentially. + */ + private void calculatedCachedData() { + size = 0; + nDeletions = 0; + nInsertions = 0; + nMQ0Reads = 0; + + for ( ExtendedEventPileupElement p : this ) { + + size++; + if ( p.isDeletion() ) { + nDeletions++; + maxDeletionLength = Math.max(maxDeletionLength, p.getEventLength()); + } else { + if ( p.isInsertion() ) nInsertions++; + } + + if ( p.getRead().getMappingQuality() == 0 ) { + nMQ0Reads++; + } + } + } + + + // -------------------------------------------------------- + // + // Special 'constructors' + // + // -------------------------------------------------------- + + + /** + * Returns a new ReadBackedPileup that is free of mapping quality zero reads in this pileup. Note that this + * does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy + * of the pileup (just returns this) if there are no MQ0 reads in the pileup. + * + * @return + */ + @Override + public ReadBackedExtendedEventPileup getPileupWithoutMappingQualityZeroReads() { + + if ( getNumberOfMappingQualityZeroReads() > 0 ) { + ArrayList filteredPileup = new ArrayList(); + for ( ExtendedEventPileupElement p : pileup ) { + if ( p.getRead().getMappingQuality() > 0 ) { + filteredPileup.add(p); + } + } + return new UnifiedReadBackedExtendedEventPileup(loc, filteredPileup); + } else { + return this; + } + } + + @Override + public ReadBackedExtendedEventPileup getMappingFilteredPileup( int minMapQ ) { + ArrayList filteredPileup = new ArrayList(); + + for ( ExtendedEventPileupElement p : pileup ) { + if ( p.getRead().getMappingQuality() >= minMapQ ) { + filteredPileup.add(p); + } + } + + return new UnifiedReadBackedExtendedEventPileup(loc, filteredPileup); + } + + /** + * Returns a pileup randomly downsampled to the desiredCoverage. + * + * @param desiredCoverage + * @return + */ + @Override + public ReadBackedExtendedEventPileup getDownsampledPileup(int desiredCoverage) { + if ( size() <= desiredCoverage ) + return this; + + // randomly choose numbers corresponding to positions in the reads list + Random generator = new Random(); + TreeSet positions = new TreeSet(); + for ( int i = 0; i < desiredCoverage; /* no update */ ) { + if ( positions.add(generator.nextInt(pileup.size())) ) + i++; + } + + Iterator positionIter = positions.iterator(); + ArrayList downsampledPileup = new ArrayList(); + + while ( positionIter.hasNext() ) { + int nextReadToKeep = (Integer)positionIter.next(); + downsampledPileup.add(pileup.get(nextReadToKeep)); + } + + return new UnifiedReadBackedExtendedEventPileup(getLocation(), downsampledPileup); + } + + // -------------------------------------------------------- + // + // iterators + // + // -------------------------------------------------------- + + /** + * The best way to access PileupElements where you only care about the bases and quals in the pileup. + * + * for (PileupElement p : this) { doSomething(p); } + * + * Provides efficient iteration of the data. + * + * @return + */ + @Override + public Iterator iterator() { + return pileup.iterator(); + } + + + /** + * Returns the number of deletion events in this pileup + * + * @return + */ + @Override + public int getNumberOfDeletions() { + return nDeletions; + } + + /** + * Returns the number of insertion events in this pileup + * + * @return + */ + @Override + public int getNumberOfInsertions() { + return nInsertions; + } + + + /** Returns the length of the longest deletion observed at the site this + * pileup is associated with (NOTE: by convention, both insertions and deletions + * are associated with genomic location immediately before the actual event). If + * there are no deletions at the site, returns 0. + * @return + */ + @Override + public int getMaxDeletionLength() { + return maxDeletionLength; + } + /** + * Returns the number of mapping quality zero reads in this pileup. + * @return + */ + @Override + public int getNumberOfMappingQualityZeroReads() { + return nMQ0Reads; + } + +// public int getNumberOfDeletions() { +// int n = 0; +// +// for ( int i = 0; i < size(); i++ ) { +// if ( getOffsets().get(i) != -1 ) { n++; } +// } +// +// return n; +// } + + /** + * @return the number of elements in this pileup + */ + @Override + public int size() { + return size; + } + + /** + * @return the location of this pileup + */ + @Override + public GenomeLoc getLocation() { + return loc; + } + + @Override + public String getShortPileupString() { + // In the pileup format, each extended event line has genomic position (chromosome name and offset), + // reference "base" (always set to "E" for E(xtended)), then 'I','D', or '.' symbol for each read representing + // insertion, deletion or no-event, respectively. + return String.format("%s %s E %s", + getLocation().getContig(), getLocation().getStart(), // chromosome name and coordinate + new String(getEvents()) ); + } + + + // -------------------------------------------------------- + // + // Convenience functions that may be slow + // + // -------------------------------------------------------- + + /** + * Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time + * @return + */ + @Override + public List getReads() { + List reads = new ArrayList(size()); + for ( ExtendedEventPileupElement pile : this ) { reads.add(pile.getRead()); } + return reads; + } + + /** + * Returns a list of the offsets in this pileup. Note this call costs O(n) and allocates fresh lists each time + * @return + */ + @Override + public List getOffsets() { + List offsets = new ArrayList(size()); + for ( ExtendedEventPileupElement pile : this ) { offsets.add(pile.getOffset()); } + return offsets; + } + + /** + * Returns an array of the events in this pileup ('I', 'D', or '.'). Note this call costs O(n) and allocates fresh array each time + * @return + */ + @Override + public byte[] getEvents() { + byte[] v = new byte[size()]; + int i = 0; + for ( ExtendedEventPileupElement e : this ) { + switch ( e.getType() ) { + case INSERTION: v[i] = 'I'; break; + case DELETION: v[i] = 'D'; break; + case NOEVENT: v[i] = '.'; break; + default: throw new StingException("Unknown event type encountered: "+e.getType()); + } + i++; + } + return v; + } + + /** A shortcut for getEventStringsWithCounts(null); + * + * @return + */ + @Override + public List> getEventStringsWithCounts() { + return getEventStringsWithCounts(null); + } + + /** Returns String representation of all distinct extended events (indels) at the site along with + * observation counts (numbers of reads) for each distinct event. If refBases is null, a simple string representation for + * deletions will be generated as "D" (i.e. "5D"); if the reference bases are provided, the actual + * deleted sequence will be used in the string representation (e.g. "-AAC"). + * @param refBases reference bases, starting with the current locus (i.e. the one immediately before the indel), and + * extending far enough to accomodate the longest deletion (i.e. size of refBases must be at least 1+) + * @return list of distinct events; first element of a pair is a string representation of the event, second element + * gives the number of reads, in which that event was observed + */ + @Override + public List> getEventStringsWithCounts(byte[] refBases) { + Map events = new HashMap(); + + for ( ExtendedEventPileupElement e : this ) { + Integer cnt; + String indel = null; + switch ( e.getType() ) { + case INSERTION: + indel = "+"+e.getEventBases(); + break; + case DELETION: + indel = getDeletionString(e.getEventLength(),refBases); + break; + case NOEVENT: continue; + default: throw new StingException("Unknown event type encountered: "+e.getType()); + } + + cnt = events.get(indel); + if ( cnt == null ) events.put(indel,1); + else events.put(indel,cnt.intValue()+1); + } + + List> eventList = new ArrayList>(events.size()); + for ( Map.Entry m : events.entrySet() ) { + eventList.add( new Pair(m.getKey(),m.getValue())); + } + return eventList; + } + + /** + * Builds string representation of the deletion event. If refBases is null, the representation will be + * "D" (e.g. "5D"); if the reference bases are provided, a verbose representation (e.g. "-AAC") + * will be generated. NOTE: refBases must start with the base prior to the actual deletion (i.e. deleted + * base(s) are refBase[1], refBase[2], ...), and the length of the passed array must be sufficient to accomodate the + * deletion length (i.e. size of refBase must be at least length+1). + * @param length + * @param refBases + * @return + */ + static String getDeletionString(int length, byte[] refBases) { + if ( refBases == null ) { + return Integer.toString(length)+"D"; // if we do not have reference bases, we can only report something like "5D" + } else { + return "-"+new String(refBases,1,length).toUpperCase(); + } + } + + /** + * Get an array of the mapping qualities + * @return + */ + @Override + public byte[] getMappingQuals() { + byte[] v = new byte[size()]; + int i = 0; + for ( ExtendedEventPileupElement e : this ) { v[i++] = (byte)e.getRead().getMappingQuality(); } + return v; + } +} diff --git a/java/src/org/broadinstitute/sting/utils/pileup/UnifiedReadBackedPileup.java b/java/src/org/broadinstitute/sting/utils/pileup/UnifiedReadBackedPileup.java new file mode 100755 index 000000000..62a25e58e --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/pileup/UnifiedReadBackedPileup.java @@ -0,0 +1,550 @@ +package org.broadinstitute.sting.utils.pileup; + +import org.broadinstitute.sting.gatk.iterators.IterableIterator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.BaseUtils; + +import java.util.*; + +import net.sf.samtools.SAMRecord; + +/** + * Version two file implementing pileups of bases in reads at a locus. + * + * @author Mark DePristo + */ +public class UnifiedReadBackedPileup implements ReadBackedPileup { + private GenomeLoc loc = null; + private List pileup = null; + + private int size = 0; // cached value of the size of the pileup + private int nDeletions = 0; // cached value of the number of deletions + private int nMQ0Reads = 0; // cached value of the number of MQ0 reads + + /** + * Create a new version of a read backed pileup at loc, using the reads and their corresponding + * offsets. This pileup will contain a list, in order of the reads, of the piled bases at + * reads[i] for all i in offsets. Does not make a copy of the data, so it's not safe to + * go changing the reads. + * + * @param loc + * @param reads + * @param offsets + */ + public UnifiedReadBackedPileup(GenomeLoc loc, List reads, List offsets ) { + this(loc, readsOffsets2Pileup(reads, offsets)); + } + + public UnifiedReadBackedPileup(GenomeLoc loc, List reads, int offset ) { + this(loc, readsOffsets2Pileup(reads, offset)); + } + + /** + * Create a new version of a read backed pileup at loc without any aligned reads + * + */ + public UnifiedReadBackedPileup(GenomeLoc loc ) { + this(loc, new ArrayList(0)); + } + + /** + * Create a new version of a read backed pileup at loc, using the reads and their corresponding + * offsets. This lower level constructure assumes pileup is well-formed and merely keeps a + * pointer to pileup. Don't go changing the data in pileup. + * + */ + public UnifiedReadBackedPileup(GenomeLoc loc, List pileup ) { + if ( loc == null ) throw new StingException("Illegal null genomeloc in ReadBackedPileup"); + if ( pileup == null ) throw new StingException("Illegal null pileup in ReadBackedPileup"); + + this.loc = loc; + this.pileup = pileup; + + calculatedCachedData(); + } + + /** + * Optimization of above constructor where all of the cached data is provided + * @param loc + * @param pileup + */ + public UnifiedReadBackedPileup(GenomeLoc loc, List pileup, int size, int nDeletions, int nMQ0Reads ) { + if ( loc == null ) throw new StingException("Illegal null genomeloc in UnifiedReadBackedPileup"); + if ( pileup == null ) throw new StingException("Illegal null pileup in UnifiedReadBackedPileup"); + + this.loc = loc; + this.pileup = pileup; + this.size = size; + this.nDeletions = nDeletions; + this.nMQ0Reads = nMQ0Reads; + } + + + /** + * Calculate cached sizes, nDeletion, and base counts for the pileup. This calculation is done upfront, + * so you pay the cost at the start, but it's more efficient to do this rather than pay the cost of calling + * sizes, nDeletion, etc. over and over potentially. + */ + private void calculatedCachedData() { + size = 0; + nDeletions = 0; + nMQ0Reads = 0; + + for ( PileupElement p : this ) { + size++; + if ( p.isDeletion() ) { + nDeletions++; + } + if ( p.getRead().getMappingQuality() == 0 ) { + nMQ0Reads++; + } + } + } + + + /** + * Helper routine for converting reads and offset lists to a PileupElement list. + * + * @param reads + * @param offsets + * @return + */ + private static ArrayList readsOffsets2Pileup(List reads, List offsets ) { + if ( reads == null ) throw new StingException("Illegal null read list in UnifiedReadBackedPileup"); + if ( offsets == null ) throw new StingException("Illegal null offsets list in UnifiedReadBackedPileup"); + if ( reads.size() != offsets.size() ) throw new StingException("Reads and offset lists have different sizes!"); + + ArrayList pileup = new ArrayList(reads.size()); + for ( int i = 0; i < reads.size(); i++ ) { + pileup.add( new PileupElement( reads.get(i), offsets.get(i) ) ); + } + + return pileup; + } + + /** + * Helper routine for converting reads and a single offset to a PileupElement list. + * + * @param reads + * @param offset + * @return + */ + private static ArrayList readsOffsets2Pileup(List reads, int offset ) { + if ( reads == null ) throw new StingException("Illegal null read list in UnifiedReadBackedPileup"); + if ( offset < 0 ) throw new StingException("Illegal offset < 0 UnifiedReadBackedPileup"); + + ArrayList pileup = new ArrayList(reads.size()); + for ( int i = 0; i < reads.size(); i++ ) { + pileup.add( new PileupElement( reads.get(i), offset ) ); + } + + return pileup; + } + + // -------------------------------------------------------- + // + // Special 'constructors' + // + // -------------------------------------------------------- + + /** + * Returns a new ReadBackedPileup that is free of deletion spanning reads in this pileup. Note that this + * does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy + * of the pileup (just returns this) if there are no deletions in the pileup. + * + * @return + */ + @Override + public ReadBackedPileup getPileupWithoutDeletions() { + if ( getNumberOfDeletions() > 0 ) { + ArrayList filteredPileup = new ArrayList(); + + for ( PileupElement p : pileup ) { + if ( !p.isDeletion() ) { + filteredPileup.add(p); + } + } + return new UnifiedReadBackedPileup(loc, filteredPileup); + } else { + return this; + } + } + + /** + * Returns a new ReadBackedPileup where only one read from an overlapping read + * pair is retained. If the two reads in question disagree to their basecall, + * neither read is retained. If they agree on the base, the read with the higher + * quality observation is retained + * + * @return the newly filtered pileup + */ + @Override + public ReadBackedPileup getOverlappingFragmentFilteredPileup() { + Map filteredPileup = new HashMap(); + + for ( PileupElement p : pileup ) { + String readName = p.getRead().getReadName(); + + // if we've never seen this read before, life is good + if (!filteredPileup.containsKey(readName)) { + filteredPileup.put(readName, p); + } else { + PileupElement existing = filteredPileup.get(readName); + + // if the reads disagree at this position, throw them both out. Otherwise + // keep the element with the higher quality score + if (existing.getBase() != p.getBase()) { + filteredPileup.remove(readName); + } else { + if (existing.getQual() < p.getQual()) { + filteredPileup.put(readName, p); + } + } + } + } + + return new UnifiedReadBackedPileup(loc, new ArrayList(filteredPileup.values())); + } + + /** + * Returns a new ReadBackedPileup that is free of mapping quality zero reads in this pileup. Note that this + * does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy + * of the pileup (just returns this) if there are no MQ0 reads in the pileup. + * + * @return + */ + @Override + public ReadBackedPileup getPileupWithoutMappingQualityZeroReads() { + + if ( getNumberOfMappingQualityZeroReads() > 0 ) { + ArrayList filteredPileup = new ArrayList(); + for ( PileupElement p : pileup ) { + if ( p.getRead().getMappingQuality() > 0 ) { + filteredPileup.add(p); + } + } + return new UnifiedReadBackedPileup(loc, filteredPileup); + } else { + return this; + } + } + + /** Returns subset of this pileup that contains only bases with quality >= minBaseQ, coming from + * reads with mapping qualities >= minMapQ. This method allocates and returns a new instance of ReadBackedPileup. + * @param minBaseQ + * @param minMapQ + * @return + */ + @Override + public ReadBackedPileup getBaseAndMappingFilteredPileup( int minBaseQ, int minMapQ ) { + ArrayList filteredPileup = new ArrayList(); + + for ( PileupElement p : pileup ) { + if ( p.getRead().getMappingQuality() >= minMapQ && (p.isDeletion() || p.getQual() >= minBaseQ) ) { + filteredPileup.add(p); + } + } + + return new UnifiedReadBackedPileup(loc, filteredPileup); + } + + /** Returns subset of this pileup that contains only bases with quality >= minBaseQ. + * This method allocates and returns a new instance of ReadBackedPileup. + * @param minBaseQ + * @return + */ + @Override + public ReadBackedPileup getBaseFilteredPileup( int minBaseQ ) { + return getBaseAndMappingFilteredPileup(minBaseQ, -1); + } + + /** Returns subset of this pileup that contains only bases coming from reads with mapping quality >= minMapQ. + * This method allocates and returns a new instance of ReadBackedPileup. + * @param minMapQ + * @return + */ + @Override + public ReadBackedPileup getMappingFilteredPileup( int minMapQ ) { + return getBaseAndMappingFilteredPileup(-1, minMapQ); + } + + /** + * Returns a pileup randomly downsampled to the desiredCoverage. + * + * @param desiredCoverage + * @return + */ + @Override + public ReadBackedPileup getDownsampledPileup(int desiredCoverage) { + if ( size() <= desiredCoverage ) + return this; + + // randomly choose numbers corresponding to positions in the reads list + Random generator = new Random(); + TreeSet positions = new TreeSet(); + for ( int i = 0; i < desiredCoverage; /* no update */ ) { + if ( positions.add(generator.nextInt(pileup.size())) ) + i++; + } + + Iterator positionIter = positions.iterator(); + ArrayList downsampledPileup = new ArrayList(); + + while ( positionIter.hasNext() ) { + int nextReadToKeep = (Integer)positionIter.next(); + downsampledPileup.add(pileup.get(nextReadToKeep)); + } + + return new UnifiedReadBackedPileup(getLocation(), downsampledPileup); + } + + @Override + public ReadBackedPileup getPileupForSample(String sampleName) { + List filteredPileup = new ArrayList(); + for(PileupElement p: this) { + SAMRecord read = p.getRead(); + if(sampleName != null) { + if(read.getReadGroup() != null && sampleName.equals(read.getReadGroup().getSample())) + filteredPileup.add(p); + } + else { + if(read.getReadGroup() == null || read.getReadGroup().getSample() == null) + filteredPileup.add(p); + } + } + return filteredPileup.size()>0 ? new UnifiedReadBackedPileup(loc,filteredPileup) : null; + } + + // -------------------------------------------------------- + // + // iterators + // + // -------------------------------------------------------- + + /** + * The best way to access PileupElements where you only care about the bases and quals in the pileup. + * + * for (PileupElement p : this) { doSomething(p); } + * + * Provides efficient iteration of the data. + * + * @return + */ + @Override + public Iterator iterator() { + return pileup.iterator(); + } + + + /** + * The best way to access PileupElements where you only care not only about bases and quals in the pileup + * but also need access to the index of the pileup element in the pile. + * + * for (ExtendedPileupElement p : this) { doSomething(p); } + * + * Provides efficient iteration of the data. + * + * @return + */ + + // todo -- reimplement efficiently + // todo -- why is this public? + @Override + public IterableIterator extendedForeachIterator() { + ArrayList x = new ArrayList(size()); + int i = 0; + for ( PileupElement pile : this ) { + x.add(new ExtendedPileupElement(pile.getRead(), pile.getOffset(), i++, this)); + } + + return new IterableIterator(x.iterator()); + } + + /** + * Simple useful routine to count the number of deletion bases in this pileup + * + * @return + */ + @Override + public int getNumberOfDeletions() { + return nDeletions; + } + + @Override + public int getNumberOfMappingQualityZeroReads() { + return nMQ0Reads; + } + + /** + * @return the number of elements in this pileup + */ + @Override + public int size() { + return size; + } + + /** + * @return the location of this pileup + */ + @Override + public GenomeLoc getLocation() { + return loc; + } + + /** + * Get counts of A, C, G, T in order, which returns a int[4] vector with counts according + * to BaseUtils.simpleBaseToBaseIndex for each base. + * + * @return + */ + @Override + public int[] getBaseCounts() { + int[] counts = new int[4]; + for ( PileupElement pile : this ) { + // skip deletion sites + if ( ! pile.isDeletion() ) { + int index = BaseUtils.simpleBaseToBaseIndex((char)pile.getBase()); + if (index != -1) + counts[index]++; + } + } + + return counts; + } + + /** + * Somewhat expensive routine that returns true if any base in the pileup has secondary bases annotated + * @return + */ + @Override + public boolean hasSecondaryBases() { + for ( PileupElement pile : this ) { + // skip deletion sites + if ( ! pile.isDeletion() && BaseUtils.isRegularBase((char)pile.getSecondBase()) ) + return true; + } + + return false; + } + + @Override + public String getPileupString(char ref) { + // In the pileup format, each line represents a genomic position, consisting of chromosome name, + // coordinate, reference base, read bases, read qualities and alignment mapping qualities. + return String.format("%s %s %c %s %s", + getLocation().getContig(), getLocation().getStart(), // chromosome name and coordinate + ref, // reference base + new String(getBases()), + getQualsString()); + } + + + // -------------------------------------------------------- + // + // Convenience functions that may be slow + // + // -------------------------------------------------------- + + /** + * Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time + * @return + */ + @Override + public List getReads() { + List reads = new ArrayList(size()); + for ( PileupElement pile : this ) { reads.add(pile.getRead()); } + return reads; + } + + /** + * Returns a list of the offsets in this pileup. Note this call costs O(n) and allocates fresh lists each time + * @return + */ + @Override + public List getOffsets() { + List offsets = new ArrayList(size()); + for ( PileupElement pile : this ) { offsets.add(pile.getOffset()); } + return offsets; + } + + /** + * Returns an array of the bases in this pileup. Note this call costs O(n) and allocates fresh array each time + * @return + */ + @Override + public byte[] getBases() { + byte[] v = new byte[size()]; + for ( ExtendedPileupElement pile : this.extendedForeachIterator() ) { v[pile.getPileupOffset()] = pile.getBase(); } + return v; + } + + /** + * Returns an array of the secondary bases in this pileup. Note this call costs O(n) and allocates fresh array each time + * @return + */ + @Override + public byte[] getSecondaryBases() { + byte[] v = new byte[size()]; + for ( ExtendedPileupElement pile : this.extendedForeachIterator() ) { v[pile.getPileupOffset()] = pile.getSecondBase(); } + return v; + } + + /** + * Returns an array of the quals in this pileup. Note this call costs O(n) and allocates fresh array each time + * @return + */ + @Override + public byte[] getQuals() { + byte[] v = new byte[size()]; + for ( ExtendedPileupElement pile : this.extendedForeachIterator() ) { v[pile.getPileupOffset()] = pile.getQual(); } + return v; + } + + /** + * Get an array of the mapping qualities + * @return + */ + @Override + public byte[] getMappingQuals() { + byte[] v = new byte[size()]; + for ( ExtendedPileupElement pile : this.extendedForeachIterator() ) { v[pile.getPileupOffset()] = (byte)pile.getRead().getMappingQuality(); } + return v; + } + + + // + // Private functions for printing pileups + // + private String getMappingQualsString() { + return quals2String(getMappingQuals()); + } + + static String quals2String( byte[] quals ) { + StringBuilder qualStr = new StringBuilder(); + for ( int qual : quals ) { + qual = Math.min(qual, 63); // todo: fixme, this isn't a good idea + char qualChar = (char) (33 + qual); // todo: warning, this is illegal for qual > 63 + qualStr.append(qualChar); + } + + return qualStr.toString(); + } + + private String getQualsString() { + return quals2String(getQuals()); + } + + @Override + public String toString() { + StringBuilder s = new StringBuilder(); + s.append(getLocation()); + s.append(": "); + + for ( PileupElement p : this ) { + s.append((char)p.getBase()); + } + + return s.toString(); + } +}