Refactor LIBS into component parts, expand unit tests, some code cleanup

-- Split out all of the inner classes of LIBS into separate independent classes
-- Split / add unit tests for many of these components.
-- Radically expand unit tests for SAMRecordAlignmentState (the lowest level piece of code) making sure at least some of it works
-- No need to change unit tests or integration tests.  No change in functionality.
-- Added (currently disabled) code to track all submitted reads to LIBS, but this isn't accessible or tested
This commit is contained in:
Mark DePristo 2013-01-06 11:55:18 -05:00
parent 2e5d38fd0e
commit b3ecfbfce8
16 changed files with 1704 additions and 1350 deletions

View File

@ -29,7 +29,7 @@ import net.sf.picard.util.PeekableIterator;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.utils.locusiterator.LegacyLocusIteratorByState;
import org.broadinstitute.sting.utils.locusiterator.legacy.LegacyLocusIteratorByState;
import org.broadinstitute.sting.utils.locusiterator.LocusIterator;
import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;

View File

@ -0,0 +1,53 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.locusiterator;
/**
* Created with IntelliJ IDEA.
* User: depristo
* Date: 1/5/13
* Time: 1:26 PM
* To change this template use File | Settings | File Templates.
*/
class LIBSDownsamplingInfo {
public final static LIBSDownsamplingInfo NO_DOWNSAMPLING = new LIBSDownsamplingInfo(false, -1);
final private boolean performDownsampling;
final private int toCoverage;
LIBSDownsamplingInfo(boolean performDownsampling, int toCoverage) {
this.performDownsampling = performDownsampling;
this.toCoverage = toCoverage;
}
public boolean isPerformDownsampling() {
return performDownsampling;
}
public int getToCoverage() {
return toCoverage;
}
}

View File

@ -25,8 +25,6 @@
package org.broadinstitute.sting.utils.locusiterator;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
@ -36,7 +34,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.downsampling.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.locusiterator.legacy.LegacyLocusIteratorByState;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -65,152 +63,10 @@ public class LocusIteratorByState extends LocusIterator {
private final GenomeLocParser genomeLocParser;
private final ArrayList<String> samples;
private final ReadStateManager readStates;
private final boolean keepSubmittedReads;
private final boolean includeReadsWithDeletionAtLoci;
protected static class SAMRecordState {
SAMRecord read;
int readOffset = -1; // how far are we offset from the start of the read bases?
int genomeOffset = -1; // how far are we offset from the alignment start on the genome?
Cigar cigar = null;
int cigarOffset = -1;
CigarElement curElement = null;
int nCigarElements = 0;
int cigarElementCounter = -1; // how far are we into a single cigarElement
// The logical model for generating extended events is as follows: the "record state" implements the traversal
// along the reference; thus stepForwardOnGenome() returns on every and only on actual reference bases. This
// can be a (mis)match or a deletion (in the latter case, we still return on every individual reference base the
// deletion spans). In the extended events mode, the record state also remembers if there was an insertion, or
// if the deletion just started *right before* the current reference base the record state is pointing to upon the return from
// stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended
// events immediately preceding the current reference base).
public SAMRecordState(SAMRecord read) {
this.read = read;
cigar = read.getCigar();
nCigarElements = cigar.numCigarElements();
//System.out.printf("Creating a SAMRecordState: %s%n", this);
}
public SAMRecord getRead() {
return read;
}
/**
* What is our current offset in the read's bases that aligns us with the reference genome?
*
* @return
*/
public int getReadOffset() {
return readOffset;
}
/**
* What is the current offset w.r.t. the alignment state that aligns us to the readOffset?
*
* @return
*/
public int getGenomeOffset() {
return genomeOffset;
}
public int getGenomePosition() {
return read.getAlignmentStart() + getGenomeOffset();
}
public GenomeLoc getLocation(GenomeLocParser genomeLocParser) {
return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition());
}
public CigarOperator getCurrentCigarOperator() {
return curElement.getOperator();
}
public String toString() {
return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement);
}
public CigarElement peekForwardOnGenome() {
return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement );
}
public CigarElement peekBackwardOnGenome() {
return ( cigarElementCounter - 1 == 0 && cigarOffset - 1 > 0 ? cigar.getCigarElement(cigarOffset - 1) : curElement );
}
public CigarOperator stepForwardOnGenome() {
// we enter this method with readOffset = index of the last processed base on the read
// (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion
if (curElement == null || ++cigarElementCounter > curElement.getLength()) {
cigarOffset++;
if (cigarOffset < nCigarElements) {
curElement = cigar.getCigarElement(cigarOffset);
cigarElementCounter = 0;
// next line: guards against cigar elements of length 0; when new cigar element is retrieved,
// we reenter in order to re-check cigarElementCounter against curElement's length
return stepForwardOnGenome();
} else {
if (curElement != null && curElement.getOperator() == CigarOperator.D)
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
// Reads that contain indels model the genomeOffset as the following base in the reference. Because
// we fall into this else block only when indels end the read, increment genomeOffset such that the
// current offset of this read is the next ref base after the end of the indel. This position will
// model a point on the reference somewhere after the end of the read.
genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here:
// we do step forward on the ref, and by returning null we also indicate that we are past the read end.
return null;
}
}
boolean done = false;
switch (curElement.getOperator()) {
case H: // ignore hard clips
case P: // ignore pads
cigarElementCounter = curElement.getLength();
break;
case I: // insertion w.r.t. the reference
case S: // soft clip
cigarElementCounter = curElement.getLength();
readOffset += curElement.getLength();
break;
case D: // deletion w.r.t. the reference
if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
// should be the same as N case
genomeOffset++;
done = true;
break;
case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
genomeOffset++;
done = true;
break;
case M:
case EQ:
case X:
readOffset++;
genomeOffset++;
done = true;
break;
default:
throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator());
}
return done ? curElement.getOperator() : stepForwardOnGenome();
}
}
//final boolean DEBUG = false;
//final boolean DEBUG2 = false && DEBUG;
private ReadProperties readInfo;
private AlignmentContext nextAlignmentContext;
private boolean performDownsampling;
// -----------------------------------------------------------------------------------------------------------------
//
@ -218,22 +74,27 @@ public class LocusIteratorByState extends LocusIterator {
//
// -----------------------------------------------------------------------------------------------------------------
public LocusIteratorByState(final Iterator<SAMRecord> samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection<String> samples) {
this.readInfo = readInformation;
public LocusIteratorByState(final Iterator<SAMRecord> samIterator,
final ReadProperties readInformation,
final GenomeLocParser genomeLocParser,
final Collection<String> samples) {
this(samIterator,
toDownsamplingInfo(readInformation),
readInformation.includeReadsWithDeletionAtLoci(),
genomeLocParser,
samples);
}
protected LocusIteratorByState(final Iterator<SAMRecord> samIterator,
final LIBSDownsamplingInfo downsamplingInfo,
final boolean includeReadsWithDeletionAtLoci,
final GenomeLocParser genomeLocParser,
final Collection<String> samples) {
this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci;
this.genomeLocParser = genomeLocParser;
this.samples = new ArrayList<String>(samples);
// LIBS will invoke the Reservoir and Leveling downsamplers on the read stream if we're
// downsampling to coverage by sample. SAMDataSource will have refrained from applying
// any downsamplers to the read stream in this case, in the expectation that LIBS will
// manage the downsampling. The reason for this is twofold: performance (don't have to
// split/re-assemble the read stream in SAMDataSource), and to enable partial downsampling
// of reads (eg., using half of a read, and throwing the rest away).
this.performDownsampling = readInfo.getDownsamplingMethod() != null &&
readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE &&
readInfo.getDownsamplingMethod().toCoverage != null;
this.readStates = new ReadStateManager(samIterator);
this.keepSubmittedReads = false; // TODO -- HOOK UP SYSTEM
this.readStates = new ReadStateManager(samIterator, this.samples, downsamplingInfo, keepSubmittedReads);
// currently the GATK expects this LocusIteratorByState to accept empty sample lists, when
// there's no read data. So we need to throw this error only when samIterator.hasNext() is true
@ -242,28 +103,19 @@ public class LocusIteratorByState extends LocusIterator {
}
}
/**
* For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list
* for the system.
*/
public final static Collection<String> sampleListForSAMWithoutReadGroups() {
List<String> samples = new ArrayList<String>();
samples.add(null);
return samples;
}
@Override
public Iterator<AlignmentContext> iterator() {
return this;
}
@Override
public void close() {
//this.it.close();
}
@Override
public boolean hasNext() {
lazyLoadNextAlignmentContext();
return (nextAlignmentContext != null);
//if ( DEBUG ) System.out.printf("hasNext() = %b%n", r);
return nextAlignmentContext != null;
}
private GenomeLoc getLocation() {
@ -275,6 +127,8 @@ public class LocusIteratorByState extends LocusIterator {
// next() routine and associated collection operations
//
// -----------------------------------------------------------------------------------------------------------------
@Override
public AlignmentContext next() {
lazyLoadNextAlignmentContext();
if (!hasNext())
@ -299,7 +153,7 @@ public class LocusIteratorByState extends LocusIterator {
boolean hasBeenSampled = false;
for (final String sample : samples) {
final Iterator<SAMRecordState> iterator = readStates.iterator(sample);
final Iterator<SAMRecordAlignmentState> iterator = readStates.iterator(sample);
final List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
int size = 0; // number of elements in this sample's pileup
@ -307,7 +161,7 @@ public class LocusIteratorByState extends LocusIterator {
int nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
while (iterator.hasNext()) {
final SAMRecordState state = iterator.next(); // state object with the read/offset information
final SAMRecordAlignmentState state = iterator.next(); // state object with the read/offset information
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
@ -330,7 +184,7 @@ public class LocusIteratorByState extends LocusIterator {
if (op == CigarOperator.D) {
// TODO -- LIBS is totally busted for deletions so that reads with Ds right before Is in their CIGAR are broken; must fix
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
if (includeReadsWithDeletionAtLoci) { // only add deletions to the pileup if we are authorized to do so
pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1));
size++;
nDeletions++;
@ -367,33 +221,11 @@ public class LocusIteratorByState extends LocusIterator {
}
}
// fast testing of position
private boolean readIsPastCurrentPosition(SAMRecord read) {
if (readStates.isEmpty())
return false;
else {
SAMRecordState state = readStates.getFirst();
SAMRecord ourRead = state.getRead();
return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition();
}
}
/**
* Generic place to put per-base filters appropriate to LocusIteratorByState
*
* @param rec
* @param pos
* @return
*/
private static boolean filterBaseInRead(GATKSAMRecord rec, long pos) {
return ReadUtils.isBaseInsideAdaptor(rec, pos);
}
private void updateReadStates() {
for (final String sample : samples) {
Iterator<SAMRecordState> it = readStates.iterator(sample);
Iterator<SAMRecordAlignmentState> it = readStates.iterator(sample);
while (it.hasNext()) {
SAMRecordState state = it.next();
SAMRecordAlignmentState state = it.next();
CigarOperator op = state.stepForwardOnGenome();
if (op == null) {
// we discard the read only when we are past its end AND indel at the end of the read (if any) was
@ -405,257 +237,42 @@ public class LocusIteratorByState extends LocusIterator {
}
}
public void remove() {
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
}
// -----------------------------------------------------------------------------------------------------------------
//
// utility functions
//
// -----------------------------------------------------------------------------------------------------------------
protected class ReadStateManager {
private final PeekableIterator<SAMRecord> iterator;
private final SamplePartitioner samplePartitioner;
private final Map<String, PerSampleReadStateManager> readStatesBySample = new HashMap<String, PerSampleReadStateManager>();
private int totalReadStates = 0;
public ReadStateManager(Iterator<SAMRecord> source) {
this.iterator = new PeekableIterator<SAMRecord>(source);
for (final String sample : samples) {
readStatesBySample.put(sample, new PerSampleReadStateManager());
}
samplePartitioner = new SamplePartitioner(performDownsampling);
}
/**
* Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented
* for this iterator; if present, total read states will be decremented.
*
* @param sample The sample.
* @return Iterator over the reads associated with that sample.
*/
public Iterator<SAMRecordState> iterator(final String sample) {
return new Iterator<SAMRecordState>() {
private Iterator<SAMRecordState> wrappedIterator = readStatesBySample.get(sample).iterator();
public boolean hasNext() {
return wrappedIterator.hasNext();
}
public SAMRecordState next() {
return wrappedIterator.next();
}
public void remove() {
wrappedIterator.remove();
}
};
}
public boolean isEmpty() {
return totalReadStates == 0;
}
/**
* Retrieves the total number of reads in the manager across all samples.
*
* @return Total number of reads over all samples.
*/
public int size() {
return totalReadStates;
}
/**
* Retrieves the total number of reads in the manager in the given sample.
*
* @param sample The sample.
* @return Total number of reads in the given sample.
*/
public int size(final String sample) {
return readStatesBySample.get(sample).size();
}
public SAMRecordState getFirst() {
for (final String sample : samples) {
PerSampleReadStateManager reads = readStatesBySample.get(sample);
if (!reads.isEmpty())
return reads.peek();
}
return null;
}
public boolean hasNext() {
return totalReadStates > 0 || iterator.hasNext();
}
public void collectPendingReads() {
if (!iterator.hasNext())
return;
if (readStates.size() == 0) {
int firstContigIndex = iterator.peek().getReferenceIndex();
int firstAlignmentStart = iterator.peek().getAlignmentStart();
while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) {
samplePartitioner.submitRead(iterator.next());
}
} else {
// Fast fail in the case that the read is past the current position.
if (readIsPastCurrentPosition(iterator.peek()))
return;
while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) {
samplePartitioner.submitRead(iterator.next());
}
}
samplePartitioner.doneSubmittingReads();
for (final String sample : samples) {
Collection<SAMRecord> newReads = samplePartitioner.getReadsForSample(sample);
PerSampleReadStateManager statesBySample = readStatesBySample.get(sample);
addReadsToSample(statesBySample, newReads);
}
samplePartitioner.reset();
}
/**
* Add reads with the given sample name to the given hanger entry.
*
* @param readStates The list of read states to add this collection of reads.
* @param reads Reads to add. Selected reads will be pulled from this source.
*/
private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection<SAMRecord> reads) {
if (reads.isEmpty())
return;
Collection<SAMRecordState> newReadStates = new LinkedList<SAMRecordState>();
for (SAMRecord read : reads) {
SAMRecordState state = new SAMRecordState(read);
state.stepForwardOnGenome();
newReadStates.add(state);
}
readStates.addStatesAtNextAlignmentStart(newReadStates);
}
protected class PerSampleReadStateManager implements Iterable<SAMRecordState> {
private List<LinkedList<SAMRecordState>> readStatesByAlignmentStart = new LinkedList<LinkedList<SAMRecordState>>();
private int thisSampleReadStates = 0;
private Downsampler<LinkedList<SAMRecordState>> levelingDownsampler =
performDownsampling ?
new LevelingDownsampler<LinkedList<SAMRecordState>, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) :
null;
public void addStatesAtNextAlignmentStart(Collection<SAMRecordState> states) {
if ( states.isEmpty() ) {
return;
}
readStatesByAlignmentStart.add(new LinkedList<SAMRecordState>(states));
thisSampleReadStates += states.size();
totalReadStates += states.size();
if ( levelingDownsampler != null ) {
levelingDownsampler.submit(readStatesByAlignmentStart);
levelingDownsampler.signalEndOfInput();
thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
// use returned List directly rather than make a copy, for efficiency's sake
readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems();
levelingDownsampler.reset();
}
}
public boolean isEmpty() {
return readStatesByAlignmentStart.isEmpty();
}
public SAMRecordState peek() {
return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek();
}
public int size() {
return thisSampleReadStates;
}
public Iterator<SAMRecordState> iterator() {
return new Iterator<SAMRecordState>() {
private Iterator<LinkedList<SAMRecordState>> alignmentStartIterator = readStatesByAlignmentStart.iterator();
private LinkedList<SAMRecordState> currentPositionReadStates = null;
private Iterator<SAMRecordState> currentPositionReadStatesIterator = null;
public boolean hasNext() {
return alignmentStartIterator.hasNext() ||
(currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext());
}
public SAMRecordState next() {
if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) {
currentPositionReadStates = alignmentStartIterator.next();
currentPositionReadStatesIterator = currentPositionReadStates.iterator();
}
return currentPositionReadStatesIterator.next();
}
public void remove() {
currentPositionReadStatesIterator.remove();
thisSampleReadStates--;
totalReadStates--;
if ( currentPositionReadStates.isEmpty() ) {
alignmentStartIterator.remove();
}
}
};
}
}
/**
* Generic place to put per-base filters appropriate to LocusIteratorByState
*
* @param rec
* @param pos
* @return
*/
private boolean filterBaseInRead(GATKSAMRecord rec, long pos) {
return ReadUtils.isBaseInsideAdaptor(rec, pos);
}
/**
* Divides reads by sample and (if requested) does a preliminary downsampling pass with a ReservoirDownsampler.
* Create a LIBSDownsamplingInfo object from the requested info in ReadProperties
*
* Note: stores reads by sample ID string, not by sample object
* LIBS will invoke the Reservoir and Leveling downsamplers on the read stream if we're
* downsampling to coverage by sample. SAMDataSource will have refrained from applying
* any downsamplers to the read stream in this case, in the expectation that LIBS will
* manage the downsampling. The reason for this is twofold: performance (don't have to
* split/re-assemble the read stream in SAMDataSource), and to enable partial downsampling
* of reads (eg., using half of a read, and throwing the rest away).
*
* @param readInfo GATK engine information about what should be done to the reads
* @return a LIBS specific info holder about downsampling only
*/
private class SamplePartitioner {
private Map<String, Downsampler<SAMRecord>> readsBySample;
private static LIBSDownsamplingInfo toDownsamplingInfo(final ReadProperties readInfo) {
final boolean performDownsampling = readInfo.getDownsamplingMethod() != null &&
readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE &&
readInfo.getDownsamplingMethod().toCoverage != null;
final int coverage = performDownsampling ? readInfo.getDownsamplingMethod().toCoverage : 0;
public SamplePartitioner( boolean downsampleReads ) {
readsBySample = new HashMap<String, Downsampler<SAMRecord>>();
for ( String sample : samples ) {
readsBySample.put(sample,
downsampleReads ? new ReservoirDownsampler<SAMRecord>(readInfo.getDownsamplingMethod().toCoverage) :
new PassThroughDownsampler<SAMRecord>());
}
}
public void submitRead(SAMRecord read) {
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
if (readsBySample.containsKey(sampleName))
readsBySample.get(sampleName).submit(read);
}
public void doneSubmittingReads() {
for ( Map.Entry<String, Downsampler<SAMRecord>> perSampleReads : readsBySample.entrySet() ) {
perSampleReads.getValue().signalEndOfInput();
}
}
public Collection<SAMRecord> getReadsForSample(String sampleName) {
if ( ! readsBySample.containsKey(sampleName) )
throw new NoSuchElementException("Sample name not found");
return readsBySample.get(sampleName).consumeFinalizedItems();
}
public void reset() {
for ( Map.Entry<String, Downsampler<SAMRecord>> perSampleReads : readsBySample.entrySet() ) {
perSampleReads.getValue().clear();
perSampleReads.getValue().reset();
}
}
return new LIBSDownsamplingInfo(performDownsampling, coverage);
}
}

View File

@ -0,0 +1,343 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.locusiterator;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.downsampling.Downsampler;
import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.*;
/**
* Manages and updates mapping from sample -> List of SAMRecordAlignmentState
*
* Optionally can keep track of all of the reads pulled off the iterator and
* that appeared at any point in the list of SAMRecordAlignmentState for any reads.
* This functionaly is only possible at this stage, as this object does the popping of
* reads off the underlying source iterator, and presents only a pileup-like interface
* of samples -> SAMRecordAlignmentStates. Reconstructing the unique set of reads
* used across all pileups is extremely expensive from that data structure.
*
* User: depristo
* Date: 1/5/13
* Time: 2:02 PM
*/
class ReadStateManager {
private final List<String> samples;
private final PeekableIterator<SAMRecord> iterator;
private final SamplePartitioner samplePartitioner;
private final Map<String, PerSampleReadStateManager> readStatesBySample = new HashMap<String, PerSampleReadStateManager>();
private LinkedList<SAMRecord> submittedReads;
private final boolean keepSubmittedReads;
private int totalReadStates = 0;
public ReadStateManager(final Iterator<SAMRecord> source,
final List<String> samples,
final LIBSDownsamplingInfo LIBSDownsamplingInfo,
final boolean keepSubmittedReads) {
this.samples = samples;
this.iterator = new PeekableIterator<SAMRecord>(source);
this.keepSubmittedReads = keepSubmittedReads;
this.submittedReads = new LinkedList<SAMRecord>();
for (final String sample : samples) {
readStatesBySample.put(sample, new PerSampleReadStateManager(LIBSDownsamplingInfo));
}
samplePartitioner = new SamplePartitioner(LIBSDownsamplingInfo, samples);
}
/**
* Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented
* for this iterator; if present, total read states will be decremented.
*
* @param sample The sample.
* @return Iterator over the reads associated with that sample.
*/
public Iterator<SAMRecordAlignmentState> iterator(final String sample) {
return new Iterator<SAMRecordAlignmentState>() {
private Iterator<SAMRecordAlignmentState> wrappedIterator = readStatesBySample.get(sample).iterator();
public boolean hasNext() {
return wrappedIterator.hasNext();
}
public SAMRecordAlignmentState next() {
return wrappedIterator.next();
}
public void remove() {
wrappedIterator.remove();
}
};
}
public boolean isEmpty() {
return totalReadStates == 0;
}
/**
* Retrieves the total number of reads in the manager across all samples.
*
* @return Total number of reads over all samples.
*/
public int size() {
return totalReadStates;
}
/**
* Retrieves the total number of reads in the manager in the given sample.
*
* @param sample The sample.
* @return Total number of reads in the given sample.
*/
public int size(final String sample) {
return readStatesBySample.get(sample).size();
}
public SAMRecordAlignmentState getFirst() {
for (final String sample : samples) {
PerSampleReadStateManager reads = readStatesBySample.get(sample);
if (!reads.isEmpty())
return reads.peek();
}
return null;
}
public boolean hasNext() {
return totalReadStates > 0 || iterator.hasNext();
}
// fast testing of position
private boolean readIsPastCurrentPosition(SAMRecord read) {
if (isEmpty())
return false;
else {
SAMRecordAlignmentState state = getFirst();
SAMRecord ourRead = state.getRead();
return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition();
}
}
public void collectPendingReads() {
if (!iterator.hasNext())
return;
// the next record in the stream, peeked as to not remove it from the stream
if ( isEmpty() ) {
final int firstContigIndex = iterator.peek().getReferenceIndex();
final int firstAlignmentStart = iterator.peek().getAlignmentStart();
while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) {
submitRead(iterator.next());
}
} else {
// Fast fail in the case that the read is past the current position.
if (readIsPastCurrentPosition(iterator.peek()))
return;
while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) {
submitRead(iterator.next());
}
}
samplePartitioner.doneSubmittingReads();
for (final String sample : samples) {
Collection<SAMRecord> newReads = samplePartitioner.getReadsForSample(sample);
PerSampleReadStateManager statesBySample = readStatesBySample.get(sample);
addReadsToSample(statesBySample, newReads);
}
samplePartitioner.reset();
}
/**
* Add a read to the sample partitioner, potentially adding it to all submitted reads, if appropriate
* @param read a non-null read
*/
@Requires("read != null")
protected void submitRead(final SAMRecord read) {
if ( keepSubmittedReads )
submittedReads.add(read);
samplePartitioner.submitRead(read);
}
/**
* Transfer current list of submitted reads, clearing old list
*
* Takes the maintained list of submitted reads, and transfers it to the caller of this
* function. The old list of set to a new, cleanly allocated list so the caller officially
* owns the list returned by this call. This is the only way to clear the tracking
* of submitted reads, if enabled.
*
* How to use this function:
*
* while ( doing some work unit, such as creating pileup at some locus ):
* interact with ReadStateManager in some way to make work unit
* readsUsedInPileup = transferSubmittedReads)
*
* @throws UnsupportedOperationException if called when keepingSubmittedReads is false
*
* @return the current list of submitted reads
*/
@Ensures({
"result != null",
"result != submittedReads" // result and previous submitted reads are not == objects
})
public List<SAMRecord> transferSubmittedReads() {
if ( ! keepSubmittedReads ) throw new UnsupportedOperationException("cannot transferSubmittedReads if you aren't keeping them");
final List<SAMRecord> prevSubmittedReads = submittedReads;
this.submittedReads = new LinkedList<SAMRecord>();
return prevSubmittedReads;
}
/**
* Obtain a pointer to the list of submitted reads.
*
* This is not a copy of the list; it is shared with this ReadStateManager. It should
* not be modified. Updates to this ReadStateManager may change the contains of the
* list entirely.
*
* For testing purposes only.
*
* Will always be empty if we are are not keepingSubmittedReads
*
* @return a non-null list of reads that have been submitted to this ReadStateManager
*/
@Ensures({"result != null","keepingSubmittedReads || result.isEmpty()"})
protected List<SAMRecord> getSubmittedReads() {
return submittedReads;
}
/**
* Add reads with the given sample name to the given hanger entry.
*
* @param readStates The list of read states to add this collection of reads.
* @param reads Reads to add. Selected reads will be pulled from this source.
*/
private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection<SAMRecord> reads) {
if (reads.isEmpty())
return;
Collection<SAMRecordAlignmentState> newReadStates = new LinkedList<SAMRecordAlignmentState>();
for (SAMRecord read : reads) {
SAMRecordAlignmentState state = new SAMRecordAlignmentState(read);
state.stepForwardOnGenome();
newReadStates.add(state);
}
readStates.addStatesAtNextAlignmentStart(newReadStates);
}
protected class PerSampleReadStateManager implements Iterable<SAMRecordAlignmentState> {
private List<LinkedList<SAMRecordAlignmentState>> readStatesByAlignmentStart = new LinkedList<LinkedList<SAMRecordAlignmentState>>();
private final Downsampler<LinkedList<SAMRecordAlignmentState>> levelingDownsampler;
private int thisSampleReadStates = 0;
public PerSampleReadStateManager(final LIBSDownsamplingInfo LIBSDownsamplingInfo) {
this.levelingDownsampler = LIBSDownsamplingInfo.isPerformDownsampling()
? new LevelingDownsampler<LinkedList<SAMRecordAlignmentState>, SAMRecordAlignmentState>(LIBSDownsamplingInfo.getToCoverage())
: null;
}
public void addStatesAtNextAlignmentStart(Collection<SAMRecordAlignmentState> states) {
if ( states.isEmpty() ) {
return;
}
readStatesByAlignmentStart.add(new LinkedList<SAMRecordAlignmentState>(states));
thisSampleReadStates += states.size();
totalReadStates += states.size();
if ( levelingDownsampler != null ) {
levelingDownsampler.submit(readStatesByAlignmentStart);
levelingDownsampler.signalEndOfInput();
thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
// use returned List directly rather than make a copy, for efficiency's sake
readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems();
levelingDownsampler.reset();
}
}
public boolean isEmpty() {
return readStatesByAlignmentStart.isEmpty();
}
public SAMRecordAlignmentState peek() {
return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek();
}
public int size() {
return thisSampleReadStates;
}
public Iterator<SAMRecordAlignmentState> iterator() {
return new Iterator<SAMRecordAlignmentState>() {
private Iterator<LinkedList<SAMRecordAlignmentState>> alignmentStartIterator = readStatesByAlignmentStart.iterator();
private LinkedList<SAMRecordAlignmentState> currentPositionReadStates = null;
private Iterator<SAMRecordAlignmentState> currentPositionReadStatesIterator = null;
public boolean hasNext() {
return alignmentStartIterator.hasNext() ||
(currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext());
}
public SAMRecordAlignmentState next() {
if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) {
currentPositionReadStates = alignmentStartIterator.next();
currentPositionReadStatesIterator = currentPositionReadStates.iterator();
}
return currentPositionReadStatesIterator.next();
}
public void remove() {
currentPositionReadStatesIterator.remove();
thisSampleReadStates--;
totalReadStates--;
if ( currentPositionReadStates.isEmpty() ) {
alignmentStartIterator.remove();
}
}
};
}
}
}

View File

@ -0,0 +1,205 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.locusiterator;
import com.google.java.contract.Requires;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.UserException;
/**
* Steps a single read along its alignment to the genome
*
* The logical model for generating extended events is as follows: the "record state"
* implements the traversal along the reference; thus stepForwardOnGenome() returns
* on every and only on actual reference bases. This can be a (mis)match or a deletion
* (in the latter case, we still return on every individual reference base the deletion spans).
* In the extended events mode, the record state also remembers if there was an insertion, or
* if the deletion just started *right before* the current reference base the record state is
* pointing to upon the return from stepForwardOnGenome(). The next call to stepForwardOnGenome()
* will clear that memory (as we remember only extended events immediately preceding
* the current reference base).
*
* User: depristo
* Date: 1/5/13
* Time: 1:08 PM
*/
class SAMRecordAlignmentState {
// TODO -- one idea to clean up this functionality:
// TODO --
// TODO -- split functionality here into an alignment state machine and an
// TODO -- alignment state. The alignment state simply carries with it the
// TODO -- state of the alignment (the current cigar op, the genome offset,
// TODO -- the read offset, etc. The AlignmentStateMachine produces these
// TODO -- states, and has operations such stepForwardOnGenome, getLastState(),
// TODO -- getCurrentState(), getNextState();
/**
* Our read
*/
private final SAMRecord read;
private final Cigar cigar;
private final int nCigarElements;
/**
* how far are we offset from the start of the read bases?
*/
int readOffset = -1;
/**
* how far are we offset from the alignment start on the genome?
*/
int genomeOffset = -1;
int cigarOffset = -1;
CigarElement curElement = null;
/**
* how far are we into a single cigarElement?
*/
int cigarElementCounter = -1;
@Requires("read != null")
// TODO -- should enforce contracts like the read is aligned, etc
public SAMRecordAlignmentState(final SAMRecord read) {
this.read = read;
this.cigar = read.getCigar();
this.nCigarElements = cigar.numCigarElements();
}
public SAMRecord getRead() {
return read;
}
/**
* What is our current offset in the read's bases that aligns us with the reference genome?
*
* @return the current read offset position
*/
public int getReadOffset() {
return readOffset;
}
/**
* What is the current offset w.r.t. the alignment state that aligns us to the readOffset?
*
* @return the current offset
*/
public int getGenomeOffset() {
return genomeOffset;
}
public int getGenomePosition() {
return read.getAlignmentStart() + getGenomeOffset();
}
public GenomeLoc getLocation(GenomeLocParser genomeLocParser) {
return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition());
}
public CigarOperator getCurrentCigarOperator() {
return curElement.getOperator();
}
public String toString() {
return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement);
}
public CigarElement peekForwardOnGenome() {
return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement );
}
public CigarElement peekBackwardOnGenome() {
return ( cigarElementCounter - 1 == 0 && cigarOffset - 1 > 0 ? cigar.getCigarElement(cigarOffset - 1) : curElement );
}
public CigarOperator stepForwardOnGenome() {
// we enter this method with readOffset = index of the last processed base on the read
// (-1 if we did not process a single base yet); this can be last matching base,
// or last base of an insertion
if (curElement == null || ++cigarElementCounter > curElement.getLength()) {
cigarOffset++;
if (cigarOffset < nCigarElements) {
curElement = cigar.getCigarElement(cigarOffset);
cigarElementCounter = 0;
// next line: guards against cigar elements of length 0; when new cigar element is retrieved,
// we reenter in order to re-check cigarElementCounter against curElement's length
return stepForwardOnGenome();
} else {
if (curElement != null && curElement.getOperator() == CigarOperator.D)
throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
// Reads that contain indels model the genomeOffset as the following base in the reference. Because
// we fall into this else block only when indels end the read, increment genomeOffset such that the
// current offset of this read is the next ref base after the end of the indel. This position will
// model a point on the reference somewhere after the end of the read.
genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here:
// we do step forward on the ref, and by returning null we also indicate that we are past the read end.
return null;
}
}
boolean done = false;
switch (curElement.getOperator()) {
case H: // ignore hard clips
case P: // ignore pads
cigarElementCounter = curElement.getLength();
break;
case I: // insertion w.r.t. the reference
case S: // soft clip
cigarElementCounter = curElement.getLength();
readOffset += curElement.getLength();
break;
case D: // deletion w.r.t. the reference
if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
// should be the same as N case
genomeOffset++;
done = true;
break;
case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
genomeOffset++;
done = true;
break;
case M:
case EQ:
case X:
readOffset++;
genomeOffset++;
done = true;
break;
default:
throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator());
}
return done ? curElement.getOperator() : stepForwardOnGenome();
}
}

View File

@ -0,0 +1,81 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.locusiterator;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.downsampling.Downsampler;
import org.broadinstitute.sting.gatk.downsampling.PassThroughDownsampler;
import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler;
import java.util.*;
/**
* Divides reads by sample and (if requested) does a preliminary downsampling pass with a ReservoirDownsampler.
*
* Note: stores reads by sample ID string, not by sample object
*/
class SamplePartitioner {
private Map<String, Downsampler<SAMRecord>> readsBySample;
public SamplePartitioner(final LIBSDownsamplingInfo LIBSDownsamplingInfo, final List<String> samples) {
readsBySample = new HashMap<String, Downsampler<SAMRecord>>(samples.size());
for ( String sample : samples ) {
readsBySample.put(sample, createDownsampler(LIBSDownsamplingInfo));
}
}
private Downsampler<SAMRecord> createDownsampler(final LIBSDownsamplingInfo LIBSDownsamplingInfo) {
return LIBSDownsamplingInfo.isPerformDownsampling()
? new ReservoirDownsampler<SAMRecord>(LIBSDownsamplingInfo.getToCoverage())
: new PassThroughDownsampler<SAMRecord>();
}
public void submitRead(SAMRecord read) {
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
if (readsBySample.containsKey(sampleName))
readsBySample.get(sampleName).submit(read);
}
public void doneSubmittingReads() {
for ( Map.Entry<String, Downsampler<SAMRecord>> perSampleReads : readsBySample.entrySet() ) {
perSampleReads.getValue().signalEndOfInput();
}
}
public Collection<SAMRecord> getReadsForSample(String sampleName) {
if ( ! readsBySample.containsKey(sampleName) )
throw new NoSuchElementException("Sample name not found");
return readsBySample.get(sampleName).consumeFinalizedItems();
}
public void reset() {
for ( Map.Entry<String, Downsampler<SAMRecord>> perSampleReads : readsBySample.entrySet() ) {
perSampleReads.getValue().clear();
perSampleReads.getValue().reset();
}
}
}

View File

@ -23,7 +23,7 @@
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.locusiterator;
package org.broadinstitute.sting.utils.locusiterator.legacy;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.Cigar;
@ -40,6 +40,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.LegacyReservoirDownsampler;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.locusiterator.LocusIterator;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;

View File

@ -326,6 +326,34 @@ public class ArtificialSAMUtils {
return stack;
}
/**
* Create a read stream based on the parameters. The cigar string for each
* read will be *M, where * is the length of the read.
*
* Useful for testing things like LocusIteratorBystate
*
* @return a collection of stackSize reads all sharing the above properties
*/
public static List<SAMRecord> createReadStream( final int nReadsPerLocus,
final int nLoci,
final SAMFileHeader header,
final int alignmentStart,
final int length ) {
final String name = "readName";
List<SAMRecord> reads = new ArrayList<SAMRecord>(nReadsPerLocus*nLoci);
for ( int locus = 0; locus < nLoci; locus++ ) {
for ( int readI = 0; readI < nReadsPerLocus; readI++ ) {
for ( final SAMReadGroupRecord rg : header.getReadGroups() ) {
final GATKSAMRecord read = createArtificialRead(header, name, 0, alignmentStart, length);
read.setReadGroup(new GATKSAMReadGroupRecord(rg));
reads.add(read);
}
}
}
return reads;
}
/**
* create an iterator containing the specified read piles
*

View File

@ -36,7 +36,7 @@ import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
import org.broadinstitute.sting.utils.locusiterator.LegacyLocusIteratorByState;
import org.broadinstitute.sting.utils.locusiterator.legacy.LegacyLocusIteratorByState;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.walkers.qc.CountLoci;
import org.broadinstitute.sting.utils.GenomeLocParser;

View File

@ -0,0 +1,144 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.locusiterator;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
/**
* Created with IntelliJ IDEA.
* User: depristo
* Date: 1/5/13
* Time: 8:42 PM
* To change this template use File | Settings | File Templates.
*/
public final class LIBS_position {
SAMRecord read;
final int numOperators;
int currentOperatorIndex = 0;
int currentPositionOnOperator = 0;
int currentReadOffset = 0;
boolean isBeforeDeletionStart = false;
boolean isBeforeDeletedBase = false;
boolean isAfterDeletionEnd = false;
boolean isAfterDeletedBase = false;
boolean isBeforeInsertion = false;
boolean isAfterInsertion = false;
boolean isNextToSoftClip = false;
boolean sawMop = false;
public LIBS_position(final SAMRecord read) {
this.read = read;
numOperators = read.getCigar().numCigarElements();
}
public int getCurrentReadOffset() {
return Math.max(0, currentReadOffset - 1);
}
/**
* Steps forward on the genome. Returns false when done reading the read, true otherwise.
*/
public boolean stepForwardOnGenome() {
if ( currentOperatorIndex == numOperators )
return false;
CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex);
if ( currentPositionOnOperator >= curElement.getLength() ) {
if ( ++currentOperatorIndex == numOperators )
return false;
curElement = read.getCigar().getCigarElement(currentOperatorIndex);
currentPositionOnOperator = 0;
}
switch ( curElement.getOperator() ) {
case I: // insertion w.r.t. the reference
// if ( !sawMop )
// break;
case S: // soft clip
currentReadOffset += curElement.getLength();
case H: // hard clip
case P: // padding
currentOperatorIndex++;
return stepForwardOnGenome();
case D: // deletion w.r.t. the reference
case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
currentPositionOnOperator++;
break;
case M:
case EQ:
case X:
sawMop = true;
currentReadOffset++;
currentPositionOnOperator++;
break;
default:
throw new IllegalStateException("No support for cigar op: " + curElement.getOperator());
}
final boolean isFirstOp = currentOperatorIndex == 0;
final boolean isLastOp = currentOperatorIndex == numOperators - 1;
final boolean isFirstBaseOfOp = currentPositionOnOperator == 1;
final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength();
isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp);
isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D);
isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp);
isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D);
isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp)
|| (!sawMop && curElement.getOperator() == CigarOperator.I);
isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp);
isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp)
|| isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp);
return true;
}
private static boolean isBeforeOp(final Cigar cigar,
final int currentOperatorIndex,
final CigarOperator op,
final boolean isLastOp,
final boolean isLastBaseOfOp) {
return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op;
}
private static boolean isAfterOp(final Cigar cigar,
final int currentOperatorIndex,
final CigarOperator op,
final boolean isFirstOp,
final boolean isFirstBaseOfOp) {
return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op;
}
}

View File

@ -1,531 +0,0 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.locusiterator;
import net.sf.samtools.*;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* testing of the LEGACY version of LocusIteratorByState
*/
public class LegacyLocusIteratorByStateUnitTest extends BaseTest {
private static SAMFileHeader header;
private LegacyLocusIteratorByState li;
private GenomeLocParser genomeLocParser;
@BeforeClass
public void beforeClass() {
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
}
private LegacyLocusIteratorByState makeLTBS(List<SAMRecord> reads, ReadProperties readAttributes) {
return new LegacyLocusIteratorByState(new FakeCloseableIterator<SAMRecord>(reads.iterator()), readAttributes, genomeLocParser, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups());
}
@Test
public void testXandEQOperators() {
final byte[] bases1 = new byte[] {'A','A','A','A','A','A','A','A','A','A'};
final byte[] bases2 = new byte[] {'A','A','A','C','A','A','A','A','A','C'};
// create a test version of the Reads object
ReadProperties readAttributes = createTestReadProperties();
SAMRecord r1 = ArtificialSAMUtils.createArtificialRead(header,"r1",0,1,10);
r1.setReadBases(bases1);
r1.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20});
r1.setCigarString("10M");
SAMRecord r2 = ArtificialSAMUtils.createArtificialRead(header,"r2",0,1,10);
r2.setReadBases(bases2);
r2.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20});
r2.setCigarString("3=1X5=1X");
SAMRecord r3 = ArtificialSAMUtils.createArtificialRead(header,"r3",0,1,10);
r3.setReadBases(bases2);
r3.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20});
r3.setCigarString("3=1X5M1X");
SAMRecord r4 = ArtificialSAMUtils.createArtificialRead(header,"r4",0,1,10);
r4.setReadBases(bases2);
r4.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20});
r4.setCigarString("10M");
List<SAMRecord> reads = Arrays.asList(r1, r2, r3, r4);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads,readAttributes);
while (li.hasNext()) {
AlignmentContext context = li.next();
ReadBackedPileup pileup = context.getBasePileup();
Assert.assertEquals(pileup.depthOfCoverage(), 4);
}
}
@Test
public void testIndelsInRegularPileup() {
final byte[] bases = new byte[] {'A','A','A','A','A','A','A','A','A','A'};
final byte[] indelBases = new byte[] {'A','A','A','A','C','T','A','A','A','A','A','A'};
// create a test version of the Reads object
ReadProperties readAttributes = createTestReadProperties();
SAMRecord before = ArtificialSAMUtils.createArtificialRead(header,"before",0,1,10);
before.setReadBases(bases);
before.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20});
before.setCigarString("10M");
SAMRecord during = ArtificialSAMUtils.createArtificialRead(header,"during",0,2,10);
during.setReadBases(indelBases);
during.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20});
during.setCigarString("4M2I6M");
SAMRecord after = ArtificialSAMUtils.createArtificialRead(header,"after",0,3,10);
after.setReadBases(bases);
after.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20});
after.setCigarString("10M");
List<SAMRecord> reads = Arrays.asList(before, during, after);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads,readAttributes);
boolean foundIndel = false;
while (li.hasNext()) {
AlignmentContext context = li.next();
ReadBackedPileup pileup = context.getBasePileup().getBaseFilteredPileup(10);
for (PileupElement p : pileup) {
if (p.isBeforeInsertion()) {
foundIndel = true;
Assert.assertEquals(p.getEventLength(), 2, "Wrong event length");
Assert.assertEquals(p.getEventBases(), "CT", "Inserted bases are incorrect");
break;
}
}
}
Assert.assertTrue(foundIndel,"Indel in pileup not found");
}
@Test
public void testWholeIndelReadInIsolation() {
final int firstLocus = 44367789;
// create a test version of the Reads object
ReadProperties readAttributes = createTestReadProperties();
SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header, "indelOnly", 0, firstLocus, 76);
indelOnlyRead.setReadBases(Utils.dupBytes((byte)'A',76));
indelOnlyRead.setBaseQualities(Utils.dupBytes((byte) '@', 76));
indelOnlyRead.setCigarString("76I");
List<SAMRecord> reads = Arrays.asList(indelOnlyRead);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads, readAttributes);
// Traditionally, reads that end with indels bleed into the pileup at the following locus. Verify that the next pileup contains this read
// and considers it to be an indel-containing read.
Assert.assertTrue(li.hasNext(),"Should have found a whole-indel read in the normal base pileup without extended events enabled");
AlignmentContext alignmentContext = li.next();
Assert.assertEquals(alignmentContext.getLocation().getStart(), firstLocus, "Base pileup is at incorrect location.");
ReadBackedPileup basePileup = alignmentContext.getBasePileup();
Assert.assertEquals(basePileup.getReads().size(),1,"Pileup is of incorrect size");
Assert.assertSame(basePileup.getReads().get(0), indelOnlyRead, "Read in pileup is incorrect");
}
/**
* Test to make sure that reads supporting only an indel (example cigar string: 76I) do
* not negatively influence the ordering of the pileup.
*/
@Test
public void testWholeIndelRead() {
final int firstLocus = 44367788, secondLocus = firstLocus + 1;
SAMRecord leadingRead = ArtificialSAMUtils.createArtificialRead(header,"leading",0,firstLocus,76);
leadingRead.setReadBases(Utils.dupBytes((byte)'A',76));
leadingRead.setBaseQualities(Utils.dupBytes((byte)'@',76));
leadingRead.setCigarString("1M75I");
SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header,"indelOnly",0,secondLocus,76);
indelOnlyRead.setReadBases(Utils.dupBytes((byte) 'A', 76));
indelOnlyRead.setBaseQualities(Utils.dupBytes((byte)'@',76));
indelOnlyRead.setCigarString("76I");
SAMRecord fullMatchAfterIndel = ArtificialSAMUtils.createArtificialRead(header,"fullMatch",0,secondLocus,76);
fullMatchAfterIndel.setReadBases(Utils.dupBytes((byte)'A',76));
fullMatchAfterIndel.setBaseQualities(Utils.dupBytes((byte)'@',76));
fullMatchAfterIndel.setCigarString("75I1M");
List<SAMRecord> reads = Arrays.asList(leadingRead, indelOnlyRead, fullMatchAfterIndel);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads, createTestReadProperties());
int currentLocus = firstLocus;
int numAlignmentContextsFound = 0;
while(li.hasNext()) {
AlignmentContext alignmentContext = li.next();
Assert.assertEquals(alignmentContext.getLocation().getStart(),currentLocus,"Current locus returned by alignment context is incorrect");
if(currentLocus == firstLocus) {
List<GATKSAMRecord> readsAtLocus = alignmentContext.getBasePileup().getReads();
Assert.assertEquals(readsAtLocus.size(),1,"Wrong number of reads at locus " + currentLocus);
Assert.assertSame(readsAtLocus.get(0),leadingRead,"leadingRead absent from pileup at locus " + currentLocus);
}
else if(currentLocus == secondLocus) {
List<GATKSAMRecord> readsAtLocus = alignmentContext.getBasePileup().getReads();
Assert.assertEquals(readsAtLocus.size(),2,"Wrong number of reads at locus " + currentLocus);
Assert.assertSame(readsAtLocus.get(0),indelOnlyRead,"indelOnlyRead absent from pileup at locus " + currentLocus);
Assert.assertSame(readsAtLocus.get(1),fullMatchAfterIndel,"fullMatchAfterIndel absent from pileup at locus " + currentLocus);
}
currentLocus++;
numAlignmentContextsFound++;
}
Assert.assertEquals(numAlignmentContextsFound, 2, "Found incorrect number of alignment contexts");
}
/**
* Test to make sure that reads supporting only an indel (example cigar string: 76I) are represented properly
*/
@Test
public void testWholeIndelReadRepresentedTest() {
final int firstLocus = 44367788, secondLocus = firstLocus + 1;
SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,secondLocus,1);
read1.setReadBases(Utils.dupBytes((byte) 'A', 1));
read1.setBaseQualities(Utils.dupBytes((byte) '@', 1));
read1.setCigarString("1I");
List<SAMRecord> reads = Arrays.asList(read1);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads, createTestReadProperties());
while(li.hasNext()) {
AlignmentContext alignmentContext = li.next();
ReadBackedPileup p = alignmentContext.getBasePileup();
Assert.assertTrue(p.getNumberOfElements() == 1);
PileupElement pe = p.iterator().next();
Assert.assertTrue(pe.isBeforeInsertion());
Assert.assertFalse(pe.isAfterInsertion());
Assert.assertEquals(pe.getEventBases(), "A");
}
SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,secondLocus,10);
read2.setReadBases(Utils.dupBytes((byte) 'A', 10));
read2.setBaseQualities(Utils.dupBytes((byte) '@', 10));
read2.setCigarString("10I");
reads = Arrays.asList(read2);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads, createTestReadProperties());
while(li.hasNext()) {
AlignmentContext alignmentContext = li.next();
ReadBackedPileup p = alignmentContext.getBasePileup();
Assert.assertTrue(p.getNumberOfElements() == 1);
PileupElement pe = p.iterator().next();
Assert.assertTrue(pe.isBeforeInsertion());
Assert.assertFalse(pe.isAfterInsertion());
Assert.assertEquals(pe.getEventBases(), "AAAAAAAAAA");
}
}
////////////////////////////////////////////
// comprehensive LIBS/PileupElement tests //
////////////////////////////////////////////
private static class LIBSTest {
final String cigar;
final int readLength;
private LIBSTest(final String cigar, final int readLength) {
this.cigar = cigar;
this.readLength = readLength;
}
}
@DataProvider(name = "LIBSTest")
public Object[][] createLIBSTestData() {
//TODO -- when LIBS is fixed this should be replaced to provide all possible permutations of CIGAR strings
return new Object[][]{
{new LIBSTest("1I", 1)},
{new LIBSTest("10I", 10)},
{new LIBSTest("2M2I2M", 6)},
{new LIBSTest("2M2I", 4)},
//TODO -- uncomment these when LIBS is fixed
//{new LIBSTest("2I2M", 4, Arrays.asList(2,3), Arrays.asList(IS_AFTER_INSERTION_FLAG,0))},
//{new LIBSTest("1I1M1D1M", 3, Arrays.asList(0,1), Arrays.asList(IS_AFTER_INSERTION_FLAG | IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG))},
//{new LIBSTest("1S1I1M", 3, Arrays.asList(2), Arrays.asList(IS_AFTER_INSERTION_FLAG))},
//{new LIBSTest("1M2D2M", 3)},
{new LIBSTest("1S1M", 2)},
{new LIBSTest("1M1S", 2)},
{new LIBSTest("1S1M1I", 3)}
};
}
@Test(dataProvider = "LIBSTest")
public void testLIBS(LIBSTest params) {
final int locus = 44367788;
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, params.readLength);
read.setReadBases(Utils.dupBytes((byte) 'A', params.readLength));
read.setBaseQualities(Utils.dupBytes((byte) '@', params.readLength));
read.setCigarString(params.cigar);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(Arrays.asList(read), createTestReadProperties());
final LIBS_position tester = new LIBS_position(read);
while ( li.hasNext() ) {
AlignmentContext alignmentContext = li.next();
ReadBackedPileup p = alignmentContext.getBasePileup();
Assert.assertTrue(p.getNumberOfElements() == 1);
PileupElement pe = p.iterator().next();
tester.stepForwardOnGenome();
Assert.assertEquals(pe.isBeforeDeletedBase(), tester.isBeforeDeletedBase);
Assert.assertEquals(pe.isBeforeDeletionStart(), tester.isBeforeDeletionStart);
Assert.assertEquals(pe.isAfterDeletedBase(), tester.isAfterDeletedBase);
Assert.assertEquals(pe.isAfterDeletionEnd(), tester.isAfterDeletionEnd);
Assert.assertEquals(pe.isBeforeInsertion(), tester.isBeforeInsertion);
Assert.assertEquals(pe.isAfterInsertion(), tester.isAfterInsertion);
Assert.assertEquals(pe.isNextToSoftClip(), tester.isNextToSoftClip);
Assert.assertEquals(pe.getOffset(), tester.getCurrentReadOffset());
}
}
////////////////////////////////////////////////
// End comprehensive LIBS/PileupElement tests //
////////////////////////////////////////////////
private static ReadProperties createTestReadProperties() {
return new ReadProperties(
Collections.<SAMReaderID>emptyList(),
new SAMFileHeader(),
SAMFileHeader.SortOrder.coordinate,
false,
SAMFileReader.ValidationStringency.STRICT,
null,
new ValidationExclusion(),
Collections.<ReadFilter>emptyList(),
Collections.<ReadTransformer>emptyList(),
false,
(byte) -1
);
}
}
class FakeCloseableIterator<T> implements CloseableIterator<T> {
Iterator<T> iterator;
public FakeCloseableIterator(Iterator<T> it) {
iterator = it;
}
@Override
public void close() {}
@Override
public boolean hasNext() {
return iterator.hasNext();
}
@Override
public T next() {
return iterator.next();
}
@Override
public void remove() {
throw new UnsupportedOperationException("Don't remove!");
}
}
final class LIBS_position {
SAMRecord read;
final int numOperators;
int currentOperatorIndex = 0;
int currentPositionOnOperator = 0;
int currentReadOffset = 0;
boolean isBeforeDeletionStart = false;
boolean isBeforeDeletedBase = false;
boolean isAfterDeletionEnd = false;
boolean isAfterDeletedBase = false;
boolean isBeforeInsertion = false;
boolean isAfterInsertion = false;
boolean isNextToSoftClip = false;
boolean sawMop = false;
public LIBS_position(final SAMRecord read) {
this.read = read;
numOperators = read.getCigar().numCigarElements();
}
public int getCurrentReadOffset() {
return Math.max(0, currentReadOffset - 1);
}
/**
* Steps forward on the genome. Returns false when done reading the read, true otherwise.
*/
public boolean stepForwardOnGenome() {
if ( currentOperatorIndex == numOperators )
return false;
CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex);
if ( currentPositionOnOperator >= curElement.getLength() ) {
if ( ++currentOperatorIndex == numOperators )
return false;
curElement = read.getCigar().getCigarElement(currentOperatorIndex);
currentPositionOnOperator = 0;
}
switch ( curElement.getOperator() ) {
case I: // insertion w.r.t. the reference
if ( !sawMop )
break;
case S: // soft clip
currentReadOffset += curElement.getLength();
case H: // hard clip
case P: // padding
currentOperatorIndex++;
return stepForwardOnGenome();
case D: // deletion w.r.t. the reference
case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
currentPositionOnOperator++;
break;
case M:
case EQ:
case X:
sawMop = true;
currentReadOffset++;
currentPositionOnOperator++;
break;
default:
throw new IllegalStateException("No support for cigar op: " + curElement.getOperator());
}
final boolean isFirstOp = currentOperatorIndex == 0;
final boolean isLastOp = currentOperatorIndex == numOperators - 1;
final boolean isFirstBaseOfOp = currentPositionOnOperator == 1;
final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength();
isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp);
isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D);
isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp);
isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D);
isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp)
|| (!sawMop && curElement.getOperator() == CigarOperator.I);
isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp);
isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp)
|| isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp);
return true;
}
private static boolean isBeforeOp(final Cigar cigar,
final int currentOperatorIndex,
final CigarOperator op,
final boolean isLastOp,
final boolean isLastBaseOfOp) {
return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op;
}
private static boolean isAfterOp(final Cigar cigar,
final int currentOperatorIndex,
final CigarOperator op,
final boolean isFirstOp,
final boolean isFirstBaseOfOp) {
return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op;
}
}

View File

@ -0,0 +1,252 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.locusiterator;
import net.sf.samtools.*;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
/**
* testing of the new (non-legacy) version of LocusIteratorByState
*/
public class LocusIteratorByStateBaseTest extends BaseTest {
protected static SAMFileHeader header;
protected GenomeLocParser genomeLocParser;
@BeforeClass
public void beforeClass() {
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
}
/**
* For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list
* for the system.
*/
protected static List<String> sampleListForSAMWithoutReadGroups() {
List<String> samples = new ArrayList<String>();
samples.add(null);
return samples;
}
protected LocusIteratorByState makeLTBS(List<SAMRecord> reads,
ReadProperties readAttributes) {
return new LocusIteratorByState(new FakeCloseableIterator<SAMRecord>(reads.iterator()),
readAttributes,
genomeLocParser,
sampleListForSAMWithoutReadGroups());
}
protected static ReadProperties createTestReadProperties() {
return createTestReadProperties(null);
}
protected static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) {
return new ReadProperties(
Collections.<SAMReaderID>emptyList(),
new SAMFileHeader(),
SAMFileHeader.SortOrder.coordinate,
false,
SAMFileReader.ValidationStringency.STRICT,
downsamplingMethod,
new ValidationExclusion(),
Collections.<ReadFilter>emptyList(),
Collections.<ReadTransformer>emptyList(),
false,
(byte) -1
);
}
protected static class FakeCloseableIterator<T> implements CloseableIterator<T> {
Iterator<T> iterator;
public FakeCloseableIterator(Iterator<T> it) {
iterator = it;
}
@Override
public void close() {}
@Override
public boolean hasNext() {
return iterator.hasNext();
}
@Override
public T next() {
return iterator.next();
}
@Override
public void remove() {
throw new UnsupportedOperationException("Don't remove!");
}
}
protected static class LIBSTest {
public static final int locus = 44367788;
final String cigar;
final int readLength;
final private List<CigarElement> elements;
public LIBSTest(final String cigar, final int readLength) {
this(null, cigar, readLength);
}
public LIBSTest(final List<CigarElement> elements, final String cigar, final int readLength) {
this.elements = elements;
this.cigar = cigar;
this.readLength = readLength;
}
@Override
public String toString() {
return "LIBSTest{" +
"cigar='" + cigar + '\'' +
", readLength=" + readLength +
'}';
}
public List<CigarElement> getElements() {
return elements;
}
public GATKSAMRecord makeRead() {
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, readLength);
read.setReadBases(Utils.dupBytes((byte) 'A', readLength));
final byte[] quals = new byte[readLength];
for ( int i = 0; i < readLength; i++ )
quals[i] = (byte)(i % QualityUtils.MAX_QUAL_SCORE);
read.setBaseQualities(quals);
read.setCigarString(cigar);
return read;
}
}
private boolean isIndel(final CigarElement ce) {
return ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I;
}
private boolean startsWithDeletion(final List<CigarElement> elements) {
for ( final CigarElement element : elements ) {
switch ( element.getOperator() ) {
case M:
case I:
case EQ:
case X:
return false;
case D:
return true;
default:
// keep looking
}
}
return false;
}
private LIBSTest makePermutationTest(final List<CigarElement> elements) {
CigarElement last = null;
boolean hasMatch = false;
// starts with D => bad
if ( startsWithDeletion(elements) )
return null;
// ends with D => bad
if ( elements.get(elements.size()-1).getOperator() == CigarOperator.D )
return null;
// make sure it's valid
String cigar = "";
int len = 0;
for ( final CigarElement ce : elements ) {
if ( ce.getOperator() == CigarOperator.N )
return null; // TODO -- don't support N
// abort on a bad cigar
if ( last != null ) {
if ( ce.getOperator() == last.getOperator() )
return null;
if ( isIndel(ce) && isIndel(last) )
return null;
}
cigar += ce.getLength() + ce.getOperator().toString();
len += ce.getLength();
last = ce;
hasMatch = hasMatch || ce.getOperator() == CigarOperator.M;
}
if ( ! hasMatch )
return null;
return new LIBSTest(elements, cigar, len);
}
@DataProvider(name = "LIBSTest")
public Object[][] createLIBSTests(final List<Integer> cigarLengths, final List<Integer> combinations) {
final List<Object[]> tests = new LinkedList<Object[]>();
final List<CigarOperator> allOps = Arrays.asList(CigarOperator.values());
final List<CigarElement> singleCigars = new LinkedList<CigarElement>();
for ( final int len : cigarLengths )
for ( final CigarOperator op : allOps )
singleCigars.add(new CigarElement(len, op));
for ( final int complexity : combinations ) {
for ( final List<CigarElement> elements : Utils.makePermutations(singleCigars, complexity, true) ) {
final LIBSTest test = makePermutationTest(elements);
if ( test != null ) tests.add(new Object[]{test});
}
}
return tests.toArray(new Object[][]{});
}
}

View File

@ -26,25 +26,16 @@
package org.broadinstitute.sting.utils.locusiterator;
import net.sf.samtools.*;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.NGSPlatform;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
@ -53,20 +44,13 @@ import java.util.*;
/**
* testing of the new (non-legacy) version of LocusIteratorByState
*/
public class LocusIteratorByStateUnitTest extends BaseTest {
private static SAMFileHeader header;
private LocusIteratorByState li;
private GenomeLocParser genomeLocParser;
public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
@BeforeClass
public void beforeClass() {
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
}
// TODO -- REMOVE ME WHEN LIBS IS FIXED
// TODO -- CURRENT CODE DOESN'T CORRECTLY COMPUTE THINGS LIKE BEFORE DELETION, AFTER INSERTION, ETC
private final static boolean ALLOW_BROKEN_LIBS_STATE = true;
private LocusIteratorByState makeLTBS(List<SAMRecord> reads, ReadProperties readAttributes) {
return new LocusIteratorByState(new FakeCloseableIterator<SAMRecord>(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByState.sampleListForSAMWithoutReadGroups());
}
protected LocusIteratorByState li;
@Test
public void testXandEQOperators() {
@ -286,53 +270,46 @@ public class LocusIteratorByStateUnitTest extends BaseTest {
// comprehensive LIBS/PileupElement tests //
////////////////////////////////////////////
private static class LIBSTest {
final String cigar;
final int readLength;
private LIBSTest(final String cigar, final int readLength) {
this.cigar = cigar;
this.readLength = readLength;
}
}
@DataProvider(name = "LIBSTest")
public Object[][] createLIBSTestData() {
public Object[][] makeLIBSTest() {
final List<Object[]> tests = new LinkedList<Object[]>();
//TODO -- when LIBS is fixed this should be replaced to provide all possible permutations of CIGAR strings
tests.add(new Object[]{new LIBSTest("1I", 1)});
tests.add(new Object[]{new LIBSTest("10I", 10)});
tests.add(new Object[]{new LIBSTest("2M2I2M", 6)});
tests.add(new Object[]{new LIBSTest("2M2I", 4)});
//TODO -- uncomment these when LIBS is fixed
//{new LIBSTest("2I2M", 4, Arrays.asList(2,3), Arrays.asList(IS_AFTER_INSERTION_FLAG,0))},
//{new LIBSTest("1I1M1D1M", 3, Arrays.asList(0,1), Arrays.asList(IS_AFTER_INSERTION_FLAG | IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG))},
//{new LIBSTest("1S1I1M", 3, Arrays.asList(2), Arrays.asList(IS_AFTER_INSERTION_FLAG))},
//{new LIBSTest("1M2D2M", 3)},
tests.add(new Object[]{new LIBSTest("1S1M", 2)});
tests.add(new Object[]{new LIBSTest("1M1S", 2)});
tests.add(new Object[]{new LIBSTest("1S1M1I", 3)});
return new Object[][]{
{new LIBSTest("1I", 1)},
{new LIBSTest("10I", 10)},
{new LIBSTest("2M2I2M", 6)},
{new LIBSTest("2M2I", 4)},
//TODO -- uncomment these when LIBS is fixed
//{new LIBSTest("2I2M", 4, Arrays.asList(2,3), Arrays.asList(IS_AFTER_INSERTION_FLAG,0))},
//{new LIBSTest("1I1M1D1M", 3, Arrays.asList(0,1), Arrays.asList(IS_AFTER_INSERTION_FLAG | IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG))},
//{new LIBSTest("1S1I1M", 3, Arrays.asList(2), Arrays.asList(IS_AFTER_INSERTION_FLAG))},
//{new LIBSTest("1M2D2M", 3)},
{new LIBSTest("1S1M", 2)},
{new LIBSTest("1M1S", 2)},
{new LIBSTest("1S1M1I", 3)}
};
return tests.toArray(new Object[][]{});
// TODO -- enable combinatorial tests here when LIBS is fixed
// return createLIBSTests(
// Arrays.asList(1, 10),
// Arrays.asList(1, 2, 3));
}
@Test(dataProvider = "LIBSTest")
public void testLIBS(LIBSTest params) {
final int locus = 44367788;
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, params.readLength);
read.setReadBases(Utils.dupBytes((byte) 'A', params.readLength));
read.setBaseQualities(Utils.dupBytes((byte) '@', params.readLength));
read.setCigarString(params.cigar);
if ( params.getElements() == null || params.getElements().get(0).getOperator() == CigarOperator.I )
// TODO -- ENABLE ME WHEN LIBS IS FIXED
return;
// create the iterator by state with the fake reads and fake records
li = makeLTBS(Arrays.asList(read), createTestReadProperties());
final GATKSAMRecord read = params.makeRead();
li = makeLTBS(Arrays.asList((SAMRecord)read), createTestReadProperties());
final LIBS_position tester = new LIBS_position(read);
int bpVisited = 0;
while ( li.hasNext() ) {
bpVisited++;
AlignmentContext alignmentContext = li.next();
ReadBackedPileup p = alignmentContext.getBasePileup();
Assert.assertTrue(p.getNumberOfElements() == 1);
@ -340,336 +317,68 @@ public class LocusIteratorByStateUnitTest extends BaseTest {
tester.stepForwardOnGenome();
Assert.assertEquals(pe.isBeforeDeletedBase(), tester.isBeforeDeletedBase);
Assert.assertEquals(pe.isBeforeDeletionStart(), tester.isBeforeDeletionStart);
Assert.assertEquals(pe.isAfterDeletedBase(), tester.isAfterDeletedBase);
Assert.assertEquals(pe.isAfterDeletionEnd(), tester.isAfterDeletionEnd);
Assert.assertEquals(pe.isBeforeInsertion(), tester.isBeforeInsertion);
Assert.assertEquals(pe.isAfterInsertion(), tester.isAfterInsertion);
Assert.assertEquals(pe.isNextToSoftClip(), tester.isNextToSoftClip);
if ( ! ALLOW_BROKEN_LIBS_STATE ) {
Assert.assertEquals(pe.isBeforeDeletedBase(), tester.isBeforeDeletedBase);
Assert.assertEquals(pe.isBeforeDeletionStart(), tester.isBeforeDeletionStart);
Assert.assertEquals(pe.isAfterDeletedBase(), tester.isAfterDeletedBase);
Assert.assertEquals(pe.isAfterDeletionEnd(), tester.isAfterDeletionEnd);
Assert.assertEquals(pe.isBeforeInsertion(), tester.isBeforeInsertion);
Assert.assertEquals(pe.isAfterInsertion(), tester.isAfterInsertion);
Assert.assertEquals(pe.isNextToSoftClip(), tester.isNextToSoftClip);
}
Assert.assertEquals(pe.getOffset(), tester.getCurrentReadOffset());
}
// min is one because always visit something, even for 10I reads
final int expectedBpToVisit = Math.max(read.getAlignmentEnd() - read.getAlignmentStart() + 1, 1);
Assert.assertEquals(bpVisited, expectedBpToVisit, "Didn't visit the expected number of bp");
}
////////////////////////////////////////////////
// End comprehensive LIBS/PileupElement tests //
////////////////////////////////////////////////
// ------------------------------------------------------------
//
// Tests for keeping reads
//
// ------------------------------------------------------------
@DataProvider(name = "LIBSKeepSubmittedReads")
public Object[][] makeLIBSKeepSubmittedReads() {
final List<Object[]> tests = new LinkedList<Object[]>();
///////////////////////////////////////
// Read State Manager Tests //
///////////////////////////////////////
private class PerSampleReadStateManagerTest extends TestDataProvider {
private List<Integer> readCountsPerAlignmentStart;
private List<SAMRecord> reads;
private List<ArrayList<LocusIteratorByState.SAMRecordState>> recordStatesByAlignmentStart;
private int removalInterval;
public PerSampleReadStateManagerTest( List<Integer> readCountsPerAlignmentStart, int removalInterval ) {
super(PerSampleReadStateManagerTest.class);
this.readCountsPerAlignmentStart = readCountsPerAlignmentStart;
this.removalInterval = removalInterval;
reads = new ArrayList<SAMRecord>();
recordStatesByAlignmentStart = new ArrayList<ArrayList<LocusIteratorByState.SAMRecordState>>();
setName(String.format("%s: readCountsPerAlignmentStart: %s removalInterval: %d",
getClass().getSimpleName(), readCountsPerAlignmentStart, removalInterval));
}
public void run() {
LocusIteratorByState libs = makeLTBS(new ArrayList<SAMRecord>(), createTestReadProperties());
LocusIteratorByState.ReadStateManager readStateManager =
libs.new ReadStateManager(new ArrayList<SAMRecord>().iterator());
LocusIteratorByState.ReadStateManager.PerSampleReadStateManager perSampleReadStateManager =
readStateManager.new PerSampleReadStateManager();
makeReads();
for ( ArrayList<LocusIteratorByState.SAMRecordState> stackRecordStates : recordStatesByAlignmentStart ) {
perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates);
}
// read state manager should have the right number of reads
Assert.assertEquals(reads.size(), perSampleReadStateManager.size());
Iterator<SAMRecord> originalReadsIterator = reads.iterator();
Iterator<LocusIteratorByState.SAMRecordState> recordStateIterator = perSampleReadStateManager.iterator();
int recordStateCount = 0;
int numReadStatesRemoved = 0;
// Do a first-pass validation of the record state iteration by making sure we get back everything we
// put in, in the same order, doing any requested removals of read states along the way
while ( recordStateIterator.hasNext() ) {
LocusIteratorByState.SAMRecordState readState = recordStateIterator.next();
recordStateCount++;
SAMRecord readFromPerSampleReadStateManager = readState.getRead();
Assert.assertTrue(originalReadsIterator.hasNext());
SAMRecord originalRead = originalReadsIterator.next();
// The read we get back should be literally the same read in memory as we put in
Assert.assertTrue(originalRead == readFromPerSampleReadStateManager);
// If requested, remove a read state every removalInterval states
if ( removalInterval > 0 && recordStateCount % removalInterval == 0 ) {
recordStateIterator.remove();
numReadStatesRemoved++;
}
}
Assert.assertFalse(originalReadsIterator.hasNext());
// If we removed any read states, do a second pass through the read states to make sure the right
// states were removed
if ( numReadStatesRemoved > 0 ) {
Assert.assertEquals(perSampleReadStateManager.size(), reads.size() - numReadStatesRemoved);
originalReadsIterator = reads.iterator();
recordStateIterator = perSampleReadStateManager.iterator();
int readCount = 0;
int readStateCount = 0;
// Match record states with the reads that should remain after removal
while ( recordStateIterator.hasNext() ) {
LocusIteratorByState.SAMRecordState readState = recordStateIterator.next();
readStateCount++;
SAMRecord readFromPerSampleReadStateManager = readState.getRead();
Assert.assertTrue(originalReadsIterator.hasNext());
SAMRecord originalRead = originalReadsIterator.next();
readCount++;
if ( readCount % removalInterval == 0 ) {
originalRead = originalReadsIterator.next(); // advance to next read, since the previous one should have been discarded
readCount++;
for ( final int nReadsPerLocus : Arrays.asList(1, 10) ) {
for ( final int nLoci : Arrays.asList(1, 10, 100, 1000) ) {
for ( final int nSamples : Arrays.asList(1, 2, 100) ) {
for ( final boolean keepReads : Arrays.asList(true, false) ) {
tests.add(new Object[]{nReadsPerLocus, nLoci, nSamples, keepReads});
}
// The read we get back should be literally the same read in memory as we put in (after accounting for removals)
Assert.assertTrue(originalRead == readFromPerSampleReadStateManager);
}
Assert.assertEquals(readStateCount, reads.size() - numReadStatesRemoved);
}
// Allow memory used by this test to be reclaimed
readCountsPerAlignmentStart = null;
reads = null;
recordStatesByAlignmentStart = null;
}
private void makeReads() {
int alignmentStart = 1;
for ( int readsThisStack : readCountsPerAlignmentStart ) {
ArrayList<SAMRecord> stackReads = new ArrayList<SAMRecord>(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100)));
ArrayList<LocusIteratorByState.SAMRecordState> stackRecordStates = new ArrayList<LocusIteratorByState.SAMRecordState>();
for ( SAMRecord read : stackReads ) {
stackRecordStates.add(new LocusIteratorByState.SAMRecordState(read));
}
reads.addAll(stackReads);
recordStatesByAlignmentStart.add(stackRecordStates);
}
}
}
@DataProvider(name = "PerSampleReadStateManagerTestDataProvider")
public Object[][] createPerSampleReadStateManagerTests() {
for ( List<Integer> thisTestReadStateCounts : Arrays.asList( Arrays.asList(1),
Arrays.asList(2),
Arrays.asList(10),
Arrays.asList(1, 1),
Arrays.asList(2, 2),
Arrays.asList(10, 10),
Arrays.asList(1, 10),
Arrays.asList(10, 1),
Arrays.asList(1, 1, 1),
Arrays.asList(2, 2, 2),
Arrays.asList(10, 10, 10),
Arrays.asList(1, 1, 1, 1, 1, 1),
Arrays.asList(10, 10, 10, 10, 10, 10),
Arrays.asList(1, 2, 10, 1, 2, 10)
) ) {
for ( int removalInterval : Arrays.asList(0, 2, 3) ) {
new PerSampleReadStateManagerTest(thisTestReadStateCounts, removalInterval);
}
}
return PerSampleReadStateManagerTest.getTests(PerSampleReadStateManagerTest.class);
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "PerSampleReadStateManagerTestDataProvider")
public void runPerSampleReadStateManagerTest( PerSampleReadStateManagerTest test ) {
logger.warn("Running test: " + test);
@Test(enabled = false, dataProvider = "LIBSKeepSubmittedReads")
public void testLIBSKeepSubmittedReads(final int nReadsPerLocus, final int nLoci, final int nSamples, final boolean keepReads) {
final int readLength = 10;
test.run();
}
///////////////////////////////////////
// End Read State Manager Tests //
///////////////////////////////////////
///////////////////////////////////////
// Helper methods / classes //
///////////////////////////////////////
private static ReadProperties createTestReadProperties() {
return createTestReadProperties(null);
}
private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) {
return new ReadProperties(
Collections.<SAMReaderID>emptyList(),
new SAMFileHeader(),
SAMFileHeader.SortOrder.coordinate,
false,
SAMFileReader.ValidationStringency.STRICT,
downsamplingMethod,
new ValidationExclusion(),
Collections.<ReadFilter>emptyList(),
Collections.<ReadTransformer>emptyList(),
false,
(byte) -1
);
}
private static class FakeCloseableIterator<T> implements CloseableIterator<T> {
Iterator<T> iterator;
public FakeCloseableIterator(Iterator<T> it) {
iterator = it;
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000);
for ( int i = 0; i < nSamples; i++ ) {
final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg" + i);
rg.setSample("sample" + i);
rg.setPlatform(NGSPlatform.ILLUMINA.getDefaultPlatform());
header.addReadGroup(rg);
}
@Override
public void close() {}
final List<SAMRecord> reads = ArtificialSAMUtils.createReadStream(nReadsPerLocus, nLoci, header, 1, readLength);
li = makeLTBS(reads, createTestReadProperties());
@Override
public boolean hasNext() {
return iterator.hasNext();
int bpVisited = 0;
while ( li.hasNext() ) {
bpVisited++;
}
@Override
public T next() {
return iterator.next();
}
@Override
public void remove() {
throw new UnsupportedOperationException("Don't remove!");
}
}
private static final class LIBS_position {
SAMRecord read;
final int numOperators;
int currentOperatorIndex = 0;
int currentPositionOnOperator = 0;
int currentReadOffset = 0;
boolean isBeforeDeletionStart = false;
boolean isBeforeDeletedBase = false;
boolean isAfterDeletionEnd = false;
boolean isAfterDeletedBase = false;
boolean isBeforeInsertion = false;
boolean isAfterInsertion = false;
boolean isNextToSoftClip = false;
boolean sawMop = false;
public LIBS_position(final SAMRecord read) {
this.read = read;
numOperators = read.getCigar().numCigarElements();
}
public int getCurrentReadOffset() {
return Math.max(0, currentReadOffset - 1);
}
/**
* Steps forward on the genome. Returns false when done reading the read, true otherwise.
*/
public boolean stepForwardOnGenome() {
if ( currentOperatorIndex == numOperators )
return false;
CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex);
if ( currentPositionOnOperator >= curElement.getLength() ) {
if ( ++currentOperatorIndex == numOperators )
return false;
curElement = read.getCigar().getCigarElement(currentOperatorIndex);
currentPositionOnOperator = 0;
}
switch ( curElement.getOperator() ) {
case I: // insertion w.r.t. the reference
if ( !sawMop )
break;
case S: // soft clip
currentReadOffset += curElement.getLength();
case H: // hard clip
case P: // padding
currentOperatorIndex++;
return stepForwardOnGenome();
case D: // deletion w.r.t. the reference
case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
currentPositionOnOperator++;
break;
case M:
case EQ:
case X:
sawMop = true;
currentReadOffset++;
currentPositionOnOperator++;
break;
default:
throw new IllegalStateException("No support for cigar op: " + curElement.getOperator());
}
final boolean isFirstOp = currentOperatorIndex == 0;
final boolean isLastOp = currentOperatorIndex == numOperators - 1;
final boolean isFirstBaseOfOp = currentPositionOnOperator == 1;
final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength();
isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp);
isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D);
isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp);
isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D);
isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp)
|| (!sawMop && curElement.getOperator() == CigarOperator.I);
isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp);
isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp)
|| isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp);
return true;
}
private static boolean isBeforeOp(final Cigar cigar,
final int currentOperatorIndex,
final CigarOperator op,
final boolean isLastOp,
final boolean isLastBaseOfOp) {
return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op;
}
private static boolean isAfterOp(final Cigar cigar,
final int currentOperatorIndex,
final CigarOperator op,
final boolean isFirstOp,
final boolean isFirstBaseOfOp) {
return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op;
}
final int expectedBpToVisit = nLoci + readLength;
Assert.assertEquals(bpVisited, expectedBpToVisit, "Didn't visit the expected number of bp");
}
}

View File

@ -0,0 +1,214 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.locusiterator;
import net.sf.samtools.*;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
/**
* testing of the new (non-legacy) version of LocusIteratorByState
*/
public class ReadStateManagerUnitTest extends LocusIteratorByStateBaseTest {
///////////////////////////////////////
// Read State Manager Tests //
///////////////////////////////////////
private class PerSampleReadStateManagerTest extends TestDataProvider {
private List<Integer> readCountsPerAlignmentStart;
private List<SAMRecord> reads;
private List<ArrayList<SAMRecordAlignmentState>> recordStatesByAlignmentStart;
private int removalInterval;
public PerSampleReadStateManagerTest( List<Integer> readCountsPerAlignmentStart, int removalInterval ) {
super(PerSampleReadStateManagerTest.class);
this.readCountsPerAlignmentStart = readCountsPerAlignmentStart;
this.removalInterval = removalInterval;
reads = new ArrayList<SAMRecord>();
recordStatesByAlignmentStart = new ArrayList<ArrayList<SAMRecordAlignmentState>>();
setName(String.format("%s: readCountsPerAlignmentStart: %s removalInterval: %d",
getClass().getSimpleName(), readCountsPerAlignmentStart, removalInterval));
}
public void run() {
final List<String> samples = sampleListForSAMWithoutReadGroups();
final Iterator<SAMRecord> iterator = new LinkedList<SAMRecord>().iterator();
ReadStateManager readStateManager = new ReadStateManager(iterator, samples, LIBSDownsamplingInfo.NO_DOWNSAMPLING);
ReadStateManager.PerSampleReadStateManager perSampleReadStateManager = readStateManager.new PerSampleReadStateManager(LIBSDownsamplingInfo.NO_DOWNSAMPLING);
// ReadStateManager readStateManager =
// libs.new ReadStateManager(new ArrayList<SAMRecord>().iterator());
// ReadStateManager.PerSampleReadStateManager perSampleReadStateManager =
// readStateManager.new PerSampleReadStateManager();
makeReads();
for ( ArrayList<SAMRecordAlignmentState> stackRecordStates : recordStatesByAlignmentStart ) {
perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates);
}
// read state manager should have the right number of reads
Assert.assertEquals(reads.size(), perSampleReadStateManager.size());
Iterator<SAMRecord> originalReadsIterator = reads.iterator();
Iterator<SAMRecordAlignmentState> recordStateIterator = perSampleReadStateManager.iterator();
int recordStateCount = 0;
int numReadStatesRemoved = 0;
// Do a first-pass validation of the record state iteration by making sure we get back everything we
// put in, in the same order, doing any requested removals of read states along the way
while ( recordStateIterator.hasNext() ) {
SAMRecordAlignmentState readState = recordStateIterator.next();
recordStateCount++;
SAMRecord readFromPerSampleReadStateManager = readState.getRead();
Assert.assertTrue(originalReadsIterator.hasNext());
SAMRecord originalRead = originalReadsIterator.next();
// The read we get back should be literally the same read in memory as we put in
Assert.assertTrue(originalRead == readFromPerSampleReadStateManager);
// If requested, remove a read state every removalInterval states
if ( removalInterval > 0 && recordStateCount % removalInterval == 0 ) {
recordStateIterator.remove();
numReadStatesRemoved++;
}
}
Assert.assertFalse(originalReadsIterator.hasNext());
// If we removed any read states, do a second pass through the read states to make sure the right
// states were removed
if ( numReadStatesRemoved > 0 ) {
Assert.assertEquals(perSampleReadStateManager.size(), reads.size() - numReadStatesRemoved);
originalReadsIterator = reads.iterator();
recordStateIterator = perSampleReadStateManager.iterator();
int readCount = 0;
int readStateCount = 0;
// Match record states with the reads that should remain after removal
while ( recordStateIterator.hasNext() ) {
SAMRecordAlignmentState readState = recordStateIterator.next();
readStateCount++;
SAMRecord readFromPerSampleReadStateManager = readState.getRead();
Assert.assertTrue(originalReadsIterator.hasNext());
SAMRecord originalRead = originalReadsIterator.next();
readCount++;
if ( readCount % removalInterval == 0 ) {
originalRead = originalReadsIterator.next(); // advance to next read, since the previous one should have been discarded
readCount++;
}
// The read we get back should be literally the same read in memory as we put in (after accounting for removals)
Assert.assertTrue(originalRead == readFromPerSampleReadStateManager);
}
Assert.assertEquals(readStateCount, reads.size() - numReadStatesRemoved);
}
// Allow memory used by this test to be reclaimed
readCountsPerAlignmentStart = null;
reads = null;
recordStatesByAlignmentStart = null;
}
private void makeReads() {
int alignmentStart = 1;
for ( int readsThisStack : readCountsPerAlignmentStart ) {
ArrayList<SAMRecord> stackReads = new ArrayList<SAMRecord>(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100)));
ArrayList<SAMRecordAlignmentState> stackRecordStates = new ArrayList<SAMRecordAlignmentState>();
for ( SAMRecord read : stackReads ) {
stackRecordStates.add(new SAMRecordAlignmentState(read));
}
reads.addAll(stackReads);
recordStatesByAlignmentStart.add(stackRecordStates);
}
}
}
@DataProvider(name = "PerSampleReadStateManagerTestDataProvider")
public Object[][] createPerSampleReadStateManagerTests() {
for ( List<Integer> thisTestReadStateCounts : Arrays.asList( Arrays.asList(1),
Arrays.asList(2),
Arrays.asList(10),
Arrays.asList(1, 1),
Arrays.asList(2, 2),
Arrays.asList(10, 10),
Arrays.asList(1, 10),
Arrays.asList(10, 1),
Arrays.asList(1, 1, 1),
Arrays.asList(2, 2, 2),
Arrays.asList(10, 10, 10),
Arrays.asList(1, 1, 1, 1, 1, 1),
Arrays.asList(10, 10, 10, 10, 10, 10),
Arrays.asList(1, 2, 10, 1, 2, 10)
) ) {
for ( int removalInterval : Arrays.asList(0, 2, 3) ) {
new PerSampleReadStateManagerTest(thisTestReadStateCounts, removalInterval);
}
}
return PerSampleReadStateManagerTest.getTests(PerSampleReadStateManagerTest.class);
}
@Test(dataProvider = "PerSampleReadStateManagerTestDataProvider")
public void runPerSampleReadStateManagerTest( PerSampleReadStateManagerTest test ) {
logger.warn("Running test: " + test);
test.run();
}
}

View File

@ -0,0 +1,78 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.locusiterator;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.Arrays;
/**
* testing of the new (non-legacy) version of LocusIteratorByState
*/
public class SAMRecordAlignmentStateUnitTest extends LocusIteratorByStateBaseTest {
@DataProvider(name = "AlignmentStateTest")
public Object[][] makeAlignmentStateTest() {
// return new Object[][]{{new LIBSTest("1I", 1)}};
return createLIBSTests(
Arrays.asList(1, 2),
Arrays.asList(1, 2, 3, 4));
}
@Test(dataProvider = "AlignmentStateTest")
public void testAlignmentStateTest(LIBSTest params) {
final GATKSAMRecord read = params.makeRead();
final SAMRecordAlignmentState state = new SAMRecordAlignmentState(read);
final LIBS_position tester = new LIBS_position(read);
Assert.assertSame(state.getRead(), read);
Assert.assertNotNull(state.toString());
int bpVisited = 0;
int lastOffset = -1;
while ( state.stepForwardOnGenome() != null ) {
bpVisited++;
tester.stepForwardOnGenome();
Assert.assertTrue(state.getReadOffset() >= lastOffset, "Somehow read offsets are decreasing: lastOffset " + lastOffset + " current " + state.getReadOffset());
Assert.assertEquals(state.getReadOffset(), tester.getCurrentReadOffset(), "Read offsets are wrong at " + bpVisited);
// TODO -- state.peekBackwardOnGenome();
// TODO -- state.peekForwardOnGenome();
// TODO -- state.getCurrentCigarOperator()
// TODO -- state.getGenomeOffset();
// TODO -- state.getGenomePosition();
// TODO -- Assert.assertEquals(state.getLocation(genomeLocParser), EXPECTATION);
lastOffset = state.getReadOffset();
}
// min is one because always visit something, even for 10I reads
final int expectedBpToVisit = read.getAlignmentEnd() - read.getAlignmentStart() + 1;
Assert.assertEquals(bpVisited, expectedBpToVisit, "Didn't visit the expected number of bp");
}
}

View File

@ -0,0 +1,160 @@
package org.broadinstitute.sting.utils.locusiterator.legacy;
import net.sf.samtools.*;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.locusiterator.legacy.LegacyLocusIteratorByState;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
class FakeCloseableIterator<T> implements CloseableIterator<T> {
Iterator<T> iterator;
public FakeCloseableIterator(Iterator<T> it) {
iterator = it;
}
@Override
public void close() {}
@Override
public boolean hasNext() {
return iterator.hasNext();
}
@Override
public T next() {
return iterator.next();
}
@Override
public void remove() {
throw new UnsupportedOperationException("Don't remove!");
}
}
final class LIBS_position {
SAMRecord read;
final int numOperators;
int currentOperatorIndex = 0;
int currentPositionOnOperator = 0;
int currentReadOffset = 0;
boolean isBeforeDeletionStart = false;
boolean isBeforeDeletedBase = false;
boolean isAfterDeletionEnd = false;
boolean isAfterDeletedBase = false;
boolean isBeforeInsertion = false;
boolean isAfterInsertion = false;
boolean isNextToSoftClip = false;
boolean sawMop = false;
public LIBS_position(final SAMRecord read) {
this.read = read;
numOperators = read.getCigar().numCigarElements();
}
public int getCurrentReadOffset() {
return Math.max(0, currentReadOffset - 1);
}
/**
* Steps forward on the genome. Returns false when done reading the read, true otherwise.
*/
public boolean stepForwardOnGenome() {
if ( currentOperatorIndex == numOperators )
return false;
CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex);
if ( currentPositionOnOperator >= curElement.getLength() ) {
if ( ++currentOperatorIndex == numOperators )
return false;
curElement = read.getCigar().getCigarElement(currentOperatorIndex);
currentPositionOnOperator = 0;
}
switch ( curElement.getOperator() ) {
case I: // insertion w.r.t. the reference
if ( !sawMop )
break;
case S: // soft clip
currentReadOffset += curElement.getLength();
case H: // hard clip
case P: // padding
currentOperatorIndex++;
return stepForwardOnGenome();
case D: // deletion w.r.t. the reference
case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
currentPositionOnOperator++;
break;
case M:
case EQ:
case X:
sawMop = true;
currentReadOffset++;
currentPositionOnOperator++;
break;
default:
throw new IllegalStateException("No support for cigar op: " + curElement.getOperator());
}
final boolean isFirstOp = currentOperatorIndex == 0;
final boolean isLastOp = currentOperatorIndex == numOperators - 1;
final boolean isFirstBaseOfOp = currentPositionOnOperator == 1;
final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength();
isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp);
isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D);
isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp);
isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D);
isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp)
|| (!sawMop && curElement.getOperator() == CigarOperator.I);
isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp);
isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp)
|| isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp);
return true;
}
private static boolean isBeforeOp(final Cigar cigar,
final int currentOperatorIndex,
final CigarOperator op,
final boolean isLastOp,
final boolean isLastBaseOfOp) {
return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op;
}
private static boolean isAfterOp(final Cigar cigar,
final int currentOperatorIndex,
final CigarOperator op,
final boolean isFirstOp,
final boolean isFirstBaseOfOp) {
return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op;
}
}