gatk-3.8/java/lib/edu/mit/broad/picard/sam/SamLocusIterator.java

281 lines
11 KiB
Java

package edu.mit.broad.picard.sam;
import edu.mit.broad.sam.util.CloseableIterator;
import edu.mit.broad.sam.SAMRecord;
import edu.mit.broad.sam.NotPrimarySkippingIterator;
import edu.mit.broad.picard.directed.GenomeMask;
import java.util.*;
/**
* Iterator that traverses a SAM File, accumulating information on a per-locus basis
*/
public class SamLocusIterator implements Iterable<SamLocusIterator.LocusInfo>, CloseableIterator<SamLocusIterator.LocusInfo> {
/**
* The unit of iteration. Holds the locus, plus the base, quality and strand for each read at that locus.
*/
public static class LocusInfo {
protected final int sequenceIndex;
protected final int position;
protected final List<Byte> bases = new ArrayList<Byte>(100);
protected final List<Byte> qualities = new ArrayList<Byte>(100);
protected final List<Boolean> negativeStrandFlags = new ArrayList<Boolean>(100);
LocusInfo(final int sequenceIndex, final int position) {
this.sequenceIndex = sequenceIndex;
this.position = position;
}
/**
* Accumulate info for one read at the locus.
*/
public void add(final Byte readBase, final Byte baseQuality, final boolean strand) {
bases.add(readBase);
qualities.add(baseQuality);
negativeStrandFlags.add(strand);
}
public int getSequenceIndex() { return sequenceIndex; }
public int getPosition() { return position; }
public List<Byte> getBases() { return bases; }
public List<Byte> getQualities() { return qualities; }
public List<Boolean> getNegativeStrandFlags() { return negativeStrandFlags; }
public String getBasesAsString() { return bytesToString(bases); }
private static String bytesToString(final List<Byte> data) {
if (data == null || data.size() == 0) {
return "";
}
final char[] chars = new char[data.size()];
for (int i = 0; i < data.size(); i++) {
chars[i] = (char) (data.get(i) & 0xFF);
}
return new String(chars);
}
}
private final CloseableIterator<SAMRecord> underlyingIterator;
private final NotPrimarySkippingIterator it;
private final LinkedList<LocusInfo> complete = new LinkedList<LocusInfo>();
private final LinkedList<LocusInfo> accumulator = new LinkedList<LocusInfo>();
private boolean includeNonPfReads = false;
private boolean includeDuplicates = false;
private int qualityScoreCutoff = -Integer.MAX_VALUE;
private GenomeMask mask;
private int lastContig = 0;
private int lastPosition = 0;
private boolean finishedAlignedReads = false;
// this should probably take a SAM
public SamLocusIterator(final CloseableIterator<SAMRecord> samIterator) {
this.underlyingIterator = samIterator;
this.it = new NotPrimarySkippingIterator(samIterator);
}
public Iterator<LocusInfo> iterator() {
return this;
}
public void close() {
this.underlyingIterator.close();
}
private boolean samHasMore() {
return !finishedAlignedReads && it.hasCurrent();
}
public boolean hasNext() {
return ((complete.size() > 0) || (accumulator.size() > 0) || (samHasMore()) || hasRemainingMaskBases());
}
private boolean hasRemainingMaskBases() {
if (mask == null) return false;
// if there are more contigs in the mask, by definition some of them must have
// marked bases otherwise if we're in the last contig, but we're not at the last marked position,
// there is also more in the mask
return (lastContig <= mask.getMaxContig() ||
(lastContig == mask.getMaxContig() && lastPosition <= mask.get(lastContig).nextSetBit(lastPosition+1)));
}
public LocusInfo next() {
// if we don't have any completed entries to return, try and make some!
while(complete.size() == 0 && samHasMore()) {
final SAMRecord rec = it.getCurrent();
final String cigar = rec.getCigarString();
// as soon as we hit our first non-aligned read, we can stop!
if (cigar.equals("*")) {
this.finishedAlignedReads = true;
continue;
}
// skip dupe reads, if so requested
if (!isIncludeDuplicates() && rec.getDuplicateReadFlag()) { it.advance(); continue; }
// skip non-PF reads, if so requested
if (!isIncludeNonPfReads() && rec.getReadFailsVendorQualityCheckFlag()) { it.advance(); continue; }
// when we switch contigs, emit everything in the accumulator
if (accumulator.size() > 0 && accumulator.getFirst().sequenceIndex != rec.getReferenceIndex()) {
while (accumulator.size() > 0) {
popLocus();
}
}
// pop off things we're not going to accumulate more coverage at the locus in question
while(accumulator.size() > 0 && accumulator.getFirst().position < rec.getAlignmentStart()) {
popLocus();
}
// check that it's a non-gapped alignment for now!
// TODO: handle gapped and clipped alignments
if (!cigar.matches("[0-9]+M")) {
System.out.println("Cannot deal with clipped or gapped alignments. CIGAR="+cigar);
System.exit(1);
}
// at this point, either the list is empty or the head should
// be the same position as the first base of the read
// interpret the CIGAR string and add the base info
for(int j=0; j < rec.getReadBases().length; j++) {
// if the position is empty, initialize it
if (j > accumulator.size() - 1) {
accumulator.add(new LocusInfo(rec.getReferenceIndex(), rec.getAlignmentStart() + j));
}
// if the quality score cutoff is met, accumulate the base info
if (rec.getBaseQualities()[j] >= getQualityScoreCutoff()) {
accumulator.get(j).add(rec.getReadBases()[j], rec.getBaseQualities()[j], rec.getReadNegativeStrandFlag());
}
}
it.advance();
}
// if we have nothing to return to the user, and we're at the end of the SAM iterator,
// push everything into the complete queue
if (complete.size() == 0 && !samHasMore()) {
while(accumulator.size() > 0) {
popLocus();
}
}
// if there are completed entries, return those
if (complete.size() > 0) {
return complete.removeFirst();
} else {
// In this case... we're past the last read from SAM so see if we can
// fill out any more (zero coverage) entries from the mask
LocusInfo zeroResult = null;
while (zeroResult == null && lastContig <= mask.getMaxContig()) {
final int nextbit = mask.get(lastContig).nextSetBit(lastPosition+1);
// try the next contig
if (nextbit == -1) {
lastContig++;
lastPosition = 0;
} else {
lastPosition = nextbit;
zeroResult = new LocusInfo(lastContig, lastPosition);
}
}
return zeroResult;
}
}
/**
* Pop the first entry from the LocusInfo accumulator into the complete queue. In addition,
* check the GenomeMask and if there are intervening mask positions between the last popped base and the one
* about to be popped, put those on the complete queue as well.
*/
private void popLocus() {
final LocusInfo li = accumulator.removeFirst();
// fill in any gaps based on our genome mask
final int liContig = li.getSequenceIndex();
// if we're not on the same contig, fill in the rest of the bits for the previous contig first...
if (lastContig < liContig) {
while (lastContig < liContig) {
int nextbit = 0;
if (mask != null && mask.get(lastContig) != null) {
while (nextbit != -1) {
nextbit = mask.get(lastContig).nextSetBit(lastPosition + 1);
if (nextbit > -1) {
complete.addLast(new LocusInfo(lastContig, nextbit));
lastPosition = nextbit;
}
}
}
lastPosition=0;
lastContig++;
}
}
// now that we're on the same contig, fill in any unfilled positions
// if we have some bits in the mask to fill in...
if (mask != null && mask.get(lastContig) != null && lastPosition + 1 < li.getPosition()) {
while (lastPosition + 1 < li.getPosition()) {
final int nextbit = mask.get(lastContig).nextSetBit(lastPosition + 1);
// if there are no more mask bits, or the next mask bit is
// at or after the current data, just continue on
if (nextbit == -1 || nextbit >= li.getPosition()) { break; }
// otherwise, pop on the desired empty locus info
complete.addLast(new LocusInfo(lastContig, nextbit));
lastPosition = nextbit;
}
}
// only add to the complete queue if it's in the mask (or we have no mask!)
if (mask == null || mask.get(li.getSequenceIndex(), li.getPosition())) {
complete.addLast(li);
}
lastContig = liContig;
lastPosition = li.getPosition();
}
public void remove() {
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
}
// --------------------------------------------------------------------------------------------
// Helper methods below this point...
// --------------------------------------------------------------------------------------------
public void setGenomeMask(final GenomeMask mask) { this.mask = mask; }
public GenomeMask getGenomeMask() { return this.mask; }
public boolean isIncludeNonPfReads() { return includeNonPfReads; }
public void setIncludeNonPfReads(final boolean includeNonPfReads) { this.includeNonPfReads = includeNonPfReads; }
public boolean isIncludeDuplicates() { return includeDuplicates; }
public void setIncludeDuplicates(final boolean includeDuplicates) { this.includeDuplicates = includeDuplicates; }
public int getQualityScoreCutoff() { return qualityScoreCutoff; }
public void setQualityScoreCutoff(final int qualityScoreCutoff) { this.qualityScoreCutoff = qualityScoreCutoff; }
}