From 7c7d4b3a95aef50d94fc2f183fbf8d890ee90576 Mon Sep 17 00:00:00 2001 From: depristo Date: Thu, 12 Mar 2009 23:31:00 +0000 Subject: [PATCH] Added support for HangingLocusIterator git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@43 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/atk/LocusIteratorByHanger.java | 189 +++++++++++++++ .../sting/atk/SingleLocusIterator.java | 162 +++++++++++++ .../atk/modules/DepthOfCoverageWalker.java | 28 +++ .../broadinstitute/sting/utils/RefHanger.java | 227 ++++++++++++++++++ 4 files changed, 606 insertions(+) create mode 100755 playground/java/src/org/broadinstitute/sting/atk/LocusIteratorByHanger.java create mode 100755 playground/java/src/org/broadinstitute/sting/atk/SingleLocusIterator.java create mode 100755 playground/java/src/org/broadinstitute/sting/atk/modules/DepthOfCoverageWalker.java create mode 100755 playground/java/src/org/broadinstitute/sting/utils/RefHanger.java diff --git a/playground/java/src/org/broadinstitute/sting/atk/LocusIteratorByHanger.java b/playground/java/src/org/broadinstitute/sting/atk/LocusIteratorByHanger.java new file mode 100755 index 000000000..e3d56977a --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/atk/LocusIteratorByHanger.java @@ -0,0 +1,189 @@ +package org.broadinstitute.sting.atk; + +import net.sf.samtools.util.CloseableIterator; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.AlignmentBlock; +import org.broadinstitute.sting.utils.*; + +import java.util.List; +import java.util.Iterator; + +import org.broadinstitute.sting.utils.RefHanger; + +/** + * Iterator that traverses a SAM File, accumulating information on a per-locus basis + */ +public class LocusIteratorByHanger extends LocusIterator { + + // ----------------------------------------------------------------------------------------------------------------- + // + // member fields + // + // ----------------------------------------------------------------------------------------------------------------- + private final PushbackIterator it; + + private RefHanger readHanger = new RefHanger(); + private RefHanger offsetHanger = new RefHanger(); + final int INCREMENT_SIZE = 100; + final boolean DEBUG = false; + + /** + * Useful class for forwarding on locusContext data from this iterator + */ + public class MyLocusContext implements LocusContext { + GenomeLoc loc = null; + private List reads = null; + private List offsets = null; + + private MyLocusContext(GenomeLoc loc, List reads, List offsets) { + this.loc = loc; + this.reads = reads; + this.offsets = offsets; + } + + public String getContig() { return getLocation().getContig(); } + public long getPosition() { return getLocation().getStart(); } + public GenomeLoc getLocation() { return loc; } + + public List getReads() { return reads; } + public List getOffsets() { return offsets; } + public int numReads() { return readHanger.getLeft().data.size(); } + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // constructors and other basic operations + // + // ----------------------------------------------------------------------------------------------------------------- + public LocusIteratorByHanger(final CloseableIterator samIterator) { + this.it = new PushbackIterator(samIterator); + } + + public Iterator iterator() { + return this; + } + + public void close() { + //this.it.close(); + } + + public boolean hasNext() { + return readHanger.hasHangers() || it.hasNext(); + } + + public void printState() { + for ( int i = 0; i < readHanger.size(); i++ ) { + RefHanger.Hanger rhanger = readHanger.getHanger(i); + RefHanger.Hanger ohanger = offsetHanger.getHanger(i); + + System.out.printf(" -> %s:", rhanger.loc); + for ( int j = 0; j < rhanger.size(); j++ ) { + SAMRecord read = (SAMRecord)rhanger.get(j); + int offset = (Integer)ohanger.get(j); + System.out.printf(" %s(%d)=%s", read.getReadName(), offset, read.getReadString().charAt(offset) ); + } + System.out.printf("%n"); + + } + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // next() routine and associated collection operations + // + // ----------------------------------------------------------------------------------------------------------------- + public MyLocusContext next() { + if ( ! currentPositionIsFullyCovered() ) + expandWindow(INCREMENT_SIZE); + + if ( DEBUG ) { + System.out.printf("in Next:%n"); + printState(); + } + + RefHanger.Hanger rhanger = readHanger.popLeft(); + RefHanger.Hanger ohanger = offsetHanger.popLeft(); + + return new MyLocusContext(rhanger.loc, rhanger.data, ohanger.data); + } + + protected void hangRead(final SAMRecord read) { + GenomeLoc readLoc = new GenomeLoc(read.getReferenceName(), read.getAlignmentStart()); + //System.out.printf("Adding read %s at %d%n", read.getReadName(), read.getAlignmentStart()); + /* + for ( int i = 0; i < read.getReadLength(); i++ ) { + GenomeLoc offset = new GenomeLoc(readLoc.getContig(), readLoc.getStart() + i); + readHanger.expandingPut(offset, read); + offsetHanger.expandingPut(offset, i); + } + */ + + for ( AlignmentBlock block : read.getAlignmentBlocks() ) { + if ( DEBUG ) + System.out.printf("Processing block %s len=%d%n", block, block.getLength()); + for ( int i = 0; i < block.getLength(); i++ ) { + GenomeLoc offset = new GenomeLoc(readLoc.getContig(), block.getReferenceStart() + i); + readHanger.expandingPut(offset, read); + offsetHanger.expandingPut(offset, block.getReadStart() + i - 1); + if ( DEBUG ) + System.out.printf(" # Added %s%n", offset); + } + } + } + + private final boolean currentPositionIsFullyCovered(final GenomeLoc nextReadInStreamLoc) { + if ( readHanger.isEmpty() ) + // If the buffer is empty, we're definitely not done + return false; + + if ( nextReadInStreamLoc.compareTo(readHanger.getLeftLoc()) == 1 ) + // the next read in the stream is beyond the left most position, so it's fully covered + return true; + else + // not fully covered + return false; + } + + private final boolean currentPositionIsFullyCovered() { + final SAMRecord read = it.next(); + GenomeLoc readLoc = new GenomeLoc(read.getReferenceName(), read.getAlignmentStart()); + final boolean coveredP = currentPositionIsFullyCovered(readLoc); + if ( coveredP ) + it.pushback(read); + return coveredP; + } + + private final void expandWindow(final int incrementSize) { + while ( it.hasNext() ) { + if ( DEBUG ) { + System.out.printf("Expanding window%n"); + printState(); + } + + SAMRecord read = it.next(); + + GenomeLoc readLoc = new GenomeLoc(read.getReferenceName(), read.getAlignmentStart()); + if ( DEBUG ) { + System.out.printf(" Expanding window sizes %d with %d : left=%s, right=%s, readLoc = %s, cmp=%d%n", + readHanger.size(), incrementSize, + readHanger.hasHangers() ? readHanger.getLeftLoc() : "NA", + readHanger.hasHangers() ? readHanger.getRightLoc() : "NA", + readLoc, + readHanger.hasHangers() ? readLoc.compareTo(readHanger.getLeftLoc()) : -100); + } + //if ( readHanger.size() >= incrementSize ) { + //if ( readHanger.hasHangers() && readLoc.compareTo(readHanger.getLeftLoc()) == 1) { + if ( readHanger.hasHangers() && readLoc.distance(readHanger.getLeftLoc()) >= incrementSize ) { + // We've collected up enough reads + it.pushback(read); + break; + } + else + hangRead(read); + } + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } +} \ No newline at end of file diff --git a/playground/java/src/org/broadinstitute/sting/atk/SingleLocusIterator.java b/playground/java/src/org/broadinstitute/sting/atk/SingleLocusIterator.java new file mode 100755 index 000000000..139f447dc --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/atk/SingleLocusIterator.java @@ -0,0 +1,162 @@ +package org.broadinstitute.sting.atk; + +import net.sf.samtools.util.CloseableIterator; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.PushbackIterator; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.Predicate; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.List; +import java.util.ArrayList; +import java.util.Iterator; + +import edu.mit.broad.picard.filter.SamRecordFilter; +import edu.mit.broad.picard.filter.FilteringIterator; + +/** + * Iterator that traverses a SAM File, accumulating information on a per-locus basis + */ +public class SingleLocusIterator extends LocusIterator implements LocusContext { + + // ----------------------------------------------------------------------------------------------------------------- + // + // member fields + // + // ----------------------------------------------------------------------------------------------------------------- + private final PushbackIterator it; + private String contig = null; + private int position = -1; + private List reads = new ArrayList(100); + private List offsets = new ArrayList(100); + + public String getContig() { return contig; } + public long getPosition() { return position; } + public GenomeLoc getLocation() { return new GenomeLoc(contig, position); } + + public List getReads() { return reads; } + public List getOffsets() { return offsets; } + public int numReads() { return reads.size(); } + + // ----------------------------------------------------------------------------------------------------------------- + // + // constructors and other basic operations + // + // ----------------------------------------------------------------------------------------------------------------- + public SingleLocusIterator(final CloseableIterator samIterator) { + FilteringIterator filterIter = new FilteringIterator(samIterator, new locusStreamFilterFunc()); + this.it = new PushbackIterator(filterIter); + + } + + /** + * Class to filter out un-handle-able reads from the stream. + */ + class locusStreamFilterFunc implements SamRecordFilter { + public boolean filterOut(SAMRecord rec) { + return rec.getCigar().numCigarElements() > 1; + } + } + + public Iterator iterator() { + return this; + } + + public void close() { + //this.it.close(); + } + + public boolean hasNext() { + return it.hasNext(); + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // next() routine and associated collection operations + // + // ----------------------------------------------------------------------------------------------------------------- + public LocusContext next() { + position += 1; + + if ( position != -1 ) { + cleanReads(); + expandReads(); + } + + if ( reads.isEmpty() ) { + // the window is empty, we need to jump to the first pos of the first read in the stream + SAMRecord read = it.next(); + pushRead(read); + contig = read.getReferenceName(); + position = read.getAlignmentStart() - 1; + return next(); + } + else { + // at this point, window contains all reads covering the pos, we need to return them + // and the offsets into each read for this loci + calcOffsetsOfWindow(position); + return this; + } + } + + private void pushRead(SAMRecord read) { + //System.out.printf(" -> Adding read %s %d-%d flags %s%n", read.getReadName(), read.getAlignmentStart(), read.getAlignmentEnd(), Utils.readFlagsAsString(read)); + reads.add(read); + } + + class KeepReadPFunc implements Predicate { + public boolean apply(SAMRecord read) { + return position >= read.getAlignmentStart() && + position < read.getAlignmentEnd() && + read.getReferenceName().equals(contig); // should be index for efficiency + } + } + Predicate KeepReadP = new SingleLocusIterator.KeepReadPFunc(); + + private void calcOffsetsOfWindow(final int position) { + offsets.clear(); + for ( SAMRecord read : reads ) { +// def calcOffset( read ): +// offset = self.pos - read.start +// return offset +// +// offsets = map(calcOffset, self.window) + final int offset = position - read.getAlignmentStart(); + assert(offset < read.getReadLength() ); + offsets.add(offset); + //System.out.printf("offsets [%d] %s%n", read.getAlignmentStart(), offsets); + } + } + + private void cleanReads() { + // def keepReadP( read ): + // return read.chr == chr and pos >= read.start and pos <= read.end + // self.window = filter( keepReadP, self.window ) + reads = Utils.filter(KeepReadP, reads); + } + + private void expandReads() { +// for read in self.rs: +// #print 'read', read, pos +// if read.chr == chr and read.start <= pos and read.end >= pos: +// self.pushRead(read) +// else: +// self.rs.unget( read ) +// #self.rs = chain( [read], self.rs ) +// break + while ( it.hasNext() ) { + SAMRecord read = it.next(); + if ( KeepReadP.apply( read ) ) { + pushRead(read); + } + else { + it.pushback(read); + break; + } + } + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } +} diff --git a/playground/java/src/org/broadinstitute/sting/atk/modules/DepthOfCoverageWalker.java b/playground/java/src/org/broadinstitute/sting/atk/modules/DepthOfCoverageWalker.java new file mode 100755 index 000000000..0e1f45903 --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/atk/modules/DepthOfCoverageWalker.java @@ -0,0 +1,28 @@ +package org.broadinstitute.sting.atk.modules; + +import org.broadinstitute.sting.atk.LocusContext; +import org.broadinstitute.sting.utils.ReferenceOrderedDatum; + +import java.util.List; + +import net.sf.samtools.SAMRecord; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 3:22:14 PM + * To change this template use File | Settings | File Templates. + */ +public class DepthOfCoverageWalker extends BasicLociWalker { + public Integer map(List rodData, char ref, LocusContext context) { + System.out.printf("%s: %d%n", context.getLocation(), context.getReads().size() ); + return 0; + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } +} \ No newline at end of file diff --git a/playground/java/src/org/broadinstitute/sting/utils/RefHanger.java b/playground/java/src/org/broadinstitute/sting/utils/RefHanger.java new file mode 100755 index 000000000..01f89f0a4 --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/utils/RefHanger.java @@ -0,0 +1,227 @@ +package org.broadinstitute.sting.utils; + +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.List; +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Hanging data off the reference sequence + * + * Supports in effect the following data structure + * + * <-- reference bases: A T G C --> + * d d d d + * d d d d + * d d d d + * d d d + * d d + * d + * + * Where the little d's are data associated with each position in the reference. + * Supports adding and removing data to either side of the data structure, as well as + * randomly accessing data anywhere within window. + */ +public class RefHanger { + + // ----------------------------------------------------------------------------------------------------------------- + // + // member fields + // + // ----------------------------------------------------------------------------------------------------------------- + ArrayList hangers; + + // ----------------------------------------------------------------------------------------------------------------- + // + // Info structure + // + // ----------------------------------------------------------------------------------------------------------------- + public class Hanger { + public GenomeLoc loc = null; + public ArrayList data = null; + + public Hanger(GenomeLoc loc, ArrayList data) { + this.loc = loc; + this.data = data; + } + + public final ArrayList getData() { return data; } + public final int size() { return this.data.size(); } + public final T get(int i) { return this.data.get(i); } + } + + + // ----------------------------------------------------------------------------------------------------------------- + // + // constructors and other basic operations + // + // ----------------------------------------------------------------------------------------------------------------- + public RefHanger() { + hangers = new ArrayList(); + } + + protected int getLeftOffset() { return 0; } + protected int getRightOffset() { return hangers.size() - 1; } + protected int getOffset(GenomeLoc loc) { + return loc.minus(getLeftLoc()); + } + + public GenomeLoc getLeftLoc() { return hangers.get(getLeftOffset()).loc; } + public GenomeLoc getRightLoc() { return hangers.get(getRightOffset()).loc; } + + public boolean hasLocation(GenomeLoc loc) { + return ! isEmpty() && loc.isBetween(getLeftLoc(), getRightLoc()); + } + + public boolean isEmpty() { + return hangers.isEmpty(); + } + public boolean hasHangers() { + return ! isEmpty(); + } + + /** + * Pops off the left most data from the structure + * + * @return + */ + public Hanger popLeft() { + assert(hasHangers()); + return hangers.remove(0); + } + + public void dropLeft() { + popLeft(); + } + + /** + * Looks at the left most data from the structure + * + * @return + */ + public Hanger getLeft() { + assert(hasHangers()); + return getHanger(0); + } + + /** + * Returns data at offset relativePos + * + * @return + */ + public Hanger getHanger(int relativePos) { + assert( hangers.contains(relativePos) ); + return hangers.get(relativePos); + } + + /** + * Returns data at GenomeLoc + * + * @return + */ + public Hanger getHanger(GenomeLoc pos) { + return getHanger(getOffset(pos)); + } + + public int size() { + return hangers.size(); + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // Adding data to the left and right + // + // ----------------------------------------------------------------------------------------------------------------- + + public void pushLeft(GenomeLoc pos) { + pushLeft(pos, new ArrayList()); + } + + public void pushLeft(GenomeLoc pos, T datum) { + pushLeft(pos, new ArrayList(Arrays.asList(datum))); + } + + public void pushLeft(GenomeLoc pos, ArrayList data) { + hangers.add(0, new Hanger(pos, data)); + } + + public void pushRight(GenomeLoc pos) { + pushRight(pos, new ArrayList()); + } + + public void pushRight(GenomeLoc pos, T datum) { + pushRight(pos, new ArrayList(Arrays.asList(datum))); + } + + public void pushRight(GenomeLoc pos, ArrayList data) { + hangers.add(new Hanger(pos, data)); + } + + public boolean ensurePos(GenomeLoc pos) { + if ( hasLocation(pos) ) + return true; + else { + pushRight(pos); + return false; + } + } + + public void extendData(GenomeLoc pos, T datum) { + getHanger(pos).data.add(datum); + } + + public void addData(List positions, List dataByPos) { + assert( positions.size() == dataByPos.size() ); + + for ( int i = 0; i < positions.size(); i++ ) { + GenomeLoc pos = positions.get(i); + T datum = dataByPos.get(i); + expandingPut1(pos, datum); + } + } + + public void expandingPut1(final GenomeLoc loc, T datum) { + ensurePos(loc); + extendData(loc, datum); + } + + public void printState() { + System.out.printf("Hanger:%n"); + for ( Hanger hanger : hangers ) { + System.out.printf(" -> %s => %s:%n", hanger.loc, Utils.join("/", hanger.data) ); + } + } + + /** + * Pushes locations on the right until we reach the expected position for pos. + * + * For example, if we have chr1:1 and 2 in the hanger, and we push 4 into the hangers + * this function will add 3 -> {} to the hanger too + * + * @param pos + * @param datum + */ + public void expandingPut(GenomeLoc pos, T datum) { + //System.out.printf("expandingPut(%s, %s)%n", pos, datum); + //printState(); + if ( isEmpty() ) + // we have nothing, just push right + pushRight(pos, datum); + else { + assert( pos.compareTo(getRightLoc()) == 1 ); + + GenomeLoc nextRight = getRightLoc().nextLoc(); + while ( pos.compareTo(nextRight) == 1 ) { + //printState(); + //System.out.printf(" *** Extending %s, heading for %s%n", nextRight, pos); + ensurePos(nextRight); + nextRight = nextRight.nextLoc(); + } + + ensurePos(pos); + extendData(pos, datum); + } + //printState(); + } +} \ No newline at end of file