package org.broadinstitute.sting.playground.gatk.refdata; import java.io.File; import java.io.IOException; import java.util.Collections; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import org.broadinstitute.sting.gatk.iterators.PushbackIterator; import org.broadinstitute.sting.gatk.refdata.BasicReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.Transcript; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.xReadLines; public class rodRefSeq extends BasicReferenceOrderedDatum { private GenomeLoc location = null; private List records = null; public rodRefSeq(String name) { super(name); // location = new GenomeLoc(0,0,-1); } /** Despite this constructor is public, it is meant primarily for the internal use; RefSeq iterator will * call it to populate the ROD at the given genomic location with the data (transcripts) it is holding * @param name * @param location * @param records */ public rodRefSeq(String name, GenomeLoc location, List records) { super(name); this.location = location; this.records = records; } @Override public GenomeLoc getLocation() { return location; } /** Required by ReferenceOrderedDatum interface; this method does nothing (always returns false), * since this rod provides its own iterator for reading underlying data files. */ @Override public boolean parseLine(Object header, String[] parts) { return false; // this rod has its own iterator } /** Returns true if the current position this ROD is associated with is within the coding interval for at least * one of the annotated transcripts. NOTE: "coding" interval is defined as a single genomic interval, so it * does not include the UTRs of the outermost exons, but it includes introns between exons spliced into a * transcript, or internal exons that are not spliced into a given transcript. @see isExon(). * @return */ public boolean isCoding() { if ( records == null ) return false; for ( Transcript t : records) { if ( t.overlapsCodingP(location) ) return true; } return false; } /** Returns true if the current position this ROD is associated with is within an exon for at least * one of the annotated transcripts. NOTE: position can be still within a UTR, see @isCoding() * @return */ public boolean isExon() { if ( records == null ) return false; for ( Transcript t : records) { if ( t.overlapsExonP(location) ) return true; } return false; } @Override public String repl() { throw new StingException("repl() is not implemented yet"); } /** Will print the genomic location of this rod, followed by (space separated) ids of all the * annotated transcripts overlapping with this position. */ @Override public String toSimpleString() { if ( records == null ) return new String(getName()+": "); StringBuilder b = new StringBuilder(); b.append(getName()); b.append(":"); for ( Transcript t : records ) { b.append(' '); b.append(t.getTranscriptId()); } return b.toString(); } @Override public String toString() { return toSimpleString(); } public static Iterator createIterator(String trackName, File f) throws IOException { return new refSeqIterator(trackName,f); } } class refSeqIterator implements Iterator { // private xReadLines reader = null; private long curr_position = 0; private long max_position = 0; private String curr_contig_name = null; // will keep the name of the contig the iterator is currently in private List records; // will keep the list of all transcripts overlapping with the current position private PushbackIterator reader; private String name = null; public refSeqIterator(String trackName, File f) throws IOException { reader = new PushbackIterator( new refSeqRecordIterator(f) ); records = new LinkedList(); name = trackName; } @Override public boolean hasNext() { // if we did not walk to the very end of currently loaded transcripts, then we // definitely have next if ( curr_position < max_position ) return true; // we are past currently loaded stuff; we have next if there are more lines to load: return reader.hasNext(); } @Override public rodRefSeq next() { curr_position++; if ( curr_position <= max_position ) { // we still have bases covered by at least one currently loaded transcript; // we have to purge only subset of transcripts, on which we moved past the end Iterator i = records.iterator(); while ( i.hasNext() ) { Transcript t = i.next(); if ( t.getLocation().getStop() < curr_position ) { i.remove(); // we moved past the end of transcript r, purge it forever } } } else { // ooops, we are past the end of all loaded transcripts - kill them all, // load next transcript and fastforward current position to its start records.clear(); Transcript t = reader.next(); // if hasNext() previously returned true, we are guaranteed that this call to reader.next() is safe records.add( t ); curr_contig_name = t.getLocation().getContig(); curr_position = t.getLocation().getStart(); max_position = t.getLocation().getStop(); } // 'records' only keeps those transcripts now, on which we did not reach the end yet // (we might have reloaded records completely if it was necessary); current position is correctly set. // lets check if we walked into additional new transcripts so we'd need to load them too: while ( reader.hasNext() ) { Transcript t = reader.peek(); int ci1 = GenomeLoc.getContigIndex(curr_contig_name); int ci2 = GenomeLoc.getContigIndex( t.getLocation().getContig() ); if ( ci1 > ci2 ) throw new StingException("RefSeq track seems to be not contig-ordered"); if ( ci1 < ci2 ) break; // next transcript is on the next contig, we do not need it yet... if ( t.getLocation().getStart() > curr_position ) break; // next transcript is on the same contig but starts after the current position; we are done t = reader.next(); // we do need next record, time to load it for real long stop = t.getLocation().getStop(); if ( stop < curr_position ) throw new StingException("DEBUG: encountered contig that should have been loaded earlier"); if ( stop > max_position ) max_position = stop; records.add(t); } // 'records' and current position are fully updated. We can now create new rod and return it (NOTE: this iterator will break if the list // of pre-loaded records is meddled with by the clients between iterations, so we return them as unmodifiable list) rodRefSeq rod = new rodRefSeq(name,new GenomeLoc(curr_contig_name,curr_position, curr_position),Collections.unmodifiableList(records)); return rod; } @Override public void remove() { throw new UnsupportedOperationException(); } } /** Low-level iterator for reading refseq annotation file record by record (i.e. line by line). Returns * pre-processed input lines as RefSeqRecord objects. */ class refSeqRecordIterator implements Iterator { private xReadLines reader; public refSeqRecordIterator(File f) throws IOException { reader = new xReadLines(f); } @Override public boolean hasNext() { return reader.hasNext(); } @Override public Transcript next() { Transcript t = new Transcript(); t.parseLine( reader.next() ); return t; } @Override public void remove() { throw new UnsupportedOperationException(); } }