251 lines
9.1 KiB
Java
251 lines
9.1 KiB
Java
package org.broadinstitute.sting.gatk.refdata;
|
|
|
|
import java.io.File;
|
|
import java.io.IOException;
|
|
import java.util.Collections;
|
|
import java.util.Iterator;
|
|
import java.util.LinkedList;
|
|
import java.util.List;
|
|
|
|
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
|
|
import org.broadinstitute.sting.gatk.refdata.BasicReferenceOrderedDatum;
|
|
import org.broadinstitute.sting.gatk.refdata.Transcript;
|
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
|
import org.broadinstitute.sting.utils.StingException;
|
|
import org.broadinstitute.sting.utils.xReadLines;
|
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
|
|
|
public class rodRefSeq extends BasicReferenceOrderedDatum {
|
|
|
|
private GenomeLoc location = null;
|
|
private List<Transcript> records = null;
|
|
|
|
public rodRefSeq(String name) {
|
|
super(name);
|
|
// location = GenomeLoc.parseGenomeLoc(0,0,-1);
|
|
}
|
|
|
|
/** Despite this constructor is public, it is meant primarily for the internal use; RefSeq iterator will
|
|
* call it to populate the ROD at the given genomic location with the data (transcripts) it is holding
|
|
* @param name
|
|
* @param location
|
|
* @param records
|
|
*/
|
|
public rodRefSeq(String name, GenomeLoc location, List<Transcript> records) {
|
|
super(name);
|
|
this.location = location;
|
|
this.records = records;
|
|
}
|
|
|
|
@Override
|
|
public GenomeLoc getLocation() {
|
|
return location;
|
|
}
|
|
|
|
/** Required by ReferenceOrderedDatum interface; this method does nothing (always returns false),
|
|
* since this rod provides its own iterator for reading underlying data files.
|
|
*/
|
|
@Override
|
|
public boolean parseLine(Object header, String[] parts) {
|
|
return false; // this rod has its own iterator
|
|
}
|
|
|
|
/** Returns true if the current position this ROD is associated with is within the coding interval for at least
|
|
* one of the annotated transcripts. NOTE: "coding" interval is defined as a single genomic interval, so it
|
|
* does not include the UTRs of the outermost exons, but it includes introns between exons spliced into a
|
|
* transcript, or internal exons that are not spliced into a given transcript. To check that a position is
|
|
* indeed within an exon but not in UTR, use isExon() && isCoding(). @see isExon().
|
|
* @return
|
|
*/
|
|
public boolean isCoding() {
|
|
if ( records == null ) return false;
|
|
for ( Transcript t : records) {
|
|
if ( t.overlapsCodingP(location) ) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
/** Returns true if the current position this ROD is associated with is within an exon for at least
|
|
* one of the annotated transcripts. NOTE: position can be still within a UTR, see @isCoding()
|
|
* @return
|
|
*/
|
|
public boolean isExon() {
|
|
if ( records == null ) return false;
|
|
for ( Transcript t : records) {
|
|
if ( t.overlapsExonP(location) ) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/** Returns all annotated transcripts overlapping with the current position as an immutable list.
|
|
* "Overlap" is defined as position being within genomic interval corresponding to the whole
|
|
* transcript start/transcript stop coordinates, thus the position can be still in a UTR, in intron, or
|
|
* in an internal exon that is actually not spliced into the specific transcript. Use isExon(), isCoding() of
|
|
* this ROD class, or query individual Transcript objects returned by this method to get more details.
|
|
* @return
|
|
*/
|
|
public List<Transcript> getTranscripts() { return Collections.unmodifiableList(records) ; }
|
|
|
|
@Override
|
|
public String repl() {
|
|
throw new StingException("repl() is not implemented yet");
|
|
}
|
|
|
|
/** Will print the genomic location of this rod, followed by (space separated) ids of all the
|
|
* annotated transcripts overlapping with this position.
|
|
*/
|
|
@Override
|
|
public String toSimpleString() {
|
|
if ( records == null ) return new String(getName()+": <NULL>");
|
|
StringBuilder b = new StringBuilder();
|
|
b.append(getName());
|
|
b.append(":");
|
|
for ( Transcript t : records ) {
|
|
b.append(' ');
|
|
b.append(t.getTranscriptId());
|
|
}
|
|
return b.toString();
|
|
}
|
|
|
|
@Override
|
|
public String toString() {
|
|
return toSimpleString();
|
|
}
|
|
|
|
public static Iterator<rodRefSeq> createIterator(String trackName, File f) throws IOException {
|
|
// System.out.println("REFSEQ ITERATOR CREATED");
|
|
return new refSeqIterator(trackName,f);
|
|
}
|
|
|
|
}
|
|
|
|
class refSeqIterator implements Iterator<rodRefSeq> {
|
|
// private xReadLines reader = null;
|
|
private long curr_position = 0;
|
|
private long max_position = 0;
|
|
private String curr_contig_name = null; // will keep the name of the contig the iterator is currently in
|
|
private List<Transcript> records; // will keep the list of all transcripts overlapping with the current position
|
|
private PushbackIterator<Transcript> reader;
|
|
private String name = null;
|
|
// private long z = 0; // counter, for debugging only
|
|
// private long t = 0; // for debug timer only
|
|
|
|
public refSeqIterator(String trackName, File f) throws IOException {
|
|
reader = new PushbackIterator<Transcript>( new refSeqRecordIterator(f) );
|
|
records = new LinkedList<Transcript>();
|
|
name = trackName;
|
|
// System.out.println("REFSEQ ITERATOR CONSTRUCTOR");
|
|
}
|
|
|
|
@Override
|
|
public boolean hasNext() {
|
|
// if we did not walk to the very end of currently loaded transcripts, then we
|
|
// definitely have next
|
|
if ( curr_position < max_position ) return true;
|
|
|
|
// we are past currently loaded stuff; we have next if there are more lines to load:
|
|
return reader.hasNext();
|
|
}
|
|
|
|
@Override
|
|
public rodRefSeq next() {
|
|
// if ( z == 0 ) t = System.currentTimeMillis();
|
|
// System.out.println("NEXT on REFSEQ ITERATOR");
|
|
curr_position++;
|
|
if ( curr_position <= max_position ) {
|
|
|
|
// we still have bases covered by at least one currently loaded transcript;
|
|
// we have to purge only subset of transcripts, on which we moved past the end
|
|
Iterator<Transcript> i = records.iterator();
|
|
while ( i.hasNext() ) {
|
|
Transcript t = i.next();
|
|
if ( t.getLocation().getStop() < curr_position ) {
|
|
i.remove(); // we moved past the end of transcript r, purge it forever
|
|
}
|
|
}
|
|
} else {
|
|
// ooops, we are past the end of all loaded transcripts - kill them all,
|
|
// load next transcript and fastforward current position to its start
|
|
records.clear();
|
|
Transcript t = reader.next(); // if hasNext() previously returned true, we are guaranteed that this call to reader.next() is safe
|
|
records.add( t );
|
|
curr_contig_name = t.getLocation().getContig();
|
|
curr_position = t.getLocation().getStart();
|
|
max_position = t.getLocation().getStop();
|
|
}
|
|
|
|
// System.out.println("curr pos="+curr_position+"; max pos="+max_position);
|
|
|
|
// 'records' only keeps those transcripts now, on which we did not reach the end yet
|
|
// (we might have reloaded records completely if it was necessary); current position is correctly set.
|
|
// lets check if we walked into additional new transcripts so we'd need to load them too:
|
|
|
|
while ( reader.hasNext() ) {
|
|
Transcript t = reader.peek();
|
|
// System.out.println("next transcript at "+t.getLocation()+"; curr contig is "+curr_contig_name);
|
|
int ci1 = GenomeLocParser.getContigIndex(curr_contig_name);
|
|
int ci2 = GenomeLocParser.getContigIndex( t.getLocation().getContig() );
|
|
// System.out.println("Contigs: "+ci1+":"+ci2);
|
|
if ( ci1 > ci2 ) throw new StingException("RefSeq track seems to be not contig-ordered");
|
|
if ( ci1 < ci2 ) break; // next transcript is on the next contig, we do not need it yet...
|
|
if ( t.getLocation().getStart() > curr_position ) break; // next transcript is on the same contig but starts after the current position; we are done
|
|
t = reader.next(); // we do need next record, time to load it for real
|
|
// System.out.println("loaded next transcript");
|
|
long stop = t.getLocation().getStop();
|
|
if ( stop < curr_position ) throw new StingException("DEBUG: encountered contig that should have been loaded earlier");
|
|
if ( stop > max_position ) max_position = stop;
|
|
records.add(t);
|
|
}
|
|
|
|
// 'records' and current position are fully updated. We can now create new rod and return it (NOTE: this iterator will break if the list
|
|
// of pre-loaded records is meddled with by the clients between iterations, so we return them as unmodifiable list)
|
|
rodRefSeq rod = new rodRefSeq(name, GenomeLocParser.parseGenomeLoc(curr_contig_name,curr_position, curr_position),Collections.unmodifiableList(records));
|
|
// System.out.println("Returning rod at "+rod.getLocation());
|
|
// if ( (++z) % 1000000 == 0 ) {
|
|
// System.out.println(rod.getLocation()+": holding "+records.size()+ "; time per 1M ref positions: "+((double)(System.currentTimeMillis()-t)/1000.0)+" s");
|
|
// z = 0;
|
|
// }
|
|
return rod;
|
|
}
|
|
|
|
@Override
|
|
public void remove() {
|
|
throw new UnsupportedOperationException();
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
/** Low-level iterator for reading refseq annotation file record by record (i.e. line by line). Returns
|
|
* pre-processed input lines as RefSeqRecord objects.
|
|
*/
|
|
class refSeqRecordIterator implements Iterator<Transcript> {
|
|
|
|
private xReadLines reader;
|
|
|
|
public refSeqRecordIterator(File f) throws IOException { reader = new xReadLines(f); }
|
|
|
|
@Override
|
|
public boolean hasNext() {
|
|
return reader.hasNext();
|
|
}
|
|
|
|
@Override
|
|
public Transcript next() {
|
|
Transcript t = new Transcript();
|
|
t.parseLine( reader.next() );
|
|
return t;
|
|
}
|
|
|
|
|
|
@Override
|
|
public void remove() {
|
|
throw new UnsupportedOperationException();
|
|
}
|
|
|
|
}
|
|
|
|
|