From b76f8c4eb5ada0ab489aa2de328e9204b74eca06 Mon Sep 17 00:00:00 2001 From: asivache Date: Wed, 27 May 2009 18:18:33 +0000 Subject: [PATCH] moved from playground to gatk git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@838 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/refdata/rodRefSeq.java | 222 ++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 java/src/org/broadinstitute/sting/gatk/refdata/rodRefSeq.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodRefSeq.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodRefSeq.java new file mode 100644 index 000000000..82bc49019 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodRefSeq.java @@ -0,0 +1,222 @@ +package org.broadinstitute.sting.playground.gatk.refdata; + +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +import org.broadinstitute.sting.gatk.iterators.PushbackIterator; +import org.broadinstitute.sting.gatk.refdata.BasicReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.Transcript; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.xReadLines; + +public class rodRefSeq extends BasicReferenceOrderedDatum { + + private GenomeLoc location = null; + private List records = null; + + public rodRefSeq(String name) { + super(name); +// location = new GenomeLoc(0,0,-1); + } + + /** Despite this constructor is public, it is meant primarily for the internal use; RefSeq iterator will + * call it to populate the ROD at the given genomic location with the data (transcripts) it is holding + * @param name + * @param location + * @param records + */ + public rodRefSeq(String name, GenomeLoc location, List records) { + super(name); + this.location = location; + this.records = records; + } + + @Override + public GenomeLoc getLocation() { + return location; + } + + /** Required by ReferenceOrderedDatum interface; this method does nothing (always returns false), + * since this rod provides its own iterator for reading underlying data files. + */ + @Override + public boolean parseLine(Object header, String[] parts) { + return false; // this rod has its own iterator + } + + /** Returns true if the current position this ROD is associated with is within the coding interval for at least + * one of the annotated transcripts. NOTE: "coding" interval is defined as a single genomic interval, so it + * does not include the UTRs of the outermost exons, but it includes introns between exons spliced into a + * transcript, or internal exons that are not spliced into a given transcript. @see isExon(). + * @return + */ + public boolean isCoding() { + if ( records == null ) return false; + for ( Transcript t : records) { + if ( t.overlapsCodingP(location) ) return true; + } + return false; + } + + + /** Returns true if the current position this ROD is associated with is within an exon for at least + * one of the annotated transcripts. NOTE: position can be still within a UTR, see @isCoding() + * @return + */ + public boolean isExon() { + if ( records == null ) return false; + for ( Transcript t : records) { + if ( t.overlapsExonP(location) ) return true; + } + return false; + } + + @Override + public String repl() { + throw new StingException("repl() is not implemented yet"); + } + + /** Will print the genomic location of this rod, followed by (space separated) ids of all the + * annotated transcripts overlapping with this position. + */ + @Override + public String toSimpleString() { + if ( records == null ) return new String(getName()+": "); + StringBuilder b = new StringBuilder(); + b.append(getName()); + b.append(":"); + for ( Transcript t : records ) { + b.append(' '); + b.append(t.getTranscriptId()); + } + return b.toString(); + } + + @Override + public String toString() { + return toSimpleString(); + } + + public static Iterator createIterator(String trackName, File f) throws IOException { + return new refSeqIterator(trackName,f); + } + +} + +class refSeqIterator implements Iterator { +// private xReadLines reader = null; + private long curr_position = 0; + private long max_position = 0; + private String curr_contig_name = null; // will keep the name of the contig the iterator is currently in + private List records; // will keep the list of all transcripts overlapping with the current position + private PushbackIterator reader; + private String name = null; + + public refSeqIterator(String trackName, File f) throws IOException { + reader = new PushbackIterator( new refSeqRecordIterator(f) ); + records = new LinkedList(); + name = trackName; + } + + @Override + public boolean hasNext() { + // if we did not walk to the very end of currently loaded transcripts, then we + // definitely have next + if ( curr_position < max_position ) return true; + + // we are past currently loaded stuff; we have next if there are more lines to load: + return reader.hasNext(); + } + + @Override + public rodRefSeq next() { + curr_position++; + if ( curr_position <= max_position ) { + // we still have bases covered by at least one currently loaded transcript; + // we have to purge only subset of transcripts, on which we moved past the end + Iterator i = records.iterator(); + while ( i.hasNext() ) { + Transcript t = i.next(); + if ( t.getLocation().getStop() < curr_position ) { + i.remove(); // we moved past the end of transcript r, purge it forever + } + } + } else { + // ooops, we are past the end of all loaded transcripts - kill them all, + // load next transcript and fastforward current position to its start + records.clear(); + Transcript t = reader.next(); // if hasNext() previously returned true, we are guaranteed that this call to reader.next() is safe + records.add( t ); + curr_contig_name = t.getLocation().getContig(); + curr_position = t.getLocation().getStart(); + max_position = t.getLocation().getStop(); + } + + // 'records' only keeps those transcripts now, on which we did not reach the end yet + // (we might have reloaded records completely if it was necessary); current position is correctly set. + // lets check if we walked into additional new transcripts so we'd need to load them too: + + while ( reader.hasNext() ) { + Transcript t = reader.peek(); + int ci1 = GenomeLoc.getContigIndex(curr_contig_name); + int ci2 = GenomeLoc.getContigIndex( t.getLocation().getContig() ); + if ( ci1 > ci2 ) throw new StingException("RefSeq track seems to be not contig-ordered"); + if ( ci1 < ci2 ) break; // next transcript is on the next contig, we do not need it yet... + if ( t.getLocation().getStart() > curr_position ) break; // next transcript is on the same contig but starts after the current position; we are done + t = reader.next(); // we do need next record, time to load it for real + long stop = t.getLocation().getStop(); + if ( stop < curr_position ) throw new StingException("DEBUG: encountered contig that should have been loaded earlier"); + if ( stop > max_position ) max_position = stop; + records.add(t); + } + + // 'records' and current position are fully updated. We can now create new rod and return it (NOTE: this iterator will break if the list + // of pre-loaded records is meddled with by the clients between iterations, so we return them as unmodifiable list) + rodRefSeq rod = new rodRefSeq(name,new GenomeLoc(curr_contig_name,curr_position, curr_position),Collections.unmodifiableList(records)); + return rod; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + + +} + +/** Low-level iterator for reading refseq annotation file record by record (i.e. line by line). Returns + * pre-processed input lines as RefSeqRecord objects. + */ +class refSeqRecordIterator implements Iterator { + + private xReadLines reader; + + public refSeqRecordIterator(File f) throws IOException { reader = new xReadLines(f); } + + @Override + public boolean hasNext() { + return reader.hasNext(); + } + + @Override + public Transcript next() { + Transcript t = new Transcript(); + t.parseLine( reader.next() ); + return t; + } + + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + +} + +