Now that ROD system supports overlapping RODs, we do not need rodRefSeq to be too smart and read in all the overlapping records (transcripts) on its own; leave it to the generic ROD mechanism.

PARTIAL commit; new, simpler rodRefSeq will reappear in a seq. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1694 348d0f76-0448-11de-a6fe-93d51630548a
2009-09-23 18:11:16 +00:00 · 2009-09-23 18:11:16 +00:00 · 1bd4c0077c
parent aa66074a0e
commit 1bd4c0077c
2 changed files with 0 additions and 349 deletions
--- a/java/src/org/broadinstitute/sting/gatk/refdata/Transcript.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/Transcript.java
@ -1,99 +0,0 @@
-package org.broadinstitute.sting.gatk.refdata;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.StingException;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-
-/** Holds a single transcript annotation: refseq id, gene name, genomic locations of the locus, of the coding region 
- * and of all the exons. 
- */
-public class Transcript {
-		private String transcript_id;
-		private int strand;
-		private GenomeLoc transcript_interval;
-		private GenomeLoc transcript_coding_interval;
-		private List<GenomeLoc> exons;
-		private String gene_name;
-		private List<Integer> exon_frames;
-
-		public Transcript() {}
-		
-		public Transcript(String line) {
-			parseLine(line);
-		}
-		
-		/** Returns id of the transcript (RefSeq NM_* id) */
-		public String getTranscriptId() { return transcript_id; }
-		/** Returns coding strand of the transcript, 1 or -1 for positive or negative strand, respectively */
-		public int getStrand() { return strand; }
-		/** Returns transcript's full genomic interval (includes all exons with UTRs) */  
-		public GenomeLoc getLocation() { return transcript_interval; }
-		/** Returns genomic interval of the coding sequence (does not include UTRs, but still includes introns, since it's a single interval on the DNA) */  
-		public GenomeLoc getCodingLocation() { return transcript_coding_interval; }
-		/** Name of the gene this transcript corresponds to (NOT gene id such as Entrez etc) */
-		public String getGeneName() { return gene_name; }
-		/** Number of exons in this transcript */
-		public int getNumExons() { return exons.size(); }
-		/** Genomic location of the n-th exon; throws an exception if n is out of bounds */
-		public GenomeLoc getExonLocation(int n) {
-			if ( n >= exons.size() || n < 0 ) throw new StingException("Index out-of-bounds. Transcript has " + exons.size() +" exons; requested: "+n);
-			return exons.get(n);
-		}
-		/** Returns the list of all exons in this transcript, as genomic intervals */
-		public List<GenomeLoc> getExons() { return exons; }
-		
-		/** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */ 
-	    public boolean overlapsP (GenomeLoc that) {
-	    	return transcript_interval.overlapsP(that);
-	    }
-		
-		/** Returns true if the specified interval 'that' overlaps with the coding genomic interval of this transcript. 
-		 * NOTE: since "coding interval" is still a single genomic interval, it will not contain UTRs of the outermost exons,
-		 * but it will still contain introns and/or exons internal to this genomic locus that are not spliced into this transcript. 
-		 * @see overlapsExonP()
-		 */ 
-	    public boolean overlapsCodingP (GenomeLoc that) {
-	    	return transcript_coding_interval.overlapsP(that);
-	    }
-
-		/** Returns true if the specified interval 'that' overlaps with any of the exons actually spliced into this transcript */ 
-	    public boolean overlapsExonP (GenomeLoc that) {
-	    	for ( GenomeLoc e : exons ) {
-	    		if ( e.overlapsP(that) ) return true;
-	    	}
-	    	return false;
-	    }
-
-	    /** Fills this object from a text line in RefSeq (UCSC) text dump file */
-		public void parseLine(String line) {
-			String[] fields = line.split("\t");
-			transcript_id = fields[1];
-			if ( fields[3].length()==1 && fields[3].charAt(0)=='+') strand = 1;
-			else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') strand = -1;
-			else throw new StingException("Expected strand symbol (+/-), found: "+fields[3]);
-			
-			String contig_name = fields[2];
-			transcript_interval = GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]));
-			transcript_coding_interval = GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(fields[6])+1, Integer.parseInt(fields[7]));
-			gene_name = fields[12];
-			String[] exon_starts = fields[9].split(",");
-			String[] exon_stops = fields[10].split(",");
-			String[] eframes = fields[15].split(",");
-				
-			assert exon_starts.length == exon_stops.length : "Data format error: numbers of exon start and stop positions differ";
-			assert exon_starts.length == eframes.length : "Data format error: numbers of exons and exon frameshifts differ";
-			
-			exons = new ArrayList<GenomeLoc>(exon_starts.length);
-			exon_frames = new ArrayList<Integer>(eframes.length);
-			
-			for ( int i = 0 ; i < exon_starts.length  ; i++ ) {
-				exons.add(GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) );
-				exon_frames.add(Integer.decode(eframes[i]));
-			}
-		}
-		
-}
-
--- a/java/src/org/broadinstitute/sting/gatk/refdata/rodRefSeq.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodRefSeq.java
@ -1,250 +0,0 @@
-package org.broadinstitute.sting.gatk.refdata;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
-
-import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
-import org.broadinstitute.sting.gatk.refdata.BasicReferenceOrderedDatum;
-import org.broadinstitute.sting.gatk.refdata.Transcript;
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.StingException;
-import org.broadinstitute.sting.utils.xReadLines;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-
-public class rodRefSeq extends BasicReferenceOrderedDatum {
-
-	private GenomeLoc location = null;
-	private List<Transcript> records = null;
-	
-	public rodRefSeq(String name) {
-		super(name);
-//		location = GenomeLoc.parseGenomeLoc(0,0,-1);
-	}
-
-	/** Despite this constructor is public, it is meant primarily for the internal use; RefSeq iterator will
-	 * call it to populate the ROD at the given genomic location with the data (transcripts) it is holding
-	 * @param name
-	 * @param location
-	 * @param records
-	 */
-	public rodRefSeq(String name, GenomeLoc location, List<Transcript> records) {
-		super(name);
-		this.location = location;
-		this.records = records;
-	}
-
-	@Override
-	public GenomeLoc getLocation() {
-		return location;
-	}
-
-	/** Required by ReferenceOrderedDatum interface; this method does nothing (always returns false),
-	 * since this rod provides its own iterator for reading underlying data files.
-	 */
-	@Override
-	public boolean parseLine(Object header, String[] parts) {
-		return false; // this rod has its own iterator
-	}
-
-	/** Returns true if the current position this ROD is associated with is within the coding interval for at least
-	 * one of the annotated transcripts. NOTE: "coding" interval is defined as a single genomic interval, so it
-	 * does not include the UTRs of the outermost exons, but it includes introns between exons spliced into a 
-	 * transcript, or internal exons that are not spliced into a given transcript. To check that a position is
-	 * indeed within an exon but not in UTR, use isExon() && isCoding(). @see isExon().
-	 * @return
-	 */
-	public boolean isCoding() {
-		if ( records == null ) return false;
-		for ( Transcript t : records) {
-			if ( t.overlapsCodingP(location) ) return true;
-		}
-		return false;
-	}
-	
-
-	/** Returns true if the current position this ROD is associated with is within an exon for at least
-	 * one of the annotated transcripts. NOTE: position can be still within a UTR, see @isCoding()
-	 * @return
-	 */
-	public boolean isExon() {
-		if ( records == null ) return false; 
-		for ( Transcript t : records) {
-			if ( t.overlapsExonP(location) ) return true;
-		}
-		return false;
-	}
-
-	/** Returns all annotated transcripts overlapping with the current position as an immutable list.
-	 * "Overlap" is defined as position being within genomic interval corresponding to the whole 
-	 * transcript start/transcript stop coordinates, thus the position can be still in a UTR, in intron, or
-	 * in an internal exon that is actually not spliced into the specific transcript. Use isExon(), isCoding() of
-	 * this ROD class, or query individual Transcript objects returned by this method to get more details. 
-	 * @return
-	 */
-	public List<Transcript> getTranscripts() { return Collections.unmodifiableList(records) ; }
-	
-	@Override
-	public String repl() {
-		throw new StingException("repl() is not implemented yet");
-	}
-
-	/** Will print the genomic location of this rod, followed by (space separated) ids of all the
-	 * annotated transcripts overlapping with this position. 
-	 */
-	@Override
-	public String toSimpleString() {
-		if ( records == null ) return new String(getName()+": <NULL>");
-		StringBuilder b = new StringBuilder();
-		b.append(getName());
-		b.append(":");
-		for ( Transcript t : records ) { 
-				b.append(' ');
-				b.append(t.getTranscriptId());
-		}
-		return b.toString();
-	}
-
-	@Override
-	public String toString() {
-		return toSimpleString();
-	}
-
-	public static Iterator<rodRefSeq> createIterator(String trackName, File f) throws IOException {
-//		System.out.println("REFSEQ ITERATOR CREATED");
-		return new refSeqIterator(trackName,f);
-	}
-	
-}
-
-class refSeqIterator implements Iterator<rodRefSeq> {
-//	private xReadLines reader = null;
-	private long curr_position = 0;
-	private long max_position = 0;
-	private String curr_contig_name = null; // will keep the name of the contig the iterator is currently in
-	private List<Transcript> records; // will keep the list of all transcripts overlapping with the current position
-	private PushbackIterator<Transcript> reader;
-	private String name = null;
-//	private long z = 0; // counter, for debugging only
-//	private long t = 0; // for debug timer only
-	
-	public refSeqIterator(String trackName, File f) throws IOException {
-		reader = new PushbackIterator<Transcript>( new refSeqRecordIterator(f) );
-		records = new LinkedList<Transcript>();
-		name = trackName;
-//		System.out.println("REFSEQ ITERATOR CONSTRUCTOR");
-	}
-
-	@Override
-	public boolean hasNext() {
-		// if we did not walk to the very end of currently loaded transcripts, then we 
-		// definitely have next
-		if ( curr_position < max_position ) return true;
-		
-		// we are past currently loaded stuff; we have next if there are more lines to load:
-		return reader.hasNext();
-	}
-
-	@Override
-	public rodRefSeq next() {
-//		if ( z == 0 ) t = System.currentTimeMillis();
-//		System.out.println("NEXT on REFSEQ ITERATOR");
-		curr_position++;
-		if ( curr_position <= max_position ) { 
-		
-			// we still have bases covered by at least one currently loaded transcript;
-			// we have to purge only subset of transcripts, on which we moved past the end
-			Iterator<Transcript> i = records.iterator();
-			while ( i.hasNext() ) {
-				Transcript t = i.next();
-				if ( t.getLocation().getStop() < curr_position ) {
-					i.remove(); // we moved past the end of transcript r, purge it forever
-				}
-			}
-		} else {
-			// ooops, we are past the end of all loaded transcripts - kill them all, 
-			// load next transcript and fastforward current position to its start
-			records.clear();
-			Transcript t = reader.next(); // if hasNext() previously returned true, we are guaranteed that this call to reader.next() is safe
-			records.add( t );  
-			curr_contig_name = t.getLocation().getContig();
-			curr_position = t.getLocation().getStart();
-			max_position = t.getLocation().getStop();
-		}
-		
-//		System.out.println("curr pos="+curr_position+"; max pos="+max_position);
-
-		// 'records' only keeps those transcripts now, on which we did not reach the end yet 
-		// (we might have reloaded records completely if it was necessary); current position is correctly set.
-		// lets check if we walked into additional new transcripts so we'd need to load them too:
-		
-		while ( reader.hasNext() ) {
-			Transcript t = reader.peek();
-//			System.out.println("next transcript at "+t.getLocation()+"; curr contig is "+curr_contig_name);
-			int ci1 = GenomeLocParser.getContigIndex(curr_contig_name);
-			int ci2 = GenomeLocParser.getContigIndex( t.getLocation().getContig() );
-//			System.out.println("Contigs: "+ci1+":"+ci2);
-			if ( ci1 > ci2 ) throw new StingException("RefSeq track seems to be not contig-ordered");
-			if ( ci1 < ci2 ) break; // next transcript is on the next contig, we do not need it yet...
-			if ( t.getLocation().getStart() > curr_position ) break; // next transcript is on the same contig but starts after the current position; we are done
-			t = reader.next(); // we do need next record, time to load it for real
-//			System.out.println("loaded next transcript");
-			long stop = t.getLocation().getStop();
-			if ( stop < curr_position ) throw new StingException("DEBUG: encountered contig that should have been loaded earlier");
-			if ( stop > max_position ) max_position = stop;
-			records.add(t);
-		}
-		
-		// 'records' and current position are fully updated. We can now create new rod and return it (NOTE: this iterator will break if the list
-		// of pre-loaded records is meddled with by the clients between iterations, so we return them as unmodifiable list)
-		rodRefSeq rod = new rodRefSeq(name, GenomeLocParser.parseGenomeLoc(curr_contig_name,curr_position, curr_position),Collections.unmodifiableList(records));
-//		System.out.println("Returning rod at "+rod.getLocation());
-//		if ( (++z) % 1000000 == 0 ) {
-//			System.out.println(rod.getLocation()+": holding "+records.size()+ "; time per 1M ref positions: "+((double)(System.currentTimeMillis()-t)/1000.0)+" s");
-//			z = 0;
-//		}
-		return rod;
-	}
-
-	@Override
-	public void remove() {
-		throw new UnsupportedOperationException();		
-	}
-	
-	
-
-}
-
-/** Low-level iterator for reading refseq annotation file record by record (i.e. line by line). Returns 
- * pre-processed input lines as RefSeqRecord objects.
- */
-class refSeqRecordIterator implements Iterator<Transcript> {
-	
-	private xReadLines reader;
-	
-	public refSeqRecordIterator(File f) throws IOException { reader = new xReadLines(f); }
-
-	@Override
-	public boolean hasNext() {
-		return reader.hasNext();
-	}
-
-	@Override
-	public Transcript next()  {
-		Transcript t = new Transcript();
-		t.parseLine( reader.next() );
-		return t;
-	}
-	
-
-	@Override
-	public void remove() {
-        throw new UnsupportedOperationException();		
-	}
-
-}
-
-