diff --git a/java/src/org/broadinstitute/sting/playground/gatk/refdata/rodRefSeq.java b/java/src/org/broadinstitute/sting/playground/gatk/refdata/rodRefSeq.java index 536dcca38..82bc49019 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/refdata/rodRefSeq.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/refdata/rodRefSeq.java @@ -2,15 +2,14 @@ package org.broadinstitute.sting.playground.gatk.refdata; import java.io.File; import java.io.IOException; -import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.LinkedList; import java.util.List; -import java.util.zip.DataFormatException; import org.broadinstitute.sting.gatk.iterators.PushbackIterator; import org.broadinstitute.sting.gatk.refdata.BasicReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.Transcript; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.xReadLines; @@ -18,14 +17,20 @@ import org.broadinstitute.sting.utils.xReadLines; public class rodRefSeq extends BasicReferenceOrderedDatum { private GenomeLoc location = null; - private List records = null; + private List records = null; public rodRefSeq(String name) { super(name); // location = new GenomeLoc(0,0,-1); } - public rodRefSeq(String name, GenomeLoc location, List records) { + /** Despite this constructor is public, it is meant primarily for the internal use; RefSeq iterator will + * call it to populate the ROD at the given genomic location with the data (transcripts) it is holding + * @param name + * @param location + * @param records + */ + public rodRefSeq(String name, GenomeLoc location, List records) { super(name); this.location = location; this.records = records; @@ -36,28 +41,37 @@ public class rodRefSeq extends BasicReferenceOrderedDatum { return location; } + /** Required by ReferenceOrderedDatum interface; this method does nothing (always returns false), + * since this rod provides its own iterator for reading underlying data files. + */ @Override public boolean parseLine(Object header, String[] parts) { return false; // this rod has its own iterator } - + /** Returns true if the current position this ROD is associated with is within the coding interval for at least + * one of the annotated transcripts. NOTE: "coding" interval is defined as a single genomic interval, so it + * does not include the UTRs of the outermost exons, but it includes introns between exons spliced into a + * transcript, or internal exons that are not spliced into a given transcript. @see isExon(). + * @return + */ public boolean isCoding() { if ( records == null ) return false; - for ( RefSeqRecord r : records) { - if (location.getStart() >= r.getCodingLocation().getStart() && location.getStart() <= r.getCodingLocation().getStop() ) return true; + for ( Transcript t : records) { + if ( t.overlapsCodingP(location) ) return true; } return false; } + /** Returns true if the current position this ROD is associated with is within an exon for at least + * one of the annotated transcripts. NOTE: position can be still within a UTR, see @isCoding() + * @return + */ public boolean isExon() { if ( records == null ) return false; - for ( RefSeqRecord r : records) { - for ( GenomeLoc e : r.getExons() ) { -// System.err.println("EXON: "+e); - if (location.getStart() >= e.getStart() && location.getStart() <= e.getStop() ) return true; - } + for ( Transcript t : records) { + if ( t.overlapsExonP(location) ) return true; } return false; } @@ -67,17 +81,18 @@ public class rodRefSeq extends BasicReferenceOrderedDatum { throw new StingException("repl() is not implemented yet"); } + /** Will print the genomic location of this rod, followed by (space separated) ids of all the + * annotated transcripts overlapping with this position. + */ @Override public String toSimpleString() { if ( records == null ) return new String(getName()+": "); StringBuilder b = new StringBuilder(); b.append(getName()); b.append(":"); - for ( RefSeqRecord r : records ) { + for ( Transcript t : records ) { b.append(' '); - b.append(r.getTranscriptId()); -// b.append("; "+ r.getNumExons() + " exons: "); -// for ( GenomeLoc e : r.getExons() ) b.append(" " + e); + b.append(t.getTranscriptId()); } return b.toString(); } @@ -87,8 +102,8 @@ public class rodRefSeq extends BasicReferenceOrderedDatum { return toSimpleString(); } - public static Iterator createIterator(String trackName, File f) throws IOException, DataFormatException { - return new refSeqIterator(f, trackName); + public static Iterator createIterator(String trackName, File f) throws IOException { + return new refSeqIterator(trackName,f); } } @@ -98,13 +113,13 @@ class refSeqIterator implements Iterator { private long curr_position = 0; private long max_position = 0; private String curr_contig_name = null; // will keep the name of the contig the iterator is currently in - private List records; // will keep the list of all transcripts overlapping with the current position - private PushbackIterator reader; + private List records; // will keep the list of all transcripts overlapping with the current position + private PushbackIterator reader; private String name = null; - public refSeqIterator(File f, String trackName) throws IOException { - reader = new PushbackIterator( new refSeqRecordIterator(f) ); - records = new LinkedList(); + public refSeqIterator(String trackName, File f) throws IOException { + reader = new PushbackIterator( new refSeqRecordIterator(f) ); + records = new LinkedList(); name = trackName; } @@ -124,11 +139,10 @@ class refSeqIterator implements Iterator { if ( curr_position <= max_position ) { // we still have bases covered by at least one currently loaded transcript; // we have to purge only subset of transcripts, on which we moved past the end - Iterator i = records.iterator(); + Iterator i = records.iterator(); while ( i.hasNext() ) { - RefSeqRecord r = i.next(); - if ( r.getLocation().getStop() < curr_position ) { - System.out.println("Throwing away: " + r.toString()); + Transcript t = i.next(); + if ( t.getLocation().getStop() < curr_position ) { i.remove(); // we moved past the end of transcript r, purge it forever } } @@ -136,29 +150,29 @@ class refSeqIterator implements Iterator { // ooops, we are past the end of all loaded transcripts - kill them all, // load next transcript and fastforward current position to its start records.clear(); - RefSeqRecord r = reader.next(); // if hasNext() previously returned true, we are guaranteed that this call to reader.next() is safe - records.add( r ); - curr_contig_name = r.getLocation().getContig(); - curr_position = r.getLocation().getStart(); - max_position = r.getLocation().getStop(); + Transcript t = reader.next(); // if hasNext() previously returned true, we are guaranteed that this call to reader.next() is safe + records.add( t ); + curr_contig_name = t.getLocation().getContig(); + curr_position = t.getLocation().getStart(); + max_position = t.getLocation().getStop(); } // 'records' only keeps those transcripts now, on which we did not reach the end yet // (we might have reloaded records completely if it was necessary); current position is correctly set. - // lets check if we walked into new transcripts so we'd need to load them too: + // lets check if we walked into additional new transcripts so we'd need to load them too: while ( reader.hasNext() ) { - RefSeqRecord r = reader.peek(); + Transcript t = reader.peek(); int ci1 = GenomeLoc.getContigIndex(curr_contig_name); - int ci2 = GenomeLoc.getContigIndex( r.getLocation().getContig() ); + int ci2 = GenomeLoc.getContigIndex( t.getLocation().getContig() ); if ( ci1 > ci2 ) throw new StingException("RefSeq track seems to be not contig-ordered"); if ( ci1 < ci2 ) break; // next transcript is on the next contig, we do not need it yet... - if ( r.getLocation().getStart() > curr_position ) break; // next transcript is on the same contig but starts after the current position; we are done - r = reader.next(); // we need next record, time to load it for real - long stop = r.getLocation().getStop(); - if ( stop < curr_position ) throw new StingException("DEBUG: encuntered contig that should have been loaded earlier"); + if ( t.getLocation().getStart() > curr_position ) break; // next transcript is on the same contig but starts after the current position; we are done + t = reader.next(); // we do need next record, time to load it for real + long stop = t.getLocation().getStop(); + if ( stop < curr_position ) throw new StingException("DEBUG: encountered contig that should have been loaded earlier"); if ( stop > max_position ) max_position = stop; - records.add(r); + records.add(t); } // 'records' and current position are fully updated. We can now create new rod and return it (NOTE: this iterator will break if the list @@ -179,7 +193,7 @@ class refSeqIterator implements Iterator { /** Low-level iterator for reading refseq annotation file record by record (i.e. line by line). Returns * pre-processed input lines as RefSeqRecord objects. */ -class refSeqRecordIterator implements Iterator { +class refSeqRecordIterator implements Iterator { private xReadLines reader; @@ -191,10 +205,10 @@ class refSeqRecordIterator implements Iterator { } @Override - public RefSeqRecord next() { - RefSeqRecord r = new RefSeqRecord(); - r.parseLine( reader.next() ); - return r; + public Transcript next() { + Transcript t = new Transcript(); + t.parseLine( reader.next() ); + return t; } @@ -206,61 +220,3 @@ class refSeqRecordIterator implements Iterator { } -/** Holds a single transcript annotation: refseq id, gene name, genomic locations of the locus, of the coding region - * and of all the exons. - */ -class RefSeqRecord { - private String transcript_id; - private int strand; - private GenomeLoc transcript_interval; - private GenomeLoc transcript_coding_interval; - private List exons; - private String gene_name; - private List exon_frames; - - public RefSeqRecord() {} - - public RefSeqRecord(String line) throws DataFormatException { - parseLine(line); - } - - public String getTranscriptId() { return transcript_id; } - public int getStrand() { return strand; } - public GenomeLoc getLocation() { return transcript_interval; } - public GenomeLoc getCodingLocation() { return transcript_coding_interval; } - public String getGeneName() { return gene_name; } - public int getNumExons() { return exons.size(); } - public GenomeLoc getExonLocation(int n) { - if ( n >= exons.size() || n < 0 ) throw new StingException("Index out-of-bounds. Transcript has " + exons.size() +" exons; requested: "+n); - return exons.get(n); - } - public List getExons() { return exons; } - - public void parseLine(String line) { - String[] fields = line.split("\t"); - transcript_id = fields[1]; - if ( fields[3].length()==1 && fields[3].charAt(0)=='+') strand = 1; - else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') strand = -1; - else throw new StingException("Expected strand symbol (+/-), found: "+fields[3]); - - String contig_name = fields[2]; - transcript_interval = new GenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5])); - transcript_coding_interval = new GenomeLoc(contig_name, Integer.parseInt(fields[6])+1, Integer.parseInt(fields[7])); - gene_name = fields[12]; - String[] exon_starts = fields[9].split(","); - String[] exon_stops = fields[10].split(","); - String[] eframes = fields[15].split(","); - - assert exon_starts.length == exon_stops.length : "Data format error: numbers of exon start and stop positions differ"; - assert exon_starts.length == eframes.length : "Data format error: numbers of exons and exon frameshifts differ"; - - exons = new ArrayList(exon_starts.length); - exon_frames = new ArrayList(eframes.length); - - for ( int i = 0 ; i < exon_starts.length ; i++ ) { - exons.add(new GenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) ); - exon_frames.add(Integer.decode(eframes[i])); - } - } - -} \ No newline at end of file