diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/Transcript.java b/java/src/org/broadinstitute/sting/gatk/refdata/Transcript.java deleted file mode 100644 index f9435ea24..000000000 --- a/java/src/org/broadinstitute/sting/gatk/refdata/Transcript.java +++ /dev/null @@ -1,99 +0,0 @@ -package org.broadinstitute.sting.gatk.refdata; - -import java.util.ArrayList; -import java.util.List; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.GenomeLocParser; - -/** Holds a single transcript annotation: refseq id, gene name, genomic locations of the locus, of the coding region - * and of all the exons. - */ -public class Transcript { - private String transcript_id; - private int strand; - private GenomeLoc transcript_interval; - private GenomeLoc transcript_coding_interval; - private List exons; - private String gene_name; - private List exon_frames; - - public Transcript() {} - - public Transcript(String line) { - parseLine(line); - } - - /** Returns id of the transcript (RefSeq NM_* id) */ - public String getTranscriptId() { return transcript_id; } - /** Returns coding strand of the transcript, 1 or -1 for positive or negative strand, respectively */ - public int getStrand() { return strand; } - /** Returns transcript's full genomic interval (includes all exons with UTRs) */ - public GenomeLoc getLocation() { return transcript_interval; } - /** Returns genomic interval of the coding sequence (does not include UTRs, but still includes introns, since it's a single interval on the DNA) */ - public GenomeLoc getCodingLocation() { return transcript_coding_interval; } - /** Name of the gene this transcript corresponds to (NOT gene id such as Entrez etc) */ - public String getGeneName() { return gene_name; } - /** Number of exons in this transcript */ - public int getNumExons() { return exons.size(); } - /** Genomic location of the n-th exon; throws an exception if n is out of bounds */ - public GenomeLoc getExonLocation(int n) { - if ( n >= exons.size() || n < 0 ) throw new StingException("Index out-of-bounds. Transcript has " + exons.size() +" exons; requested: "+n); - return exons.get(n); - } - /** Returns the list of all exons in this transcript, as genomic intervals */ - public List getExons() { return exons; } - - /** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */ - public boolean overlapsP (GenomeLoc that) { - return transcript_interval.overlapsP(that); - } - - /** Returns true if the specified interval 'that' overlaps with the coding genomic interval of this transcript. - * NOTE: since "coding interval" is still a single genomic interval, it will not contain UTRs of the outermost exons, - * but it will still contain introns and/or exons internal to this genomic locus that are not spliced into this transcript. - * @see overlapsExonP() - */ - public boolean overlapsCodingP (GenomeLoc that) { - return transcript_coding_interval.overlapsP(that); - } - - /** Returns true if the specified interval 'that' overlaps with any of the exons actually spliced into this transcript */ - public boolean overlapsExonP (GenomeLoc that) { - for ( GenomeLoc e : exons ) { - if ( e.overlapsP(that) ) return true; - } - return false; - } - - /** Fills this object from a text line in RefSeq (UCSC) text dump file */ - public void parseLine(String line) { - String[] fields = line.split("\t"); - transcript_id = fields[1]; - if ( fields[3].length()==1 && fields[3].charAt(0)=='+') strand = 1; - else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') strand = -1; - else throw new StingException("Expected strand symbol (+/-), found: "+fields[3]); - - String contig_name = fields[2]; - transcript_interval = GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5])); - transcript_coding_interval = GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(fields[6])+1, Integer.parseInt(fields[7])); - gene_name = fields[12]; - String[] exon_starts = fields[9].split(","); - String[] exon_stops = fields[10].split(","); - String[] eframes = fields[15].split(","); - - assert exon_starts.length == exon_stops.length : "Data format error: numbers of exon start and stop positions differ"; - assert exon_starts.length == eframes.length : "Data format error: numbers of exons and exon frameshifts differ"; - - exons = new ArrayList(exon_starts.length); - exon_frames = new ArrayList(eframes.length); - - for ( int i = 0 ; i < exon_starts.length ; i++ ) { - exons.add(GenomeLocParser.parseGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) ); - exon_frames.add(Integer.decode(eframes[i])); - } - } - -} - diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodRefSeq.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodRefSeq.java deleted file mode 100644 index d73eb9468..000000000 --- a/java/src/org/broadinstitute/sting/gatk/refdata/rodRefSeq.java +++ /dev/null @@ -1,250 +0,0 @@ -package org.broadinstitute.sting.gatk.refdata; - -import java.io.File; -import java.io.IOException; -import java.util.Collections; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; - -import org.broadinstitute.sting.gatk.iterators.PushbackIterator; -import org.broadinstitute.sting.gatk.refdata.BasicReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.Transcript; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.xReadLines; -import org.broadinstitute.sting.utils.GenomeLocParser; - -public class rodRefSeq extends BasicReferenceOrderedDatum { - - private GenomeLoc location = null; - private List records = null; - - public rodRefSeq(String name) { - super(name); -// location = GenomeLoc.parseGenomeLoc(0,0,-1); - } - - /** Despite this constructor is public, it is meant primarily for the internal use; RefSeq iterator will - * call it to populate the ROD at the given genomic location with the data (transcripts) it is holding - * @param name - * @param location - * @param records - */ - public rodRefSeq(String name, GenomeLoc location, List records) { - super(name); - this.location = location; - this.records = records; - } - - @Override - public GenomeLoc getLocation() { - return location; - } - - /** Required by ReferenceOrderedDatum interface; this method does nothing (always returns false), - * since this rod provides its own iterator for reading underlying data files. - */ - @Override - public boolean parseLine(Object header, String[] parts) { - return false; // this rod has its own iterator - } - - /** Returns true if the current position this ROD is associated with is within the coding interval for at least - * one of the annotated transcripts. NOTE: "coding" interval is defined as a single genomic interval, so it - * does not include the UTRs of the outermost exons, but it includes introns between exons spliced into a - * transcript, or internal exons that are not spliced into a given transcript. To check that a position is - * indeed within an exon but not in UTR, use isExon() && isCoding(). @see isExon(). - * @return - */ - public boolean isCoding() { - if ( records == null ) return false; - for ( Transcript t : records) { - if ( t.overlapsCodingP(location) ) return true; - } - return false; - } - - - /** Returns true if the current position this ROD is associated with is within an exon for at least - * one of the annotated transcripts. NOTE: position can be still within a UTR, see @isCoding() - * @return - */ - public boolean isExon() { - if ( records == null ) return false; - for ( Transcript t : records) { - if ( t.overlapsExonP(location) ) return true; - } - return false; - } - - /** Returns all annotated transcripts overlapping with the current position as an immutable list. - * "Overlap" is defined as position being within genomic interval corresponding to the whole - * transcript start/transcript stop coordinates, thus the position can be still in a UTR, in intron, or - * in an internal exon that is actually not spliced into the specific transcript. Use isExon(), isCoding() of - * this ROD class, or query individual Transcript objects returned by this method to get more details. - * @return - */ - public List getTranscripts() { return Collections.unmodifiableList(records) ; } - - @Override - public String repl() { - throw new StingException("repl() is not implemented yet"); - } - - /** Will print the genomic location of this rod, followed by (space separated) ids of all the - * annotated transcripts overlapping with this position. - */ - @Override - public String toSimpleString() { - if ( records == null ) return new String(getName()+": "); - StringBuilder b = new StringBuilder(); - b.append(getName()); - b.append(":"); - for ( Transcript t : records ) { - b.append(' '); - b.append(t.getTranscriptId()); - } - return b.toString(); - } - - @Override - public String toString() { - return toSimpleString(); - } - - public static Iterator createIterator(String trackName, File f) throws IOException { -// System.out.println("REFSEQ ITERATOR CREATED"); - return new refSeqIterator(trackName,f); - } - -} - -class refSeqIterator implements Iterator { -// private xReadLines reader = null; - private long curr_position = 0; - private long max_position = 0; - private String curr_contig_name = null; // will keep the name of the contig the iterator is currently in - private List records; // will keep the list of all transcripts overlapping with the current position - private PushbackIterator reader; - private String name = null; -// private long z = 0; // counter, for debugging only -// private long t = 0; // for debug timer only - - public refSeqIterator(String trackName, File f) throws IOException { - reader = new PushbackIterator( new refSeqRecordIterator(f) ); - records = new LinkedList(); - name = trackName; -// System.out.println("REFSEQ ITERATOR CONSTRUCTOR"); - } - - @Override - public boolean hasNext() { - // if we did not walk to the very end of currently loaded transcripts, then we - // definitely have next - if ( curr_position < max_position ) return true; - - // we are past currently loaded stuff; we have next if there are more lines to load: - return reader.hasNext(); - } - - @Override - public rodRefSeq next() { -// if ( z == 0 ) t = System.currentTimeMillis(); -// System.out.println("NEXT on REFSEQ ITERATOR"); - curr_position++; - if ( curr_position <= max_position ) { - - // we still have bases covered by at least one currently loaded transcript; - // we have to purge only subset of transcripts, on which we moved past the end - Iterator i = records.iterator(); - while ( i.hasNext() ) { - Transcript t = i.next(); - if ( t.getLocation().getStop() < curr_position ) { - i.remove(); // we moved past the end of transcript r, purge it forever - } - } - } else { - // ooops, we are past the end of all loaded transcripts - kill them all, - // load next transcript and fastforward current position to its start - records.clear(); - Transcript t = reader.next(); // if hasNext() previously returned true, we are guaranteed that this call to reader.next() is safe - records.add( t ); - curr_contig_name = t.getLocation().getContig(); - curr_position = t.getLocation().getStart(); - max_position = t.getLocation().getStop(); - } - -// System.out.println("curr pos="+curr_position+"; max pos="+max_position); - - // 'records' only keeps those transcripts now, on which we did not reach the end yet - // (we might have reloaded records completely if it was necessary); current position is correctly set. - // lets check if we walked into additional new transcripts so we'd need to load them too: - - while ( reader.hasNext() ) { - Transcript t = reader.peek(); -// System.out.println("next transcript at "+t.getLocation()+"; curr contig is "+curr_contig_name); - int ci1 = GenomeLocParser.getContigIndex(curr_contig_name); - int ci2 = GenomeLocParser.getContigIndex( t.getLocation().getContig() ); -// System.out.println("Contigs: "+ci1+":"+ci2); - if ( ci1 > ci2 ) throw new StingException("RefSeq track seems to be not contig-ordered"); - if ( ci1 < ci2 ) break; // next transcript is on the next contig, we do not need it yet... - if ( t.getLocation().getStart() > curr_position ) break; // next transcript is on the same contig but starts after the current position; we are done - t = reader.next(); // we do need next record, time to load it for real -// System.out.println("loaded next transcript"); - long stop = t.getLocation().getStop(); - if ( stop < curr_position ) throw new StingException("DEBUG: encountered contig that should have been loaded earlier"); - if ( stop > max_position ) max_position = stop; - records.add(t); - } - - // 'records' and current position are fully updated. We can now create new rod and return it (NOTE: this iterator will break if the list - // of pre-loaded records is meddled with by the clients between iterations, so we return them as unmodifiable list) - rodRefSeq rod = new rodRefSeq(name, GenomeLocParser.parseGenomeLoc(curr_contig_name,curr_position, curr_position),Collections.unmodifiableList(records)); -// System.out.println("Returning rod at "+rod.getLocation()); -// if ( (++z) % 1000000 == 0 ) { -// System.out.println(rod.getLocation()+": holding "+records.size()+ "; time per 1M ref positions: "+((double)(System.currentTimeMillis()-t)/1000.0)+" s"); -// z = 0; -// } - return rod; - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - - - -} - -/** Low-level iterator for reading refseq annotation file record by record (i.e. line by line). Returns - * pre-processed input lines as RefSeqRecord objects. - */ -class refSeqRecordIterator implements Iterator { - - private xReadLines reader; - - public refSeqRecordIterator(File f) throws IOException { reader = new xReadLines(f); } - - @Override - public boolean hasNext() { - return reader.hasNext(); - } - - @Override - public Transcript next() { - Transcript t = new Transcript(); - t.parseLine( reader.next() ); - return t; - } - - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - -} - -