changed to use factored out Transcript class; some docs added (not much)
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@836 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ae0bac5696
commit
5b310e48f5
|
|
@ -2,15 +2,14 @@ package org.broadinstitute.sting.playground.gatk.refdata;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.zip.DataFormatException;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
|
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
|
||||||
import org.broadinstitute.sting.gatk.refdata.BasicReferenceOrderedDatum;
|
import org.broadinstitute.sting.gatk.refdata.BasicReferenceOrderedDatum;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.Transcript;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
import org.broadinstitute.sting.utils.xReadLines;
|
import org.broadinstitute.sting.utils.xReadLines;
|
||||||
|
|
@ -18,14 +17,20 @@ import org.broadinstitute.sting.utils.xReadLines;
|
||||||
public class rodRefSeq extends BasicReferenceOrderedDatum {
|
public class rodRefSeq extends BasicReferenceOrderedDatum {
|
||||||
|
|
||||||
private GenomeLoc location = null;
|
private GenomeLoc location = null;
|
||||||
private List<RefSeqRecord> records = null;
|
private List<Transcript> records = null;
|
||||||
|
|
||||||
public rodRefSeq(String name) {
|
public rodRefSeq(String name) {
|
||||||
super(name);
|
super(name);
|
||||||
// location = new GenomeLoc(0,0,-1);
|
// location = new GenomeLoc(0,0,-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
public rodRefSeq(String name, GenomeLoc location, List<RefSeqRecord> records) {
|
/** Despite this constructor is public, it is meant primarily for the internal use; RefSeq iterator will
|
||||||
|
* call it to populate the ROD at the given genomic location with the data (transcripts) it is holding
|
||||||
|
* @param name
|
||||||
|
* @param location
|
||||||
|
* @param records
|
||||||
|
*/
|
||||||
|
public rodRefSeq(String name, GenomeLoc location, List<Transcript> records) {
|
||||||
super(name);
|
super(name);
|
||||||
this.location = location;
|
this.location = location;
|
||||||
this.records = records;
|
this.records = records;
|
||||||
|
|
@ -36,28 +41,37 @@ public class rodRefSeq extends BasicReferenceOrderedDatum {
|
||||||
return location;
|
return location;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Required by ReferenceOrderedDatum interface; this method does nothing (always returns false),
|
||||||
|
* since this rod provides its own iterator for reading underlying data files.
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean parseLine(Object header, String[] parts) {
|
public boolean parseLine(Object header, String[] parts) {
|
||||||
return false; // this rod has its own iterator
|
return false; // this rod has its own iterator
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns true if the current position this ROD is associated with is within the coding interval for at least
|
||||||
|
* one of the annotated transcripts. NOTE: "coding" interval is defined as a single genomic interval, so it
|
||||||
|
* does not include the UTRs of the outermost exons, but it includes introns between exons spliced into a
|
||||||
|
* transcript, or internal exons that are not spliced into a given transcript. @see isExon().
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
public boolean isCoding() {
|
public boolean isCoding() {
|
||||||
if ( records == null ) return false;
|
if ( records == null ) return false;
|
||||||
for ( RefSeqRecord r : records) {
|
for ( Transcript t : records) {
|
||||||
if (location.getStart() >= r.getCodingLocation().getStart() && location.getStart() <= r.getCodingLocation().getStop() ) return true;
|
if ( t.overlapsCodingP(location) ) return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** Returns true if the current position this ROD is associated with is within an exon for at least
|
||||||
|
* one of the annotated transcripts. NOTE: position can be still within a UTR, see @isCoding()
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
public boolean isExon() {
|
public boolean isExon() {
|
||||||
if ( records == null ) return false;
|
if ( records == null ) return false;
|
||||||
for ( RefSeqRecord r : records) {
|
for ( Transcript t : records) {
|
||||||
for ( GenomeLoc e : r.getExons() ) {
|
if ( t.overlapsExonP(location) ) return true;
|
||||||
// System.err.println("EXON: "+e);
|
|
||||||
if (location.getStart() >= e.getStart() && location.getStart() <= e.getStop() ) return true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
@ -67,17 +81,18 @@ public class rodRefSeq extends BasicReferenceOrderedDatum {
|
||||||
throw new StingException("repl() is not implemented yet");
|
throw new StingException("repl() is not implemented yet");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Will print the genomic location of this rod, followed by (space separated) ids of all the
|
||||||
|
* annotated transcripts overlapping with this position.
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public String toSimpleString() {
|
public String toSimpleString() {
|
||||||
if ( records == null ) return new String(getName()+": <NULL>");
|
if ( records == null ) return new String(getName()+": <NULL>");
|
||||||
StringBuilder b = new StringBuilder();
|
StringBuilder b = new StringBuilder();
|
||||||
b.append(getName());
|
b.append(getName());
|
||||||
b.append(":");
|
b.append(":");
|
||||||
for ( RefSeqRecord r : records ) {
|
for ( Transcript t : records ) {
|
||||||
b.append(' ');
|
b.append(' ');
|
||||||
b.append(r.getTranscriptId());
|
b.append(t.getTranscriptId());
|
||||||
// b.append("; "+ r.getNumExons() + " exons: ");
|
|
||||||
// for ( GenomeLoc e : r.getExons() ) b.append(" " + e);
|
|
||||||
}
|
}
|
||||||
return b.toString();
|
return b.toString();
|
||||||
}
|
}
|
||||||
|
|
@ -87,8 +102,8 @@ public class rodRefSeq extends BasicReferenceOrderedDatum {
|
||||||
return toSimpleString();
|
return toSimpleString();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Iterator<rodRefSeq> createIterator(String trackName, File f) throws IOException, DataFormatException {
|
public static Iterator<rodRefSeq> createIterator(String trackName, File f) throws IOException {
|
||||||
return new refSeqIterator(f, trackName);
|
return new refSeqIterator(trackName,f);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -98,13 +113,13 @@ class refSeqIterator implements Iterator<rodRefSeq> {
|
||||||
private long curr_position = 0;
|
private long curr_position = 0;
|
||||||
private long max_position = 0;
|
private long max_position = 0;
|
||||||
private String curr_contig_name = null; // will keep the name of the contig the iterator is currently in
|
private String curr_contig_name = null; // will keep the name of the contig the iterator is currently in
|
||||||
private List<RefSeqRecord> records; // will keep the list of all transcripts overlapping with the current position
|
private List<Transcript> records; // will keep the list of all transcripts overlapping with the current position
|
||||||
private PushbackIterator<RefSeqRecord> reader;
|
private PushbackIterator<Transcript> reader;
|
||||||
private String name = null;
|
private String name = null;
|
||||||
|
|
||||||
public refSeqIterator(File f, String trackName) throws IOException {
|
public refSeqIterator(String trackName, File f) throws IOException {
|
||||||
reader = new PushbackIterator<RefSeqRecord>( new refSeqRecordIterator(f) );
|
reader = new PushbackIterator<Transcript>( new refSeqRecordIterator(f) );
|
||||||
records = new LinkedList<RefSeqRecord>();
|
records = new LinkedList<Transcript>();
|
||||||
name = trackName;
|
name = trackName;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -124,11 +139,10 @@ class refSeqIterator implements Iterator<rodRefSeq> {
|
||||||
if ( curr_position <= max_position ) {
|
if ( curr_position <= max_position ) {
|
||||||
// we still have bases covered by at least one currently loaded transcript;
|
// we still have bases covered by at least one currently loaded transcript;
|
||||||
// we have to purge only subset of transcripts, on which we moved past the end
|
// we have to purge only subset of transcripts, on which we moved past the end
|
||||||
Iterator<RefSeqRecord> i = records.iterator();
|
Iterator<Transcript> i = records.iterator();
|
||||||
while ( i.hasNext() ) {
|
while ( i.hasNext() ) {
|
||||||
RefSeqRecord r = i.next();
|
Transcript t = i.next();
|
||||||
if ( r.getLocation().getStop() < curr_position ) {
|
if ( t.getLocation().getStop() < curr_position ) {
|
||||||
System.out.println("Throwing away: " + r.toString());
|
|
||||||
i.remove(); // we moved past the end of transcript r, purge it forever
|
i.remove(); // we moved past the end of transcript r, purge it forever
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -136,29 +150,29 @@ class refSeqIterator implements Iterator<rodRefSeq> {
|
||||||
// ooops, we are past the end of all loaded transcripts - kill them all,
|
// ooops, we are past the end of all loaded transcripts - kill them all,
|
||||||
// load next transcript and fastforward current position to its start
|
// load next transcript and fastforward current position to its start
|
||||||
records.clear();
|
records.clear();
|
||||||
RefSeqRecord r = reader.next(); // if hasNext() previously returned true, we are guaranteed that this call to reader.next() is safe
|
Transcript t = reader.next(); // if hasNext() previously returned true, we are guaranteed that this call to reader.next() is safe
|
||||||
records.add( r );
|
records.add( t );
|
||||||
curr_contig_name = r.getLocation().getContig();
|
curr_contig_name = t.getLocation().getContig();
|
||||||
curr_position = r.getLocation().getStart();
|
curr_position = t.getLocation().getStart();
|
||||||
max_position = r.getLocation().getStop();
|
max_position = t.getLocation().getStop();
|
||||||
}
|
}
|
||||||
|
|
||||||
// 'records' only keeps those transcripts now, on which we did not reach the end yet
|
// 'records' only keeps those transcripts now, on which we did not reach the end yet
|
||||||
// (we might have reloaded records completely if it was necessary); current position is correctly set.
|
// (we might have reloaded records completely if it was necessary); current position is correctly set.
|
||||||
// lets check if we walked into new transcripts so we'd need to load them too:
|
// lets check if we walked into additional new transcripts so we'd need to load them too:
|
||||||
|
|
||||||
while ( reader.hasNext() ) {
|
while ( reader.hasNext() ) {
|
||||||
RefSeqRecord r = reader.peek();
|
Transcript t = reader.peek();
|
||||||
int ci1 = GenomeLoc.getContigIndex(curr_contig_name);
|
int ci1 = GenomeLoc.getContigIndex(curr_contig_name);
|
||||||
int ci2 = GenomeLoc.getContigIndex( r.getLocation().getContig() );
|
int ci2 = GenomeLoc.getContigIndex( t.getLocation().getContig() );
|
||||||
if ( ci1 > ci2 ) throw new StingException("RefSeq track seems to be not contig-ordered");
|
if ( ci1 > ci2 ) throw new StingException("RefSeq track seems to be not contig-ordered");
|
||||||
if ( ci1 < ci2 ) break; // next transcript is on the next contig, we do not need it yet...
|
if ( ci1 < ci2 ) break; // next transcript is on the next contig, we do not need it yet...
|
||||||
if ( r.getLocation().getStart() > curr_position ) break; // next transcript is on the same contig but starts after the current position; we are done
|
if ( t.getLocation().getStart() > curr_position ) break; // next transcript is on the same contig but starts after the current position; we are done
|
||||||
r = reader.next(); // we need next record, time to load it for real
|
t = reader.next(); // we do need next record, time to load it for real
|
||||||
long stop = r.getLocation().getStop();
|
long stop = t.getLocation().getStop();
|
||||||
if ( stop < curr_position ) throw new StingException("DEBUG: encuntered contig that should have been loaded earlier");
|
if ( stop < curr_position ) throw new StingException("DEBUG: encountered contig that should have been loaded earlier");
|
||||||
if ( stop > max_position ) max_position = stop;
|
if ( stop > max_position ) max_position = stop;
|
||||||
records.add(r);
|
records.add(t);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 'records' and current position are fully updated. We can now create new rod and return it (NOTE: this iterator will break if the list
|
// 'records' and current position are fully updated. We can now create new rod and return it (NOTE: this iterator will break if the list
|
||||||
|
|
@ -179,7 +193,7 @@ class refSeqIterator implements Iterator<rodRefSeq> {
|
||||||
/** Low-level iterator for reading refseq annotation file record by record (i.e. line by line). Returns
|
/** Low-level iterator for reading refseq annotation file record by record (i.e. line by line). Returns
|
||||||
* pre-processed input lines as RefSeqRecord objects.
|
* pre-processed input lines as RefSeqRecord objects.
|
||||||
*/
|
*/
|
||||||
class refSeqRecordIterator implements Iterator<RefSeqRecord> {
|
class refSeqRecordIterator implements Iterator<Transcript> {
|
||||||
|
|
||||||
private xReadLines reader;
|
private xReadLines reader;
|
||||||
|
|
||||||
|
|
@ -191,10 +205,10 @@ class refSeqRecordIterator implements Iterator<RefSeqRecord> {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public RefSeqRecord next() {
|
public Transcript next() {
|
||||||
RefSeqRecord r = new RefSeqRecord();
|
Transcript t = new Transcript();
|
||||||
r.parseLine( reader.next() );
|
t.parseLine( reader.next() );
|
||||||
return r;
|
return t;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -206,61 +220,3 @@ class refSeqRecordIterator implements Iterator<RefSeqRecord> {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/** Holds a single transcript annotation: refseq id, gene name, genomic locations of the locus, of the coding region
|
|
||||||
* and of all the exons.
|
|
||||||
*/
|
|
||||||
class RefSeqRecord {
|
|
||||||
private String transcript_id;
|
|
||||||
private int strand;
|
|
||||||
private GenomeLoc transcript_interval;
|
|
||||||
private GenomeLoc transcript_coding_interval;
|
|
||||||
private List<GenomeLoc> exons;
|
|
||||||
private String gene_name;
|
|
||||||
private List<Integer> exon_frames;
|
|
||||||
|
|
||||||
public RefSeqRecord() {}
|
|
||||||
|
|
||||||
public RefSeqRecord(String line) throws DataFormatException {
|
|
||||||
parseLine(line);
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getTranscriptId() { return transcript_id; }
|
|
||||||
public int getStrand() { return strand; }
|
|
||||||
public GenomeLoc getLocation() { return transcript_interval; }
|
|
||||||
public GenomeLoc getCodingLocation() { return transcript_coding_interval; }
|
|
||||||
public String getGeneName() { return gene_name; }
|
|
||||||
public int getNumExons() { return exons.size(); }
|
|
||||||
public GenomeLoc getExonLocation(int n) {
|
|
||||||
if ( n >= exons.size() || n < 0 ) throw new StingException("Index out-of-bounds. Transcript has " + exons.size() +" exons; requested: "+n);
|
|
||||||
return exons.get(n);
|
|
||||||
}
|
|
||||||
public List<GenomeLoc> getExons() { return exons; }
|
|
||||||
|
|
||||||
public void parseLine(String line) {
|
|
||||||
String[] fields = line.split("\t");
|
|
||||||
transcript_id = fields[1];
|
|
||||||
if ( fields[3].length()==1 && fields[3].charAt(0)=='+') strand = 1;
|
|
||||||
else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') strand = -1;
|
|
||||||
else throw new StingException("Expected strand symbol (+/-), found: "+fields[3]);
|
|
||||||
|
|
||||||
String contig_name = fields[2];
|
|
||||||
transcript_interval = new GenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]));
|
|
||||||
transcript_coding_interval = new GenomeLoc(contig_name, Integer.parseInt(fields[6])+1, Integer.parseInt(fields[7]));
|
|
||||||
gene_name = fields[12];
|
|
||||||
String[] exon_starts = fields[9].split(",");
|
|
||||||
String[] exon_stops = fields[10].split(",");
|
|
||||||
String[] eframes = fields[15].split(",");
|
|
||||||
|
|
||||||
assert exon_starts.length == exon_stops.length : "Data format error: numbers of exon start and stop positions differ";
|
|
||||||
assert exon_starts.length == eframes.length : "Data format error: numbers of exons and exon frameshifts differ";
|
|
||||||
|
|
||||||
exons = new ArrayList<GenomeLoc>(exon_starts.length);
|
|
||||||
exon_frames = new ArrayList<Integer>(eframes.length);
|
|
||||||
|
|
||||||
for ( int i = 0 ; i < exon_starts.length ; i++ ) {
|
|
||||||
exons.add(new GenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) );
|
|
||||||
exon_frames.add(Integer.decode(eframes[i]));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
Loading…
Reference in New Issue