From a6bd5095937212687fd2c8de3700ea5e15af1fe7 Mon Sep 17 00:00:00 2001 From: asivache Date: Mon, 21 Sep 2009 16:55:22 +0000 Subject: [PATCH] Changing the carpet under your feet!! New incremental update to th eROD system has arrived. all the updated classes now make use of new SeekableRodIterator instead of RODIterator. RODIterator class deleted. This batch makes only trivial updates to tests dictated by the change in the ROD system interface. Few less trivial updates to follow. This is a partial commit; a few walkers also still need to be updated, hold on... git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1667 348d0f76-0448-11de-a6fe-93d51630548a --- .../ManagingReferenceOrderedView.java | 8 +- .../datasources/providers/RodLocusView.java | 41 ++-- .../ReferenceOrderedDataSource.java | 18 +- .../sting/gatk/refdata/RODIterator.java | 149 ------------ .../gatk/refdata/RefMetaDataTracker.java | 98 ++++++-- .../gatk/refdata/ReferenceOrderedData.java | 225 +++++++++--------- .../sting/gatk/walkers/CountRodWalker.java | 7 +- .../ReferenceOrderedDataPoolTest.java | 33 ++- .../sting/gatk/refdata/TabularRODTest.java | 53 +++-- 9 files changed, 280 insertions(+), 352 deletions(-) delete mode 100755 java/src/org/broadinstitute/sting/gatk/refdata/RODIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java index 2da895d01..d17dc06ce 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java @@ -1,7 +1,7 @@ package org.broadinstitute.sting.gatk.datasources.providers; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.RODIterator; +import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.utils.GenomeLoc; @@ -43,7 +43,7 @@ public class ManagingReferenceOrderedView implements ReferenceOrderedView { public ManagingReferenceOrderedView( ShardDataProvider provider ) { //this.provider = provider; for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() ) - states.add( new ReferenceOrderedDataState( dataSource, (RODIterator)dataSource.seek(provider.getShard()) ) ); + states.add( new ReferenceOrderedDataState( dataSource, (SeekableRODIterator)dataSource.seek(provider.getShard()) ) ); provider.register(this); } @@ -78,9 +78,9 @@ public class ManagingReferenceOrderedView implements ReferenceOrderedView { */ private class ReferenceOrderedDataState { public final ReferenceOrderedDataSource dataSource; - public final RODIterator iterator; + public final SeekableRODIterator iterator; - public ReferenceOrderedDataState( ReferenceOrderedDataSource dataSource, RODIterator iterator ) { + public ReferenceOrderedDataState( ReferenceOrderedDataSource dataSource, SeekableRODIterator iterator ) { this.dataSource = dataSource; this.iterator = iterator; } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java b/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java index d9483d2bc..14ead32fb 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java @@ -1,8 +1,6 @@ package org.broadinstitute.sting.gatk.datasources.providers; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.RODIterator; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.GenomeLoc; @@ -37,11 +35,11 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { /** * The data sources along with their current states. */ - private MergingIterator rodQueue = null; + private MergingIterator> rodQueue = null; RefMetaDataTracker tracker = null; GenomeLoc lastLoc = null; - ReferenceOrderedDatum interval = null; + RODRecordList interval = null; // broken support for multi-locus rods //List multiLocusRODs = new LinkedList(); @@ -63,11 +61,11 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { GenomeLoc loc = provider.getShard().getGenomeLoc(); - List> iterators = new LinkedList>(); + List< Iterator> > iterators = new LinkedList< Iterator> >(); for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() ) { if ( DEBUG ) System.out.printf("Shard is %s%n", loc); - RODIterator it = (RODIterator)dataSource.seek(provider.getShard()); - ReferenceOrderedDatum x = it.seekForward(loc); + SeekableRODIterator it = (SeekableRODIterator)dataSource.seek(provider.getShard()); + RODRecordList x = it.seekForward(loc); // we need to special case the interval so we don't always think there's a rod at the first location if ( dataSource.getName().equals(INTERVAL_ROD_NAME) ) { @@ -75,11 +73,11 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { throw new RuntimeException("BUG: interval local variable already assigned " + interval); interval = x; } else { - iterators.add( (Iterator)it ); + iterators.add( it ); } } - rodQueue = new MergingIterator(iterators); + rodQueue = new MergingIterator>(iterators); } public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ) { @@ -90,7 +88,7 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { if ( ! rodQueue.hasNext() ) return false; else { - ReferenceOrderedDatum peeked = rodQueue.peek(); + RODRecordList peeked = rodQueue.peek(); return ! peeked.getLocation().isPast(shard.getGenomeLoc()); } } @@ -102,14 +100,14 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { */ public AlignmentContext next() { if ( DEBUG ) System.out.printf("In RodLocusView.next()...%n"); - ReferenceOrderedDatum datum = rodQueue.next(); + RODRecordList datum = rodQueue.next(); if ( DEBUG ) System.out.printf("In RodLocusView.next(); datum = %s...%n", datum.getLocation()); if ( DEBUG ) System.out.printf("In RodLocusView.next(): creating tracker...%n"); // Update the tracker here for use - Collection allRODsHere = getSpanningRods(datum); - tracker = createTracker(allRODsHere); + Collection> allTracksHere = getSpanningTracks(datum); + tracker = createTracker(allTracksHere); GenomeLoc rodSite = datum.getLocation(); GenomeLoc site = GenomeLocParser.createGenomeLoc( rodSite.getContigIndex(), rodSite.getStart(), rodSite.getStart()); @@ -122,11 +120,11 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { return new AlignmentContext(site, new ArrayList(), new ArrayList(), skippedBases); } - private RefMetaDataTracker createTracker( Collection allRodsHere ) { + private RefMetaDataTracker createTracker( Collection> allTracksHere ) { RefMetaDataTracker t = new RefMetaDataTracker(); - for ( ReferenceOrderedDatum element : allRodsHere ) { - if ( ! t.hasROD(element.getName()) ) - t.bind(element.getName(), element); + for ( RODRecordList track : allTracksHere ) { + if ( ! t.hasROD(track.getName()) ) + t.bind(track.getName(), track); } // special case the interval again -- add it into the ROD @@ -135,7 +133,12 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView { return t; } - private Collection getSpanningRods(ReferenceOrderedDatum marker) { + private Collection> getSpanningTracks(ReferenceOrderedDatum marker) { + RODRecordList wrapper = new RODRecordList(marker.getName(),Collections.singletonList(marker)); + return rodQueue.allElementsLTE(wrapper); + } + + private Collection> getSpanningTracks(RODRecordList marker) { return rodQueue.allElementsLTE(marker); } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java index a4ce5039e..e73458fbc 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java @@ -2,7 +2,7 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; -import org.broadinstitute.sting.gatk.refdata.RODIterator; +import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; @@ -59,7 +59,7 @@ public class ReferenceOrderedDataSource implements SimpleDataSource { * @return Iterator through the data. */ public Iterator seek( Shard shard ) { - RODIterator iterator = iteratorPool.iterator( new MappedStreamSegment(shard.getGenomeLoc()) ); + SeekableRODIterator iterator = iteratorPool.iterator( new MappedStreamSegment(shard.getGenomeLoc()) ); return iterator; } @@ -67,7 +67,7 @@ public class ReferenceOrderedDataSource implements SimpleDataSource { * Close the specified iterator, returning it to the pool. * @param iterator Iterator to close. */ - public void close( RODIterator iterator ) { + public void close( SeekableRODIterator iterator ) { this.iteratorPool.release(iterator); } @@ -76,7 +76,7 @@ public class ReferenceOrderedDataSource implements SimpleDataSource { /** * A pool of reference-ordered data iterators. */ -class ReferenceOrderedDataPool extends ResourcePool { +class ReferenceOrderedDataPool extends ResourcePool { private final ReferenceOrderedData rod; public ReferenceOrderedDataPool( ReferenceOrderedData rod ) { @@ -88,7 +88,7 @@ class ReferenceOrderedDataPool extends ResourcePool { * to be completely independent of any other iterator. * @return The newly created resource. */ - public RODIterator createNewResource() { + public SeekableRODIterator createNewResource() { return rod.iterator(); } @@ -99,13 +99,13 @@ class ReferenceOrderedDataPool extends ResourcePool { * @param resources @{inheritedDoc} * @return @{inheritedDoc} */ - public RODIterator selectBestExistingResource( DataStreamSegment segment, List resources ) { + public SeekableRODIterator selectBestExistingResource( DataStreamSegment segment, List resources ) { if( !(segment instanceof MappedStreamSegment) ) throw new StingException("Reference-ordered data cannot utilitize unmapped segments."); GenomeLoc position = ((MappedStreamSegment)segment).locus; - for( RODIterator iterator: resources ) { + for( SeekableRODIterator iterator: resources ) { if( (iterator.position() == null && iterator.hasNext()) || (iterator.position() != null && iterator.position().isBefore(position)) ) return iterator; @@ -116,14 +116,14 @@ class ReferenceOrderedDataPool extends ResourcePool { /** * In this case, the iterator is the resource. Pass it through. */ - public RODIterator createIteratorFromResource( DataStreamSegment segment, RODIterator resource ) { + public SeekableRODIterator createIteratorFromResource( DataStreamSegment segment, SeekableRODIterator resource ) { return resource; } /** * Don't worry about closing the resource; let the file handles expire naturally for the moment. */ - public void closeResource( RODIterator resource ) { + public void closeResource( SeekableRODIterator resource ) { } } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RODIterator.java b/java/src/org/broadinstitute/sting/gatk/refdata/RODIterator.java deleted file mode 100755 index f689076af..000000000 --- a/java/src/org/broadinstitute/sting/gatk/refdata/RODIterator.java +++ /dev/null @@ -1,149 +0,0 @@ -package org.broadinstitute.sting.gatk.refdata; - -import org.broadinstitute.sting.gatk.iterators.PushbackIterator; -import org.broadinstitute.sting.utils.GenomeLoc; - -import java.util.Iterator; - -/** - * Adapter (decorator) class for rod iterators. The "raw" rod iterator wrapped into this class - * should be capable of reading the underlying ROD data file and iterating over successive - * genomic locations. The purpose of this adapter is to provide additional seekForward() method: - * upon a call to this method, the decorated iterator will fastforward to the specified position. - * NOTE 1: if a particular ROD data file is allowed to have multiple records (lines) - * associated with the same location, the "raw" iterator must be capable of dealing with this situation - * by loading all such records at once on a call to next(). - * NOTE 2: the object represented by this class is still a unidirectional iterator: after a call to seekForward(), - * subsequent calls to seekForward() or next() will work from the position the iterator was fastforwarded to. - * @author asivache - * - * @param - */ -public class RODIterator implements Iterator { - private PushbackIterator it; - private ROD current = null; - private GenomeLoc position = null; - - public RODIterator(Iterator it) { - this.it = new PushbackIterator(it); - } - - @Override - public boolean hasNext() { return it.hasNext(); } - - @Override - public ROD next() { - ROD next = it.next(); - if( next != null ) { - position = next.getLocation().clone(); - current = next; - } - return next; - } - -// @Override -// public boolean hasNext() { return current != null || it.hasNext(); } -// -// @Override -// public ROD next() { -// if ( current != null ) { -// ROD prev = current; -// current = null; -// return prev; -// } else { -// ROD next = it.next(); -// if( next != null ) { -// position = next.getLocation().clone(); -// //current = next; -// } -// -// return next; -// } -// } - - /** - * Returns the current position of this iterator. - * @return Current position of the iterator, or null if no position exists. - */ - public GenomeLoc position() { - return position; - } - - /** - * Seeks forward in the file until we reach (or cross) a record at contig / pos - * If we don't find anything and cross beyond contig / pos, we return null; - * subsequent call to next() will return the first record located after the specified - * position in this case. Otherwise, the first ROD record at or overlapping with - * the specified position is returned; the subsequent call to next() will return the - * next ROD record. - * - * NOTE 1: the location object loc should be a single point (not an interval); - * ROD locations, however, can be extended intervals, in which case first ROD that overlaps the specified - * position will be returned. - * - * NOTE 2: seekForward() is not exactly like next(): if we are strictly past a record, seekForward will not - * see it, but it will be returning the "current" record (i.e. the record returned by last call to next() or - * seekForward()) over and over again and will NOT advance the iterator for as long as the current record's location - * overlaps with the query position. - * - * @param loc point-like genomic location to fastforward to. - * @return ROD object at (or overlapping with) the specified position, or null if no such ROD exists. - */ - public ROD seekForward(final GenomeLoc loc) { - final boolean DEBUG = false; - - ROD result = null; - - //if (current != null && current.getName().equals("interval")) { - // boolean contains = current.getLocation().containsP(loc); - // System.out.printf(" %s : current is %s, seeking to %s, contains %b%n", current.getName(), current.getLocation(), loc, contains); - //} - - if ( current != null && current.getLocation().containsP(loc) ) - return current; - - if ( DEBUG ) System.out.printf(" *** starting seek to %s %d (contig %d) from current location %s %d%n", loc.getContig(), loc.getStart(), - loc.getContigIndex(),current==null?"null":current.getLocation().getContig(), current==null?-1:current.getLocation().getStart()); - while ( hasNext() ) { - ROD proposed = next(); - if( proposed == null ) - continue; - //System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart()); - if ( DEBUG ) System.out.println(" proposed at "+proposed.getLocation()+"; contig index="+proposed.getLocation().getContigIndex()); - boolean containedP = proposed.getLocation().containsP(loc); - //System.out.printf(" %s -> Seeking to %s, at %s => contains = %b%n", current.getName(), loc, current.getLocation(), containedP); - int cmp = proposed.getLocation().compareTo(loc); - if ( cmp < 0 ) { - if ( DEBUG ) System.out.println(" we are before..."); - // current occurs before loc, continue searching - continue; - } - else if ( cmp == 0 || containedP ) { - if ( DEBUG ) System.out.println(" we found overlap..."); - result = proposed; - break; - } else { - if ( DEBUG ) System.out.println(" we are after..."); - // current is after loc - it.pushback(proposed); - break; - } - } - - if ( DEBUG ) { - if ( result != null ) - System.out.printf(" ### Found %s%n", result.getLocation()); - } - - // make a note that the iterator last seeked to the specified position - current = result; - position = loc.clone(); - - // we ran out of elements or found something - return result; - } - - public void remove() { - throw new UnsupportedOperationException(); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index 79661b899..b5f314c96 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -2,9 +2,7 @@ package org.broadinstitute.sting.gatk.refdata; import org.apache.log4j.Logger; -import java.util.Collection; -import java.util.HashMap; -import java.util.LinkedList; +import java.util.*; /** * This class represents the Reference Metadata available at a particular site in the genome. It can be @@ -23,31 +21,68 @@ import java.util.LinkedList; * Time: 3:05:23 PM */ public class RefMetaDataTracker { - final HashMap map = new HashMap(); + final HashMap> map = new HashMap>(); protected static Logger logger = Logger.getLogger(RefMetaDataTracker.class); /** - * Finds the reference meta data named name, if it exists, otherwise returns the defaultValue - * + * Finds the reference meta data named name, if it exists, otherwise returns the defaultValue. + * This is a legacy method that works with "singleton" tracks, in which a single ROD record can be associated + * with any given site. If track provides multiple records associated with a site, this method will return + * the first one. * @param name * @param defaultValue * @return */ + @Deprecated public ReferenceOrderedDatum lookup(final String name, ReferenceOrderedDatum defaultValue) { + //logger.debug(String.format("Lookup %s%n", name)); + final String luName = canonicalName(name); + if ( map.containsKey(luName) ) { + RODRecordList value = map.get(luName) ; + if ( value != null ) { + List l = value.getRecords(); + if ( l != null & l.size() > 0 ) return value.getRecords().get(0); + } + } + return defaultValue; + } + + /** + * Finds the reference metadata track named 'name' and returns all ROD records from that track associated + * with the current site as a RODRecordList collection object. If no data track with specified name is available, + * returns defaultValue wrapped as RODRecordList object. NOTE: if defaultValue is null, it will be wrapped up + * with track name set to 'name' and location set to null; otherwise the wrapper object will have name and + * location set to defaultValue.getName() and defaultValue.getLocation(), respectively (use caution, + * defaultValue.getLocation() may be not equal to what RODRecordList's location would be expected to be otherwise: + * for instance, on locus traversal, location is usually expected to be a single base we are currently looking at, + * regardless of the presence of "extended" RODs overlapping with that location). + * @param name + * @param defaultValue + * @return + */ + public RODRecordList getTrackData(final String name, ReferenceOrderedDatum defaultValue) { //logger.debug(String.format("Lookup %s%n", name)); final String luName = canonicalName(name); if ( map.containsKey(luName) ) return map.get(luName); - else - return defaultValue; - } + else { + if ( defaultValue == null ) { + return new RODRecordList(luName, Collections.singletonList(defaultValue), null); + } else { + return new RODRecordList(defaultValue.getName(), + Collections.singletonList(defaultValue), + defaultValue.getLocation()); + } + } + } /** * @see this.lookup * @param name * @param defaultValue * @return */ + @Deprecated public Object lookup(final String name, Object defaultValue) { final String luName = canonicalName(name); if ( map.containsKey(luName) ) @@ -68,7 +103,7 @@ public class RefMetaDataTracker { } /** - * Is there a binding at this site to a ROD with name? + * Is there a binding at this site to a ROD/track with the specified name? * * @param name the name of the rod * @return true if it has the rod @@ -78,39 +113,64 @@ public class RefMetaDataTracker { } /** - * Get all of the RODs at the current site + * Get all of the RODs at the current site. The collection is "flattened": for any track that has multiple records + * at the current site, they all will be added to the list as separate elements. * * @return */ public Collection getAllRods() { - return map.values(); + List l = new ArrayList(); + for ( RODRecordList rl : map.values() ) { + if ( rl == null ) continue; // how do we get null value stored for a track? shouldn't the track be missing from the map alltogether? + l.addAll(rl.getRecords()); + } + return l; + } /** - * Get all of the RODs at the current site + * Get all of the ROD tracks at the current site. Each track is returned as a single compound + * object (RODRecordList) that may contain multiple ROD records associated with the current site. * * @return */ - public Collection getBoundRods() { - LinkedList bound = new LinkedList(); + public Collection> getBoundRodTracks() { + LinkedList> bound = new LinkedList>(); - for ( ReferenceOrderedDatum value : map.values() ) { - if ( value != null ) - bound.add(value); + for ( RODRecordList value : map.values() ) { + if ( value != null && value.size() != 0 ) bound.add(value); } return bound; } + public Collection getBoundRodRecords() { + LinkedList bound = new LinkedList(); + + for ( RODRecordList valueList : map.values() ) { + for ( ReferenceOrderedDatum value : valueList ) { + if ( value != null ) + bound.add(value); + } + } + + return bound; + } /** - * Binds the reference ordered datum ROD to name at this site. Should be used only but the traversal + * Binds the list of reference ordered data records (RODs) to track name at this site. Should be used only by the traversal * system to provide access to RODs in a structured way to the walkers. * * @param name * @param rod */ + public void bind(final String name, RODRecordList rod) { + //logger.debug(String.format("Binding %s to %s", name, rod)); + map.put(canonicalName(name), rod); + } +/* public void bind(final String name, ReferenceOrderedDatum rod) { //logger.debug(String.format("Binding %s to %s", name, rod)); map.put(canonicalName(name), rod); } + */ } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java index 2371fd02e..9bec6e50f 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java @@ -1,14 +1,10 @@ package org.broadinstitute.sting.gatk.refdata; import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.MalformedGenomeLocException; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.xReadLines; import java.io.*; -import java.lang.reflect.Constructor; -import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.*; @@ -20,13 +16,13 @@ import java.util.*; * Time: 10:47:14 AM * To change this template use File | Settings | File Templates. */ -public class ReferenceOrderedData implements Iterable { +public class ReferenceOrderedData implements Iterable> { private String name; private File file = null; - private String fieldDelimiter; +// private String fieldDelimiter; /** Header object returned from the datum */ - private Object header = null; +// private Object header = null; private Class type = null; // runtime type information for object construction @@ -180,8 +176,8 @@ public class ReferenceOrderedData implements this.file = file; this.type = type; this.name = name; - this.header = initializeROD(name, file, type); - this.fieldDelimiter = newROD(name, type).delimiterRegex(); +// this.header = initializeROD(name, file, type); +// this.fieldDelimiter = newROD(name, type).delimiterRegex(); } public String getName() { return name; } @@ -200,13 +196,13 @@ public class ReferenceOrderedData implements return this.name.equals(name) && type.isAssignableFrom(this.type); } - public RODIterator iterator() { + public SeekableRODIterator iterator() { Iterator it; try { Method m = type.getDeclaredMethod("createIterator", String.class, java.io.File.class); it = (Iterator) m.invoke(null, name, file); } catch (java.lang.NoSuchMethodException e) { - it = new SimpleRODIterator(); + it = new RODRecordIterator(file,name,type); } catch (java.lang.NullPointerException e) { throw new RuntimeException(e); } catch (java.lang.SecurityException e) { @@ -218,7 +214,8 @@ public class ReferenceOrderedData implements } catch (java.lang.reflect.InvocationTargetException e) { throw new RuntimeException(e); } - return new RODIterator(it); + // return new RODIterator(it); + return new SeekableRODIterator(it); } // ---------------------------------------------------------------------- @@ -227,10 +224,10 @@ public class ReferenceOrderedData implements // // ---------------------------------------------------------------------- public void testMe() { - for (ReferenceOrderedDatum rec : this) { - System.out.println(rec.toString()); + for (RODRecordList rec : this) { + System.out.println(rec.getRecords().get(0).toString()); - RodGenotypeChipAsGFF gff = (RodGenotypeChipAsGFF) rec; + RodGenotypeChipAsGFF gff = (RodGenotypeChipAsGFF) rec.getRecords().get(0); String[] keys = {"LENGTH", "ALT", "FOBARBAR"}; for (String key : keys) { System.out.printf(" -> %s is (%s)%n", key, gff.containsAttribute(key) ? gff.getAttribute(key) : "none"); @@ -246,8 +243,10 @@ public class ReferenceOrderedData implements // ---------------------------------------------------------------------- public ArrayList readAll() { ArrayList elts = new ArrayList(); - for (ReferenceOrderedDatum rec : this) { - elts.add(rec); + for ( RODRecordList l : this ) { + for (ReferenceOrderedDatum rec : l) { + elts.add(rec); + } } elts.trimToSize(); return elts; @@ -269,12 +268,14 @@ public class ReferenceOrderedData implements public boolean validateFile() throws Exception { ReferenceOrderedDatum last = null; - for (ReferenceOrderedDatum rec : this) { - if (last != null && last.compareTo(rec) == 1) { - // It's out of order - throw new Exception("Out of order elements at \n" + last.toString() + "\n" + rec.toString()); + for ( RODRecordList l : this ) { + for (ReferenceOrderedDatum rec : l) { + if (last != null && last.compareTo(rec) > 1) { + // It's out of order + throw new Exception("Out of order elements at \n" + last.toString() + "\n" + rec.toString()); + } + last = rec; } - last = rec; } return true; } @@ -288,103 +289,103 @@ public class ReferenceOrderedData implements // Iteration // // ---------------------------------------------------------------------- - private class SimpleRODIterator implements Iterator { - private xReadLines parser = null; - - public SimpleRODIterator() { - try { - parser = new xReadLines(file); - } catch (FileNotFoundException e) { - Utils.scareUser("Couldn't open file: " + file); - } - } - - public boolean hasNext() { - //System.out.printf("Parser has next: %b%n", parser.hasNext()); - return parser.hasNext(); - } - - public ROD next() { - ROD n = null; - boolean success = false; - boolean firstFailure = true; - - do { - final String line = parser.next(); - //System.out.printf("Line is '%s'%n", line); - String parts[] = line.split(fieldDelimiter); - - try { - n = parseLine(parts); - // Two failure conditions: - // 1) parseLine throws an exception. - // 2) parseLine returns null. - // 3) parseLine throws a RuntimeException. - // TODO: Clean this up so that all errors are handled in one spot. - success = (n != null); - } - catch (MalformedGenomeLocException ex) { - if (firstFailure) { - Utils.warnUser("Failed to parse contig on line '" + line + "'. The reason given was: " + ex.getMessage() + " Skipping ahead to the next recognized GenomeLoc. "); - firstFailure = false; - } - if (!parser.hasNext()) - Utils.warnUser("Unable to find more valid reference-ordered data. Giving up."); - } - - } while (!success && parser.hasNext()); - - return n; - } - - public void remove() { - throw new UnsupportedOperationException(); - } - } +// private class SimpleRODIterator implements Iterator { +// private xReadLines parser = null; +// +// public SimpleRODIterator() { +// try { +// parser = new xReadLines(file); +// } catch (FileNotFoundException e) { +// Utils.scareUser("Couldn't open file: " + file); +// } +// } +// +// public boolean hasNext() { +// //System.out.printf("Parser has next: %b%n", parser.hasNext()); +// return parser.hasNext(); +// } +// +// public ROD next() { +// ROD n = null; +// boolean success = false; +// boolean firstFailure = true; +// +// do { +// final String line = parser.next(); +// //System.out.printf("Line is '%s'%n", line); +// String parts[] = line.split(fieldDelimiter); +// +// try { +// n = parseLine(parts); +// // Two failure conditions: +// // 1) parseLine throws an exception. +// // 2) parseLine returns null. +// // 3) parseLine throws a RuntimeException. +// // TODO: Clean this up so that all errors are handled in one spot. +// success = (n != null); +// } +// catch (MalformedGenomeLocException ex) { +// if (firstFailure) { +// Utils.warnUser("Failed to parse contig on line '" + line + "'. The reason given was: " + ex.getMessage() + " Skipping ahead to the next recognized GenomeLoc. "); +// firstFailure = false; +// } +// if (!parser.hasNext()) +// Utils.warnUser("Unable to find more valid reference-ordered data. Giving up."); +// } +// +// } while (!success && parser.hasNext()); +// +// return n; +// } +// +// public void remove() { +// throw new UnsupportedOperationException(); +// } +// } // ---------------------------------------------------------------------- // // Parsing // // ---------------------------------------------------------------------- - private Constructor parsing_constructor; +// private Constructor parsing_constructor; - private ROD newROD(final String name, final Class type) { - try { - return (ROD) parsing_constructor.newInstance(name); - } catch (java.lang.InstantiationException e) { - throw new RuntimeException(e); - } catch (java.lang.IllegalAccessException e) { - throw new RuntimeException(e); - } catch (InvocationTargetException e) { - throw new RuntimeException(e); - } - } +// private ROD newROD(final String name, final Class type) { +// try { +// return (ROD) parsing_constructor.newInstance(name); +// } catch (java.lang.InstantiationException e) { +// throw new RuntimeException(e); +// } catch (java.lang.IllegalAccessException e) { +// throw new RuntimeException(e); +// } catch (InvocationTargetException e) { +// throw new RuntimeException(e); +// } +// } - private Object initializeROD(final String name, final File file, final Class type) { - try { - parsing_constructor = type.getConstructor(String.class); - } - catch (java.lang.NoSuchMethodException e) { - throw new RuntimeException(e); - } - ROD rod = newROD(name, type); - try { - return rod.initialize(file); - } catch (FileNotFoundException e) { - throw new RuntimeException(e); - } - } +// private Object initializeROD(final String name, final File file, final Class type) { +// try { +// parsing_constructor = type.getConstructor(String.class); +// } +// catch (java.lang.NoSuchMethodException e) { +// throw new RuntimeException(e); +// } +// ROD rod = newROD(name, type); +// try { +// return rod.initialize(file); +// } catch (FileNotFoundException e) { +// throw new RuntimeException(e); +// } +// } - private ROD parseLine(final String[] parts) { - //System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts)); - ROD obj = newROD(name, type); - try { - if (!obj.parseLine(header, parts)) - obj = null; - } catch (IOException e) { - throw new RuntimeException("Badly formed ROD: " + e); - } - return obj; - } +// private ROD parseLine(final String[] parts) { +// //System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts)); +// ROD obj = newROD(name, type); +// try { +// if (!obj.parseLine(header, parts)) +// obj = null; +// } catch (IOException e) { +// throw new RuntimeException("Badly formed ROD: " + e); +// } +// return obj; +// } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/CountRodWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/CountRodWalker.java index fe778235c..829bfbdb9 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/CountRodWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/CountRodWalker.java @@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.RODRecordList; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.cmdLine.Argument; @@ -55,8 +56,8 @@ public class CountRodWalker extends RodWalker rods = new LinkedList(); - for ( ReferenceOrderedDatum rod : tracker.getBoundRods() ) { + Collection> rods = new LinkedList>(); + for ( RODRecordList rod : tracker.getBoundRodTracks() ) { //System.out.printf("Considering rod %s%n", rod); if ( rod.getLocation().getStart() == context.getLocation().getStart() && ! rod.getName().equals("interval") ) { // only consider the first element @@ -70,7 +71,7 @@ public class CountRodWalker extends RodWalker 0 ) { if ( verbose ) { List names = new ArrayList(); - for ( ReferenceOrderedDatum rod : rods ) { + for ( RODRecordList rod : rods ) { names.add(rod.getName()); } diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolTest.java index 17e3e4d53..8ab81c41d 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolTest.java @@ -9,10 +9,7 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.TabularROD; -import org.broadinstitute.sting.gatk.refdata.RODIterator; +import org.broadinstitute.sting.gatk.refdata.*; import java.io.File; import java.io.FileNotFoundException; @@ -58,12 +55,12 @@ public class ReferenceOrderedDataPoolTest extends BaseTest { @Test public void testCreateSingleIterator() { ResourcePool iteratorPool = new ReferenceOrderedDataPool(rod); - RODIterator iterator = (RODIterator)iteratorPool.iterator( new MappedStreamSegment(testSite1) ); + SeekableRODIterator iterator = (SeekableRODIterator)iteratorPool.iterator( new MappedStreamSegment(testSite1) ); Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators()); Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators()); - TabularROD datum = (TabularROD)iterator.next(); + TabularROD datum = (TabularROD)iterator.next().getRecords().get(0); assertTrue(datum.getLocation().equals(testSite1)); assertTrue(datum.get("COL1").equals("A")); @@ -79,36 +76,36 @@ public class ReferenceOrderedDataPoolTest extends BaseTest { @Test public void testCreateMultipleIterators() { ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(rod); - RODIterator iterator1 = iteratorPool.iterator( new MappedStreamSegment(testSite1) ); + SeekableRODIterator iterator1 = iteratorPool.iterator( new MappedStreamSegment(testSite1) ); // Create a new iterator at position 2. - RODIterator iterator2 = iteratorPool.iterator( new MappedStreamSegment(testSite2) ); + SeekableRODIterator iterator2 = iteratorPool.iterator( new MappedStreamSegment(testSite2) ); Assert.assertEquals("Number of iterators in the pool is incorrect", 2, iteratorPool.numIterators()); Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators()); // Test out-of-order access: first iterator2, then iterator1. // Ugh...first call to a region needs to be a seek. - TabularROD datum = (TabularROD)iterator2.seekForward(testSite2); + TabularROD datum = (TabularROD)iterator2.seekForward(testSite2).getRecords().get(0); assertTrue(datum.getLocation().equals(testSite2)); assertTrue(datum.get("COL1").equals("C")); assertTrue(datum.get("COL2").equals("D")); assertTrue(datum.get("COL3").equals("E")); - datum = (TabularROD)iterator1.next(); + datum = (TabularROD)iterator1.next().getRecords().get(0); assertTrue(datum.getLocation().equals(testSite1)); assertTrue(datum.get("COL1").equals("A")); assertTrue(datum.get("COL2").equals("B")); assertTrue(datum.get("COL3").equals("C")); // Advance iterator2, and make sure both iterator's contents are still correct. - datum = (TabularROD)iterator2.next(); + datum = (TabularROD)iterator2.next().getRecords().get(0); assertTrue(datum.getLocation().equals(testSite3)); assertTrue(datum.get("COL1").equals("F")); assertTrue(datum.get("COL2").equals("G")); assertTrue(datum.get("COL3").equals("H")); - datum = (TabularROD)iterator1.next(); + datum = (TabularROD)iterator1.next().getRecords().get(0); assertTrue(datum.getLocation().equals(testSite2)); assertTrue(datum.get("COL1").equals("C")); assertTrue(datum.get("COL2").equals("D")); @@ -129,12 +126,12 @@ public class ReferenceOrderedDataPoolTest extends BaseTest { @Test public void testIteratorConservation() { ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(rod); - RODIterator iterator = (RODIterator)iteratorPool.iterator( new MappedStreamSegment(testSite1) ); + SeekableRODIterator iterator = iteratorPool.iterator( new MappedStreamSegment(testSite1) ); Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators()); Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators()); - TabularROD datum = (TabularROD)iterator.next(); + TabularROD datum = (TabularROD)iterator.next().getRecords().get(0); assertTrue(datum.getLocation().equals(testSite1)); assertTrue(datum.get("COL1").equals("A")); assertTrue(datum.get("COL2").equals("B")); @@ -149,7 +146,7 @@ public class ReferenceOrderedDataPoolTest extends BaseTest { Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators()); Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators()); - datum = (TabularROD)iterator.seekForward(testSite3); + datum = (TabularROD)iterator.seekForward(testSite3).getRecords().get(0); assertTrue(datum.getLocation().equals(testSite3)); assertTrue(datum.get("COL1").equals("F")); assertTrue(datum.get("COL2").equals("G")); @@ -164,12 +161,12 @@ public class ReferenceOrderedDataPoolTest extends BaseTest { @Test public void testIteratorCreation() { ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(rod); - RODIterator iterator = (RODIterator)iteratorPool.iterator( new MappedStreamSegment(testSite3) ); + SeekableRODIterator iterator = iteratorPool.iterator( new MappedStreamSegment(testSite3) ); Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators()); Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators()); - TabularROD datum = (TabularROD)iterator.seekForward(testSite3); + TabularROD datum = (TabularROD)iterator.seekForward(testSite3).getRecords().get(0); assertTrue(datum.getLocation().equals(testSite3)); assertTrue(datum.get("COL1").equals("F")); assertTrue(datum.get("COL2").equals("G")); @@ -184,7 +181,7 @@ public class ReferenceOrderedDataPoolTest extends BaseTest { Assert.assertEquals("Number of iterators in the pool is incorrect", 2, iteratorPool.numIterators()); Assert.assertEquals("Number of available iterators in the pool is incorrect", 1, iteratorPool.numAvailableIterators()); - datum = (TabularROD)iterator.next(); + datum = (TabularROD)iterator.next().getRecords().get(0); assertTrue(datum.getLocation().equals(testSite1)); assertTrue(datum.get("COL1").equals("A")); assertTrue(datum.get("COL2").equals("B")); diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java index e1f866ed2..bd41876a4 100755 --- a/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java @@ -26,7 +26,7 @@ import net.sf.picard.reference.ReferenceSequenceFile; public class TabularRODTest extends BaseTest { private static ReferenceSequenceFile seq; private ReferenceOrderedData ROD; - private RODIterator iter; + private SeekableRODIterator iter; @BeforeClass @@ -48,7 +48,8 @@ public class TabularRODTest extends BaseTest { @Test public void test1() { logger.warn("Executing test1"); - TabularROD one = (TabularROD)iter.next(); + RODRecordList oneList = iter.next(); + TabularROD one = oneList.getRecords().get(0); assertTrue(one.size() == 4); assertTrue(one.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 10))); assertTrue(one.get("COL1").equals("A")); @@ -59,8 +60,10 @@ public class TabularRODTest extends BaseTest { @Test public void test2() { logger.warn("Executing test2"); - TabularROD one = (TabularROD)iter.next(); - TabularROD two = (TabularROD)iter.next(); + RODRecordList oneList = iter.next(); + RODRecordList twoList = iter.next(); + TabularROD one = oneList.getRecords().get(0); + TabularROD two = twoList.getRecords().get(0); assertTrue(two.size() == 4); assertTrue(two.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 20))); assertTrue(two.get("COL1").equals("C")); @@ -71,9 +74,12 @@ public class TabularRODTest extends BaseTest { @Test public void test3() { logger.warn("Executing test3"); - TabularROD one = (TabularROD)iter.next(); - TabularROD two = (TabularROD)iter.next(); - TabularROD three = (TabularROD)iter.next(); + RODRecordList oneList = iter.next(); + RODRecordList twoList = iter.next(); + RODRecordList threeList = iter.next(); + TabularROD one = oneList.getRecords().get(0); + TabularROD two = twoList.getRecords().get(0); + TabularROD three = threeList.getRecords().get(0); assertTrue(three.size() == 4); assertTrue(three.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 30))); assertTrue(three.get("COL1").equals("F")); @@ -84,16 +90,20 @@ public class TabularRODTest extends BaseTest { @Test public void testDone() { logger.warn("Executing testDone"); - TabularROD one = (TabularROD)iter.next(); - TabularROD two = (TabularROD)iter.next(); - TabularROD three = (TabularROD)iter.next(); + RODRecordList oneList = iter.next(); + RODRecordList twoList = iter.next(); + RODRecordList threeList = iter.next(); + TabularROD one = oneList.getRecords().get(0); + TabularROD two = twoList.getRecords().get(0); + TabularROD three = threeList.getRecords().get(0); assertTrue(!iter.hasNext()); } @Test public void testSeek() { logger.warn("Executing testSeek"); - TabularROD two = (TabularROD)iter.seekForward(GenomeLocParser.createGenomeLoc("chrM", 20)); + RODRecordList twoList = iter.seekForward(GenomeLocParser.createGenomeLoc("chrM", 20)); + TabularROD two = twoList.getRecords().get(0); assertTrue(two.size() == 4); assertTrue(two.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 20))); assertTrue(two.get("COL1").equals("C")); @@ -104,7 +114,8 @@ public class TabularRODTest extends BaseTest { @Test public void testToString() { logger.warn("Executing testToString"); - TabularROD one = (TabularROD)iter.next(); + RODRecordList oneList = iter.next(); + TabularROD one = oneList.getRecords().get(0); assertTrue(one.toString().equals("chrM:10\tA\tB\tC")); } @@ -113,10 +124,11 @@ public class TabularRODTest extends BaseTest { public void testDelim1() { File file2 = new File(testDir + "TabularDataTest2.dat"); ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class); - RODIterator iter_commas = ROD_commas.iterator(); + SeekableRODIterator iter_commas = ROD_commas.iterator(); logger.warn("Executing testDelim1"); - TabularROD one2 = (TabularROD)iter_commas.next(); + RODRecordList one2List = iter_commas.next(); + TabularROD one2 = one2List.getRecords().get(0); assertTrue(one2.size() == 5); assertTrue(one2.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 10))); assertTrue(one2.get("COL1").equals("A")); @@ -130,10 +142,11 @@ public class TabularRODTest extends BaseTest { TabularROD.setDelimiter(",",","); File file2 = new File(testDir + "TabularDataTest2.dat"); ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class); - RODIterator iter_commas = ROD_commas.iterator(); + SeekableRODIterator iter_commas = ROD_commas.iterator(); logger.warn("Executing testDelim1"); - TabularROD one2 = (TabularROD)iter_commas.next(); + RODRecordList one2List = iter_commas.next(); + TabularROD one2 = one2List.getRecords().get(0); assertTrue(one2.size() == 5); assertTrue(one2.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 10))); assertTrue(one2.get("COL1").equals("A")); @@ -174,16 +187,18 @@ public class TabularRODTest extends BaseTest { out.println(row.toString()); ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", outputFile, TabularROD.class); - RODIterator iter_commas = ROD_commas.iterator(); + SeekableRODIterator iter_commas = ROD_commas.iterator(); - TabularROD one = (TabularROD)iter_commas.next(); + RODRecordList oneList = iter_commas.next(); + TabularROD one = oneList.getRecords().get(0); assertTrue(one.size() == 4); assertTrue(one.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 1))); assertTrue(one.get("col1").equals("1")); assertTrue(one.get("col2").equals("2")); assertTrue(one.get("col3").equals("3")); - TabularROD two = (TabularROD)iter_commas.next(); + RODRecordList twoList = iter_commas.next(); + TabularROD two = twoList.getRecords().get(0); assertTrue(two.size() == 4); assertTrue(two.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 2))); assertTrue(two.get("col1").equals("3"));