2009-04-04 03:54:54 +08:00
package org.broadinstitute.sting.gatk.refdata ;
import org.apache.log4j.Logger ;
2010-08-06 02:47:53 +08:00
import org.broad.tribble.util.variantcontext.VariantContext ;
2010-04-19 13:47:17 +08:00
import org.broadinstitute.sting.gatk.contexts.ReferenceContext ;
2010-04-01 06:39:56 +08:00
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature ;
2010-02-26 06:48:55 +08:00
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList ;
2010-09-12 23:07:38 +08:00
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException ;
2010-02-05 23:42:54 +08:00
import org.broadinstitute.sting.utils.GenomeLoc ;
2010-09-12 22:02:43 +08:00
import org.broadinstitute.sting.utils.exceptions.UserException ;
2009-04-04 03:54:54 +08:00
2009-09-22 00:55:22 +08:00
import java.util.* ;
2009-04-04 03:54:54 +08:00
/ * *
2009-04-10 06:04:59 +08:00
* This class represents the Reference Metadata available at a particular site in the genome . It can be
2010-04-01 06:39:56 +08:00
* used to conveniently lookup the RMDs at this site , as well just getting a list of all of the RMDs
2009-04-10 06:04:59 +08:00
*
* The standard interaction model is :
*
2010-04-01 06:39:56 +08:00
* Traversal system arrives at a site , which has a bunch of RMDs covering it
Genotype * Traversal calls tracker . bind ( name , RMD ) for each RMDs in RMDs
2009-04-10 06:04:59 +08:00
* Traversal passes tracker to the walker
2010-04-01 06:39:56 +08:00
* walker calls lookup ( name , default ) to obtain the RMDs values at this site , or default if none was
2009-04-10 06:04:59 +08:00
* bound at this site .
*
2009-04-04 03:54:54 +08:00
* User : mdepristo
* Date : Apr 3 , 2009
* Time : 3 : 05 : 23 PM
* /
public class RefMetaDataTracker {
2010-02-25 06:11:53 +08:00
final HashMap < String , RODRecordList > map = new HashMap < String , RODRecordList > ( ) ;
2009-04-04 03:54:54 +08:00
protected static Logger logger = Logger . getLogger ( RefMetaDataTracker . class ) ;
2010-04-01 06:39:56 +08:00
2009-04-04 03:54:54 +08:00
/ * *
2010-04-01 06:39:56 +08:00
* get all the reference meta data associated with a track name .
* @param name the name of the track we ' re looking for
* @return a list of objects , representing the underlying objects that the tracks produce . I . e . for a
* dbSNP RMD this will be a RodDbSNP , etc .
*
* Important : The list returned by this function is guaranteed not to be null , but may be empty !
2009-04-04 03:54:54 +08:00
* /
2010-04-01 06:39:56 +08:00
public List < Object > getReferenceMetaData ( final String name ) {
RODRecordList list = getTrackDataByName ( name , true ) ;
List < Object > objects = new ArrayList < Object > ( ) ;
if ( list = = null ) return objects ;
for ( GATKFeature feature : list )
objects . add ( feature . getUnderlyingObject ( ) ) ;
return objects ;
2009-09-22 00:55:22 +08:00
}
/ * *
2010-04-01 06:39:56 +08:00
* get all the reference meta data associated with a track name .
* @param name the name of the track we ' re looking for
* @param requireExactMatch do we require an exact match for the name ( true ) or do we require only that the name starts with
* the passed in parameter ( false ) .
* @return a list of objects , representing the underlying objects that the tracks produce . I . e . for a
* dbSNP rod this will be a RodDbSNP , etc .
*
* Important : The list returned by this function is guaranteed not to be null , but may be empty !
2009-09-22 00:55:22 +08:00
* /
2010-04-01 06:39:56 +08:00
public List < Object > getReferenceMetaData ( final String name , boolean requireExactMatch ) {
RODRecordList list = getTrackDataByName ( name , requireExactMatch ) ;
List < Object > objects = new ArrayList < Object > ( ) ;
if ( list = = null ) return objects ;
for ( GATKFeature feature : list )
objects . add ( feature . getUnderlyingObject ( ) ) ;
return objects ;
2010-01-20 05:33:13 +08:00
}
2010-05-20 01:40:20 +08:00
/ * *
* get all the GATK features associated with a specific track name
* @param name the name of the track we ' re looking for
* @param requireExactMatch do we require an exact match for the name ( true ) or do we require only that the name starts with
* the passed in parameter ( false ) .
* @return a list of GATKFeatures for the target rmd
*
* Important : The list returned by this function is guaranteed not to be null , but may be empty !
* /
public List < GATKFeature > getGATKFeatureMetaData ( final String name , boolean requireExactMatch ) {
List < GATKFeature > feat = getTrackDataByName ( name , requireExactMatch ) ;
return ( feat = = null ) ? new ArrayList < GATKFeature > ( ) : feat ; // to satisfy the above requirement that we don't return null
}
2009-04-10 06:04:59 +08:00
/ * *
2010-04-01 06:39:56 +08:00
* get a singleton record , given the name and a type . This function will return the first record at the current position seen ,
* and emit a logger warning if there were more than one option .
*
* WARNING : this method is deprecated , since we now suppport more than one RMD at a single position for all tracks . If there are
* are multiple RMD objects at this location , there is no contract for which object this method will pick , and which object gets
* picked may change from time to time ! BE WARNED !
*
* @param name the name of the track
* @param clazz the underlying type to return
* @param < T > the type to parameterize on , matching the clazz argument
* @return a record of type T , or null if no record is present .
2009-04-10 06:04:59 +08:00
* /
2009-09-22 00:55:22 +08:00
@Deprecated
2010-04-01 06:39:56 +08:00
public < T > T lookup ( final String name , Class < T > clazz ) {
RODRecordList objects = getTrackDataByName ( name , true ) ;
2009-04-04 03:54:54 +08:00
2010-04-01 06:39:56 +08:00
// if emtpy or null return null;
if ( objects = = null | | objects . size ( ) < 1 ) return null ;
if ( objects . size ( ) > 1 )
logger . info ( "lookup is choosing the first record from " + ( objects . size ( ) - 1 ) + " options" ) ;
Object obj = objects . get ( 0 ) . getUnderlyingObject ( ) ;
if ( ! ( clazz . isAssignableFrom ( obj . getClass ( ) ) ) )
2010-09-12 22:02:43 +08:00
throw new UserException . CommandLineException ( "Unable to case track named " + name + " to type of " + clazz . toString ( )
2010-09-10 07:21:17 +08:00
+ " it's of type " + obj . getClass ( ) ) ;
2010-04-01 06:39:56 +08:00
return ( T ) obj ;
2009-04-10 06:04:59 +08:00
}
/ * *
2009-09-22 00:55:22 +08:00
* Is there a binding at this site to a ROD / track with the specified name ?
2009-04-10 06:04:59 +08:00
*
2009-07-21 08:55:52 +08:00
* @param name the name of the rod
* @return true if it has the rod
2009-04-10 06:04:59 +08:00
* /
2009-07-21 08:55:52 +08:00
public boolean hasROD ( final String name ) {
2010-04-01 06:39:56 +08:00
return map . containsKey ( canonicalName ( name ) ) ;
2009-04-04 03:54:54 +08:00
}
2010-04-01 06:39:56 +08:00
2009-04-10 06:04:59 +08:00
/ * *
2010-04-01 06:39:56 +08:00
* Get all of the RMDs at the current site . The collection is "flattened" : for any track that has multiple records
2009-09-22 00:55:22 +08:00
* at the current site , they all will be added to the list as separate elements .
2010-04-01 06:39:56 +08:00
*
2010-04-19 13:47:17 +08:00
* @return collection of all rods
2009-04-10 06:04:59 +08:00
* /
2010-04-01 06:39:56 +08:00
public Collection < GATKFeature > getAllRods ( ) {
List < GATKFeature > l = new ArrayList < GATKFeature > ( ) ;
2010-02-25 06:11:53 +08:00
for ( RODRecordList rl : map . values ( ) ) {
2009-09-22 00:55:22 +08:00
if ( rl = = null ) continue ; // how do we get null value stored for a track? shouldn't the track be missing from the map alltogether?
2010-02-26 06:48:55 +08:00
l . addAll ( rl ) ;
2009-09-22 00:55:22 +08:00
}
return l ;
2009-04-04 03:54:54 +08:00
}
2009-09-13 03:07:57 +08:00
/ * *
2010-04-01 06:39:56 +08:00
* Get all of the RMD tracks at the current site . Each track is returned as a single compound
* object ( RODRecordList ) that may contain multiple RMD records associated with the current site .
2009-09-13 03:07:57 +08:00
*
2010-04-19 13:47:17 +08:00
* @return collection of all tracks
2009-09-13 03:07:57 +08:00
* /
2010-02-25 06:11:53 +08:00
public Collection < RODRecordList > getBoundRodTracks ( ) {
LinkedList < RODRecordList > bound = new LinkedList < RODRecordList > ( ) ;
2010-04-01 06:39:56 +08:00
2010-02-25 06:11:53 +08:00
for ( RODRecordList value : map . values ( ) ) {
2010-04-01 06:39:56 +08:00
if ( value ! = null & & value . size ( ) ! = 0 ) bound . add ( value ) ;
2009-09-13 03:07:57 +08:00
}
return bound ;
}
2009-10-07 07:40:30 +08:00
public int getNBoundRodTracks ( ) {
return getNBoundRodTracks ( null ) ;
}
public int getNBoundRodTracks ( final String excludeIn ) {
final String exclude = excludeIn = = null ? null : canonicalName ( excludeIn ) ;
int n = 0 ;
2010-02-25 06:11:53 +08:00
for ( RODRecordList value : map . values ( ) ) {
2010-04-01 06:39:56 +08:00
if ( value ! = null & & ! value . isEmpty ( ) ) {
if ( exclude = = null | | ! value . getName ( ) . equals ( exclude ) )
2009-10-07 07:40:30 +08:00
n + + ;
2010-04-01 06:39:56 +08:00
}
2009-10-07 07:40:30 +08:00
}
return n ;
}
2009-09-22 00:55:22 +08:00
2010-04-01 06:39:56 +08:00
/ * *
* Binds the list of reference ordered data records ( RMDs ) to track name at this site . Should be used only by the traversal
* system to provide access to RMDs in a structured way to the walkers .
*
* @param name the name of the track
* @param rod the collection of RMD data
* /
public void bind ( final String name , RODRecordList rod ) {
//logger.debug(String.format("Binding %s to %s", name, rod));
map . put ( canonicalName ( name ) , rod ) ;
2009-09-22 00:55:22 +08:00
}
2010-02-05 23:42:54 +08:00
2010-02-07 00:26:06 +08:00
/ * *
* Converts all possible ROD tracks to VariantContexts objects , of all types , allowing any start and any number
* of entries per ROD .
2010-04-19 13:47:17 +08:00
* The name of each VariantContext corresponds to the ROD name .
*
* @param ref reference context
* @return variant context
2010-02-07 00:26:06 +08:00
* /
2010-04-19 13:47:17 +08:00
public Collection < VariantContext > getAllVariantContexts ( ReferenceContext ref ) {
return getAllVariantContexts ( ref , null , null , false , false ) ;
2010-02-05 23:42:54 +08:00
}
2010-07-01 04:13:03 +08:00
/ * *
* Returns all of the variant contexts that start at the current location
* @param ref
* @param curLocation
* @return
* /
public Collection < VariantContext > getAllVariantContexts ( ReferenceContext ref , GenomeLoc curLocation ) {
return getAllVariantContexts ( ref , null , curLocation , true , false ) ;
}
2010-02-07 00:26:06 +08:00
/ * *
* Converts all possible ROD tracks to VariantContexts objects . If allowedTypes ! = null , then only
* VariantContexts in the allow set of types will be returned . If requireStartsHere is true , then curLocation
* must not be null , and only records whose start position is = = to curLocation . getStart ( ) will be returned .
* If takeFirstOnly is true , then only a single VariantContext will be converted from any individual ROD . Of course ,
* this single object must pass the allowed types and start here options if provided . Note that the result
* may return multiple VariantContexts with the same name if that particular track contained multiple RODs spanning
* the current location .
*
* The name of each VariantContext corresponds to the ROD name .
*
2010-04-19 13:47:17 +08:00
* @param ref reference context
2010-03-27 02:34:59 +08:00
* @param allowedTypes allowed types
2010-04-19 13:47:17 +08:00
* @param curLocation location
2010-03-27 02:34:59 +08:00
* @param requireStartHere do we require the rod to start at this location ?
* @param takeFirstOnly do we take the first rod only ?
* @return variant context
2010-02-07 00:26:06 +08:00
* /
2010-04-19 13:47:17 +08:00
public Collection < VariantContext > getAllVariantContexts ( ReferenceContext ref , EnumSet < VariantContext . Type > allowedTypes , GenomeLoc curLocation , boolean requireStartHere , boolean takeFirstOnly ) {
2010-02-05 23:42:54 +08:00
List < VariantContext > contexts = new ArrayList < VariantContext > ( ) ;
2010-02-25 06:11:53 +08:00
for ( RODRecordList rodList : getBoundRodTracks ( ) ) {
2010-04-19 13:47:17 +08:00
addVariantContexts ( contexts , rodList , ref , allowedTypes , curLocation , requireStartHere , takeFirstOnly ) ;
2010-02-05 23:42:54 +08:00
}
return contexts ;
}
2010-02-07 00:26:06 +08:00
/ * *
* Gets the variant contexts associated with track name name
*
* see getVariantContexts for more information .
*
2010-05-20 07:27:55 +08:00
* @param ref ReferenceContext to enable conversion to variant context
2010-03-27 02:34:59 +08:00
* @param name name
* @param curLocation location
* @param allowedTypes allowed types
* @param requireStartHere do we require the rod to start at this location ?
* @param takeFirstOnly do we take the first rod only ?
* @return variant context
2010-02-07 00:26:06 +08:00
* /
2010-05-20 07:27:55 +08:00
// public Collection<VariantContext> getVariantContexts(String name, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
// return getVariantContexts(null, Arrays.asList(name), allowedTypes, curLocation, requireStartHere, takeFirstOnly);
// }
2010-03-27 02:34:59 +08:00
2010-04-19 13:47:17 +08:00
public Collection < VariantContext > getVariantContexts ( ReferenceContext ref , String name , EnumSet < VariantContext . Type > allowedTypes , GenomeLoc curLocation , boolean requireStartHere , boolean takeFirstOnly ) {
return getVariantContexts ( ref , Arrays . asList ( name ) , allowedTypes , curLocation , requireStartHere , takeFirstOnly ) ;
2010-02-11 00:12:29 +08:00
}
2010-05-20 07:27:55 +08:00
// public Collection<VariantContext> getVariantContexts(Collection<String> names, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
// return getVariantContexts(null, names, allowedTypes, curLocation, requireStartHere, takeFirstOnly);
// }
2010-03-27 02:34:59 +08:00
2010-04-19 13:47:17 +08:00
public Collection < VariantContext > getVariantContexts ( ReferenceContext ref , Collection < String > names , EnumSet < VariantContext . Type > allowedTypes , GenomeLoc curLocation , boolean requireStartHere , boolean takeFirstOnly ) {
2010-02-07 00:26:06 +08:00
Collection < VariantContext > contexts = new ArrayList < VariantContext > ( ) ;
2010-02-11 00:12:29 +08:00
for ( String name : names ) {
2010-04-01 06:39:56 +08:00
RODRecordList rodList = getTrackDataByName ( name , true ) ; // require that the name is an exact match
2010-02-11 00:12:29 +08:00
if ( rodList ! = null )
2010-04-19 13:47:17 +08:00
addVariantContexts ( contexts , rodList , ref , allowedTypes , curLocation , requireStartHere , takeFirstOnly ) ;
2010-02-11 00:12:29 +08:00
}
2010-02-07 00:26:06 +08:00
return contexts ;
}
/ * *
* Gets the variant context associated with name , and assumes the system only has a single bound track at this location . Throws an exception if not .
* see getVariantContexts for more information .
*
2010-03-27 02:34:59 +08:00
* @param name name
* @param curLocation location
* @param allowedTypes allowed types
* @param requireStartHere do we require the rod to start at this location ?
* @return variant context
2010-02-07 00:26:06 +08:00
* /
2010-05-20 07:27:55 +08:00
public VariantContext getVariantContext ( ReferenceContext ref , String name , EnumSet < VariantContext . Type > allowedTypes , GenomeLoc curLocation , boolean requireStartHere ) {
Collection < VariantContext > contexts = getVariantContexts ( ref , name , allowedTypes , curLocation , requireStartHere , false ) ;
2010-02-07 00:26:06 +08:00
if ( contexts . size ( ) > 1 )
2010-09-12 23:07:38 +08:00
throw new ReviewedStingException ( "Requested a single VariantContext object for track " + name + " but multiple variants were present at position " + curLocation ) ;
2010-02-11 00:12:29 +08:00
else if ( contexts . size ( ) = = 0 )
return null ;
else
return contexts . iterator ( ) . next ( ) ;
2010-02-07 00:26:06 +08:00
}
2010-04-01 06:39:56 +08:00
2010-04-19 13:47:17 +08:00
private void addVariantContexts ( Collection < VariantContext > contexts , RODRecordList rodList , ReferenceContext ref , EnumSet < VariantContext . Type > allowedTypes , GenomeLoc curLocation , boolean requireStartHere , boolean takeFirstOnly ) {
2010-04-01 06:39:56 +08:00
for ( GATKFeature rec : rodList ) {
if ( VariantContextAdaptors . canBeConvertedToVariantContext ( rec . getUnderlyingObject ( ) ) ) {
2010-02-07 00:26:06 +08:00
// ok, we might actually be able to turn this record in a variant context
2010-04-19 13:47:17 +08:00
VariantContext vc = VariantContextAdaptors . toVariantContext ( rodList . getName ( ) , rec . getUnderlyingObject ( ) , ref ) ;
2010-02-07 00:26:06 +08:00
2010-04-01 06:39:56 +08:00
if ( vc = = null ) // sometimes the track has odd stuff in it that can't be converted
2010-02-07 00:26:06 +08:00
continue ;
// now, let's decide if we want to keep it
boolean goodType = allowedTypes = = null | | allowedTypes . contains ( vc . getType ( ) ) ;
boolean goodPos = ! requireStartHere | | rec . getLocation ( ) . getStart ( ) = = curLocation . getStart ( ) ;
if ( goodType & & goodPos ) { // ok, we are going to keep this thing
contexts . add ( vc ) ;
if ( takeFirstOnly )
// we only want the first passing instance, so break the loop over records in rodList
break ;
}
}
}
}
2010-02-05 23:42:54 +08:00
2009-04-10 06:04:59 +08:00
/ * *
2010-04-01 06:39:56 +08:00
* Finds the reference metadata track named ' name ' and returns all ROD records from that track associated
* with the current site as a RODRecordList collection object . If no data track with specified name is available ,
* returns defaultValue wrapped as RODRecordList object . NOTE : if defaultValue is null , it will be wrapped up
* with track name set to ' name ' and location set to null ; otherwise the wrapper object will have name and
* location set to defaultValue . getName ( ) and defaultValue . getLocation ( ) , respectively ( use caution ,
* defaultValue . getLocation ( ) may be not equal to what RODRecordList ' s location would be expected to be otherwise :
* for instance , on locus traversal , location is usually expected to be a single base we are currently looking at ,
* regardless of the presence of "extended" RODs overlapping with that location ) .
2010-04-19 13:47:17 +08:00
* @param name track name
* @param requireExactMatch do we require an exact match of the rod name ?
* @return track data for the given rod
2009-04-10 06:04:59 +08:00
* /
2010-04-01 06:39:56 +08:00
private RODRecordList getTrackDataByName ( final String name , boolean requireExactMatch ) {
//logger.debug(String.format("Lookup %s%n", name));
final String luName = canonicalName ( name ) ;
RODRecordList trackData = null ;
if ( requireExactMatch ) {
if ( map . containsKey ( luName ) )
trackData = map . get ( luName ) ;
} else {
for ( Map . Entry < String , RODRecordList > datum : map . entrySet ( ) ) {
final String rodName = datum . getKey ( ) ;
2010-04-26 13:02:09 +08:00
if ( datum . getValue ( ) ! = null & & rodName . startsWith ( luName ) ) {
2010-04-01 06:39:56 +08:00
if ( trackData = = null ) trackData = new RODRecordListImpl ( name ) ;
//System.out.printf("Adding bindings from %s to %s at %s%n", rodName, name, datum.getValue().getLocation());
( ( RODRecordListImpl ) trackData ) . add ( datum . getValue ( ) , true ) ;
}
}
}
2010-04-26 13:02:09 +08:00
return trackData ;
2010-04-01 06:39:56 +08:00
}
/ * *
* Returns the canonical name of the rod name ( lowercases it )
* @param name the name of the rod
2010-04-19 13:47:17 +08:00
* @return canonical name of the rod
2010-04-01 06:39:56 +08:00
* /
private final String canonicalName ( final String name ) {
return name . toLowerCase ( ) ;
2009-09-22 00:55:22 +08:00
}
2009-04-04 03:54:54 +08:00
}