2009-04-04 03:54:54 +08:00
|
|
|
package org.broadinstitute.sting.gatk.refdata;
|
|
|
|
|
|
|
|
|
|
import org.apache.log4j.Logger;
|
2010-02-05 23:42:54 +08:00
|
|
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
|
|
|
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
2010-02-07 00:26:06 +08:00
|
|
|
import org.broadinstitute.sting.utils.StingException;
|
2009-04-04 03:54:54 +08:00
|
|
|
|
2009-09-22 00:55:22 +08:00
|
|
|
import java.util.*;
|
2009-04-04 03:54:54 +08:00
|
|
|
|
|
|
|
|
/**
|
2009-04-10 06:04:59 +08:00
|
|
|
* This class represents the Reference Metadata available at a particular site in the genome. It can be
|
|
|
|
|
* used to conveniently lookup the RODs at this site, as well just getting a list of all of the RODs
|
|
|
|
|
*
|
|
|
|
|
* The standard interaction model is:
|
|
|
|
|
*
|
|
|
|
|
* Traversal system arrives at a site, which has a bunch of rods covering it
|
2010-02-07 00:26:06 +08:00
|
|
|
Genotype * Traversal calls tracker.bind(name, rod) for each rod in rods
|
2009-04-10 06:04:59 +08:00
|
|
|
* Traversal passes tracker to the walker
|
|
|
|
|
* walker calls lookup(name, default) to obtain the rod values at this site, or default if none was
|
|
|
|
|
* bound at this site.
|
|
|
|
|
*
|
2009-04-04 03:54:54 +08:00
|
|
|
* User: mdepristo
|
|
|
|
|
* Date: Apr 3, 2009
|
|
|
|
|
* Time: 3:05:23 PM
|
|
|
|
|
*/
|
|
|
|
|
public class RefMetaDataTracker {
|
2009-09-22 00:55:22 +08:00
|
|
|
final HashMap<String, RODRecordList<ReferenceOrderedDatum>> map = new HashMap<String, RODRecordList<ReferenceOrderedDatum>>();
|
2009-04-04 03:54:54 +08:00
|
|
|
protected static Logger logger = Logger.getLogger(RefMetaDataTracker.class);
|
|
|
|
|
|
|
|
|
|
/**
|
2009-09-22 00:55:22 +08:00
|
|
|
* Finds the reference meta data named name, if it exists, otherwise returns the defaultValue.
|
|
|
|
|
* This is a legacy method that works with "singleton" tracks, in which a single ROD record can be associated
|
|
|
|
|
* with any given site. If track provides multiple records associated with a site, this method will return
|
|
|
|
|
* the first one.
|
2009-04-04 03:54:54 +08:00
|
|
|
* @param name
|
|
|
|
|
* @param defaultValue
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
2009-09-22 00:55:22 +08:00
|
|
|
@Deprecated
|
2009-04-04 03:54:54 +08:00
|
|
|
public ReferenceOrderedDatum lookup(final String name, ReferenceOrderedDatum defaultValue) {
|
2009-09-22 00:55:22 +08:00
|
|
|
//logger.debug(String.format("Lookup %s%n", name));
|
|
|
|
|
final String luName = canonicalName(name);
|
|
|
|
|
if ( map.containsKey(luName) ) {
|
|
|
|
|
RODRecordList<ReferenceOrderedDatum> value = map.get(luName) ;
|
|
|
|
|
if ( value != null ) {
|
|
|
|
|
List<ReferenceOrderedDatum> l = value.getRecords();
|
|
|
|
|
if ( l != null & l.size() > 0 ) return value.getRecords().get(0);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return defaultValue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Finds the reference metadata track named 'name' and returns all ROD records from that track associated
|
|
|
|
|
* with the current site as a RODRecordList collection object. If no data track with specified name is available,
|
|
|
|
|
* returns defaultValue wrapped as RODRecordList object. NOTE: if defaultValue is null, it will be wrapped up
|
|
|
|
|
* with track name set to 'name' and location set to null; otherwise the wrapper object will have name and
|
|
|
|
|
* location set to defaultValue.getName() and defaultValue.getLocation(), respectively (use caution,
|
|
|
|
|
* defaultValue.getLocation() may be not equal to what RODRecordList's location would be expected to be otherwise:
|
|
|
|
|
* for instance, on locus traversal, location is usually expected to be a single base we are currently looking at,
|
|
|
|
|
* regardless of the presence of "extended" RODs overlapping with that location).
|
|
|
|
|
* @param name
|
|
|
|
|
* @param defaultValue
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
2010-01-20 05:33:13 +08:00
|
|
|
public RODRecordList<ReferenceOrderedDatum> getTrackData(final String name, ReferenceOrderedDatum defaultValue, boolean requireExactMatch) {
|
2009-04-04 03:54:54 +08:00
|
|
|
//logger.debug(String.format("Lookup %s%n", name));
|
2010-01-20 05:33:13 +08:00
|
|
|
|
2009-04-10 06:04:59 +08:00
|
|
|
final String luName = canonicalName(name);
|
2010-01-20 05:33:13 +08:00
|
|
|
RODRecordList<ReferenceOrderedDatum> trackData = null;
|
2009-04-04 03:54:54 +08:00
|
|
|
|
2010-01-20 05:33:13 +08:00
|
|
|
if ( requireExactMatch ) {
|
|
|
|
|
if ( map.containsKey(luName) )
|
|
|
|
|
trackData = map.get(luName);
|
|
|
|
|
} else {
|
|
|
|
|
for ( Map.Entry<String, RODRecordList<ReferenceOrderedDatum>> datum : map.entrySet() ) {
|
|
|
|
|
final String rodName = datum.getKey();
|
|
|
|
|
if ( rodName.startsWith(luName) ) {
|
|
|
|
|
if ( trackData == null ) trackData = new RODRecordList<ReferenceOrderedDatum>(name);
|
|
|
|
|
//System.out.printf("Adding bindings from %s to %s at %s%n", rodName, name, datum.getValue().getLocation());
|
|
|
|
|
trackData.add(datum.getValue(), true);
|
|
|
|
|
}
|
|
|
|
|
}
|
2009-09-22 00:55:22 +08:00
|
|
|
}
|
2010-01-20 05:33:13 +08:00
|
|
|
|
|
|
|
|
if ( trackData != null )
|
|
|
|
|
return trackData;
|
|
|
|
|
else if ( defaultValue == null )
|
|
|
|
|
return null;
|
|
|
|
|
else
|
|
|
|
|
return new RODRecordList<ReferenceOrderedDatum>(defaultValue.getName(),
|
|
|
|
|
Collections.singletonList(defaultValue),
|
|
|
|
|
defaultValue.getLocation());
|
2009-09-22 00:55:22 +08:00
|
|
|
}
|
2010-01-20 05:33:13 +08:00
|
|
|
|
|
|
|
|
public RODRecordList<ReferenceOrderedDatum> getTrackData(final String name, ReferenceOrderedDatum defaultValue) {
|
|
|
|
|
return getTrackData(name, defaultValue, true);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2009-04-10 06:04:59 +08:00
|
|
|
/**
|
|
|
|
|
* @see this.lookup
|
|
|
|
|
* @param name
|
|
|
|
|
* @param defaultValue
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
2009-09-22 00:55:22 +08:00
|
|
|
@Deprecated
|
2009-04-04 03:54:54 +08:00
|
|
|
public Object lookup(final String name, Object defaultValue) {
|
2009-04-10 06:04:59 +08:00
|
|
|
final String luName = canonicalName(name);
|
|
|
|
|
if ( map.containsKey(luName) )
|
|
|
|
|
return map.get(luName);
|
2009-04-04 03:54:54 +08:00
|
|
|
else
|
|
|
|
|
return defaultValue;
|
|
|
|
|
}
|
|
|
|
|
|
2009-04-10 06:04:59 +08:00
|
|
|
/**
|
|
|
|
|
* Returns the canonical name of the rod name
|
|
|
|
|
* @param name
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
|
|
|
|
private final String canonicalName(final String name)
|
|
|
|
|
{
|
2009-05-28 06:02:24 +08:00
|
|
|
//return name; // .toLowerCase();
|
2009-04-10 06:04:59 +08:00
|
|
|
return name.toLowerCase();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
2009-09-22 00:55:22 +08:00
|
|
|
* Is there a binding at this site to a ROD/track with the specified name?
|
2009-04-10 06:04:59 +08:00
|
|
|
*
|
2009-07-21 08:55:52 +08:00
|
|
|
* @param name the name of the rod
|
|
|
|
|
* @return true if it has the rod
|
2009-04-10 06:04:59 +08:00
|
|
|
*/
|
2009-07-21 08:55:52 +08:00
|
|
|
public boolean hasROD(final String name) {
|
2009-04-10 06:04:59 +08:00
|
|
|
return map.containsKey(canonicalName(name));
|
2009-04-04 03:54:54 +08:00
|
|
|
}
|
|
|
|
|
|
2009-04-10 06:04:59 +08:00
|
|
|
/**
|
2009-09-22 00:55:22 +08:00
|
|
|
* Get all of the RODs at the current site. The collection is "flattened": for any track that has multiple records
|
|
|
|
|
* at the current site, they all will be added to the list as separate elements.
|
2009-04-10 06:04:59 +08:00
|
|
|
*
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
2009-04-04 03:54:54 +08:00
|
|
|
public Collection<ReferenceOrderedDatum> getAllRods() {
|
2009-09-22 00:55:22 +08:00
|
|
|
List<ReferenceOrderedDatum> l = new ArrayList<ReferenceOrderedDatum>();
|
|
|
|
|
for ( RODRecordList<ReferenceOrderedDatum> rl : map.values() ) {
|
|
|
|
|
if ( rl == null ) continue; // how do we get null value stored for a track? shouldn't the track be missing from the map alltogether?
|
|
|
|
|
l.addAll(rl.getRecords());
|
|
|
|
|
}
|
|
|
|
|
return l;
|
|
|
|
|
|
2009-04-04 03:54:54 +08:00
|
|
|
}
|
|
|
|
|
|
2009-09-13 03:07:57 +08:00
|
|
|
/**
|
2009-09-22 00:55:22 +08:00
|
|
|
* Get all of the ROD tracks at the current site. Each track is returned as a single compound
|
|
|
|
|
* object (RODRecordList) that may contain multiple ROD records associated with the current site.
|
2009-09-13 03:07:57 +08:00
|
|
|
*
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
2009-09-22 00:55:22 +08:00
|
|
|
public Collection<RODRecordList<ReferenceOrderedDatum>> getBoundRodTracks() {
|
|
|
|
|
LinkedList<RODRecordList<ReferenceOrderedDatum>> bound = new LinkedList<RODRecordList<ReferenceOrderedDatum>>();
|
2009-09-13 03:07:57 +08:00
|
|
|
|
2009-09-22 00:55:22 +08:00
|
|
|
for ( RODRecordList<ReferenceOrderedDatum> value : map.values() ) {
|
|
|
|
|
if ( value != null && value.size() != 0 ) bound.add(value);
|
2009-09-13 03:07:57 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return bound;
|
|
|
|
|
}
|
|
|
|
|
|
2009-10-07 07:40:30 +08:00
|
|
|
public int getNBoundRodTracks() {
|
|
|
|
|
return getNBoundRodTracks(null);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public int getNBoundRodTracks(final String excludeIn ) {
|
|
|
|
|
final String exclude = excludeIn == null ? null : canonicalName(excludeIn);
|
|
|
|
|
|
|
|
|
|
int n = 0;
|
|
|
|
|
for ( RODRecordList<ReferenceOrderedDatum> value : map.values() ) {
|
|
|
|
|
if ( value != null && ! value.isEmpty() ) {
|
|
|
|
|
if ( exclude == null || ! value.getName().equals(exclude) )
|
|
|
|
|
n++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return n;
|
|
|
|
|
}
|
|
|
|
|
|
2009-09-22 00:55:22 +08:00
|
|
|
public Collection<ReferenceOrderedDatum> getBoundRodRecords() {
|
|
|
|
|
LinkedList<ReferenceOrderedDatum> bound = new LinkedList<ReferenceOrderedDatum>();
|
|
|
|
|
|
|
|
|
|
for ( RODRecordList<ReferenceOrderedDatum> valueList : map.values() ) {
|
|
|
|
|
for ( ReferenceOrderedDatum value : valueList ) {
|
|
|
|
|
if ( value != null )
|
|
|
|
|
bound.add(value);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return bound;
|
|
|
|
|
}
|
2010-02-05 23:42:54 +08:00
|
|
|
|
|
|
|
|
|
2010-02-07 00:26:06 +08:00
|
|
|
/**
|
|
|
|
|
* Converts all possible ROD tracks to VariantContexts objects, of all types, allowing any start and any number
|
|
|
|
|
* of entries per ROD.
|
|
|
|
|
*/
|
|
|
|
|
public Collection<VariantContext> getAllVariantContexts() {
|
|
|
|
|
return getAllVariantContexts(null, null, false, false);
|
2010-02-05 23:42:54 +08:00
|
|
|
}
|
|
|
|
|
|
2010-02-07 00:26:06 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Converts all possible ROD tracks to VariantContexts objects. If allowedTypes != null, then only
|
|
|
|
|
* VariantContexts in the allow set of types will be returned. If requireStartsHere is true, then curLocation
|
|
|
|
|
* must not be null, and only records whose start position is == to curLocation.getStart() will be returned.
|
|
|
|
|
* If takeFirstOnly is true, then only a single VariantContext will be converted from any individual ROD. Of course,
|
|
|
|
|
* this single object must pass the allowed types and start here options if provided. Note that the result
|
|
|
|
|
* may return multiple VariantContexts with the same name if that particular track contained multiple RODs spanning
|
|
|
|
|
* the current location.
|
|
|
|
|
*
|
|
|
|
|
* The name of each VariantContext corresponds to the ROD name.
|
|
|
|
|
*
|
|
|
|
|
* @param curLocation
|
|
|
|
|
* @param allowedTypes
|
|
|
|
|
* @param requireStartHere
|
|
|
|
|
* @param takeFirstOnly
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
|
|
|
|
public Collection<VariantContext> getAllVariantContexts(EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
|
2010-02-05 23:42:54 +08:00
|
|
|
List<VariantContext> contexts = new ArrayList<VariantContext>();
|
|
|
|
|
|
|
|
|
|
for ( RODRecordList<ReferenceOrderedDatum> rodList : getBoundRodTracks() ) {
|
2010-02-07 00:26:06 +08:00
|
|
|
addVariantContexts(contexts, rodList, allowedTypes, curLocation, requireStartHere, takeFirstOnly);
|
2010-02-05 23:42:54 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return contexts;
|
|
|
|
|
}
|
|
|
|
|
|
2010-02-07 00:26:06 +08:00
|
|
|
/**
|
|
|
|
|
* Gets the variant contexts associated with track name name
|
|
|
|
|
*
|
|
|
|
|
* see getVariantContexts for more information.
|
|
|
|
|
*
|
|
|
|
|
* @param name
|
|
|
|
|
* @param curLocation
|
|
|
|
|
* @param allowedTypes
|
|
|
|
|
* @param requireStartHere
|
|
|
|
|
* @param takeFirstOnly
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
|
|
|
|
public Collection<VariantContext> getVariantContexts(String name, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
|
2010-02-11 00:12:29 +08:00
|
|
|
return getVariantContexts(Arrays.asList(name), allowedTypes, curLocation, requireStartHere, takeFirstOnly);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public Collection<VariantContext> getVariantContexts(Collection<String> names, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
|
2010-02-07 00:26:06 +08:00
|
|
|
Collection<VariantContext> contexts = new ArrayList<VariantContext>();
|
|
|
|
|
|
2010-02-11 00:12:29 +08:00
|
|
|
for ( String name : names ) {
|
|
|
|
|
RODRecordList<ReferenceOrderedDatum> rodList = getTrackData(name, null);
|
|
|
|
|
|
|
|
|
|
if ( rodList != null )
|
|
|
|
|
addVariantContexts(contexts, rodList, allowedTypes, curLocation, requireStartHere, takeFirstOnly );
|
|
|
|
|
}
|
2010-02-07 00:26:06 +08:00
|
|
|
|
|
|
|
|
return contexts;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Gets the variant context associated with name, and assumes the system only has a single bound track at this location. Throws an exception if not.
|
|
|
|
|
* see getVariantContexts for more information.
|
|
|
|
|
*
|
|
|
|
|
* @param name
|
|
|
|
|
* @param curLocation
|
|
|
|
|
* @param allowedTypes
|
|
|
|
|
* @param requireStartHere
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
|
|
|
|
public VariantContext getVariantContext(String name, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere ) {
|
|
|
|
|
Collection<VariantContext> contexts = getVariantContexts(name, allowedTypes, curLocation, requireStartHere, false );
|
|
|
|
|
|
|
|
|
|
if ( contexts.size() > 1 )
|
|
|
|
|
throw new StingException("Requested a single VariantContext object for track " + name + " but multiple variants were present at position " + curLocation);
|
2010-02-11 00:12:29 +08:00
|
|
|
else if ( contexts.size() == 0 )
|
|
|
|
|
return null;
|
|
|
|
|
else
|
|
|
|
|
return contexts.iterator().next();
|
2010-02-07 00:26:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private void addVariantContexts(Collection<VariantContext> contexts, RODRecordList<ReferenceOrderedDatum> rodList, EnumSet<VariantContext.Type> allowedTypes, GenomeLoc curLocation, boolean requireStartHere, boolean takeFirstOnly ) {
|
|
|
|
|
for ( ReferenceOrderedDatum rec : rodList.getRecords() ) {
|
|
|
|
|
if ( VariantContextAdaptors.canBeConvertedToVariantContext(rec) ) {
|
|
|
|
|
// ok, we might actually be able to turn this record in a variant context
|
|
|
|
|
VariantContext vc = VariantContextAdaptors.toVariantContext(rodList.getName(), rec);
|
|
|
|
|
|
|
|
|
|
if ( vc == null ) // sometimes the track has odd stuff in it that can't be converted
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
// now, let's decide if we want to keep it
|
|
|
|
|
boolean goodType = allowedTypes == null || allowedTypes.contains(vc.getType());
|
|
|
|
|
boolean goodPos = ! requireStartHere || rec.getLocation().getStart() == curLocation.getStart();
|
|
|
|
|
|
|
|
|
|
if ( goodType && goodPos ) { // ok, we are going to keep this thing
|
|
|
|
|
contexts.add(vc);
|
|
|
|
|
|
|
|
|
|
if ( takeFirstOnly )
|
|
|
|
|
// we only want the first passing instance, so break the loop over records in rodList
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2010-02-05 23:42:54 +08:00
|
|
|
|
|
|
|
|
|
2009-04-10 06:04:59 +08:00
|
|
|
/**
|
2009-09-22 00:55:22 +08:00
|
|
|
* Binds the list of reference ordered data records (RODs) to track name at this site. Should be used only by the traversal
|
2009-04-10 06:04:59 +08:00
|
|
|
* system to provide access to RODs in a structured way to the walkers.
|
|
|
|
|
*
|
|
|
|
|
* @param name
|
|
|
|
|
* @param rod
|
|
|
|
|
*/
|
2009-09-22 00:55:22 +08:00
|
|
|
public void bind(final String name, RODRecordList<ReferenceOrderedDatum> rod) {
|
|
|
|
|
//logger.debug(String.format("Binding %s to %s", name, rod));
|
|
|
|
|
map.put(canonicalName(name), rod);
|
|
|
|
|
}
|
2009-04-04 03:54:54 +08:00
|
|
|
}
|