2009-03-16 06:37:20 +08:00
|
|
|
package org.broadinstitute.sting.gatk.refdata;
|
|
|
|
|
|
|
|
|
|
import java.io.File;
|
|
|
|
|
import java.io.FileWriter;
|
|
|
|
|
import java.io.IOException;
|
2009-04-02 06:54:38 +08:00
|
|
|
import java.io.FileNotFoundException;
|
2009-04-10 06:04:59 +08:00
|
|
|
import java.util.*;
|
2009-04-04 00:41:33 +08:00
|
|
|
import java.lang.reflect.Constructor;
|
|
|
|
|
import java.lang.reflect.InvocationTargetException;
|
2009-05-21 23:23:22 +08:00
|
|
|
import java.lang.reflect.Method;
|
2009-03-16 06:37:20 +08:00
|
|
|
|
|
|
|
|
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
|
|
|
|
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
2009-04-02 06:54:38 +08:00
|
|
|
import org.broadinstitute.sting.utils.xReadLines;
|
|
|
|
|
import org.broadinstitute.sting.utils.Utils;
|
2009-04-10 06:04:59 +08:00
|
|
|
import org.apache.log4j.Logger;
|
2009-03-16 06:37:20 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Class for representing arbitrary reference ordered data sets
|
|
|
|
|
*
|
|
|
|
|
* User: mdepristo
|
|
|
|
|
* Date: Feb 27, 2009
|
|
|
|
|
* Time: 10:47:14 AM
|
|
|
|
|
* To change this template use File | Settings | File Templates.
|
|
|
|
|
*/
|
|
|
|
|
public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements Iterable<ROD> {
|
2009-04-04 00:41:33 +08:00
|
|
|
private String name;
|
2009-03-16 06:37:20 +08:00
|
|
|
private File file = null;
|
2009-05-15 05:06:28 +08:00
|
|
|
private String fieldDelimiter;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Header object returned from the datum
|
|
|
|
|
*/
|
|
|
|
|
private Object header = null;
|
|
|
|
|
|
2009-03-16 06:37:20 +08:00
|
|
|
private Class<ROD> type = null; // runtime type information for object construction
|
|
|
|
|
|
2009-04-10 06:04:59 +08:00
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
//
|
|
|
|
|
// Static ROD type management
|
|
|
|
|
//
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
public static class RODBinding {
|
|
|
|
|
public final String name;
|
|
|
|
|
public final Class<? extends ReferenceOrderedDatum> type;
|
|
|
|
|
public RODBinding(final String name, final Class<? extends ReferenceOrderedDatum> type) {
|
|
|
|
|
this.name = name;
|
|
|
|
|
this.type = type;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static HashMap<String, RODBinding> Types = new HashMap<String, RODBinding>();
|
|
|
|
|
public static void addModule(final String name, final Class<? extends ReferenceOrderedDatum> rodType) {
|
|
|
|
|
System.out.printf("* Adding rod class %s%n", name);
|
|
|
|
|
Types.put(name.toLowerCase(), new RODBinding(name, rodType));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static {
|
|
|
|
|
// All known ROD types
|
|
|
|
|
addModule("GFF", rodGFF.class);
|
|
|
|
|
addModule("dbSNP", rodDbSNP.class);
|
|
|
|
|
addModule("HapMapAlleleFrequencies", HapMapAlleleFrequenciesROD.class);
|
|
|
|
|
addModule("SAMPileup", rodSAMPileup.class);
|
2009-05-15 05:06:28 +08:00
|
|
|
addModule("Table", TabularROD.class);
|
2009-04-10 06:04:59 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Parse the ROD bindings. These are of the form of a single list of strings, each triplet of the
|
2009-05-08 23:28:19 +08:00
|
|
|
* form <name>,<type>,<file>. After this function, the List of RODs contains new RODs bound to each of
|
2009-04-10 06:04:59 +08:00
|
|
|
* name, of type, ready to read from the file. This function does check for the strings to be well formed
|
|
|
|
|
* and such.
|
|
|
|
|
*
|
|
|
|
|
* @param logger
|
|
|
|
|
* @param bindings
|
|
|
|
|
* @param rods
|
|
|
|
|
*/
|
|
|
|
|
public static void parseBindings(Logger logger, ArrayList<String> bindings, List<ReferenceOrderedData<? extends ReferenceOrderedDatum> > rods)
|
|
|
|
|
{
|
|
|
|
|
// Loop over triplets
|
2009-05-08 23:28:19 +08:00
|
|
|
for( String binding: bindings ) {
|
|
|
|
|
String[] bindingTokens = binding.split(",");
|
|
|
|
|
logger.info("Processing ROD bindings: " + bindings.size() + " -> " + Utils.join(" : ", bindingTokens));
|
|
|
|
|
if( bindingTokens.length != 3 )
|
|
|
|
|
Utils.scareUser(String.format("Invalid ROD specification: requires triplets of <name>,<type>,<file> but got %s", Utils.join(",", bindings)));
|
|
|
|
|
|
|
|
|
|
final String name = bindingTokens[0];
|
|
|
|
|
final String typeName = bindingTokens[1];
|
|
|
|
|
final String fileName = bindingTokens[2];
|
2009-04-10 06:04:59 +08:00
|
|
|
|
|
|
|
|
ReferenceOrderedData<?> rod = parse1Binding(logger, name, typeName, fileName);
|
|
|
|
|
|
|
|
|
|
// check that we're not generating duplicate bindings
|
|
|
|
|
for ( ReferenceOrderedData rod2 : rods )
|
|
|
|
|
if ( rod2.getName().equals(rod.getName()) )
|
|
|
|
|
Utils.scareUser(String.format("Found duplicate rod bindings", rod.getName()));
|
|
|
|
|
|
2009-05-08 23:28:19 +08:00
|
|
|
rods.add(rod);
|
2009-04-10 06:04:59 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Helpful function that parses a single triplet of <name> <type> <file> and returns the corresponding ROD with
|
|
|
|
|
* <name>, of type <type> that reads its input from <file>.
|
|
|
|
|
*
|
|
|
|
|
* @param logger
|
|
|
|
|
* @param trackName
|
|
|
|
|
* @param typeName
|
|
|
|
|
* @param fileName
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
|
|
|
|
private static ReferenceOrderedData<?> parse1Binding( Logger logger, final String trackName, final String typeName, final String fileName )
|
|
|
|
|
{
|
|
|
|
|
// Gracefully fail if we don't have the type
|
|
|
|
|
if ( ReferenceOrderedData.Types.get(typeName.toLowerCase()) == null )
|
|
|
|
|
Utils.scareUser(String.format("Unknown ROD type: %s", typeName));
|
|
|
|
|
|
|
|
|
|
// Lookup the type
|
|
|
|
|
Class rodClass = ReferenceOrderedData.Types.get(typeName.toLowerCase()).type;
|
|
|
|
|
|
|
|
|
|
// Create the ROD
|
|
|
|
|
ReferenceOrderedData<?> rod = new ReferenceOrderedData<ReferenceOrderedDatum>(trackName.toLowerCase(), new File(fileName), rodClass );
|
|
|
|
|
logger.info(String.format("Created binding from %s to %s of type %s", trackName.toLowerCase(), fileName, rodClass));
|
|
|
|
|
return rod;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
//
|
|
|
|
|
// Constructors
|
|
|
|
|
//
|
|
|
|
|
// ----------------------------------------------------------------------
|
2009-04-04 00:41:33 +08:00
|
|
|
public ReferenceOrderedData(final String name, File file, Class<ROD> type ) {
|
2009-03-16 06:37:20 +08:00
|
|
|
this.file = file;
|
|
|
|
|
this.type = type;
|
2009-04-04 00:41:33 +08:00
|
|
|
this.name = name;
|
2009-05-15 05:06:28 +08:00
|
|
|
this.header = initializeROD(name, file, type);
|
2009-05-15 07:20:11 +08:00
|
|
|
this.fieldDelimiter = newROD(name, type).delimiterRegex();
|
2009-03-16 06:37:20 +08:00
|
|
|
}
|
|
|
|
|
|
2009-04-04 03:54:54 +08:00
|
|
|
public String getName() { return name; }
|
|
|
|
|
|
2009-05-20 07:26:17 +08:00
|
|
|
/**
|
|
|
|
|
* Special equals override to see if this ROD is of type name, type.
|
|
|
|
|
* Implemented to preserve data hiding whenever possible.
|
|
|
|
|
* @param name Name to check.
|
|
|
|
|
* @param type Type to check.
|
|
|
|
|
* @return True if these parameters imply this rod. False otherwise.
|
|
|
|
|
*/
|
|
|
|
|
public boolean matches( String name, Class<? extends ReferenceOrderedDatum> type ) {
|
|
|
|
|
return this.name.equals(name) && this.type.equals(type);
|
|
|
|
|
}
|
|
|
|
|
|
2009-03-16 06:37:20 +08:00
|
|
|
public RODIterator iterator() {
|
2009-05-21 23:23:22 +08:00
|
|
|
Iterator<ROD> it;
|
|
|
|
|
try {
|
|
|
|
|
Method m = type.getDeclaredMethod("createIterator", String.class,java.io.File.class);
|
|
|
|
|
it = (Iterator<ROD>) m.invoke(null, name, file);
|
|
|
|
|
} catch ( java.lang.NoSuchMethodException e ) {
|
|
|
|
|
it = new SimpleRODIterator();
|
|
|
|
|
} catch ( java.lang.NullPointerException e ) {
|
|
|
|
|
throw new RuntimeException(e);
|
|
|
|
|
} catch ( java.lang.SecurityException e ) {
|
|
|
|
|
throw new RuntimeException(e);
|
|
|
|
|
} catch ( java.lang.IllegalAccessException e ) {
|
|
|
|
|
throw new RuntimeException(e);
|
|
|
|
|
} catch ( java.lang.IllegalArgumentException e ) {
|
|
|
|
|
throw new RuntimeException(e);
|
|
|
|
|
} catch ( java.lang.reflect.InvocationTargetException e ) {
|
|
|
|
|
throw new RuntimeException(e);
|
|
|
|
|
}
|
|
|
|
|
return new RODIterator(it);
|
|
|
|
|
}
|
2009-03-16 06:37:20 +08:00
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
//
|
|
|
|
|
// Testing
|
|
|
|
|
//
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
public void testMe() {
|
|
|
|
|
for ( ReferenceOrderedDatum rec : this ) {
|
2009-04-02 06:54:38 +08:00
|
|
|
System.out.println(rec.toString());
|
|
|
|
|
|
|
|
|
|
rodGFF gff = (rodGFF)rec;
|
|
|
|
|
String[] keys = {"LENGTH", "ALT", "FOBARBAR"};
|
|
|
|
|
for ( String key : keys) {
|
|
|
|
|
System.out.printf(" -> %s is (%s)%n", key, gff.containsAttribute(key) ? gff.getAttribute(key) : "none");
|
2009-03-16 06:37:20 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
System.exit(1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
//
|
|
|
|
|
// Manipulations of all of the data
|
|
|
|
|
//
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
public ArrayList<ReferenceOrderedDatum> readAll() {
|
|
|
|
|
ArrayList<ReferenceOrderedDatum> elts = new ArrayList<ReferenceOrderedDatum>();
|
|
|
|
|
for ( ReferenceOrderedDatum rec : this ) {
|
|
|
|
|
elts.add(rec);
|
|
|
|
|
}
|
|
|
|
|
elts.trimToSize();
|
|
|
|
|
return elts;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static void sortRODDataInMemory(ArrayList<ReferenceOrderedDatum> data) {
|
|
|
|
|
Collections.sort(data);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static void write(ArrayList<ReferenceOrderedDatum> data, File output) throws IOException {
|
|
|
|
|
final FileWriter out = new FileWriter(output);
|
|
|
|
|
|
|
|
|
|
for ( ReferenceOrderedDatum rec : data ) {
|
|
|
|
|
out.write(rec.repl() + "\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
out.close();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public boolean validateFile() throws Exception {
|
|
|
|
|
ReferenceOrderedDatum last = null;
|
|
|
|
|
for ( ReferenceOrderedDatum rec : this ) {
|
|
|
|
|
if ( last != null && last.compareTo(rec) == 1 ) {
|
|
|
|
|
// It's out of order
|
|
|
|
|
throw new Exception("Out of order elements at \n" + last.toString() + "\n" + rec.toString());
|
|
|
|
|
}
|
|
|
|
|
last = rec;
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void indexFile() {
|
|
|
|
|
// Fixme -- get access to the linear index system from Jim
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
//
|
|
|
|
|
// Iteration
|
|
|
|
|
//
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
private class SimpleRODIterator implements Iterator<ROD> {
|
2009-04-02 06:54:38 +08:00
|
|
|
private xReadLines parser = null;
|
2009-03-16 06:37:20 +08:00
|
|
|
|
|
|
|
|
public SimpleRODIterator() {
|
2009-04-02 06:54:38 +08:00
|
|
|
try {
|
|
|
|
|
parser = new xReadLines(file);
|
|
|
|
|
} catch ( FileNotFoundException e ) {
|
|
|
|
|
Utils.scareUser("Couldn't open file: " + file);
|
|
|
|
|
}
|
2009-03-16 06:37:20 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public boolean hasNext() {
|
2009-04-02 06:54:38 +08:00
|
|
|
//System.out.printf("Parser has next: %b%n", parser.hasNext());
|
2009-03-16 06:37:20 +08:00
|
|
|
return parser.hasNext();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public ROD next() {
|
2009-04-02 06:54:38 +08:00
|
|
|
final String line = parser.next();
|
|
|
|
|
//System.out.printf("Line is %s%n", line);
|
2009-05-15 05:06:28 +08:00
|
|
|
String parts[] = line.split(fieldDelimiter);
|
|
|
|
|
ROD n = parseLine(parts);
|
|
|
|
|
return n != null ? n : next();
|
2009-03-16 06:37:20 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void remove() {
|
|
|
|
|
throw new UnsupportedOperationException();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2009-04-02 06:54:38 +08:00
|
|
|
// private class SimpleRODIterator implements Iterator<ROD> {
|
|
|
|
|
// //private WhitespaceTextFileParser parser = null;
|
|
|
|
|
// private TabbedTextFileParser parser = null;
|
|
|
|
|
//
|
|
|
|
|
// public SimpleRODIterator() {
|
|
|
|
|
// parser = new TabbedTextFileParser(true, file);
|
|
|
|
|
// }
|
|
|
|
|
//
|
|
|
|
|
// public boolean hasNext() {
|
|
|
|
|
// return parser.hasNext();
|
|
|
|
|
// }
|
|
|
|
|
//
|
|
|
|
|
// public ROD next() {
|
|
|
|
|
// String parts[] = parser.next();
|
|
|
|
|
// return parseLine(parts);
|
|
|
|
|
// }
|
|
|
|
|
//
|
|
|
|
|
// public void remove() {
|
|
|
|
|
// throw new UnsupportedOperationException();
|
|
|
|
|
// }
|
|
|
|
|
// }
|
|
|
|
|
|
2009-03-16 06:37:20 +08:00
|
|
|
public class RODIterator implements Iterator<ROD> {
|
|
|
|
|
private PushbackIterator<ROD> it;
|
|
|
|
|
private ROD prev = null;
|
|
|
|
|
|
2009-05-21 23:23:22 +08:00
|
|
|
public RODIterator(Iterator<ROD> it) {
|
2009-03-16 06:37:20 +08:00
|
|
|
this.it = new PushbackIterator<ROD>(it);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public boolean hasNext() { return it.hasNext(); }
|
|
|
|
|
public ROD next() {
|
|
|
|
|
prev = it.next();
|
|
|
|
|
return prev;
|
|
|
|
|
}
|
|
|
|
|
|
2009-05-22 04:09:32 +08:00
|
|
|
/**
|
|
|
|
|
* Returns the current position of this iterator.
|
|
|
|
|
* @return Current position of the iterator, or null if no position exists.
|
|
|
|
|
*/
|
|
|
|
|
public GenomeLoc position() {
|
|
|
|
|
if( prev != null )
|
|
|
|
|
return prev.getLocation();
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
2009-05-15 05:06:28 +08:00
|
|
|
/**
|
|
|
|
|
* Seeks forward in the file until we reach (or cross) a record at contig / pos
|
|
|
|
|
* If we don't find anything and cross beyond contig / pos, we return null
|
|
|
|
|
* Otherwise we return the first object who's start is at pos
|
|
|
|
|
*
|
|
|
|
|
* @param loc
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
2009-03-16 06:37:20 +08:00
|
|
|
public ROD seekForward(final GenomeLoc loc) {
|
2009-05-15 05:06:28 +08:00
|
|
|
final boolean DEBUG = false;
|
2009-03-16 06:37:20 +08:00
|
|
|
|
|
|
|
|
ROD result = null;
|
|
|
|
|
|
2009-05-15 05:06:28 +08:00
|
|
|
if ( DEBUG ) System.out.printf(" *** starting seek to %s %d %s%n", loc.getContig(), loc.getStart(), prev);
|
2009-03-16 06:37:20 +08:00
|
|
|
while ( hasNext() ) {
|
|
|
|
|
ROD current = next();
|
|
|
|
|
//System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart());
|
2009-05-15 05:06:28 +08:00
|
|
|
int cmp = current.getLocation().compareTo(loc);
|
|
|
|
|
if ( cmp < 0 ) {
|
|
|
|
|
// current occurs before loc, continue searching
|
|
|
|
|
continue;
|
2009-03-16 06:37:20 +08:00
|
|
|
}
|
2009-05-15 05:06:28 +08:00
|
|
|
else if ( cmp == 0 ) {
|
|
|
|
|
result = current;
|
|
|
|
|
break;
|
|
|
|
|
} else {
|
|
|
|
|
// current is after loc
|
2009-03-16 06:37:20 +08:00
|
|
|
it.pushback(current);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( DEBUG ) {
|
2009-05-15 05:06:28 +08:00
|
|
|
if ( result != null )
|
|
|
|
|
System.out.printf(" ### Found %s%n", result.getLocation());
|
2009-03-16 06:37:20 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// we ran out of elements or found something
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void remove() {
|
|
|
|
|
throw new UnsupportedOperationException();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
//
|
|
|
|
|
// Parsing
|
|
|
|
|
//
|
|
|
|
|
// ----------------------------------------------------------------------
|
2009-05-15 05:06:28 +08:00
|
|
|
private ROD newROD( final String name, final Class<ROD> type ) {
|
2009-03-16 06:37:20 +08:00
|
|
|
try {
|
2009-04-05 05:55:02 +08:00
|
|
|
Constructor<ROD> c = type.getConstructor(String.class);
|
2009-05-15 05:06:28 +08:00
|
|
|
return (ROD)c.newInstance(name);
|
2009-03-16 06:37:20 +08:00
|
|
|
} catch ( java.lang.InstantiationException e ) {
|
2009-05-15 05:06:28 +08:00
|
|
|
throw new RuntimeException(e);
|
2009-03-16 06:37:20 +08:00
|
|
|
} catch ( java.lang.IllegalAccessException e ) {
|
2009-05-15 05:06:28 +08:00
|
|
|
throw new RuntimeException(e);
|
2009-04-04 00:41:33 +08:00
|
|
|
} catch ( java.lang.NoSuchMethodException e ) {
|
2009-05-15 05:06:28 +08:00
|
|
|
throw new RuntimeException(e);
|
2009-04-04 00:41:33 +08:00
|
|
|
} catch ( InvocationTargetException e ) {
|
2009-05-15 05:06:28 +08:00
|
|
|
throw new RuntimeException(e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private Object initializeROD(final String name, final File file, final Class<ROD> type) {
|
|
|
|
|
ROD rod = newROD(name, type);
|
|
|
|
|
try {
|
|
|
|
|
return rod.initialize(file);
|
|
|
|
|
} catch ( FileNotFoundException e ) {
|
|
|
|
|
throw new RuntimeException(e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private ROD parseLine(final String[] parts) {
|
|
|
|
|
//System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts));
|
|
|
|
|
ROD obj = newROD(name, type);
|
|
|
|
|
try {
|
|
|
|
|
if ( ! obj.parseLine(header, parts) )
|
|
|
|
|
obj = null;
|
|
|
|
|
} catch (IOException e) {
|
|
|
|
|
throw new RuntimeException("Badly formed ROD: " + e);
|
2009-04-04 00:41:33 +08:00
|
|
|
}
|
2009-05-15 05:06:28 +08:00
|
|
|
return obj;
|
2009-03-16 06:37:20 +08:00
|
|
|
}
|
|
|
|
|
}
|