package org.broadinstitute.sting.gatk.refdata; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.FileNotFoundException; import java.util.*; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import org.broadinstitute.sting.gatk.iterators.PushbackIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.xReadLines; import org.broadinstitute.sting.utils.Utils; import org.apache.log4j.Logger; /** * Class for representing arbitrary reference ordered data sets * * User: mdepristo * Date: Feb 27, 2009 * Time: 10:47:14 AM * To change this template use File | Settings | File Templates. */ public class ReferenceOrderedData implements Iterable { private String name; private File file = null; private String fieldDelimiter; /** * Header object returned from the datum */ private Object header = null; private Class type = null; // runtime type information for object construction // ---------------------------------------------------------------------- // // Static ROD type management // // ---------------------------------------------------------------------- public static class RODBinding { public final String name; public final Class type; public RODBinding(final String name, final Class type) { this.name = name; this.type = type; } } public static HashMap Types = new HashMap(); public static void addModule(final String name, final Class rodType) { System.out.printf("* Adding rod class %s%n", name); Types.put(name.toLowerCase(), new RODBinding(name, rodType)); } static { // All known ROD types addModule("GFF", rodGFF.class); addModule("dbSNP", rodDbSNP.class); addModule("HapMapAlleleFrequencies", HapMapAlleleFrequenciesROD.class); addModule("SAMPileup", rodSAMPileup.class); addModule("Table", TabularROD.class); } /** * Parse the ROD bindings. These are of the form of a single list of strings, each triplet of the * form ,,. After this function, the List of RODs contains new RODs bound to each of * name, of type, ready to read from the file. This function does check for the strings to be well formed * and such. * * @param logger * @param bindings * @param rods */ public static void parseBindings(Logger logger, ArrayList bindings, List > rods) { // Loop over triplets for( String binding: bindings ) { String[] bindingTokens = binding.split(","); logger.info("Processing ROD bindings: " + bindings.size() + " -> " + Utils.join(" : ", bindingTokens)); if( bindingTokens.length != 3 ) Utils.scareUser(String.format("Invalid ROD specification: requires triplets of ,, but got %s", Utils.join(",", bindings))); final String name = bindingTokens[0]; final String typeName = bindingTokens[1]; final String fileName = bindingTokens[2]; ReferenceOrderedData rod = parse1Binding(logger, name, typeName, fileName); // check that we're not generating duplicate bindings for ( ReferenceOrderedData rod2 : rods ) if ( rod2.getName().equals(rod.getName()) ) Utils.scareUser(String.format("Found duplicate rod bindings", rod.getName())); rods.add(rod); } } /** * Helpful function that parses a single triplet of and returns the corresponding ROD with * , of type that reads its input from . * * @param logger * @param trackName * @param typeName * @param fileName * @return */ private static ReferenceOrderedData parse1Binding( Logger logger, final String trackName, final String typeName, final String fileName ) { // Gracefully fail if we don't have the type if ( ReferenceOrderedData.Types.get(typeName.toLowerCase()) == null ) Utils.scareUser(String.format("Unknown ROD type: %s", typeName)); // Lookup the type Class rodClass = ReferenceOrderedData.Types.get(typeName.toLowerCase()).type; // Create the ROD ReferenceOrderedData rod = new ReferenceOrderedData(trackName.toLowerCase(), new File(fileName), rodClass ); logger.info(String.format("Created binding from %s to %s of type %s", trackName.toLowerCase(), fileName, rodClass)); return rod; } // ---------------------------------------------------------------------- // // Constructors // // ---------------------------------------------------------------------- public ReferenceOrderedData(final String name, File file, Class type ) { this.file = file; this.type = type; this.name = name; this.header = initializeROD(name, file, type); this.fieldDelimiter = newROD(name, type).delimiterRegex(); } public String getName() { return name; } /** * Special equals override to see if this ROD is of type name, type. * Implemented to preserve data hiding whenever possible. * @param name Name to check. * @param type Type to check. * @return True if these parameters imply this rod. False otherwise. */ public boolean matches( String name, Class type ) { return this.name.equals(name) && this.type.equals(type); } public RODIterator iterator() { Iterator it; try { Method m = type.getDeclaredMethod("createIterator", String.class,java.io.File.class); it = (Iterator) m.invoke(null, name, file); } catch ( java.lang.NoSuchMethodException e ) { it = new SimpleRODIterator(); } catch ( java.lang.NullPointerException e ) { throw new RuntimeException(e); } catch ( java.lang.SecurityException e ) { throw new RuntimeException(e); } catch ( java.lang.IllegalAccessException e ) { throw new RuntimeException(e); } catch ( java.lang.IllegalArgumentException e ) { throw new RuntimeException(e); } catch ( java.lang.reflect.InvocationTargetException e ) { throw new RuntimeException(e); } return new RODIterator(it); } // ---------------------------------------------------------------------- // // Testing // // ---------------------------------------------------------------------- public void testMe() { for ( ReferenceOrderedDatum rec : this ) { System.out.println(rec.toString()); rodGFF gff = (rodGFF)rec; String[] keys = {"LENGTH", "ALT", "FOBARBAR"}; for ( String key : keys) { System.out.printf(" -> %s is (%s)%n", key, gff.containsAttribute(key) ? gff.getAttribute(key) : "none"); } } System.exit(1); } // ---------------------------------------------------------------------- // // Manipulations of all of the data // // ---------------------------------------------------------------------- public ArrayList readAll() { ArrayList elts = new ArrayList(); for ( ReferenceOrderedDatum rec : this ) { elts.add(rec); } elts.trimToSize(); return elts; } public static void sortRODDataInMemory(ArrayList data) { Collections.sort(data); } public static void write(ArrayList data, File output) throws IOException { final FileWriter out = new FileWriter(output); for ( ReferenceOrderedDatum rec : data ) { out.write(rec.repl() + "\n"); } out.close(); } public boolean validateFile() throws Exception { ReferenceOrderedDatum last = null; for ( ReferenceOrderedDatum rec : this ) { if ( last != null && last.compareTo(rec) == 1 ) { // It's out of order throw new Exception("Out of order elements at \n" + last.toString() + "\n" + rec.toString()); } last = rec; } return true; } public void indexFile() { // Fixme -- get access to the linear index system from Jim } // ---------------------------------------------------------------------- // // Iteration // // ---------------------------------------------------------------------- private class SimpleRODIterator implements Iterator { private xReadLines parser = null; public SimpleRODIterator() { try { parser = new xReadLines(file); } catch ( FileNotFoundException e ) { Utils.scareUser("Couldn't open file: " + file); } } public boolean hasNext() { //System.out.printf("Parser has next: %b%n", parser.hasNext()); return parser.hasNext(); } public ROD next() { final String line = parser.next(); //System.out.printf("Line is %s%n", line); String parts[] = line.split(fieldDelimiter); ROD n = parseLine(parts); return n != null ? n : next(); } public void remove() { throw new UnsupportedOperationException(); } } // private class SimpleRODIterator implements Iterator { // //private WhitespaceTextFileParser parser = null; // private TabbedTextFileParser parser = null; // // public SimpleRODIterator() { // parser = new TabbedTextFileParser(true, file); // } // // public boolean hasNext() { // return parser.hasNext(); // } // // public ROD next() { // String parts[] = parser.next(); // return parseLine(parts); // } // // public void remove() { // throw new UnsupportedOperationException(); // } // } public class RODIterator implements Iterator { private PushbackIterator it; private ROD prev = null; public RODIterator(Iterator it) { this.it = new PushbackIterator(it); } public boolean hasNext() { return it.hasNext(); } public ROD next() { prev = it.next(); return prev; } /** * Returns the current position of this iterator. * @return Current position of the iterator, or null if no position exists. */ public GenomeLoc position() { if( prev != null ) return prev.getLocation(); return null; } /** * Seeks forward in the file until we reach (or cross) a record at contig / pos * If we don't find anything and cross beyond contig / pos, we return null * Otherwise we return the first object who's start is at pos * * @param loc * @return */ public ROD seekForward(final GenomeLoc loc) { final boolean DEBUG = false; ROD result = null; if ( DEBUG ) System.out.printf(" *** starting seek to %s %d %s%n", loc.getContig(), loc.getStart(), prev); while ( hasNext() ) { ROD current = next(); //System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart()); int cmp = current.getLocation().compareTo(loc); if ( cmp < 0 ) { // current occurs before loc, continue searching continue; } else if ( cmp == 0 ) { result = current; break; } else { // current is after loc it.pushback(current); break; } } if ( DEBUG ) { if ( result != null ) System.out.printf(" ### Found %s%n", result.getLocation()); } // we ran out of elements or found something return result; } public void remove() { throw new UnsupportedOperationException(); } } // ---------------------------------------------------------------------- // // Parsing // // ---------------------------------------------------------------------- private ROD newROD( final String name, final Class type ) { try { Constructor c = type.getConstructor(String.class); return (ROD)c.newInstance(name); } catch ( java.lang.InstantiationException e ) { throw new RuntimeException(e); } catch ( java.lang.IllegalAccessException e ) { throw new RuntimeException(e); } catch ( java.lang.NoSuchMethodException e ) { throw new RuntimeException(e); } catch ( InvocationTargetException e ) { throw new RuntimeException(e); } } private Object initializeROD(final String name, final File file, final Class type) { ROD rod = newROD(name, type); try { return rod.initialize(file); } catch ( FileNotFoundException e ) { throw new RuntimeException(e); } } private ROD parseLine(final String[] parts) { //System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts)); ROD obj = newROD(name, type); try { if ( ! obj.parseLine(header, parts) ) obj = null; } catch (IOException e) { throw new RuntimeException("Badly formed ROD: " + e); } return obj; } }