package org.broadinstitute.sting.gatk.refdata; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException; import org.broadinstitute.sting.gatk.refdata.tracks.RODRMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.oneoffprojects.refdata.HapmapVCFROD; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.Utils; import java.io.*; import java.lang.reflect.Method; import java.util.*; /** * Class for representing arbitrary reference ordered data sets *

* User: mdepristo * Date: Feb 27, 2009 * Time: 10:47:14 AM * To change this template use File | Settings | File Templates. */ public class ReferenceOrderedData implements Iterable { private String name; private File file = null; // private String fieldDelimiter; /** Header object returned from the datum */ // private Object header = null; private Class type = null; // runtime type information for object construction /** our log, which we want to capture anything from this class */ private static Logger logger = Logger.getLogger(ReferenceOrderedData.class); // ---------------------------------------------------------------------- // // Static ROD type management // // ---------------------------------------------------------------------- public static class RODBinding { public final String name; public final Class type; public RODBinding(final String name, final Class type) { this.name = name; this.type = type; } } public static HashMap Types = new HashMap(); public static void addModule(final String name, final Class rodType) { final String boundName = name.toLowerCase(); if (Types.containsKey(boundName)) { throw new RuntimeException(String.format("GATK BUG: adding ROD module %s that is already bound", boundName)); } logger.info(String.format("* Adding rod class %s", name)); Types.put(boundName, new RODBinding(name, rodType)); } static { // All known ROD types addModule("GFF", RodGenotypeChipAsGFF.class); addModule("dbSNP", rodDbSNP.class); addModule("HapMapAlleleFrequencies", HapMapAlleleFrequenciesROD.class); addModule("SAMPileup", rodSAMPileup.class); addModule("GELI", rodGELI.class); addModule("RefSeq", rodRefSeq.class); addModule("Table", TabularROD.class); addModule("PooledEM", PooledEMSNPROD.class); addModule("CleanedOutSNP", CleanedOutSNPROD.class); addModule("Sequenom", SequenomROD.class); addModule("SangerSNP", SangerSNPROD.class); addModule("SimpleIndel", SimpleIndelROD.class); addModule("PointIndel", PointIndelROD.class); addModule("HapMapGenotype", HapMapGenotypeROD.class); addModule("Intervals", IntervalRod.class); addModule("Variants", RodGeliText.class); addModule("GLF", RodGLF.class); addModule("VCF", RodVCF.class); addModule("PicardDbSNP", rodPicardDbSNP.class); addModule("HapmapVCF", HapmapVCFROD.class); addModule("Beagle", BeagleROD.class); addModule("Plink", PlinkRod.class); } /** * Parse the ROD bindings. These are of the form of a single list of strings, each triplet of the * form ,,. After this function, the List of RODs contains new RODs bound to each of * name, of type, ready to read from the file. This function does check for the strings to be well formed * and such. * * @param bindings * @param rods */ public static void parseBindings(ArrayList bindings, List> rods) { // pre-process out any files that were passed in as rod binding command line options for (int x = 0; x < bindings.size(); x++) { if (new File(bindings.get(x)).exists()) { extractRodsFromFile(bindings, bindings.get(x)); bindings.remove(x); x--; } } // Loop over triplets for (String bindingSets : bindings) { String[] bindingTokens = bindingSets.split(","); if (bindingTokens.length % 3 != 0) Utils.scareUser(String.format("Invalid ROD specification: requires triplets of ,, but got %s", Utils.join(",", bindings))); for (int bindingSet = 0; bindingSet < bindingTokens.length; bindingSet += 3) { logger.info("Processing ROD bindings: " + bindings.size() + " -> " + Utils.join(" : ", bindingTokens)); final String name = bindingTokens[bindingSet]; final String typeName = bindingTokens[bindingSet + 1]; final String fileName = bindingTokens[bindingSet + 2]; ReferenceOrderedData rod = parse1Binding(name, typeName, fileName); // check that we're not generating duplicate bindings for (ReferenceOrderedData rod2 : rods) if (rod2.getName().equals(rod.getName())) Utils.scareUser(String.format("Found duplicate rod bindings", rod.getName())); rods.add(rod); } } } /** * given an existing file, open it and append all the valid triplet lines to an existing list * * @param rodTripletList the list of existing triplets * @param filename the file to attempt to extract ROD triplets from */ protected static void extractRodsFromFile(List rodTripletList, String filename) { BufferedReader str; try { str = new BufferedReader(new FileReader(new File(filename))); } catch (FileNotFoundException e) { throw new StingException("Unable to load the ROD input file " + filename,e); } String line = "NO LINES READ IN"; try { while ((line = str.readLine()) != null) { if (line.matches(".+,.+,.+")) rodTripletList.add(line.trim()); else logger.warn("the following file line didn't parsing into a triplet -> " + line); } } catch (IOException e) { throw new StingException("Failed reading the input rod file " + filename + " last line read was " + line,e); } } /** * Helpful function that parses a single triplet of and returns the corresponding ROD with * , of type that reads its input from . * * @param trackName * @param typeName * @param fileName * @return */ public static ReferenceOrderedData parse1Binding(final String trackName, final String typeName, final String fileName) { // Gracefully fail if we don't have the type if (ReferenceOrderedData.Types.get(typeName.toLowerCase()) == null) Utils.scareUser(String.format("Unknown ROD type: %s", typeName)); // Lookup the type Class rodClass = ReferenceOrderedData.Types.get(typeName.toLowerCase()).type; // Create the ROD ReferenceOrderedData rod = new ReferenceOrderedData(trackName.toLowerCase(), new File(fileName), rodClass ); logger.info(String.format("Created binding from %s to %s of type %s", trackName.toLowerCase(), fileName, rodClass)); return rod; } // ---------------------------------------------------------------------- // // Constructors // // ---------------------------------------------------------------------- public ReferenceOrderedData(final String name, File file, Class type ) { this.name = name; this.file = file; this.type = type; // this.header = initializeROD(name, file, type); // this.fieldDelimiter = newROD(name, type).delimiterRegex(); } public String getName() { return name; } public File getFile() { return file; } public Class getType() { return type; } /** * Special equals override to see if this ROD is compatible with the given * name and type. 'Compatible' means that this ROD has the name that's passed * in and its data can fit into the container specified by type. * * @param name Name to check. * @param type Type to check. * * @return True if these parameters imply this rod. False otherwise. */ public boolean matches(String name, Class type) { return this.name.equals(name) && type.isAssignableFrom(this.type); } public LocationAwareSeekableRODIterator iterator() { Iterator it; try { Method m = type.getDeclaredMethod("createIterator", String.class, java.io.File.class); it = (Iterator) m.invoke(null, name, file); } catch (java.lang.NoSuchMethodException e) { it = new RODRecordIterator(file,name,type); } catch (java.lang.NullPointerException e) { throw new RuntimeException(e); } catch (java.lang.SecurityException e) { throw new RuntimeException(e); } catch (java.lang.IllegalAccessException e) { throw new RuntimeException(e); } catch (java.lang.IllegalArgumentException e) { throw new RuntimeException(e); } catch (java.lang.reflect.InvocationTargetException e) { throw new RuntimeException(e); } // return new RODIterator(it); return new SeekableRODIterator(it); } // ---------------------------------------------------------------------- // // Manipulations of all of the data // // ---------------------------------------------------------------------- public ArrayList readAll() { ArrayList elts = new ArrayList(); for ( List l : this ) { for (ReferenceOrderedDatum rec : l) { elts.add(rec); } } elts.trimToSize(); return elts; } public static void sortRODDataInMemory(ArrayList data) { Collections.sort(data); } public static void write(ArrayList data, File output) throws IOException { final FileWriter out = new FileWriter(output); for (ReferenceOrderedDatum rec : data) { out.write(rec.repl() + "\n"); } out.close(); } public boolean validateFile() throws Exception { ReferenceOrderedDatum last = null; for ( List l : this ) { for (ReferenceOrderedDatum rec : l) { if (last != null && last.compareTo(rec) > 1) { // It's out of order throw new Exception("Out of order elements at \n" + last.toString() + "\n" + rec.toString()); } last = rec; } } return true; } public void indexFile() { // Fixme -- get access to the linear index system from Jim } // ---------------------------------------------------------------------- // // Iteration // // ---------------------------------------------------------------------- // private class SimpleRODIterator implements Iterator { // private xReadLines parser = null; // // public SimpleRODIterator() { // try { // parser = new xReadLines(file); // } catch (FileNotFoundException e) { // Utils.scareUser("Couldn't open file: " + file); // } // } // // public boolean hasNext() { // //System.out.printf("Parser has next: %b%n", parser.hasNext()); // return parser.hasNext(); // } // // public ROD next() { // ROD n = null; // boolean success = false; // boolean firstFailure = true; // // do { // final String line = parser.next(); // //System.out.printf("Line is '%s'%n", line); // String parts[] = line.split(fieldDelimiter); // // try { // n = parseLine(parts); // // Two failure conditions: // // 1) parseLine throws an exception. // // 2) parseLine returns null. // // 3) parseLine throws a RuntimeException. // // TODO: Clean this up so that all errors are handled in one spot. // success = (n != null); // } // catch (MalformedGenomeLocException ex) { // if (firstFailure) { // Utils.warnUser("Failed to parse contig on line '" + line + "'. The reason given was: " + ex.getMessage() + " Skipping ahead to the next recognized GenomeLoc. "); // firstFailure = false; // } // if (!parser.hasNext()) // Utils.warnUser("Unable to find more valid reference-ordered data. Giving up."); // } // // } while (!success && parser.hasNext()); // // return n; // } // // public void remove() { // throw new UnsupportedOperationException(); // } // } // ---------------------------------------------------------------------- // // Parsing // // ---------------------------------------------------------------------- // private Constructor parsing_constructor; // private ROD newROD(final String name, final Class type) { // try { // return (ROD) parsing_constructor.newInstance(name); // } catch (java.lang.InstantiationException e) { // throw new RuntimeException(e); // } catch (java.lang.IllegalAccessException e) { // throw new RuntimeException(e); // } catch (InvocationTargetException e) { // throw new RuntimeException(e); // } // } // private Object initializeROD(final String name, final File file, final Class type) { // try { // parsing_constructor = type.getConstructor(String.class); // } // catch (java.lang.NoSuchMethodException e) { // throw new RuntimeException(e); // } // ROD rod = newROD(name, type); // try { // return rod.initialize(file); // } catch (FileNotFoundException e) { // throw new RuntimeException(e); // } // } // private ROD parseLine(final String[] parts) { // //System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts)); // ROD obj = newROD(name, type); // try { // if (!obj.parseLine(header, parts)) // obj = null; // } catch (IOException e) { // throw new RuntimeException("Badly formed ROD: " + e); // } // return obj; // } }