From 17b3d5b5543f75e9d0f30065194f016ea4c71fc2 Mon Sep 17 00:00:00 2001 From: depristo Date: Thu, 9 Apr 2009 22:04:59 +0000 Subject: [PATCH] New ROD accessing system, including a generalized interface for binding ROD on the command line that doesn't require you to chance GenomeAnalysisTK.java git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@355 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GenomeAnalysisTK.java | 72 ++++++-------- .../gatk/refdata/RefMetaDataTracker.java | 63 ++++++++++-- .../gatk/refdata/ReferenceOrderedData.java | 97 ++++++++++++++++++- .../sting/playground/gatk/PrepareROD.java | 14 +-- .../broadinstitute/sting/utils/GenomeLoc.java | 8 +- 5 files changed, 185 insertions(+), 69 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java index 1c9cf35ec..27cc67aa4 100644 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java @@ -105,7 +105,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { */ private static Logger logger = Logger.getLogger(GenomeAnalysisTK.class); - public static ArrayList ROD_BINDINGS = null; + public static ArrayList ROD_BINDINGS = new ArrayList(); /** @@ -143,9 +143,11 @@ public class GenomeAnalysisTK extends CommandLineProgram { m_parser.addOptionalArg("daughter", "KID", "Daughter's genotype (SAM pileup)", "DAUGHTER_GENOTYPE_FILE"); // --rodBind + //m_parser.addOptionalArg("rods", "B", "Bind rod with and to ", "ROD_BINDINGS"); + Option rodBinder = OptionBuilder.withArgName("rodBind") .hasArgs() - .withDescription( "Bind rod with and to " ) + .withDescription( "" ) .create("B"); m_parser.addOptionalArg(rodBinder, "ROD_BINDINGS"); } @@ -187,48 +189,38 @@ public class GenomeAnalysisTK extends CommandLineProgram { start(Instance, argv); } + /** + * Convenience function that binds RODs using the old-style command line parser to the new style list for + * a uniform processing. + * + * @param name + * @param type + * @param file + */ + private static void bindConvenienceRods(final String name, final String type, final String file ) + { + ROD_BINDINGS.add(name); + ROD_BINDINGS.add(type); + ROD_BINDINGS.add(file); + } + protected int execute() { final boolean TEST_ROD = false; List > rods = new ArrayList >(); - if ( ROD_BINDINGS != null ) { - System.out.printf("ROD BINDINGS are %s%n", Utils.join(":", ROD_BINDINGS)); - } - - - if ( TEST_ROD ) { - ReferenceOrderedData gff = new ReferenceOrderedData("test", new File("trunk/data/gFFTest.gff"), rodGFF.class ); - gff.testMe(); - - //ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("trunk/data/dbSNP_head.txt"), rodDbSNP.class ); - ReferenceOrderedData dbsnp = new ReferenceOrderedData("dbSNP", new File("/Volumes/Users/mdepristo/broad/ATK/exampleSAMs/dbSNP_chr20.txt"), rodDbSNP.class ); - //dbsnp.testMe(); - rods.add(dbsnp); // { gff, dbsnp }; - } else { - if ( DBSNP_FILE != null ) { - ReferenceOrderedData dbsnp = new ReferenceOrderedData("dbSNP", new File(DBSNP_FILE), rodDbSNP.class ); - //dbsnp.testMe(); - rods.add(dbsnp); // { gff, dbsnp }; - } - if ( HAPMAP_FILE != null ) { - ReferenceOrderedData hapmap = new ReferenceOrderedData("hapmap", new File(HAPMAP_FILE), HapMapAlleleFrequenciesROD.class ); - //dbsnp.testMe(); - rods.add(hapmap); // { gff, dbsnp }; - } - if ( HAPMAP_CHIP_FILE != null ) { - ReferenceOrderedData hapmapChip = new ReferenceOrderedData("hapmap-chip", new File(HAPMAP_CHIP_FILE), rodGFF.class ); - rods.add(hapmapChip); - } - //TODO: remove when walkers can ask for tracks - if ( MOTHER_GENOTYPE_FILE != null ) - rods.add( new ReferenceOrderedData("mother", new File(MOTHER_GENOTYPE_FILE), rodSAMPileup.class ) ); - if ( FATHER_GENOTYPE_FILE != null ) - rods.add( new ReferenceOrderedData("father", new File(FATHER_GENOTYPE_FILE), rodSAMPileup.class ) ); - if ( DAUGHTER_GENOTYPE_FILE != null ) - rods.add( new ReferenceOrderedData("daughter", new File(DAUGHTER_GENOTYPE_FILE), rodSAMPileup.class ) ); - - } + // + // please don't use these in the future, use the new syntax + // + if ( DBSNP_FILE != null ) bindConvenienceRods("dbSNP", "dbsnp", DBSNP_FILE); + if ( HAPMAP_FILE != null ) bindConvenienceRods("hapmap", "HapMapAlleleFrequencies", HAPMAP_FILE); + if ( HAPMAP_CHIP_FILE != null ) bindConvenienceRods("hapmap-chip", "GFF", HAPMAP_CHIP_FILE); + //TODO: remove when walkers can ask for tracks + if ( MOTHER_GENOTYPE_FILE != null ) bindConvenienceRods("mother", "SAMPileup", MOTHER_GENOTYPE_FILE); + if ( FATHER_GENOTYPE_FILE != null ) bindConvenienceRods("father", "SAMPileup", FATHER_GENOTYPE_FILE); + if ( DAUGHTER_GENOTYPE_FILE != null ) bindConvenienceRods("daughter", "SAMPileup", DAUGHTER_GENOTYPE_FILE); + ReferenceOrderedData.parseBindings(logger, ROD_BINDINGS, rods); + initializeOutputStreams(); Walker my_walker = null; @@ -281,7 +273,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { // Prepare the sort ordering w.r.t. the sequence dictionary if (REF_FILE_ARG != null) { final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG); - GenomeLoc.setupRefContigOrdering(refFile); + //GenomeLoc.setupRefContigOrdering(refFile); } // Determine the validation stringency. Default to ValidationStringency.STRICT. diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index 11ac91856..0f581eb49 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -7,11 +7,20 @@ import java.util.List; import java.util.Collection; /** - * Created by IntelliJ IDEA. + * This class represents the Reference Metadata available at a particular site in the genome. It can be + * used to conveniently lookup the RODs at this site, as well just getting a list of all of the RODs + * + * The standard interaction model is: + * + * Traversal system arrives at a site, which has a bunch of rods covering it + * Traversal calls tracker.bind(name, rod) for each rod in rods + * Traversal passes tracker to the walker + * walker calls lookup(name, default) to obtain the rod values at this site, or default if none was + * bound at this site. + * * User: mdepristo * Date: Apr 3, 2009 * Time: 3:05:23 PM - * To change this template use File | Settings | File Templates. */ public class RefMetaDataTracker { final HashMap map = new HashMap(); @@ -26,29 +35,65 @@ public class RefMetaDataTracker { */ public ReferenceOrderedDatum lookup(final String name, ReferenceOrderedDatum defaultValue) { //logger.debug(String.format("Lookup %s%n", name)); - if ( map.containsKey(name) ) - return map.get(name); + final String luName = canonicalName(name); + if ( map.containsKey(luName) ) + return map.get(luName); else return defaultValue; } + /** + * @see this.lookup + * @param name + * @param defaultValue + * @return + */ public Object lookup(final String name, Object defaultValue) { - if ( map.containsKey(name) ) - return map.get(name); + final String luName = canonicalName(name); + if ( map.containsKey(luName) ) + return map.get(luName); else return defaultValue; } - public Object hasROD(final String name) { - return map.containsKey(name); + /** + * Returns the canonical name of the rod name + * @param name + * @return + */ + private final String canonicalName(final String name) + { + return name.toLowerCase(); } + /** + * Is there a binding at this site to a ROD with name? + * + * @param name + * @return + */ + public Object hasROD(final String name) { + return map.containsKey(canonicalName(name)); + } + + /** + * Get all of the RODs at the current site + * + * @return + */ public Collection getAllRods() { return map.values(); } + /** + * Binds the reference ordered datum ROD to name at this site. Should be used only but the traversal + * system to provide access to RODs in a structured way to the walkers. + * + * @param name + * @param rod + */ public void bind(final String name, ReferenceOrderedDatum rod) { //logger.debug(String.format("Binding %s to %s%n", name, rod)); - map.put(name, rod); + map.put(canonicalName(name), rod); } } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java index b1b026dbc..593d8545b 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java @@ -4,9 +4,7 @@ import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.FileNotFoundException; -import java.util.Iterator; -import java.util.ArrayList; -import java.util.Collections; +import java.util.*; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; @@ -14,6 +12,7 @@ import org.broadinstitute.sting.gatk.iterators.PushbackIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.xReadLines; import org.broadinstitute.sting.utils.Utils; +import org.apache.log4j.Logger; /** * Class for representing arbitrary reference ordered data sets @@ -28,6 +27,98 @@ public class ReferenceOrderedData implements private File file = null; private Class type = null; // runtime type information for object construction + // ---------------------------------------------------------------------- + // + // Static ROD type management + // + // ---------------------------------------------------------------------- + public static class RODBinding { + public final String name; + public final Class type; + public RODBinding(final String name, final Class type) { + this.name = name; + this.type = type; + } + } + + public static HashMap Types = new HashMap(); + public static void addModule(final String name, final Class rodType) { + System.out.printf("* Adding rod class %s%n", name); + Types.put(name.toLowerCase(), new RODBinding(name, rodType)); + } + + static { + // All known ROD types + addModule("GFF", rodGFF.class); + addModule("dbSNP", rodDbSNP.class); + addModule("HapMapAlleleFrequencies", HapMapAlleleFrequenciesROD.class); + addModule("SAMPileup", rodSAMPileup.class); + } + + + /** + * Parse the ROD bindings. These are of the form of a single list of strings, each triplet of the + * form . After this function, the List of RODs contains new RODs bound to each of + * name, of type, ready to read from the file. This function does check for the strings to be well formed + * and such. + * + * @param logger + * @param bindings + * @param rods + */ + public static void parseBindings(Logger logger, ArrayList bindings, List > rods) + { + logger.info("Processing ROD bindings: " + bindings.size() + " -> " + Utils.join(" : ", bindings)); + if ( bindings.size() % 3 != 0 ) + Utils.scareUser(String.format("Invalid ROD specification: requires triplets of but got %s", Utils.join(" ", bindings))); + + // Loop over triplets + for ( int i = 0; i < bindings.size(); i += 3) { + final String name = bindings.get(i); + final String typeName = bindings.get(i+1); + final String fileName = bindings.get(i+2); + + ReferenceOrderedData rod = parse1Binding(logger, name, typeName, fileName); + + // check that we're not generating duplicate bindings + for ( ReferenceOrderedData rod2 : rods ) + if ( rod2.getName().equals(rod.getName()) ) + Utils.scareUser(String.format("Found duplicate rod bindings", rod.getName())); + + rods.add(rod); + } + } + + /** + * Helpful function that parses a single triplet of and returns the corresponding ROD with + * , of type that reads its input from . + * + * @param logger + * @param trackName + * @param typeName + * @param fileName + * @return + */ + private static ReferenceOrderedData parse1Binding( Logger logger, final String trackName, final String typeName, final String fileName ) + { + // Gracefully fail if we don't have the type + if ( ReferenceOrderedData.Types.get(typeName.toLowerCase()) == null ) + Utils.scareUser(String.format("Unknown ROD type: %s", typeName)); + + // Lookup the type + Class rodClass = ReferenceOrderedData.Types.get(typeName.toLowerCase()).type; + + // Create the ROD + ReferenceOrderedData rod = new ReferenceOrderedData(trackName.toLowerCase(), new File(fileName), rodClass ); + logger.info(String.format("Created binding from %s to %s of type %s", trackName.toLowerCase(), fileName, rodClass)); + return rod; + } + + // ---------------------------------------------------------------------- + // + // Constructors + // + // ---------------------------------------------------------------------- public ReferenceOrderedData(final String name, File file, Class type ) { this.file = file; this.type = type; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/PrepareROD.java b/java/src/org/broadinstitute/sting/playground/gatk/PrepareROD.java index 2d3ab74b6..36d09f35a 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/PrepareROD.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/PrepareROD.java @@ -24,18 +24,6 @@ public class PrepareROD extends CommandLineProgram { @Option(shortName="RODNAME", doc="Name of the data") public String ROD_NAME = null; @Option(shortName="RODTYPE", doc="Referenced Ordered Data type") public String ROD_TYPE = null; - public static HashMap> Types = new HashMap>(); - public static void addModule(final String name, final Class rodType) { - System.out.printf("* Adding rod class %s%n", name); - Types.put(name.toLowerCase(), rodType); - } - - static { - addModule("GFF", rodGFF.class); - addModule("dbSNP", rodDbSNP.class); - addModule("HapMapAlleleFrequencies", HapMapAlleleFrequenciesROD.class); - } - /** Required main method implementation. */ public static void main(String[] argv) { System.exit(new PrepareROD().instanceMain(argv)); @@ -47,7 +35,7 @@ public class PrepareROD extends CommandLineProgram { final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG); GenomeLoc.setupRefContigOrdering(refFile); - Class rodClass = Types.get(ROD_TYPE.toLowerCase()); + Class rodClass = ReferenceOrderedData.Types.get(ROD_TYPE.toLowerCase()).type; ReferenceOrderedData rod = new ReferenceOrderedData("ROD", new File(ROD_FILE), rodClass ); try { diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 48b9a9206..8bd1d319c 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -99,9 +99,9 @@ public class GenomeLoc implements Comparable { return false; } else { contigInfo = seqDict; - logger.info(String.format("Prepared reference sequence contig dictionary%n order ->")); + logger.debug(String.format("Prepared reference sequence contig dictionary")); for (SAMSequenceRecord contig : seqDict.getSequences() ) { - logger.info(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength())); + logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength())); } } @@ -225,7 +225,7 @@ public class GenomeLoc implements Comparable { Collections.sort(locs); //logger.info(String.format("Going to process %d locations", locs.length)); locs = mergeOverlappingLocations(locs); - logger.info(" Locations are:\n" + Utils.join("\n", Functions.map(Operators.toString, locs))); + logger.info("Locations are:" + Utils.join(", ", Functions.map(Operators.toString, locs))); return locs; } catch (Exception e) { e.printStackTrace(); @@ -235,7 +235,7 @@ public class GenomeLoc implements Comparable { } public static ArrayList mergeOverlappingLocations(final ArrayList raw) { - logger.info(" Raw locations are:\n" + Utils.join("\n", Functions.map(Operators.toString, raw))); + logger.debug(" Raw locations are:\n" + Utils.join("\n", Functions.map(Operators.toString, raw))); if ( raw.size() <= 1 ) return raw; else {