New ROD accessing system, including a generalized interface for binding ROD on the command line that doesn't require you to chance GenomeAnalysisTK.java

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@355 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-04-09 22:04:59 +00:00
parent f5cc2d8b0b
commit 17b3d5b554
5 changed files with 185 additions and 69 deletions

View File

@ -105,7 +105,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
*/ */
private static Logger logger = Logger.getLogger(GenomeAnalysisTK.class); private static Logger logger = Logger.getLogger(GenomeAnalysisTK.class);
public static ArrayList<String> ROD_BINDINGS = null; public static ArrayList<String> ROD_BINDINGS = new ArrayList<String>();
/** /**
@ -143,9 +143,11 @@ public class GenomeAnalysisTK extends CommandLineProgram {
m_parser.addOptionalArg("daughter", "KID", "Daughter's genotype (SAM pileup)", "DAUGHTER_GENOTYPE_FILE"); m_parser.addOptionalArg("daughter", "KID", "Daughter's genotype (SAM pileup)", "DAUGHTER_GENOTYPE_FILE");
// --rodBind <name> <type> <file> // --rodBind <name> <type> <file>
//m_parser.addOptionalArg("rods", "B", "Bind rod with <name> and <type> to <file>", "ROD_BINDINGS");
Option rodBinder = OptionBuilder.withArgName("rodBind") Option rodBinder = OptionBuilder.withArgName("rodBind")
.hasArgs() .hasArgs()
.withDescription( "Bind rod with <name> and <type> to <file>" ) .withDescription( "" )
.create("B"); .create("B");
m_parser.addOptionalArg(rodBinder, "ROD_BINDINGS"); m_parser.addOptionalArg(rodBinder, "ROD_BINDINGS");
} }
@ -187,48 +189,38 @@ public class GenomeAnalysisTK extends CommandLineProgram {
start(Instance, argv); start(Instance, argv);
} }
/**
* Convenience function that binds RODs using the old-style command line parser to the new style list for
* a uniform processing.
*
* @param name
* @param type
* @param file
*/
private static void bindConvenienceRods(final String name, final String type, final String file )
{
ROD_BINDINGS.add(name);
ROD_BINDINGS.add(type);
ROD_BINDINGS.add(file);
}
protected int execute() { protected int execute() {
final boolean TEST_ROD = false; final boolean TEST_ROD = false;
List<ReferenceOrderedData<? extends ReferenceOrderedDatum> > rods = new ArrayList<ReferenceOrderedData<? extends ReferenceOrderedDatum> >(); List<ReferenceOrderedData<? extends ReferenceOrderedDatum> > rods = new ArrayList<ReferenceOrderedData<? extends ReferenceOrderedDatum> >();
if ( ROD_BINDINGS != null ) { //
System.out.printf("ROD BINDINGS are %s%n", Utils.join(":", ROD_BINDINGS)); // please don't use these in the future, use the new syntax
} //
if ( DBSNP_FILE != null ) bindConvenienceRods("dbSNP", "dbsnp", DBSNP_FILE);
if ( HAPMAP_FILE != null ) bindConvenienceRods("hapmap", "HapMapAlleleFrequencies", HAPMAP_FILE);
if ( TEST_ROD ) { if ( HAPMAP_CHIP_FILE != null ) bindConvenienceRods("hapmap-chip", "GFF", HAPMAP_CHIP_FILE);
ReferenceOrderedData<rodGFF> gff = new ReferenceOrderedData<rodGFF>("test", new File("trunk/data/gFFTest.gff"), rodGFF.class ); //TODO: remove when walkers can ask for tracks
gff.testMe(); if ( MOTHER_GENOTYPE_FILE != null ) bindConvenienceRods("mother", "SAMPileup", MOTHER_GENOTYPE_FILE);
if ( FATHER_GENOTYPE_FILE != null ) bindConvenienceRods("father", "SAMPileup", FATHER_GENOTYPE_FILE);
//ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("trunk/data/dbSNP_head.txt"), rodDbSNP.class ); if ( DAUGHTER_GENOTYPE_FILE != null ) bindConvenienceRods("daughter", "SAMPileup", DAUGHTER_GENOTYPE_FILE);
ReferenceOrderedData<rodDbSNP> dbsnp = new ReferenceOrderedData<rodDbSNP>("dbSNP", new File("/Volumes/Users/mdepristo/broad/ATK/exampleSAMs/dbSNP_chr20.txt"), rodDbSNP.class );
//dbsnp.testMe();
rods.add(dbsnp); // { gff, dbsnp };
} else {
if ( DBSNP_FILE != null ) {
ReferenceOrderedData<rodDbSNP> dbsnp = new ReferenceOrderedData<rodDbSNP>("dbSNP", new File(DBSNP_FILE), rodDbSNP.class );
//dbsnp.testMe();
rods.add(dbsnp); // { gff, dbsnp };
}
if ( HAPMAP_FILE != null ) {
ReferenceOrderedData<HapMapAlleleFrequenciesROD> hapmap = new ReferenceOrderedData<HapMapAlleleFrequenciesROD>("hapmap", new File(HAPMAP_FILE), HapMapAlleleFrequenciesROD.class );
//dbsnp.testMe();
rods.add(hapmap); // { gff, dbsnp };
}
if ( HAPMAP_CHIP_FILE != null ) {
ReferenceOrderedData<rodGFF> hapmapChip = new ReferenceOrderedData<rodGFF>("hapmap-chip", new File(HAPMAP_CHIP_FILE), rodGFF.class );
rods.add(hapmapChip);
}
//TODO: remove when walkers can ask for tracks
if ( MOTHER_GENOTYPE_FILE != null )
rods.add( new ReferenceOrderedData<rodSAMPileup>("mother", new File(MOTHER_GENOTYPE_FILE), rodSAMPileup.class ) );
if ( FATHER_GENOTYPE_FILE != null )
rods.add( new ReferenceOrderedData<rodSAMPileup>("father", new File(FATHER_GENOTYPE_FILE), rodSAMPileup.class ) );
if ( DAUGHTER_GENOTYPE_FILE != null )
rods.add( new ReferenceOrderedData<rodSAMPileup>("daughter", new File(DAUGHTER_GENOTYPE_FILE), rodSAMPileup.class ) );
}
ReferenceOrderedData.parseBindings(logger, ROD_BINDINGS, rods);
initializeOutputStreams(); initializeOutputStreams();
Walker<?,?> my_walker = null; Walker<?,?> my_walker = null;
@ -281,7 +273,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
// Prepare the sort ordering w.r.t. the sequence dictionary // Prepare the sort ordering w.r.t. the sequence dictionary
if (REF_FILE_ARG != null) { if (REF_FILE_ARG != null) {
final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG); final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG);
GenomeLoc.setupRefContigOrdering(refFile); //GenomeLoc.setupRefContigOrdering(refFile);
} }
// Determine the validation stringency. Default to ValidationStringency.STRICT. // Determine the validation stringency. Default to ValidationStringency.STRICT.

View File

@ -7,11 +7,20 @@ import java.util.List;
import java.util.Collection; import java.util.Collection;
/** /**
* Created by IntelliJ IDEA. * This class represents the Reference Metadata available at a particular site in the genome. It can be
* used to conveniently lookup the RODs at this site, as well just getting a list of all of the RODs
*
* The standard interaction model is:
*
* Traversal system arrives at a site, which has a bunch of rods covering it
* Traversal calls tracker.bind(name, rod) for each rod in rods
* Traversal passes tracker to the walker
* walker calls lookup(name, default) to obtain the rod values at this site, or default if none was
* bound at this site.
*
* User: mdepristo * User: mdepristo
* Date: Apr 3, 2009 * Date: Apr 3, 2009
* Time: 3:05:23 PM * Time: 3:05:23 PM
* To change this template use File | Settings | File Templates.
*/ */
public class RefMetaDataTracker { public class RefMetaDataTracker {
final HashMap<String, ReferenceOrderedDatum> map = new HashMap<String, ReferenceOrderedDatum>(); final HashMap<String, ReferenceOrderedDatum> map = new HashMap<String, ReferenceOrderedDatum>();
@ -26,29 +35,65 @@ public class RefMetaDataTracker {
*/ */
public ReferenceOrderedDatum lookup(final String name, ReferenceOrderedDatum defaultValue) { public ReferenceOrderedDatum lookup(final String name, ReferenceOrderedDatum defaultValue) {
//logger.debug(String.format("Lookup %s%n", name)); //logger.debug(String.format("Lookup %s%n", name));
if ( map.containsKey(name) ) final String luName = canonicalName(name);
return map.get(name); if ( map.containsKey(luName) )
return map.get(luName);
else else
return defaultValue; return defaultValue;
} }
/**
* @see this.lookup
* @param name
* @param defaultValue
* @return
*/
public Object lookup(final String name, Object defaultValue) { public Object lookup(final String name, Object defaultValue) {
if ( map.containsKey(name) ) final String luName = canonicalName(name);
return map.get(name); if ( map.containsKey(luName) )
return map.get(luName);
else else
return defaultValue; return defaultValue;
} }
public Object hasROD(final String name) { /**
return map.containsKey(name); * Returns the canonical name of the rod name
* @param name
* @return
*/
private final String canonicalName(final String name)
{
return name.toLowerCase();
} }
/**
* Is there a binding at this site to a ROD with name?
*
* @param name
* @return
*/
public Object hasROD(final String name) {
return map.containsKey(canonicalName(name));
}
/**
* Get all of the RODs at the current site
*
* @return
*/
public Collection<ReferenceOrderedDatum> getAllRods() { public Collection<ReferenceOrderedDatum> getAllRods() {
return map.values(); return map.values();
} }
/**
* Binds the reference ordered datum ROD to name at this site. Should be used only but the traversal
* system to provide access to RODs in a structured way to the walkers.
*
* @param name
* @param rod
*/
public void bind(final String name, ReferenceOrderedDatum rod) { public void bind(final String name, ReferenceOrderedDatum rod) {
//logger.debug(String.format("Binding %s to %s%n", name, rod)); //logger.debug(String.format("Binding %s to %s%n", name, rod));
map.put(name, rod); map.put(canonicalName(name), rod);
} }
} }

View File

@ -4,9 +4,7 @@ import java.io.File;
import java.io.FileWriter; import java.io.FileWriter;
import java.io.IOException; import java.io.IOException;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.util.Iterator; import java.util.*;
import java.util.ArrayList;
import java.util.Collections;
import java.lang.reflect.Constructor; import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException; import java.lang.reflect.InvocationTargetException;
@ -14,6 +12,7 @@ import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.xReadLines; import org.broadinstitute.sting.utils.xReadLines;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
import org.apache.log4j.Logger;
/** /**
* Class for representing arbitrary reference ordered data sets * Class for representing arbitrary reference ordered data sets
@ -28,6 +27,98 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
private File file = null; private File file = null;
private Class<ROD> type = null; // runtime type information for object construction private Class<ROD> type = null; // runtime type information for object construction
// ----------------------------------------------------------------------
//
// Static ROD type management
//
// ----------------------------------------------------------------------
public static class RODBinding {
public final String name;
public final Class<? extends ReferenceOrderedDatum> type;
public RODBinding(final String name, final Class<? extends ReferenceOrderedDatum> type) {
this.name = name;
this.type = type;
}
}
public static HashMap<String, RODBinding> Types = new HashMap<String, RODBinding>();
public static void addModule(final String name, final Class<? extends ReferenceOrderedDatum> rodType) {
System.out.printf("* Adding rod class %s%n", name);
Types.put(name.toLowerCase(), new RODBinding(name, rodType));
}
static {
// All known ROD types
addModule("GFF", rodGFF.class);
addModule("dbSNP", rodDbSNP.class);
addModule("HapMapAlleleFrequencies", HapMapAlleleFrequenciesROD.class);
addModule("SAMPileup", rodSAMPileup.class);
}
/**
* Parse the ROD bindings. These are of the form of a single list of strings, each triplet of the
* form <name> <type> <file>. After this function, the List of RODs contains new RODs bound to each of
* name, of type, ready to read from the file. This function does check for the strings to be well formed
* and such.
*
* @param logger
* @param bindings
* @param rods
*/
public static void parseBindings(Logger logger, ArrayList<String> bindings, List<ReferenceOrderedData<? extends ReferenceOrderedDatum> > rods)
{
logger.info("Processing ROD bindings: " + bindings.size() + " -> " + Utils.join(" : ", bindings));
if ( bindings.size() % 3 != 0 )
Utils.scareUser(String.format("Invalid ROD specification: requires triplets of <name> <type> <file> but got %s", Utils.join(" ", bindings)));
// Loop over triplets
for ( int i = 0; i < bindings.size(); i += 3) {
final String name = bindings.get(i);
final String typeName = bindings.get(i+1);
final String fileName = bindings.get(i+2);
ReferenceOrderedData<?> rod = parse1Binding(logger, name, typeName, fileName);
// check that we're not generating duplicate bindings
for ( ReferenceOrderedData rod2 : rods )
if ( rod2.getName().equals(rod.getName()) )
Utils.scareUser(String.format("Found duplicate rod bindings", rod.getName()));
rods.add(rod);
}
}
/**
* Helpful function that parses a single triplet of <name> <type> <file> and returns the corresponding ROD with
* <name>, of type <type> that reads its input from <file>.
*
* @param logger
* @param trackName
* @param typeName
* @param fileName
* @return
*/
private static ReferenceOrderedData<?> parse1Binding( Logger logger, final String trackName, final String typeName, final String fileName )
{
// Gracefully fail if we don't have the type
if ( ReferenceOrderedData.Types.get(typeName.toLowerCase()) == null )
Utils.scareUser(String.format("Unknown ROD type: %s", typeName));
// Lookup the type
Class rodClass = ReferenceOrderedData.Types.get(typeName.toLowerCase()).type;
// Create the ROD
ReferenceOrderedData<?> rod = new ReferenceOrderedData<ReferenceOrderedDatum>(trackName.toLowerCase(), new File(fileName), rodClass );
logger.info(String.format("Created binding from %s to %s of type %s", trackName.toLowerCase(), fileName, rodClass));
return rod;
}
// ----------------------------------------------------------------------
//
// Constructors
//
// ----------------------------------------------------------------------
public ReferenceOrderedData(final String name, File file, Class<ROD> type ) { public ReferenceOrderedData(final String name, File file, Class<ROD> type ) {
this.file = file; this.file = file;
this.type = type; this.type = type;

View File

@ -24,18 +24,6 @@ public class PrepareROD extends CommandLineProgram {
@Option(shortName="RODNAME", doc="Name of the data") public String ROD_NAME = null; @Option(shortName="RODNAME", doc="Name of the data") public String ROD_NAME = null;
@Option(shortName="RODTYPE", doc="Referenced Ordered Data type") public String ROD_TYPE = null; @Option(shortName="RODTYPE", doc="Referenced Ordered Data type") public String ROD_TYPE = null;
public static HashMap<String, Class<? extends ReferenceOrderedDatum>> Types = new HashMap<String,Class<? extends ReferenceOrderedDatum>>();
public static void addModule(final String name, final Class<? extends ReferenceOrderedDatum> rodType) {
System.out.printf("* Adding rod class %s%n", name);
Types.put(name.toLowerCase(), rodType);
}
static {
addModule("GFF", rodGFF.class);
addModule("dbSNP", rodDbSNP.class);
addModule("HapMapAlleleFrequencies", HapMapAlleleFrequenciesROD.class);
}
/** Required main method implementation. */ /** Required main method implementation. */
public static void main(String[] argv) { public static void main(String[] argv) {
System.exit(new PrepareROD().instanceMain(argv)); System.exit(new PrepareROD().instanceMain(argv));
@ -47,7 +35,7 @@ public class PrepareROD extends CommandLineProgram {
final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG); final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG);
GenomeLoc.setupRefContigOrdering(refFile); GenomeLoc.setupRefContigOrdering(refFile);
Class<? extends ReferenceOrderedDatum> rodClass = Types.get(ROD_TYPE.toLowerCase()); Class<? extends ReferenceOrderedDatum> rodClass = ReferenceOrderedData.Types.get(ROD_TYPE.toLowerCase()).type;
ReferenceOrderedData<? extends ReferenceOrderedDatum> rod = new ReferenceOrderedData("ROD", new File(ROD_FILE), rodClass ); ReferenceOrderedData<? extends ReferenceOrderedDatum> rod = new ReferenceOrderedData("ROD", new File(ROD_FILE), rodClass );
try { try {

View File

@ -99,9 +99,9 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
return false; return false;
} else { } else {
contigInfo = seqDict; contigInfo = seqDict;
logger.info(String.format("Prepared reference sequence contig dictionary%n order ->")); logger.debug(String.format("Prepared reference sequence contig dictionary"));
for (SAMSequenceRecord contig : seqDict.getSequences() ) { for (SAMSequenceRecord contig : seqDict.getSequences() ) {
logger.info(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength())); logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength()));
} }
} }
@ -225,7 +225,7 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
Collections.sort(locs); Collections.sort(locs);
//logger.info(String.format("Going to process %d locations", locs.length)); //logger.info(String.format("Going to process %d locations", locs.length));
locs = mergeOverlappingLocations(locs); locs = mergeOverlappingLocations(locs);
logger.info(" Locations are:\n" + Utils.join("\n", Functions.map(Operators.toString, locs))); logger.info("Locations are:" + Utils.join(", ", Functions.map(Operators.toString, locs)));
return locs; return locs;
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
@ -235,7 +235,7 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
} }
public static ArrayList<GenomeLoc> mergeOverlappingLocations(final ArrayList<GenomeLoc> raw) { public static ArrayList<GenomeLoc> mergeOverlappingLocations(final ArrayList<GenomeLoc> raw) {
logger.info(" Raw locations are:\n" + Utils.join("\n", Functions.map(Operators.toString, raw))); logger.debug(" Raw locations are:\n" + Utils.join("\n", Functions.map(Operators.toString, raw)));
if ( raw.size() <= 1 ) if ( raw.size() <= 1 )
return raw; return raw;
else { else {