New ROD accessing system, including a generalized interface for binding ROD on the command line that doesn't require you to chance GenomeAnalysisTK.java

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@355 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-04-09 22:04:59 +00:00
parent f5cc2d8b0b
commit 17b3d5b554
5 changed files with 185 additions and 69 deletions

View File

@ -105,7 +105,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
*/
private static Logger logger = Logger.getLogger(GenomeAnalysisTK.class);
public static ArrayList<String> ROD_BINDINGS = null;
public static ArrayList<String> ROD_BINDINGS = new ArrayList<String>();
/**
@ -143,9 +143,11 @@ public class GenomeAnalysisTK extends CommandLineProgram {
m_parser.addOptionalArg("daughter", "KID", "Daughter's genotype (SAM pileup)", "DAUGHTER_GENOTYPE_FILE");
// --rodBind <name> <type> <file>
//m_parser.addOptionalArg("rods", "B", "Bind rod with <name> and <type> to <file>", "ROD_BINDINGS");
Option rodBinder = OptionBuilder.withArgName("rodBind")
.hasArgs()
.withDescription( "Bind rod with <name> and <type> to <file>" )
.withDescription( "" )
.create("B");
m_parser.addOptionalArg(rodBinder, "ROD_BINDINGS");
}
@ -187,48 +189,38 @@ public class GenomeAnalysisTK extends CommandLineProgram {
start(Instance, argv);
}
/**
* Convenience function that binds RODs using the old-style command line parser to the new style list for
* a uniform processing.
*
* @param name
* @param type
* @param file
*/
private static void bindConvenienceRods(final String name, final String type, final String file )
{
ROD_BINDINGS.add(name);
ROD_BINDINGS.add(type);
ROD_BINDINGS.add(file);
}
protected int execute() {
final boolean TEST_ROD = false;
List<ReferenceOrderedData<? extends ReferenceOrderedDatum> > rods = new ArrayList<ReferenceOrderedData<? extends ReferenceOrderedDatum> >();
if ( ROD_BINDINGS != null ) {
System.out.printf("ROD BINDINGS are %s%n", Utils.join(":", ROD_BINDINGS));
}
if ( TEST_ROD ) {
ReferenceOrderedData<rodGFF> gff = new ReferenceOrderedData<rodGFF>("test", new File("trunk/data/gFFTest.gff"), rodGFF.class );
gff.testMe();
//ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("trunk/data/dbSNP_head.txt"), rodDbSNP.class );
ReferenceOrderedData<rodDbSNP> dbsnp = new ReferenceOrderedData<rodDbSNP>("dbSNP", new File("/Volumes/Users/mdepristo/broad/ATK/exampleSAMs/dbSNP_chr20.txt"), rodDbSNP.class );
//dbsnp.testMe();
rods.add(dbsnp); // { gff, dbsnp };
} else {
if ( DBSNP_FILE != null ) {
ReferenceOrderedData<rodDbSNP> dbsnp = new ReferenceOrderedData<rodDbSNP>("dbSNP", new File(DBSNP_FILE), rodDbSNP.class );
//dbsnp.testMe();
rods.add(dbsnp); // { gff, dbsnp };
}
if ( HAPMAP_FILE != null ) {
ReferenceOrderedData<HapMapAlleleFrequenciesROD> hapmap = new ReferenceOrderedData<HapMapAlleleFrequenciesROD>("hapmap", new File(HAPMAP_FILE), HapMapAlleleFrequenciesROD.class );
//dbsnp.testMe();
rods.add(hapmap); // { gff, dbsnp };
}
if ( HAPMAP_CHIP_FILE != null ) {
ReferenceOrderedData<rodGFF> hapmapChip = new ReferenceOrderedData<rodGFF>("hapmap-chip", new File(HAPMAP_CHIP_FILE), rodGFF.class );
rods.add(hapmapChip);
}
//TODO: remove when walkers can ask for tracks
if ( MOTHER_GENOTYPE_FILE != null )
rods.add( new ReferenceOrderedData<rodSAMPileup>("mother", new File(MOTHER_GENOTYPE_FILE), rodSAMPileup.class ) );
if ( FATHER_GENOTYPE_FILE != null )
rods.add( new ReferenceOrderedData<rodSAMPileup>("father", new File(FATHER_GENOTYPE_FILE), rodSAMPileup.class ) );
if ( DAUGHTER_GENOTYPE_FILE != null )
rods.add( new ReferenceOrderedData<rodSAMPileup>("daughter", new File(DAUGHTER_GENOTYPE_FILE), rodSAMPileup.class ) );
}
//
// please don't use these in the future, use the new syntax
//
if ( DBSNP_FILE != null ) bindConvenienceRods("dbSNP", "dbsnp", DBSNP_FILE);
if ( HAPMAP_FILE != null ) bindConvenienceRods("hapmap", "HapMapAlleleFrequencies", HAPMAP_FILE);
if ( HAPMAP_CHIP_FILE != null ) bindConvenienceRods("hapmap-chip", "GFF", HAPMAP_CHIP_FILE);
//TODO: remove when walkers can ask for tracks
if ( MOTHER_GENOTYPE_FILE != null ) bindConvenienceRods("mother", "SAMPileup", MOTHER_GENOTYPE_FILE);
if ( FATHER_GENOTYPE_FILE != null ) bindConvenienceRods("father", "SAMPileup", FATHER_GENOTYPE_FILE);
if ( DAUGHTER_GENOTYPE_FILE != null ) bindConvenienceRods("daughter", "SAMPileup", DAUGHTER_GENOTYPE_FILE);
ReferenceOrderedData.parseBindings(logger, ROD_BINDINGS, rods);
initializeOutputStreams();
Walker<?,?> my_walker = null;
@ -281,7 +273,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
// Prepare the sort ordering w.r.t. the sequence dictionary
if (REF_FILE_ARG != null) {
final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG);
GenomeLoc.setupRefContigOrdering(refFile);
//GenomeLoc.setupRefContigOrdering(refFile);
}
// Determine the validation stringency. Default to ValidationStringency.STRICT.

View File

@ -7,11 +7,20 @@ import java.util.List;
import java.util.Collection;
/**
* Created by IntelliJ IDEA.
* This class represents the Reference Metadata available at a particular site in the genome. It can be
* used to conveniently lookup the RODs at this site, as well just getting a list of all of the RODs
*
* The standard interaction model is:
*
* Traversal system arrives at a site, which has a bunch of rods covering it
* Traversal calls tracker.bind(name, rod) for each rod in rods
* Traversal passes tracker to the walker
* walker calls lookup(name, default) to obtain the rod values at this site, or default if none was
* bound at this site.
*
* User: mdepristo
* Date: Apr 3, 2009
* Time: 3:05:23 PM
* To change this template use File | Settings | File Templates.
*/
public class RefMetaDataTracker {
final HashMap<String, ReferenceOrderedDatum> map = new HashMap<String, ReferenceOrderedDatum>();
@ -26,29 +35,65 @@ public class RefMetaDataTracker {
*/
public ReferenceOrderedDatum lookup(final String name, ReferenceOrderedDatum defaultValue) {
//logger.debug(String.format("Lookup %s%n", name));
if ( map.containsKey(name) )
return map.get(name);
final String luName = canonicalName(name);
if ( map.containsKey(luName) )
return map.get(luName);
else
return defaultValue;
}
/**
* @see this.lookup
* @param name
* @param defaultValue
* @return
*/
public Object lookup(final String name, Object defaultValue) {
if ( map.containsKey(name) )
return map.get(name);
final String luName = canonicalName(name);
if ( map.containsKey(luName) )
return map.get(luName);
else
return defaultValue;
}
public Object hasROD(final String name) {
return map.containsKey(name);
/**
* Returns the canonical name of the rod name
* @param name
* @return
*/
private final String canonicalName(final String name)
{
return name.toLowerCase();
}
/**
* Is there a binding at this site to a ROD with name?
*
* @param name
* @return
*/
public Object hasROD(final String name) {
return map.containsKey(canonicalName(name));
}
/**
* Get all of the RODs at the current site
*
* @return
*/
public Collection<ReferenceOrderedDatum> getAllRods() {
return map.values();
}
/**
* Binds the reference ordered datum ROD to name at this site. Should be used only but the traversal
* system to provide access to RODs in a structured way to the walkers.
*
* @param name
* @param rod
*/
public void bind(final String name, ReferenceOrderedDatum rod) {
//logger.debug(String.format("Binding %s to %s%n", name, rod));
map.put(name, rod);
map.put(canonicalName(name), rod);
}
}

View File

@ -4,9 +4,7 @@ import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.FileNotFoundException;
import java.util.Iterator;
import java.util.ArrayList;
import java.util.Collections;
import java.util.*;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
@ -14,6 +12,7 @@ import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.xReadLines;
import org.broadinstitute.sting.utils.Utils;
import org.apache.log4j.Logger;
/**
* Class for representing arbitrary reference ordered data sets
@ -28,6 +27,98 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
private File file = null;
private Class<ROD> type = null; // runtime type information for object construction
// ----------------------------------------------------------------------
//
// Static ROD type management
//
// ----------------------------------------------------------------------
public static class RODBinding {
public final String name;
public final Class<? extends ReferenceOrderedDatum> type;
public RODBinding(final String name, final Class<? extends ReferenceOrderedDatum> type) {
this.name = name;
this.type = type;
}
}
public static HashMap<String, RODBinding> Types = new HashMap<String, RODBinding>();
public static void addModule(final String name, final Class<? extends ReferenceOrderedDatum> rodType) {
System.out.printf("* Adding rod class %s%n", name);
Types.put(name.toLowerCase(), new RODBinding(name, rodType));
}
static {
// All known ROD types
addModule("GFF", rodGFF.class);
addModule("dbSNP", rodDbSNP.class);
addModule("HapMapAlleleFrequencies", HapMapAlleleFrequenciesROD.class);
addModule("SAMPileup", rodSAMPileup.class);
}
/**
* Parse the ROD bindings. These are of the form of a single list of strings, each triplet of the
* form <name> <type> <file>. After this function, the List of RODs contains new RODs bound to each of
* name, of type, ready to read from the file. This function does check for the strings to be well formed
* and such.
*
* @param logger
* @param bindings
* @param rods
*/
public static void parseBindings(Logger logger, ArrayList<String> bindings, List<ReferenceOrderedData<? extends ReferenceOrderedDatum> > rods)
{
logger.info("Processing ROD bindings: " + bindings.size() + " -> " + Utils.join(" : ", bindings));
if ( bindings.size() % 3 != 0 )
Utils.scareUser(String.format("Invalid ROD specification: requires triplets of <name> <type> <file> but got %s", Utils.join(" ", bindings)));
// Loop over triplets
for ( int i = 0; i < bindings.size(); i += 3) {
final String name = bindings.get(i);
final String typeName = bindings.get(i+1);
final String fileName = bindings.get(i+2);
ReferenceOrderedData<?> rod = parse1Binding(logger, name, typeName, fileName);
// check that we're not generating duplicate bindings
for ( ReferenceOrderedData rod2 : rods )
if ( rod2.getName().equals(rod.getName()) )
Utils.scareUser(String.format("Found duplicate rod bindings", rod.getName()));
rods.add(rod);
}
}
/**
* Helpful function that parses a single triplet of <name> <type> <file> and returns the corresponding ROD with
* <name>, of type <type> that reads its input from <file>.
*
* @param logger
* @param trackName
* @param typeName
* @param fileName
* @return
*/
private static ReferenceOrderedData<?> parse1Binding( Logger logger, final String trackName, final String typeName, final String fileName )
{
// Gracefully fail if we don't have the type
if ( ReferenceOrderedData.Types.get(typeName.toLowerCase()) == null )
Utils.scareUser(String.format("Unknown ROD type: %s", typeName));
// Lookup the type
Class rodClass = ReferenceOrderedData.Types.get(typeName.toLowerCase()).type;
// Create the ROD
ReferenceOrderedData<?> rod = new ReferenceOrderedData<ReferenceOrderedDatum>(trackName.toLowerCase(), new File(fileName), rodClass );
logger.info(String.format("Created binding from %s to %s of type %s", trackName.toLowerCase(), fileName, rodClass));
return rod;
}
// ----------------------------------------------------------------------
//
// Constructors
//
// ----------------------------------------------------------------------
public ReferenceOrderedData(final String name, File file, Class<ROD> type ) {
this.file = file;
this.type = type;

View File

@ -24,18 +24,6 @@ public class PrepareROD extends CommandLineProgram {
@Option(shortName="RODNAME", doc="Name of the data") public String ROD_NAME = null;
@Option(shortName="RODTYPE", doc="Referenced Ordered Data type") public String ROD_TYPE = null;
public static HashMap<String, Class<? extends ReferenceOrderedDatum>> Types = new HashMap<String,Class<? extends ReferenceOrderedDatum>>();
public static void addModule(final String name, final Class<? extends ReferenceOrderedDatum> rodType) {
System.out.printf("* Adding rod class %s%n", name);
Types.put(name.toLowerCase(), rodType);
}
static {
addModule("GFF", rodGFF.class);
addModule("dbSNP", rodDbSNP.class);
addModule("HapMapAlleleFrequencies", HapMapAlleleFrequenciesROD.class);
}
/** Required main method implementation. */
public static void main(String[] argv) {
System.exit(new PrepareROD().instanceMain(argv));
@ -47,7 +35,7 @@ public class PrepareROD extends CommandLineProgram {
final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG);
GenomeLoc.setupRefContigOrdering(refFile);
Class<? extends ReferenceOrderedDatum> rodClass = Types.get(ROD_TYPE.toLowerCase());
Class<? extends ReferenceOrderedDatum> rodClass = ReferenceOrderedData.Types.get(ROD_TYPE.toLowerCase()).type;
ReferenceOrderedData<? extends ReferenceOrderedDatum> rod = new ReferenceOrderedData("ROD", new File(ROD_FILE), rodClass );
try {

View File

@ -99,9 +99,9 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
return false;
} else {
contigInfo = seqDict;
logger.info(String.format("Prepared reference sequence contig dictionary%n order ->"));
logger.debug(String.format("Prepared reference sequence contig dictionary"));
for (SAMSequenceRecord contig : seqDict.getSequences() ) {
logger.info(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength()));
logger.debug(String.format(" %s (%d bp)", contig.getSequenceName(), contig.getSequenceLength()));
}
}
@ -225,7 +225,7 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
Collections.sort(locs);
//logger.info(String.format("Going to process %d locations", locs.length));
locs = mergeOverlappingLocations(locs);
logger.info(" Locations are:\n" + Utils.join("\n", Functions.map(Operators.toString, locs)));
logger.info("Locations are:" + Utils.join(", ", Functions.map(Operators.toString, locs)));
return locs;
} catch (Exception e) {
e.printStackTrace();
@ -235,7 +235,7 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
}
public static ArrayList<GenomeLoc> mergeOverlappingLocations(final ArrayList<GenomeLoc> raw) {
logger.info(" Raw locations are:\n" + Utils.join("\n", Functions.map(Operators.toString, raw)));
logger.debug(" Raw locations are:\n" + Utils.join("\n", Functions.map(Operators.toString, raw)));
if ( raw.size() <= 1 )
return raw;
else {