Initial test of write and read from MongoDB

This commit is contained in:
Joel Thibault 2012-04-09 11:05:26 -04:00
parent d93a413f2e
commit bb8a6e9b0a
3 changed files with 301 additions and 2 deletions

View File

@ -0,0 +1,109 @@
package org.broadinstitute.sting.gatk.walkers;
/**
* Created with IntelliJ IDEA.
* User: thibault
* Date: 3/30/12
* Time: 4:47 PM
* To change this template use File | Settings | File Templates.
*/
import com.mongodb.*;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.PrintStream;
/**
* Inserts all of the RODs in the input data set. Data is inserted using VariantContext.toMongoDB().
*/
public class InsertRODsWalker extends RodWalker<Integer, Integer> {
@Input(fullName="input", shortName = "input", doc="The input ROD which should be inserted into the DB.", required=true)
public RodBinding<Feature> input;
@Output
PrintStream out;
private final static String MONGO_HOST = "gsa4.broadinstitute.org";
private final static Integer MONGO_PORT = 43054;
private final static String MONGO_DB_NAME = "bjorn";
private final static String MONGO_VC_COLLECTION = "vcs";
protected Mongo mongo;
protected DBCollection mongoCollection;
@Override
public void initialize()
{
try {
mongo = new Mongo(MONGO_HOST, MONGO_PORT);
DB mongoDb = mongo.getDB(MONGO_DB_NAME);
mongoCollection = mongoDb.getCollection(MONGO_VC_COLLECTION);
// set up indices
mongoCollection.ensureIndex("location");
mongoCollection.ensureIndex("sample");
mongoCollection.ensureIndex("contig");
mongoCollection.ensureIndex("start");
mongoCollection.ensureIndex("stop");
// set up primary key
mongoCollection.ensureIndex(new BasicDBObject("location", 1).append("sample", 1), new BasicDBObject("unique", 1));
}
catch (MongoException e) {}
catch (java.net.UnknownHostException e) {}
}
/**
* Initialize the number of loci processed to zero.
*
* @return 0
*/
public Integer reduceInit() { return 0; }
/**
*
* @param tracker the meta-data tracker
* @param ref the reference base
* @param context the context for the given locus
* @return 1 if the locus was successfully processed, 0 if otherwise
*/
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
return 0;
for ( Feature feature : tracker.getValues(Feature.class, context.getLocation()) ) {
if ( feature instanceof VariantContext ) {
VariantContext vc = (VariantContext) feature;
for (BasicDBObject vcForMongo : vc.toMongoDB()) {
mongoCollection.insert(vcForMongo);
}
}
}
return 1;
}
/**
* Increment the number of rods processed.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return the new number of rods processed.
*/
public Integer reduce(Integer value, Integer sum) {
return sum + value;
}
public void onTraversalDone(Integer result) {
mongo.close();
}
}

View File

@ -24,6 +24,7 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import com.mongodb.*;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
@ -33,6 +34,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.samples.Sample;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MendelianViolation;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
@ -181,7 +183,8 @@ import java.util.*;
*
*/
public class SelectVariants extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
@ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
/**
* A site is considered discordant if there exists some sample in the variant track that has a non-reference genotype
@ -344,6 +347,14 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
private Set<String> IDsToKeep = null;
private final static String MONGO_HOST = "gsa4.broadinstitute.org";
private final static Integer MONGO_PORT = 43054;
private final static String MONGO_DB_NAME = "bjorn";
private final static String MONGO_VC_COLLECTION = "vcs";
protected Mongo mongo;
protected DBCollection mongoCollection;
/**
* Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher
*/
@ -443,6 +454,15 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
throw new UserException.CouldNotReadInputFile(rsIDFile, e);
}
}
try {
mongo = new Mongo(MONGO_HOST, MONGO_PORT);
DB mongoDb = mongo.getDB(MONGO_DB_NAME);
mongoCollection = mongoDb.getCollection(MONGO_VC_COLLECTION);
}
catch (MongoException e) {}
catch (java.net.UnknownHostException e) {}
}
/**
@ -458,7 +478,8 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
if ( tracker == null )
return 0;
Collection<VariantContext> vcs = tracker.getValues(variantCollection.variants, context.getLocation());
//Collection<VariantContext> vcs = tracker.getValues(variantCollection.variants, context.getLocation());
Collection<VariantContext> vcs = getMongoVariants(context.getLocation());
if ( vcs == null || vcs.size() == 0) {
return 0;
@ -531,6 +552,92 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
return 1;
}
private Collection<VariantContext> getMongoVariants(GenomeLoc location) {
String contig = location.getContig();
long start = location.getStart();
long stop = location.getStop();
ArrayList<VariantContext> vcs = new ArrayList<VariantContext>();
BasicDBObject query = new BasicDBObject();
query.put("contig", contig);
query.put("start", start);
query.put("stop", stop);
DBCursor cursor = mongoCollection.find(query);
while(cursor.hasNext()) {
DBObject result = cursor.next();
String source = (String)result.get("source");
ArrayList<Allele> alleles = new ArrayList<Allele>();
BasicDBObject allelesInDb = (BasicDBObject)result.get("alleles");
for (Object alleleInDb : allelesInDb.values()) {
String rawAllele = (String)alleleInDb;
boolean isRef = rawAllele.contains("*");
String allele = rawAllele.replace("*", "");
alleles.add(Allele.create(allele, isRef));
}
VariantContextBuilder builder = new VariantContextBuilder(source, contig, start, stop, alleles);
String id = (String)result.get("id");
String sample = (String)result.get("sample");
Double error = (Double)result.get("error");
Map<String, Object> attributes = new TreeMap<String, Object>();
BasicDBList attrsInDb = (BasicDBList)result.get("attributes");
for (Object attrInDb : attrsInDb) {
BasicDBObject attrKVP = (BasicDBObject)attrInDb;
String key = (String)attrKVP.get("key");
Object value = attrKVP.get("value");
attributes.put(key, value);
}
Set<String> filters = new HashSet<String>();
BasicDBObject filtersInDb = (BasicDBObject)result.get("filters");
if (filtersInDb != null) {
for (Object filterInDb : filtersInDb.values()) {
filters.add((String)filterInDb);
}
}
BasicDBObject genotypeInDb = (BasicDBObject)result.get("genotype");
Double genotypeError = (Double)genotypeInDb.get("error");
ArrayList<Allele> genotypeAlleles = new ArrayList<Allele>();
BasicDBObject genotypeAllelesInDb = (BasicDBObject)genotypeInDb.get("alleles");
for (Object alleleInDb : genotypeAllelesInDb.values()) {
String rawAllele = (String)alleleInDb;
boolean isRef = rawAllele.contains("*");
String allele = rawAllele.replace("*", "");
genotypeAlleles.add(Allele.create(allele, isRef));
}
Map<String, Object> genotypeAttributes = new TreeMap<String, Object>();
BasicDBList genotypeAttrsInDb = (BasicDBList)genotypeInDb.get("attributes");
for (Object attrInDb : genotypeAttrsInDb) {
BasicDBObject attrKVP = (BasicDBObject)attrInDb;
String key = (String)attrKVP.get("key");
Object value = attrKVP.get("value");
genotypeAttributes.put(key, value);
}
Genotype genotype = new Genotype(sample, genotypeAlleles, genotypeError);
genotype = Genotype.modifyAttributes(genotype, genotypeAttributes);
builder.id(id);
builder.log10PError(error);
builder.genotypes(genotype);
builder.attributes(attributes);
builder.filters(filters);
vcs.add(builder.make());
}
return vcs;
}
/**
* Checks if vc has a variant call for (at least one of) the samples.
* @param vc the variant rod VariantContext. Here, the variant is the dataset you're looking for discordances to (e.g. HapMap)

View File

@ -1,8 +1,11 @@
package org.broadinstitute.sting.utils.variantcontext;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;
import org.broad.tribble.Feature;
import org.broad.tribble.TribbleException;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -1218,6 +1221,86 @@ public class VariantContext implements Feature { // to enable tribble integratio
this.getGenotypes());
}
public List<BasicDBObject> toMongoDB() {
List<BasicDBObject> vcDocs = new ArrayList<BasicDBObject>();
for (Genotype genotype : this.getGenotypes()) {
BasicDBObject vcDoc = new BasicDBObject();
vcDoc.put("location", contig + ":" + (start - stop == 0 ? start : start + "-" + stop));
vcDoc.put("contig", contig);
vcDoc.put("start", start);
vcDoc.put("stop", stop);
vcDoc.put("id", this.getID());
vcDoc.put("error", this.getLog10PError());
vcDoc.put("sample", genotype.getSampleName());
vcDoc.put("source", this.getSource());
vcDoc.put("type", this.getType().toString());
Integer alleleIndex = 0;
BasicDBObject allelesDoc = new BasicDBObject();
for (Allele allele : this.getAlleles())
{
String index = alleleIndex.toString();
allelesDoc.put(index, allele.toString());
alleleIndex++;
}
vcDoc.put("alleles", allelesDoc);
List<BasicDBObject> attributesDocs = new ArrayList<BasicDBObject>();
for (Map.Entry<String, Object> attribute : this.getAttributes().entrySet() )
{
String key = attribute.getKey();
Object value = attribute.getValue();
BasicDBObject attributesDoc = new BasicDBObject();
attributesDoc.put("key", key);
attributesDoc.put("value", value);
attributesDocs.add(attributesDoc);
}
vcDoc.put("attributes", attributesDocs);
BasicDBObject genotypesDoc = new BasicDBObject();
Integer genotypeAlleleIndex = 0;
BasicDBObject genotypeAllelesDoc = new BasicDBObject();
for (Allele allele : genotype.getAlleles())
{
String index = genotypeAlleleIndex.toString();
genotypeAllelesDoc.put(index, allele.toString());
genotypeAlleleIndex++;
}
genotypesDoc.put("alleles", genotypeAllelesDoc);
List<BasicDBObject> genotypesAttributesDocs = new ArrayList<BasicDBObject>();
for (Map.Entry<String, Object> attribute : genotype.getAttributes().entrySet() )
{
String key = attribute.getKey();
Object value = attribute.getValue();
BasicDBObject genotypesAttributesDoc = new BasicDBObject();
genotypesAttributesDoc.put("key", key);
genotypesAttributesDoc.put("value", value);
genotypesAttributesDocs.add(genotypesAttributesDoc);
}
genotypesDoc.put("attributes", genotypesAttributesDocs);
genotypesDoc.put("error", genotype.getLog10PError());
vcDoc.put("genotype", genotypesDoc);
Integer filterIndex = 0;
BasicDBObject filtersDoc = new BasicDBObject();
for (String filter : this.getFilters())
{
String index = filterIndex.toString();
filtersDoc.put(index, filter.toString());
filterIndex++;
}
if (filterIndex > 0) {
vcDoc.put("filters", filtersDoc);
}
vcDocs.add(vcDoc);
}
return vcDocs;
}
// protected basic manipulation routines
private static List<Allele> makeAlleles(Collection<Allele> alleles) {
final List<Allele> alleleList = new ArrayList<Allele>(alleles.size());