From bb8a6e9b0abc6786317d2ff564d1678f6f32fc5f Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Mon, 9 Apr 2012 11:05:26 -0400 Subject: [PATCH] Initial test of write and read from MongoDB --- .../sting/gatk/walkers/InsertRODsWalker.java | 109 +++++++++++++++++ .../walkers/variantutils/SelectVariants.java | 111 +++++++++++++++++- .../utils/variantcontext/VariantContext.java | 83 +++++++++++++ 3 files changed, 301 insertions(+), 2 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/InsertRODsWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/InsertRODsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/InsertRODsWalker.java new file mode 100644 index 000000000..55790bd3f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/InsertRODsWalker.java @@ -0,0 +1,109 @@ +package org.broadinstitute.sting.gatk.walkers; + +/** + * Created with IntelliJ IDEA. + * User: thibault + * Date: 3/30/12 + * Time: 4:47 PM + * To change this template use File | Settings | File Templates. + */ + +import com.mongodb.*; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.PrintStream; + +/** + * Inserts all of the RODs in the input data set. Data is inserted using VariantContext.toMongoDB(). + */ +public class InsertRODsWalker extends RodWalker { + @Input(fullName="input", shortName = "input", doc="The input ROD which should be inserted into the DB.", required=true) + public RodBinding input; + + @Output + PrintStream out; + + private final static String MONGO_HOST = "gsa4.broadinstitute.org"; + private final static Integer MONGO_PORT = 43054; + private final static String MONGO_DB_NAME = "bjorn"; + private final static String MONGO_VC_COLLECTION = "vcs"; + + protected Mongo mongo; + protected DBCollection mongoCollection; + + @Override + public void initialize() + { + try { + mongo = new Mongo(MONGO_HOST, MONGO_PORT); + DB mongoDb = mongo.getDB(MONGO_DB_NAME); + mongoCollection = mongoDb.getCollection(MONGO_VC_COLLECTION); + + // set up indices + mongoCollection.ensureIndex("location"); + mongoCollection.ensureIndex("sample"); + mongoCollection.ensureIndex("contig"); + mongoCollection.ensureIndex("start"); + mongoCollection.ensureIndex("stop"); + + // set up primary key + mongoCollection.ensureIndex(new BasicDBObject("location", 1).append("sample", 1), new BasicDBObject("unique", 1)); + + } + catch (MongoException e) {} + catch (java.net.UnknownHostException e) {} + } + + /** + * Initialize the number of loci processed to zero. + * + * @return 0 + */ + public Integer reduceInit() { return 0; } + + /** + * + * @param tracker the meta-data tracker + * @param ref the reference base + * @param context the context for the given locus + * @return 1 if the locus was successfully processed, 0 if otherwise + */ + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + for ( Feature feature : tracker.getValues(Feature.class, context.getLocation()) ) { + if ( feature instanceof VariantContext ) { + VariantContext vc = (VariantContext) feature; + for (BasicDBObject vcForMongo : vc.toMongoDB()) { + mongoCollection.insert(vcForMongo); + } + } + + } + + return 1; + } + + /** + * Increment the number of rods processed. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return the new number of rods processed. + */ + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public void onTraversalDone(Integer result) { + mongo.close(); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 42a40cde5..61f5cdb59 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; +import com.mongodb.*; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; @@ -33,6 +34,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; @@ -181,7 +183,8 @@ import java.util.*; * */ public class SelectVariants extends RodWalker implements TreeReducible { - @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); /** * A site is considered discordant if there exists some sample in the variant track that has a non-reference genotype @@ -344,6 +347,14 @@ public class SelectVariants extends RodWalker implements TreeR private Set IDsToKeep = null; + private final static String MONGO_HOST = "gsa4.broadinstitute.org"; + private final static Integer MONGO_PORT = 43054; + private final static String MONGO_DB_NAME = "bjorn"; + private final static String MONGO_VC_COLLECTION = "vcs"; + + protected Mongo mongo; + protected DBCollection mongoCollection; + /** * Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher */ @@ -443,6 +454,15 @@ public class SelectVariants extends RodWalker implements TreeR throw new UserException.CouldNotReadInputFile(rsIDFile, e); } } + + try { + mongo = new Mongo(MONGO_HOST, MONGO_PORT); + DB mongoDb = mongo.getDB(MONGO_DB_NAME); + mongoCollection = mongoDb.getCollection(MONGO_VC_COLLECTION); + } + catch (MongoException e) {} + catch (java.net.UnknownHostException e) {} + } /** @@ -458,7 +478,8 @@ public class SelectVariants extends RodWalker implements TreeR if ( tracker == null ) return 0; - Collection vcs = tracker.getValues(variantCollection.variants, context.getLocation()); + //Collection vcs = tracker.getValues(variantCollection.variants, context.getLocation()); + Collection vcs = getMongoVariants(context.getLocation()); if ( vcs == null || vcs.size() == 0) { return 0; @@ -531,6 +552,92 @@ public class SelectVariants extends RodWalker implements TreeR return 1; } + private Collection getMongoVariants(GenomeLoc location) { + String contig = location.getContig(); + long start = location.getStart(); + long stop = location.getStop(); + + ArrayList vcs = new ArrayList(); + + BasicDBObject query = new BasicDBObject(); + query.put("contig", contig); + query.put("start", start); + query.put("stop", stop); + + DBCursor cursor = mongoCollection.find(query); + while(cursor.hasNext()) { + DBObject result = cursor.next(); + + String source = (String)result.get("source"); + + ArrayList alleles = new ArrayList(); + BasicDBObject allelesInDb = (BasicDBObject)result.get("alleles"); + for (Object alleleInDb : allelesInDb.values()) { + String rawAllele = (String)alleleInDb; + boolean isRef = rawAllele.contains("*"); + String allele = rawAllele.replace("*", ""); + alleles.add(Allele.create(allele, isRef)); + } + + VariantContextBuilder builder = new VariantContextBuilder(source, contig, start, stop, alleles); + + String id = (String)result.get("id"); + String sample = (String)result.get("sample"); + Double error = (Double)result.get("error"); + + Map attributes = new TreeMap(); + BasicDBList attrsInDb = (BasicDBList)result.get("attributes"); + for (Object attrInDb : attrsInDb) { + BasicDBObject attrKVP = (BasicDBObject)attrInDb; + String key = (String)attrKVP.get("key"); + Object value = attrKVP.get("value"); + attributes.put(key, value); + } + + Set filters = new HashSet(); + BasicDBObject filtersInDb = (BasicDBObject)result.get("filters"); + if (filtersInDb != null) { + for (Object filterInDb : filtersInDb.values()) { + filters.add((String)filterInDb); + } + } + + BasicDBObject genotypeInDb = (BasicDBObject)result.get("genotype"); + Double genotypeError = (Double)genotypeInDb.get("error"); + + ArrayList genotypeAlleles = new ArrayList(); + BasicDBObject genotypeAllelesInDb = (BasicDBObject)genotypeInDb.get("alleles"); + for (Object alleleInDb : genotypeAllelesInDb.values()) { + String rawAllele = (String)alleleInDb; + boolean isRef = rawAllele.contains("*"); + String allele = rawAllele.replace("*", ""); + genotypeAlleles.add(Allele.create(allele, isRef)); + } + + Map genotypeAttributes = new TreeMap(); + BasicDBList genotypeAttrsInDb = (BasicDBList)genotypeInDb.get("attributes"); + for (Object attrInDb : genotypeAttrsInDb) { + BasicDBObject attrKVP = (BasicDBObject)attrInDb; + String key = (String)attrKVP.get("key"); + Object value = attrKVP.get("value"); + genotypeAttributes.put(key, value); + } + + Genotype genotype = new Genotype(sample, genotypeAlleles, genotypeError); + genotype = Genotype.modifyAttributes(genotype, genotypeAttributes); + + builder.id(id); + builder.log10PError(error); + builder.genotypes(genotype); + builder.attributes(attributes); + builder.filters(filters); + + vcs.add(builder.make()); + } + + return vcs; + } + /** * Checks if vc has a variant call for (at least one of) the samples. * @param vc the variant rod VariantContext. Here, the variant is the dataset you're looking for discordances to (e.g. HapMap) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 3faad46e2..155f5445b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -1,8 +1,11 @@ package org.broadinstitute.sting.utils.variantcontext; +import com.mongodb.BasicDBList; +import com.mongodb.BasicDBObject; import org.broad.tribble.Feature; import org.broad.tribble.TribbleException; import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -1218,6 +1221,86 @@ public class VariantContext implements Feature { // to enable tribble integratio this.getGenotypes()); } + public List toMongoDB() { + List vcDocs = new ArrayList(); + for (Genotype genotype : this.getGenotypes()) { + BasicDBObject vcDoc = new BasicDBObject(); + vcDoc.put("location", contig + ":" + (start - stop == 0 ? start : start + "-" + stop)); + vcDoc.put("contig", contig); + vcDoc.put("start", start); + vcDoc.put("stop", stop); + vcDoc.put("id", this.getID()); + vcDoc.put("error", this.getLog10PError()); + vcDoc.put("sample", genotype.getSampleName()); + vcDoc.put("source", this.getSource()); + vcDoc.put("type", this.getType().toString()); + + Integer alleleIndex = 0; + BasicDBObject allelesDoc = new BasicDBObject(); + for (Allele allele : this.getAlleles()) + { + String index = alleleIndex.toString(); + allelesDoc.put(index, allele.toString()); + alleleIndex++; + } + vcDoc.put("alleles", allelesDoc); + + List attributesDocs = new ArrayList(); + for (Map.Entry attribute : this.getAttributes().entrySet() ) + { + String key = attribute.getKey(); + Object value = attribute.getValue(); + BasicDBObject attributesDoc = new BasicDBObject(); + attributesDoc.put("key", key); + attributesDoc.put("value", value); + attributesDocs.add(attributesDoc); + } + vcDoc.put("attributes", attributesDocs); + + BasicDBObject genotypesDoc = new BasicDBObject(); + Integer genotypeAlleleIndex = 0; + BasicDBObject genotypeAllelesDoc = new BasicDBObject(); + for (Allele allele : genotype.getAlleles()) + { + String index = genotypeAlleleIndex.toString(); + genotypeAllelesDoc.put(index, allele.toString()); + genotypeAlleleIndex++; + } + genotypesDoc.put("alleles", genotypeAllelesDoc); + + List genotypesAttributesDocs = new ArrayList(); + for (Map.Entry attribute : genotype.getAttributes().entrySet() ) + { + String key = attribute.getKey(); + Object value = attribute.getValue(); + BasicDBObject genotypesAttributesDoc = new BasicDBObject(); + genotypesAttributesDoc.put("key", key); + genotypesAttributesDoc.put("value", value); + genotypesAttributesDocs.add(genotypesAttributesDoc); + } + genotypesDoc.put("attributes", genotypesAttributesDocs); + genotypesDoc.put("error", genotype.getLog10PError()); + + vcDoc.put("genotype", genotypesDoc); + + Integer filterIndex = 0; + BasicDBObject filtersDoc = new BasicDBObject(); + for (String filter : this.getFilters()) + { + String index = filterIndex.toString(); + filtersDoc.put(index, filter.toString()); + filterIndex++; + } + if (filterIndex > 0) { + vcDoc.put("filters", filtersDoc); + } + + vcDocs.add(vcDoc); + } + + return vcDocs; + } + // protected basic manipulation routines private static List makeAlleles(Collection alleles) { final List alleleList = new ArrayList(alleles.size());