From a51bd575663be17f3b399ea3e94444176c4f3966 Mon Sep 17 00:00:00 2001 From: ebanks Date: Mon, 10 May 2010 02:18:48 +0000 Subject: [PATCH] First version of the smart batch merging tool. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3333 348d0f76-0448-11de-a6fe-93d51630548a --- .../variantcontext/VariantContext.java | 41 ++++ .../walkers/genotyper/BatchedCallsMerger.java | 186 ++++++++++++++++++ .../walkers/genotyper/CreateTriggerTrack.java | 66 +++++++ 3 files changed, 293 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BatchedCallsMerger.java create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/genotyper/CreateTriggerTrack.java diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java index 598eff4f6..460da1aa0 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java @@ -4,6 +4,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.Utils; +import org.broad.tribble.vcf.VCFRecord; import java.util.*; @@ -868,6 +869,46 @@ public class VariantContext { // // --------------------------------------------------------------------------------------------------------- + public static VariantContext simpleMerge(Set VCs) { + if ( VCs == null || VCs.size() == 0 ) + return null; + + Iterator iter = VCs.iterator(); + + // establish the baseline info from the first VC + VariantContext first = iter.next(); + String name = first.getName(); + GenomeLoc loc = first.getLocation(); + Set alleles = new HashSet(first.getAlleles()); + Map genotypes = new HashMap(first.getGenotypes()); + double negLog10PError = first.isVariant() ? first.getNegLog10PError() : -1; + Set filters = new HashSet(first.getFilters()); + Map attributes = new HashMap(); + int depth = 0; + if ( first.hasAttribute(VCFRecord.DEPTH_KEY) ) + depth = Integer.valueOf(first.getAttribute(VCFRecord.DEPTH_KEY).toString()); + + // cycle through and add info from the other VCs, making sure the loc/reference matches + while ( iter.hasNext() ) { + VariantContext vc = iter.next(); + if ( !loc.equals(vc.getLocation()) || !first.getReference().equals(vc.getReference()) ) + return null; + + alleles.addAll(vc.getAlleles()); + genotypes.putAll(vc.getGenotypes()); + + negLog10PError = Math.max(negLog10PError, vc.isVariant() ? vc.getNegLog10PError() : -1); + + filters.addAll(vc.getFilters()); + if ( vc.hasAttribute(VCFRecord.DEPTH_KEY) ) + depth += Integer.valueOf(vc.getAttribute(VCFRecord.DEPTH_KEY).toString()); + } + + if ( depth > 0 ) + attributes.put(VCFRecord.DEPTH_KEY, String.valueOf(depth)); + return new VariantContext(name, loc, alleles, genotypes, negLog10PError, filters, attributes); + } + private void determineType() { if ( type == null ) { if ( alleles.size() == 0 ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BatchedCallsMerger.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BatchedCallsMerger.java new file mode 100755 index 000000000..b125f815a --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BatchedCallsMerger.java @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.contexts.*; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.utils.genotype.*; +import org.broadinstitute.sting.utils.genotype.vcf.*; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broad.tribble.vcf.VCFHeaderLine; + +import java.util.*; + +import net.sf.samtools.SAMReadGroupRecord; + + +/** + * A walker that merges multiple batches of calls, by calling into the Genotyper to fill in sites that + * were called in one batch but not another. + */ +@Reference(window=@Window(start=-20,stop=20)) +@By(DataSource.REFERENCE) +@Requires(value={},referenceMetaData=@RMD(name="trigger", type=ReferenceOrderedData.class)) +public class BatchedCallsMerger extends LocusWalker implements TreeReducible { + @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); + + @Argument(doc = "VCF file to which variants should be written", required = false) + public GenotypeWriter writer = null; + + @Argument(fullName="rod_list", shortName="rods", doc="A comma-separated string describing the rod names representing individual call batches", required=true) + protected String ROD_STRING = null; + private HashSet targetRods; + + // the calculation arguments + private UnifiedGenotyperEngine UG_engine = null; + + // all samples to be used + private Set samples = new HashSet(); + + // mapping from rod name to set of samples coming from it + private Map> rodsToSamples = new HashMap>(); + + // enable deletions in the pileup + public boolean includeReadsWithDeletionAtLoci() { return true; } + + // enable extended events for indels + public boolean generateExtendedEvents() { return UAC.genotypeModel == GenotypeCalculationModel.Model.INDELS; } + + public void initialize() { + + targetRods = new HashSet(Arrays.asList(ROD_STRING.split(","))); + + Set headerLines = new HashSet(); + + // iterate to get all of the sample names + List dataSources = getToolkit().getRodDataSources(); + for ( ReferenceOrderedDataSource source : dataSources ) { + RMDTrack rod = source.getReferenceOrderedData(); + // if it's one of our target rods + if ( targetRods.contains(rod.getName()) ) { + // read the samples and store them + VCFReader reader = new VCFReader(rod.getFile()); + HashSet mySamples = new HashSet(reader.getHeader().getGenotypeSamples()); + rodsToSamples.put(rod.getName(), mySamples); + samples.addAll(mySamples); + + // while we're here, pull out the header lines + headerLines.addAll(reader.getHeader().getMetaData()); + + reader.close(); + } + } + + // update the engine + UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, writer, null, null, null); + UG_engine.samples = samples; + + // initialize the header + GenotypeWriterFactory.writeHeader(writer, GenomeAnalysisEngine.instance.getSAMFileHeader(), samples, headerLines); + } + + public VariantContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return null; + + // get the calls at this locus + Collection VCs = tracker.getAllVariantContexts(ref, null, context.getLocation(), true, false); + + Set calls = new HashSet(); + Set seenRods = new HashSet(); + for ( VariantContext vc : VCs ) { + if ( targetRods.contains(vc.getName()) ) { + calls.add(vc); + seenRods.add(vc.getName()); + } + } + + // if there are no calls, ignore this site + if ( seenRods.size() == 0 ) + return null; + + // figure out which samples still need to be called + Set missedSamples = new HashSet(); + for ( String rod : targetRods ) { + if ( !seenRods.contains(rod) ) + missedSamples.addAll(rodsToSamples.get(rod)); + } + + // we are missing samples, call them + if ( missedSamples.size() > 0 ) { + AlignmentContext prunedContext = filterForSamples(context.getBasePileup(), missedSamples); + VariantCallContext vcc = UG_engine.runGenotyper(tracker, ref, prunedContext); + if ( vcc != null && vcc.vc != null ) + calls.add(vcc.vc); + } + + // merge the variant contexts + return VariantContext.simpleMerge(calls); + } + + private AlignmentContext filterForSamples(ReadBackedPileup pileup, Set samples) { + + ArrayList newPileup = new ArrayList(); + + for (PileupElement p : pileup ) { + SAMReadGroupRecord readGroup = p.getRead().getReadGroup(); + if ( readGroup != null && samples.contains(readGroup.getSample()) ) + newPileup.add(p); + } + return new AlignmentContext(pileup.getLocation(), new ReadBackedPileup(pileup.getLocation(), newPileup)); + + } + + public Integer reduceInit() { return 0; } + + public Integer treeReduce(Integer lhs, Integer rhs) { + return lhs + rhs; + } + + public Integer reduce(VariantContext value, Integer sum) { + // can't call the locus because of no coverage or no confidence + if ( value == null ) + return sum; + + try { + writer.addCall(value, new String(value.getReference().getBases())); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); + } + + return sum; + } + + public void onTraversalDone(Integer sum) {} +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/CreateTriggerTrack.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/CreateTriggerTrack.java new file mode 100755 index 000000000..ab99bc890 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/CreateTriggerTrack.java @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; + +import java.util.Collection; +import java.io.PrintWriter; + +/** + * Creates a bed-format trigger-track for the Unified Genotyper based on input variant files. + */ +public class CreateTriggerTrack extends RodWalker { + + private PrintWriter writer; + + public void initialize() { + writer = new PrintWriter(out); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + Collection VCs = tracker.getAllVariantContexts(ref); + if ( VCs.size() > 0 ) + writer.println(context.getLocation().getContig() + "\t" + context.getLocation().getStart() + "\t" + context.getLocation().getStart()); + + return 0; + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { return 0; } + + public void onTraversalDone(Integer result) { + writer.close(); + } +} \ No newline at end of file