First fully functional (though preliminary) version of walker that takes an input VCF and outputs a Beagle .bgl file that can be used for missing genotype calls/haplotype imputation. For now, only supported input format is likelihood format for unrelated individuals.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3444 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f9efc1248c
commit
3873dccb35
|
|
@ -0,0 +1,152 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||
|
||||
import org.broad.tribble.vcf.VCFCodec;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFReader;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Produces an input file to Beagle imputation engine, listing genotype likelihoods for each sample in input VCF fiel
|
||||
*/
|
||||
public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
||||
|
||||
@Argument(fullName = "beagle_file", shortName = "beagle", doc = "File to print BEAGLE-specific data for use with imputation", required = false)
|
||||
public PrintStream beagleWriter = null;
|
||||
|
||||
final TreeSet<String> samples = new TreeSet<String>();
|
||||
|
||||
|
||||
|
||||
public void initialize() {
|
||||
|
||||
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
|
||||
for ( final ReferenceOrderedDataSource source : dataSources ) {
|
||||
final RMDTrack rod = source.getReferenceOrderedData();
|
||||
if ( rod.getType().equals(VCFCodec.class) ) {
|
||||
final VCFReader reader = new VCFReader(rod.getFile());
|
||||
final Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
|
||||
samples.addAll(vcfSamples);
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if ( beagleWriter != null ) {
|
||||
beagleWriter.print("marker alleleA alleleB");
|
||||
for ( String sample : samples )
|
||||
beagleWriter.print(String.format(" %s %s %s", sample, sample, sample));
|
||||
|
||||
beagleWriter.println();
|
||||
}
|
||||
|
||||
}
|
||||
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
|
||||
if( tracker != null ) {
|
||||
EnumSet<VariantContext.Type> vc = EnumSet.of(VariantContext.Type.SNP);
|
||||
GenomeLoc loc = context.getLocation();
|
||||
VariantContext vc_eval;
|
||||
|
||||
|
||||
try {
|
||||
vc_eval = tracker.getVariantContext(ref,"eval", vc, loc, true);
|
||||
} catch (java.util.NoSuchElementException e) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// output marker ID to Beagle input file
|
||||
beagleWriter.print(String.format("%s ",vc_eval.getLocation().toString()));
|
||||
|
||||
for (Allele allele: vc_eval.getAlleles()) {
|
||||
// TODO -- check whether this is really needed by Beagle
|
||||
String bglPrintString;
|
||||
if (allele.isNoCall() || allele.isNull())
|
||||
bglPrintString = "0";
|
||||
else
|
||||
bglPrintString = allele.toString().substring(0,1); // get rid of * in case of reference allele
|
||||
|
||||
beagleWriter.print(String.format("%s ", bglPrintString));
|
||||
}
|
||||
|
||||
Map<String, Genotype> genotypes = vc_eval.getGenotypes();
|
||||
if ( genotypes == null || genotypes.size() == 0 )
|
||||
return null;
|
||||
for ( String sample : samples ) {
|
||||
// use sample as key into genotypes structure
|
||||
Genotype genotype = genotypes.get(sample);
|
||||
String gls = "0 0 0 ";
|
||||
if (genotype.isCalled()) {
|
||||
String[] glArray = genotype.getAttributeAsString("GL").split(",");
|
||||
|
||||
for (String gl : glArray) {
|
||||
Double d_gl = -Double.valueOf(gl);
|
||||
beagleWriter.print(String.format("%5.2f ",d_gl));
|
||||
}
|
||||
}
|
||||
else
|
||||
beagleWriter.print(gls); // write 0 likelihoods for uncalled genotypes.
|
||||
|
||||
}
|
||||
|
||||
beagleWriter.println();
|
||||
|
||||
}
|
||||
return 1;
|
||||
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0; // Nothing to do here
|
||||
}
|
||||
|
||||
public Integer reduce( Integer value, Integer sum ) {
|
||||
return 0; // Nothing to do here
|
||||
}
|
||||
|
||||
public void onTraversalDone( Integer sum ) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue