From fd20f5c2e846dead88e15824255dc6f40cdc464c Mon Sep 17 00:00:00 2001 From: kiran Date: Tue, 8 Sep 2009 06:12:18 +0000 Subject: [PATCH] For a file or files backed by a ROD implementing AllelicVariant, outputs a VCF file summarizing the information. Metadata like Hapmap and dbSNP membership, genotype LOD, read depth, etc, are annotated appropriately. The results output by this program are equivalent to those given by Gelis2PopSNPs.py. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1544 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/variantstovcf/VariantsToVCF.java | 212 ++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCF.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCF.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCF.java new file mode 100755 index 000000000..8cc42dd8b --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCF.java @@ -0,0 +1,212 @@ +package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf; + +import org.broadinstitute.sting.gatk.walkers.RefWalker; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter; +import org.broadinstitute.sting.utils.genotype.vcf.VCFHeader; +import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord; +import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.ListUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.BaseUtils; + +import java.util.*; +import java.io.File; + +public class VariantsToVCF extends RefWalker { + @Argument(fullName="vcfout", shortName="VO", doc="The output VCF file") public File VCF_OUT; + @Argument(fullName="verbose", shortName="V", doc="Show extended output", required=false) public boolean VERBOSE = false; + @Argument(fullName="suppress_multistate_alleles", shortName="SMA", doc="When multi-sample genotypes imply something other than a biallele state, suppress it.", required=false) public boolean SUPPRESS_MULTISTATE = false; + + private VCFWriter vcfwriter = null; + private VCFHeader vcfheader = null; + private HashSet sampleNames = null; + + public void initialize() { + Map metaData = new HashMap(); + List additionalColumns = new ArrayList(); + sampleNames = new HashSet(); + + Calendar cal = Calendar.getInstance(); + + metaData.put("format", "VCRv3.2"); + metaData.put("fileDate", String.format("%d%02d%02d", cal.get(Calendar.YEAR), cal.get(Calendar.MONTH), cal.get(Calendar.DAY_OF_MONTH))); + metaData.put("source", "VariantsToVCF"); + metaData.put("reference", this.getToolkit().getArguments().referenceFile.getAbsolutePath()); + + additionalColumns.add("FORMAT"); + + for (String rodName : this.getToolkit().getArguments().RODBindings) { + //out.println(rodName); + + String[] rodPieces = rodName.split(","); + String sampleName = rodPieces[0]; + + if (sampleName.startsWith("NA")) { + additionalColumns.add(sampleName); + sampleNames.add(sampleName.toUpperCase()); + } + } + + vcfheader = new VCFHeader(metaData, additionalColumns); + vcfwriter = new VCFWriter(vcfheader, VCF_OUT); + } + + public boolean filter(RefMetaDataTracker tracker, char ref, AlignmentContext context) { + if (BaseUtils.simpleBaseToBaseIndex(ref) > -1) { return true; } + + for (ReferenceOrderedDatum rod : tracker.getAllRods()) { + if (rod != null && sampleNames.contains(rod.getName().toUpperCase())) { + return true; + } + } + + return false; + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + int numSNPs = 0; + int numRefs = 0; + int[] alleleNames = { 0, 1, 2, 3 }; + int[] alleleCounts = new int[4]; + double snpQual = 0.0; + List gt = new ArrayList(); + + int refbase = BaseUtils.simpleBaseToBaseIndex(ref.getBase()); + + for (String name : vcfheader.getGenotypeSamples()) { + ReferenceOrderedDatum rod = tracker.lookup(name, null); + if (rod != null) { + AllelicVariant av = (AllelicVariant) rod; + String lod = String.format("%d", av.getVariationConfidence() > 99 ? 99 : (int) av.getVariationConfidence()); + int depth = 0; + + if (rod instanceof rodVariants) { + rodVariants rv = (rodVariants) rod; + depth = rv.depth; + } + + Map str = new HashMap(); + if (av.getGenotype().get(0).charAt(0) == av.getGenotype().get(0).charAt(1)) { + str.put("key","1/1:" + lod + (depth > 0 ? ":" + depth : "")); + //str.put("key","1/1"); + } else { + str.put("key","0/1:" + lod + (depth > 0 ? ":" + depth : "")); + //str.put("key","0/1"); + } + + List alleles = av.getGenotype(); + + int allele1 = BaseUtils.simpleBaseToBaseIndex(alleles.get(0).charAt(0)); + if (allele1 >= 0 && allele1 != refbase) { alleleCounts[allele1]++; } + + int allele2 = BaseUtils.simpleBaseToBaseIndex(alleles.get(0).charAt(1)); + if (allele2 >= 0 && allele2 != refbase) { alleleCounts[allele2]++; } + + gt.add(new VCFGenotypeRecord(name, str, alleles, VCFGenotypeRecord.PHASE.UNPHASED, ref.getBase())); + + numSNPs++; + snpQual += av.getVariationConfidence(); + } else { + Map str = new HashMap(); + str.put("key","0/0"); + + List alleles = new ArrayList(); + alleles.add(ref.getBase() + "" + ref.getBase()); + + gt.add(new VCFGenotypeRecord(name, str, alleles, VCFGenotypeRecord.PHASE.UNPHASED, ref.getBase())); + + numRefs++; + } + } + + if (numSNPs > 0) { + Integer[] perm = Utils.SortPermutation(alleleCounts); + int[] sortedCounts = Utils.PermuteArray(alleleCounts, perm); + int[] sortedNames = Utils.PermuteArray(alleleNames, perm); + + rodDbSNP dbsnp = (rodDbSNP) tracker.lookup("dbsnp", null); + + String infoString = String.format("locus=%s ref=%c allele_count=( %c:%d %c:%d %c:%d %c:%d )", + context.getLocation(), + ref.getBase(), + BaseUtils.baseIndexToSimpleBase(sortedNames[0]), sortedCounts[0], + BaseUtils.baseIndexToSimpleBase(sortedNames[1]), sortedCounts[1], + BaseUtils.baseIndexToSimpleBase(sortedNames[2]), sortedCounts[2], + BaseUtils.baseIndexToSimpleBase(sortedNames[3]), sortedCounts[3] + ); + + if (SUPPRESS_MULTISTATE && sortedCounts[2] > 0) { + out.println("[multistate] " + infoString); + return 0; + } else { + if (VERBOSE) { + out.println("[locus_info] " + infoString); + } + } + + Map map = new HashMap(); + for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { + map.put(field,String.valueOf(1)); + + if (field == VCFHeader.HEADER_FIELDS.CHROM) { + map.put(field, context.getContig()); + } else if (field == VCFHeader.HEADER_FIELDS.POS) { + map.put(field, String.valueOf(context.getPosition())); + } else if (field == VCFHeader.HEADER_FIELDS.REF) { + map.put(field, String.valueOf(ref.getBases())); + } else if (field == VCFHeader.HEADER_FIELDS.ALT) { + map.put(field, String.valueOf(BaseUtils.baseIndexToSimpleBase(sortedNames[3]))); + } else if (field == VCFHeader.HEADER_FIELDS.ID) { + map.put(field, (dbsnp == null) ? "." : dbsnp.name); + } else if (field == VCFHeader.HEADER_FIELDS.QUAL) { + map.put(field, String.format("%d", snpQual > 99 ? 99 : (int) snpQual)); + } else if (field == VCFHeader.HEADER_FIELDS.FILTER) { + map.put(field, String.valueOf("0")); + } else if (field == VCFHeader.HEADER_FIELDS.INFO) { + String infostr = "."; + ArrayList info = new ArrayList(); + + if (dbsnp != null) { info.add("DB=1"); } + if (dbsnp != null && dbsnp.isHapmap()) { info.add("H2=1"); } + + for (int i = 0; i < info.size(); i++) { + if (i == 0) { infostr = ""; } + + infostr += info.get(i); + + if (i < info.size() - 1) { + infostr += ";"; + } + } + + map.put(field, infostr); + } + } + + vcfwriter.addRecord(new VCFRecord(vcfheader, map, "GT:GQ:DP", gt)); + //vcfwriter.addRecord(new VCFRecord(vcfheader, map, "GT", gt)); + + return 1; + } + + return 0; + } + + public Integer reduceInit() { + return 0; + } + + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } + + public void onTraversalDone(Integer sum) { + out.println("Processed " + sum + " variants."); + + vcfwriter.close(); + } +}