From 97ed9457970b14b071483f4ff66e171ee5a5ada9 Mon Sep 17 00:00:00 2001 From: kiran Date: Sun, 15 Nov 2009 09:27:12 +0000 Subject: [PATCH] Example code for a bug in the VCF implementation. See JIRA entry at http://jira.broadinstitute.org:8008/browse/GSA-225 git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2050 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/vcftools/VCFMixedUp.java | 85 +++++++++++ .../walkers/vcftools/VCFSubsetWalker.java | 133 ++++++++++++++++++ 2 files changed, 218 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFMixedUp.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFMixedUp.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFMixedUp.java new file mode 100755 index 000000000..813c6da41 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFMixedUp.java @@ -0,0 +1,85 @@ +package org.broadinstitute.sting.playground.gatk.walkers.vcftools; + +import org.broadinstitute.sting.gatk.walkers.RefWalker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.RodVCF; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord; +import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord; + +import java.io.FileReader; +import java.io.IOException; +import java.io.BufferedReader; +import java.util.ArrayList; + +public class VCFMixedUp extends RefWalker { + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + for (ReferenceOrderedDatum rod : tracker.getAllRods()) { + if (rod instanceof RodVCF) { + for (String rodfile : this.getToolkit().getArguments().RODBindings) { + String[] rodpieces = rodfile.split(","); + + if (rodpieces[2].contains(".vcf")) { + try { + FileReader reader = new FileReader(rodpieces[2]); + BufferedReader breader = new BufferedReader(reader); + + ArrayList header = new ArrayList(); + ArrayList variant = new ArrayList(); + + String line; + while ((line = breader.readLine()) != null) { + String[] pieces = line.split("\\s+"); + + if (line.contains("##")) { + } else if (line.contains("#CHROM")) { + for (int i = 0; i < pieces.length; i++) { + header.add(i, pieces[i]); + } + } else { + for (int i = 0; i < pieces.length; i++) { + variant.add(i, pieces[i]); + } + + for (int i = 0; i < pieces.length; i++) { + if (i < 9 || header.get(i).contains("NA12892") || header.get(i).contains("NA10851")) { + out.printf("%s => %s\n", header.get(i), variant.get(i)); + } + } + } + } + } catch (IOException e) {} + } + } + + RodVCF vcfrod = (RodVCF) rod; + VCFRecord record = vcfrod.mCurrentRecord; + + out.println(); + + for (VCFGenotypeRecord gr : record.getVCFGenotypeRecords()) { + if (gr.getSampleName().equalsIgnoreCase("NA12892") || gr.getSampleName().equalsIgnoreCase("NA10851")) { + out.printf("%s: %s:%s:%s:%s\n", gr.getSampleName(), + gr.toGenotypeString(record.getAlternateAlleles()), + gr.getFields().get("GQ"), + gr.getFields().get("DP"), + gr.getFields().get("HQ")); + } + } + } + } + + return 0; + } + + public Integer reduceInit() { + return 0; + } + + public Integer reduce(Integer value, Integer sum) { + return 0; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java new file mode 100755 index 000000000..fdc728839 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/vcftools/VCFSubsetWalker.java @@ -0,0 +1,133 @@ +package org.broadinstitute.sting.playground.gatk.walkers.vcftools; + +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.RefWalker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.RodVCF; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.genotype.vcf.*; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.StingException; + +import java.util.*; +import java.io.File; + +public class VCFSubsetWalker extends RefWalker, VCFWriter> { + @Argument(fullName="sample", shortName="SN", doc="Sample to include (or nothing to specify all samples)", required=false) + private HashSet SAMPLES; + + @Argument(fullName="vcfsubset", shortName="O", doc="File to write VCF subset to", required=false) + private File VPATH; + + @Argument(fullName="includeNonVariants", shortName="INV", doc="Include non-variant loci", required=false) + private boolean INCLUDE_NON_VARIANTS = false; + + @Argument(fullName="includeFiltered", shortName="IF", doc="Include filtered loci", required=false) + private boolean INCLUDE_FILTERED = false; + + private VCFHeader vheader = null; + private VCFWriter vwriter = null; + + public void initializeWriter() { + Map metaData = new HashMap(); + Set additionalColumns = new HashSet(); + + metaData.put("format", "VCRv3.2"); + metaData.put("source", "VariantsToVCF"); + metaData.put("reference", this.getToolkit().getArguments().referenceFile.getAbsolutePath()); + + additionalColumns.add("FORMAT"); + additionalColumns.addAll(SAMPLES); + + vheader = new VCFHeader(metaData, additionalColumns); + if (VPATH != null) { + vwriter = new VCFWriter(vheader, VPATH); + } + } + + public ArrayList map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + ArrayList records = new ArrayList(); + + for (ReferenceOrderedDatum rod : tracker.getAllRods()) { + if (rod instanceof RodVCF) { + RodVCF vcfrod = (RodVCF) rod; + VCFRecord record = vcfrod.mCurrentRecord; + + if (SAMPLES == null) { + SAMPLES = new HashSet(); + SAMPLES.addAll(vcfrod.getHeader().getGenotypeSamples()); + } + + if (VPATH != null && vwriter == null) { + initializeWriter(); + } + + //out.println(record.toStringRepresentation(vcfrod.getHeader())); + + records.add(record); + } + } + + return records; + } + + public VCFWriter reduceInit() { + return vwriter; + } + + private VCFRecord subsetRecord(VCFRecord record) { + ArrayList genotypeRecords = new ArrayList(); + for (VCFGenotypeRecord gr : record.getVCFGenotypeRecords()) { + //if (gr.getSampleName().equalsIgnoreCase(SAMPLE)) { + if (SAMPLES.contains(gr.getSampleName())) { + //out.println(gr.getSampleName() + " " + gr.toGenotypeString(record.getAlternateAlleles())); + genotypeRecords.add(gr); + } + } + + VCFRecord subset = new VCFRecord(record.getReferenceBase(), + record.getChromosome(), + (int) record.getPosition(), + record.getID(), + record.getAlternateAlleles(), + record.getQual(), + record.getFilterString(), + record.getInfoValues(), + record.getGenotypeFormatString(), + genotypeRecords); + + return subset; + } + + public VCFWriter reduce(ArrayList records, VCFWriter writer) { + for (VCFRecord record : records) { + VCFRecord subset = subsetRecord(record); + + boolean isVariant = false; + for (VCFGenotypeEncoding ge : subset.getVCFGenotypeRecords().get(0).getAlleles()) { + if (record.getReferenceBase() != ge.getBases().charAt(0)) { + isVariant = true; + } + } + + //if (isVariant && !subset.isFiltered()) { + if ((isVariant || INCLUDE_NON_VARIANTS) && (!subset.isFiltered() || INCLUDE_FILTERED)) { + if (writer != null) { + writer.addRecord(subset); + } else { + out.println(subset.toStringRepresentation(vheader)); + } + } + } + + return writer; + } + + public void onTraversalDone(VCFWriter writer) { + if (writer != null) { + writer.close(); + } + } +}