Example code for a bug in the VCF implementation. See JIRA entry at http://jira.broadinstitute.org:8008/browse/GSA-225

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2050 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2009-11-15 09:27:12 +00:00
parent 768f865035
commit 97ed945797
2 changed files with 218 additions and 0 deletions

View File

@ -0,0 +1,85 @@
package org.broadinstitute.sting.playground.gatk.walkers.vcftools;
import org.broadinstitute.sting.gatk.walkers.RefWalker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RodVCF;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
import java.io.FileReader;
import java.io.IOException;
import java.io.BufferedReader;
import java.util.ArrayList;
public class VCFMixedUp extends RefWalker<Integer, Integer> {
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
for (ReferenceOrderedDatum rod : tracker.getAllRods()) {
if (rod instanceof RodVCF) {
for (String rodfile : this.getToolkit().getArguments().RODBindings) {
String[] rodpieces = rodfile.split(",");
if (rodpieces[2].contains(".vcf")) {
try {
FileReader reader = new FileReader(rodpieces[2]);
BufferedReader breader = new BufferedReader(reader);
ArrayList<String> header = new ArrayList<String>();
ArrayList<String> variant = new ArrayList<String>();
String line;
while ((line = breader.readLine()) != null) {
String[] pieces = line.split("\\s+");
if (line.contains("##")) {
} else if (line.contains("#CHROM")) {
for (int i = 0; i < pieces.length; i++) {
header.add(i, pieces[i]);
}
} else {
for (int i = 0; i < pieces.length; i++) {
variant.add(i, pieces[i]);
}
for (int i = 0; i < pieces.length; i++) {
if (i < 9 || header.get(i).contains("NA12892") || header.get(i).contains("NA10851")) {
out.printf("%s => %s\n", header.get(i), variant.get(i));
}
}
}
}
} catch (IOException e) {}
}
}
RodVCF vcfrod = (RodVCF) rod;
VCFRecord record = vcfrod.mCurrentRecord;
out.println();
for (VCFGenotypeRecord gr : record.getVCFGenotypeRecords()) {
if (gr.getSampleName().equalsIgnoreCase("NA12892") || gr.getSampleName().equalsIgnoreCase("NA10851")) {
out.printf("%s: %s:%s:%s:%s\n", gr.getSampleName(),
gr.toGenotypeString(record.getAlternateAlleles()),
gr.getFields().get("GQ"),
gr.getFields().get("DP"),
gr.getFields().get("HQ"));
}
}
}
}
return 0;
}
public Integer reduceInit() {
return 0;
}
public Integer reduce(Integer value, Integer sum) {
return 0;
}
}

View File

@ -0,0 +1,133 @@
package org.broadinstitute.sting.playground.gatk.walkers.vcftools;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.RefWalker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RodVCF;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.StingException;
import java.util.*;
import java.io.File;
public class VCFSubsetWalker extends RefWalker<ArrayList<VCFRecord>, VCFWriter> {
@Argument(fullName="sample", shortName="SN", doc="Sample to include (or nothing to specify all samples)", required=false)
private HashSet<String> SAMPLES;
@Argument(fullName="vcfsubset", shortName="O", doc="File to write VCF subset to", required=false)
private File VPATH;
@Argument(fullName="includeNonVariants", shortName="INV", doc="Include non-variant loci", required=false)
private boolean INCLUDE_NON_VARIANTS = false;
@Argument(fullName="includeFiltered", shortName="IF", doc="Include filtered loci", required=false)
private boolean INCLUDE_FILTERED = false;
private VCFHeader vheader = null;
private VCFWriter vwriter = null;
public void initializeWriter() {
Map<String, String> metaData = new HashMap<String, String>();
Set<String> additionalColumns = new HashSet<String>();
metaData.put("format", "VCRv3.2");
metaData.put("source", "VariantsToVCF");
metaData.put("reference", this.getToolkit().getArguments().referenceFile.getAbsolutePath());
additionalColumns.add("FORMAT");
additionalColumns.addAll(SAMPLES);
vheader = new VCFHeader(metaData, additionalColumns);
if (VPATH != null) {
vwriter = new VCFWriter(vheader, VPATH);
}
}
public ArrayList<VCFRecord> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
ArrayList<VCFRecord> records = new ArrayList<VCFRecord>();
for (ReferenceOrderedDatum rod : tracker.getAllRods()) {
if (rod instanceof RodVCF) {
RodVCF vcfrod = (RodVCF) rod;
VCFRecord record = vcfrod.mCurrentRecord;
if (SAMPLES == null) {
SAMPLES = new HashSet<String>();
SAMPLES.addAll(vcfrod.getHeader().getGenotypeSamples());
}
if (VPATH != null && vwriter == null) {
initializeWriter();
}
//out.println(record.toStringRepresentation(vcfrod.getHeader()));
records.add(record);
}
}
return records;
}
public VCFWriter reduceInit() {
return vwriter;
}
private VCFRecord subsetRecord(VCFRecord record) {
ArrayList<VCFGenotypeRecord> genotypeRecords = new ArrayList<VCFGenotypeRecord>();
for (VCFGenotypeRecord gr : record.getVCFGenotypeRecords()) {
//if (gr.getSampleName().equalsIgnoreCase(SAMPLE)) {
if (SAMPLES.contains(gr.getSampleName())) {
//out.println(gr.getSampleName() + " " + gr.toGenotypeString(record.getAlternateAlleles()));
genotypeRecords.add(gr);
}
}
VCFRecord subset = new VCFRecord(record.getReferenceBase(),
record.getChromosome(),
(int) record.getPosition(),
record.getID(),
record.getAlternateAlleles(),
record.getQual(),
record.getFilterString(),
record.getInfoValues(),
record.getGenotypeFormatString(),
genotypeRecords);
return subset;
}
public VCFWriter reduce(ArrayList<VCFRecord> records, VCFWriter writer) {
for (VCFRecord record : records) {
VCFRecord subset = subsetRecord(record);
boolean isVariant = false;
for (VCFGenotypeEncoding ge : subset.getVCFGenotypeRecords().get(0).getAlleles()) {
if (record.getReferenceBase() != ge.getBases().charAt(0)) {
isVariant = true;
}
}
//if (isVariant && !subset.isFiltered()) {
if ((isVariant || INCLUDE_NON_VARIANTS) && (!subset.isFiltered() || INCLUDE_FILTERED)) {
if (writer != null) {
writer.addRecord(subset);
} else {
out.println(subset.toStringRepresentation(vheader));
}
}
}
return writer;
}
public void onTraversalDone(VCFWriter writer) {
if (writer != null) {
writer.close();
}
}
}