Updated. Now uses VariantContext and is truly "variants" to vcf (i.e. not just GELI to vcf).

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3074 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-03-25 04:53:31 +00:00
parent 2373a4618f
commit 4398a8b370
2 changed files with 57 additions and 214 deletions

View File

@ -1,202 +1,72 @@
package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RodGeliText;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.walkers.RefWalker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.io.File;
import java.io.PrintStream;
import java.util.*;
/**
* Converts variants from other file formats (anything that implements the Variation interface) to VCF format.
* Converts variants from other file formats to VCF format.
*/
public class VariantsToVCF extends RefWalker<Integer, Integer> {
@Argument(fullName="vcfout", shortName="VO", doc="The output VCF file") public File VCF_OUT;
@Argument(fullName="verbose", shortName="V", doc="Show extended output", required=false) public boolean VERBOSE = false;
@Argument(fullName="suppress_multistate_alleles", shortName="SMA", doc="When multi-sample genotypes imply something other than a biallele state, suppress it.", required=false) public boolean SUPPRESS_MULTISTATE = false;
@Requires(value={},referenceMetaData=@RMD(name="variant",type= ReferenceOrderedDatum.class))
public class VariantsToVCF extends RodWalker<Integer, Integer> {
@Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod (for data like GELI with genotypes)", required=false)
protected String sampleName = null;
private VCFWriter vcfwriter = null;
private VCFHeader vcfheader = null;
private TreeMap<String, String> sampleNames = null;
private static String format = "GT:GQ:DP";
public void initialize() {
sampleNames = new TreeMap<String, String>();
GATKArgumentCollection args = this.getToolkit().getArguments();
// Don't allow mixed types for now
private EnumSet<VariantContext.Type> ALLOWED_VARIANT_CONTEXT_TYPES = EnumSet.of(VariantContext.Type.SNP, VariantContext.Type.NO_VARIATION, VariantContext.Type.INDEL);
for (String rodName : args.RODBindings) {
//out.println(rodName);
String[] rodPieces = rodName.split(",");
String sampleName = rodPieces[0];
if (! ( sampleName.equals("dbSNP") || sampleName.equals("interval") ) ) {
sampleNames.put(sampleName, sampleName);
}
}
vcfheader = getHeader(args, sampleNames.keySet());
vcfwriter = new VCFWriter(VCF_OUT);
vcfwriter.writeHeader(vcfheader);
}
public static VCFHeader getHeader(GATKArgumentCollection args, Set<String> sampleNames) {
// Don't output the data for now because it kills our unit test MD5s and is optional
// TODO - figure out what to do here
//Calendar cal = Calendar.getInstance();
//metaData.put("fileDate", String.format("%d%02d%02d", cal.get(Calendar.YEAR), cal.get(Calendar.MONTH), cal.get(Calendar.DAY_OF_MONTH)));
Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
metaData.add(new VCFHeaderLine("source", "VariantsToVCF"));
metaData.add(new VCFHeaderLine("reference", args.referenceFile.getAbsolutePath()));
Set<String> additionalColumns = new HashSet<String>();
additionalColumns.add("FORMAT");
additionalColumns.addAll(sampleNames);
return new VCFHeader(metaData, additionalColumns);
}
public boolean filter(RefMetaDataTracker tracker, char ref, AlignmentContext context) {
if (BaseUtils.simpleBaseToBaseIndex(ref) > -1) {
return true;
}
for (ReferenceOrderedDatum rod : tracker.getAllRods()) {
if (rod != null && sampleNames.keySet().contains(rod.getName().toUpperCase())) {
return true;
}
}
return false;
}
private String[] ALLOWED_FORMAT_FIELDS = {"GT"};
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
List<VCFGenotypeRecord> gt = new ArrayList<VCFGenotypeRecord>();
Map<VCFHeader.HEADER_FIELDS, String> map = new HashMap<VCFHeader.HEADER_FIELDS, String>();
VCFRecord rec = generateVCFRecord(tracker, ref, context, vcfheader, gt, map, sampleNames, out, SUPPRESS_MULTISTATE, VERBOSE);
if (rec != null) {
vcfwriter.addRecord(rec);
return 1;
if ( tracker == null )
return 0;
rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
Collection<VariantContext> contexts = tracker.getVariantContexts("variant", ALLOWED_VARIANT_CONTEXT_TYPES, context.getLocation(), true, false);
for ( VariantContext vc : contexts ) {
VCFRecord vcf = VariantContextAdaptors.toVCF(vc, ref.getBase(), Arrays.asList(ALLOWED_FORMAT_FIELDS), false, false);
if ( dbsnp != null )
vcf.setID(dbsnp.getRS_ID());
if ( sampleName != null && vcf.hasGenotypeData() && vcf.getGenotype("variant") != null )
vcf.getGenotype("variant").setSampleName(sampleName);
writeRecord(vcf);
}
return 0;
return 1;
}
public static VCFRecord generateVCFRecord(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context,
VCFHeader vcfheader, List<VCFGenotypeRecord> gt, Map<VCFHeader.HEADER_FIELDS, String> map,
Map<String, String> sampleNamesToRods, PrintStream out, boolean SUPPRESS_MULTISTATE, boolean VERBOSE) {
int[] alleleCounts = new int[4];
int numSNPs = 0;
int numRefs = 0;
int[] alleleNames = {0, 1, 2, 3};
double snpQual = 0.0;
int refbase = BaseUtils.simpleBaseToBaseIndex(ref.getBase());
List<VCFGenotypeEncoding> alts = new ArrayList<VCFGenotypeEncoding>();
for (String name : vcfheader.getGenotypeSamples()) {
ReferenceOrderedDatum rod = tracker.lookup(sampleNamesToRods.get(name), null);
if (rod != null) {
Variation av = (Variation) rod;
String lod = String.format("%d", av.getNegLog10PError() > 99 ? 99 : (int) av.getNegLog10PError());
int depth = 0;
private void writeRecord(VCFRecord rec) {
if ( vcfwriter == null ) {
// setup the header fields
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFHeaderLine("source", "VariantFiltration"));
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
if (rod instanceof RodGeliText) {
RodGeliText rv = (RodGeliText) rod;
depth = rv.depth;
}
if (!(rod instanceof VariantBackedByGenotype))
throw new IllegalArgumentException("The passed in variant type must be backed by genotype data");
Genotype genotype = ((VariantBackedByGenotype) rod).getCalledGenotype();
List<VCFGenotypeEncoding> alleles = new ArrayList<VCFGenotypeEncoding>();
for (char base : genotype.getBases().toCharArray()) {
alleles.add(new VCFGenotypeEncoding(String.valueOf(base)));
if (base != ref.getBase() && !alts.contains(String.valueOf(base))) alts.add(new VCFGenotypeEncoding(String.valueOf(base)));
}
int allele1 = BaseUtils.simpleBaseToBaseIndex(genotype.getBases().charAt(0));
int allele2 = BaseUtils.simpleBaseToBaseIndex(genotype.getBases().charAt(1));
if (allele1 >= 0 && allele1 != refbase) {
alleleCounts[allele1]++;
}
if (allele2 >= 0 && allele2 != refbase) {
alleleCounts[allele2]++;
}
Map<String, String> str = new HashMap<String, String>();
str.put("GQ", lod);
if (depth > 0) str.put("DP", String.valueOf(depth));
vcfwriter = new VCFWriter(out);
VCFGenotypeRecord rec = new VCFGenotypeRecord(name, alleles, VCFGenotypeRecord.PHASE.UNPHASED);
for ( Map.Entry<String, String> entry : str.entrySet() )
rec.setField(entry.getKey(), entry.getValue());
gt.add(rec);
TreeSet<String> samples = new TreeSet<String>();
if ( sampleName != null )
samples.add(sampleName);
else
samples.addAll(Arrays.asList(rec.getSampleNames()));
numSNPs++;
snpQual += av.getNegLog10PError();
} else if (BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1) {
List<VCFGenotypeEncoding> alleles = new ArrayList<VCFGenotypeEncoding>();
alleles.add(new VCFGenotypeEncoding(String.valueOf(ref.getBase())));
alleles.add(new VCFGenotypeEncoding(String.valueOf(ref.getBase())));
gt.add(new VCFGenotypeRecord(name, alleles, VCFGenotypeRecord.PHASE.UNPHASED));
vcfwriter.writeHeader(new VCFHeader(hInfo, samples));
numRefs++;
}
}
if (numSNPs == 0)
return null;
Integer[] perm = Utils.SortPermutation(alleleCounts);
int[] sortedCounts = Utils.PermuteArray(alleleCounts, perm);
int[] sortedNames = Utils.PermuteArray(alleleNames, perm);
rodDbSNP dbsnp = (rodDbSNP) tracker.lookup("dbsnp", null);
String infoString = String.format("locus=%s ref=%c allele_count=( %c:%d %c:%d %c:%d %c:%d )",
context.getLocation(),
ref.getBase(),
BaseUtils.baseIndexToSimpleBase(sortedNames[0]), sortedCounts[0],
BaseUtils.baseIndexToSimpleBase(sortedNames[1]), sortedCounts[1],
BaseUtils.baseIndexToSimpleBase(sortedNames[2]), sortedCounts[2],
BaseUtils.baseIndexToSimpleBase(sortedNames[3]), sortedCounts[3]
);
if (SUPPRESS_MULTISTATE && sortedCounts[2] > 0) {
out.println("[multistate] " + infoString);
return null;
} else {
if (VERBOSE) {
out.println("[locus_info] " + infoString);
}
}
Map<String,String> info = new HashMap<String,String>();
if (dbsnp != null) info.put("DB","1");
if (dbsnp != null && dbsnp.isHapmap()) info.put("H2","1");
return new VCFRecord(Character.toString(ref.getBase()),
context.getContig(),
(int) context.getPosition(),
(dbsnp == null) ? "." : dbsnp.getRS_ID(),
alts,
snpQual > 99 ? 99 : (int) snpQual,
"0",
info,
format,
gt);
vcfwriter.addRecord(rec);
}
public Integer reduceInit() {
@ -208,8 +78,6 @@ public class VariantsToVCF extends RefWalker<Integer, Integer> {
}
public void onTraversalDone(Integer sum) {
out.println("Processed " + sum + " variants.");
vcfwriter.close();
}
}

View File

@ -5,7 +5,6 @@ import org.junit.Test;
import java.util.List;
import java.util.ArrayList;
import java.io.File;
/**
@ -18,62 +17,38 @@ import java.io.File;
public class VariantsToVCFIntegrationTest extends WalkerTest {
//@Test
@Test
public void testVariantsToVCFUsingGeliInput() {
List<String> md5 = new ArrayList<String>();
md5.add("a94c15f2e8905fd3e98301375cf0f42a");
md5.add("8f32f0efed5d0233cf9292198f4f01d8");
/**
* the above MD5 was calculated from running the following command:
*
* java -jar ./dist/GenomeAnalysisTK.jar \
* -R /broad/1KG/reference/human_b36_both.fasta \
* -T VariantEval \
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
* -L 1:10,000,000-11,000,000 \
* --outerr myVariantEval \
* --supressDateInformation \
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls
*
*/
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
" --rodBind NA123AB,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" +
" -B variant,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" +
" -T VariantsToVCF" +
" -L 1:10,000,000-11,000,000" +
" --vcfout %s",
" -sample NA123AB" +
" -o %s",
1, // just one output file
md5);
List<File> result = executeTest("testVariantsToVCFUsingGeliInput", spec).getFirst();
executeTest("testVariantsToVCFUsingGeliInput #1", spec).getFirst();
}
//@Test
@Test
public void testGenotypesToVCFUsingGeliInput() {
List<String> md5 = new ArrayList<String>();
md5.add("6b18f33e25edbd2154c17a949656644b");
md5.add("40cc4d04d9a50043ce1322ea2650c453");
/**
* the above MD5 was calculated from running the following command:
*
* java -jar ./dist/GenomeAnalysisTK.jar \
* -R /broad/1KG/reference/human_b36_both.fasta \
* -T VariantEval \
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
* -L 1:10,000,000-11,000,000 \
* --outerr myVariantEval \
* --supressDateInformation \
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls
*
*/
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
" --rodBind NA123AB,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.genotypes.geli.calls" +
" -B variant,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.genotypes.geli.calls" +
" -T VariantsToVCF" +
" -L 1:10,000,000-11,000,000" +
" --vcfout %s",
" -sample NA123AB" +
" -o %s",
1, // just one output file
md5);
List<File> result = executeTest("testVariantsToVCFUsingGeliInput", spec).getFirst();
executeTest("testVariantsToVCFUsingGeliInput #2", spec).getFirst();
}
}