Updated. Now uses VariantContext and is truly "variants" to vcf (i.e. not just GELI to vcf).
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3074 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2373a4618f
commit
4398a8b370
|
|
@ -1,202 +1,72 @@
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf;
|
package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
import org.broadinstitute.sting.gatk.refdata.*;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RodGeliText;
|
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||||
import org.broadinstitute.sting.gatk.walkers.RefWalker;
|
import org.broadinstitute.sting.gatk.walkers.RMD;
|
||||||
import org.broadinstitute.sting.utils.BaseUtils;
|
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
|
||||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
|
||||||
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
|
|
||||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
|
||||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||||
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.PrintStream;
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts variants from other file formats (anything that implements the Variation interface) to VCF format.
|
* Converts variants from other file formats to VCF format.
|
||||||
*/
|
*/
|
||||||
public class VariantsToVCF extends RefWalker<Integer, Integer> {
|
@Requires(value={},referenceMetaData=@RMD(name="variant",type= ReferenceOrderedDatum.class))
|
||||||
@Argument(fullName="vcfout", shortName="VO", doc="The output VCF file") public File VCF_OUT;
|
public class VariantsToVCF extends RodWalker<Integer, Integer> {
|
||||||
@Argument(fullName="verbose", shortName="V", doc="Show extended output", required=false) public boolean VERBOSE = false;
|
|
||||||
@Argument(fullName="suppress_multistate_alleles", shortName="SMA", doc="When multi-sample genotypes imply something other than a biallele state, suppress it.", required=false) public boolean SUPPRESS_MULTISTATE = false;
|
@Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod (for data like GELI with genotypes)", required=false)
|
||||||
|
protected String sampleName = null;
|
||||||
|
|
||||||
private VCFWriter vcfwriter = null;
|
private VCFWriter vcfwriter = null;
|
||||||
private VCFHeader vcfheader = null;
|
|
||||||
private TreeMap<String, String> sampleNames = null;
|
|
||||||
private static String format = "GT:GQ:DP";
|
|
||||||
|
|
||||||
public void initialize() {
|
// Don't allow mixed types for now
|
||||||
sampleNames = new TreeMap<String, String>();
|
private EnumSet<VariantContext.Type> ALLOWED_VARIANT_CONTEXT_TYPES = EnumSet.of(VariantContext.Type.SNP, VariantContext.Type.NO_VARIATION, VariantContext.Type.INDEL);
|
||||||
GATKArgumentCollection args = this.getToolkit().getArguments();
|
|
||||||
|
|
||||||
for (String rodName : args.RODBindings) {
|
private String[] ALLOWED_FORMAT_FIELDS = {"GT"};
|
||||||
//out.println(rodName);
|
|
||||||
String[] rodPieces = rodName.split(",");
|
|
||||||
String sampleName = rodPieces[0];
|
|
||||||
|
|
||||||
if (! ( sampleName.equals("dbSNP") || sampleName.equals("interval") ) ) {
|
|
||||||
sampleNames.put(sampleName, sampleName);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
vcfheader = getHeader(args, sampleNames.keySet());
|
|
||||||
vcfwriter = new VCFWriter(VCF_OUT);
|
|
||||||
vcfwriter.writeHeader(vcfheader);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static VCFHeader getHeader(GATKArgumentCollection args, Set<String> sampleNames) {
|
|
||||||
|
|
||||||
// Don't output the data for now because it kills our unit test MD5s and is optional
|
|
||||||
// TODO - figure out what to do here
|
|
||||||
//Calendar cal = Calendar.getInstance();
|
|
||||||
//metaData.put("fileDate", String.format("%d%02d%02d", cal.get(Calendar.YEAR), cal.get(Calendar.MONTH), cal.get(Calendar.DAY_OF_MONTH)));
|
|
||||||
|
|
||||||
Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
|
|
||||||
metaData.add(new VCFHeaderLine("source", "VariantsToVCF"));
|
|
||||||
metaData.add(new VCFHeaderLine("reference", args.referenceFile.getAbsolutePath()));
|
|
||||||
|
|
||||||
Set<String> additionalColumns = new HashSet<String>();
|
|
||||||
additionalColumns.add("FORMAT");
|
|
||||||
additionalColumns.addAll(sampleNames);
|
|
||||||
|
|
||||||
return new VCFHeader(metaData, additionalColumns);
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean filter(RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
|
||||||
if (BaseUtils.simpleBaseToBaseIndex(ref) > -1) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (ReferenceOrderedDatum rod : tracker.getAllRods()) {
|
|
||||||
if (rod != null && sampleNames.keySet().contains(rod.getName().toUpperCase())) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
List<VCFGenotypeRecord> gt = new ArrayList<VCFGenotypeRecord>();
|
if ( tracker == null )
|
||||||
Map<VCFHeader.HEADER_FIELDS, String> map = new HashMap<VCFHeader.HEADER_FIELDS, String>();
|
return 0;
|
||||||
VCFRecord rec = generateVCFRecord(tracker, ref, context, vcfheader, gt, map, sampleNames, out, SUPPRESS_MULTISTATE, VERBOSE);
|
|
||||||
if (rec != null) {
|
rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
|
||||||
vcfwriter.addRecord(rec);
|
|
||||||
|
Collection<VariantContext> contexts = tracker.getVariantContexts("variant", ALLOWED_VARIANT_CONTEXT_TYPES, context.getLocation(), true, false);
|
||||||
|
for ( VariantContext vc : contexts ) {
|
||||||
|
VCFRecord vcf = VariantContextAdaptors.toVCF(vc, ref.getBase(), Arrays.asList(ALLOWED_FORMAT_FIELDS), false, false);
|
||||||
|
if ( dbsnp != null )
|
||||||
|
vcf.setID(dbsnp.getRS_ID());
|
||||||
|
if ( sampleName != null && vcf.hasGenotypeData() && vcf.getGenotype("variant") != null )
|
||||||
|
vcf.getGenotype("variant").setSampleName(sampleName);
|
||||||
|
writeRecord(vcf);
|
||||||
|
}
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
return 0;
|
|
||||||
|
private void writeRecord(VCFRecord rec) {
|
||||||
|
if ( vcfwriter == null ) {
|
||||||
|
// setup the header fields
|
||||||
|
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||||
|
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||||
|
hInfo.add(new VCFHeaderLine("source", "VariantFiltration"));
|
||||||
|
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
|
||||||
|
|
||||||
|
vcfwriter = new VCFWriter(out);
|
||||||
|
|
||||||
|
TreeSet<String> samples = new TreeSet<String>();
|
||||||
|
if ( sampleName != null )
|
||||||
|
samples.add(sampleName);
|
||||||
|
else
|
||||||
|
samples.addAll(Arrays.asList(rec.getSampleNames()));
|
||||||
|
|
||||||
|
vcfwriter.writeHeader(new VCFHeader(hInfo, samples));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
vcfwriter.addRecord(rec);
|
||||||
public static VCFRecord generateVCFRecord(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context,
|
|
||||||
VCFHeader vcfheader, List<VCFGenotypeRecord> gt, Map<VCFHeader.HEADER_FIELDS, String> map,
|
|
||||||
Map<String, String> sampleNamesToRods, PrintStream out, boolean SUPPRESS_MULTISTATE, boolean VERBOSE) {
|
|
||||||
int[] alleleCounts = new int[4];
|
|
||||||
int numSNPs = 0;
|
|
||||||
int numRefs = 0;
|
|
||||||
int[] alleleNames = {0, 1, 2, 3};
|
|
||||||
double snpQual = 0.0;
|
|
||||||
int refbase = BaseUtils.simpleBaseToBaseIndex(ref.getBase());
|
|
||||||
List<VCFGenotypeEncoding> alts = new ArrayList<VCFGenotypeEncoding>();
|
|
||||||
for (String name : vcfheader.getGenotypeSamples()) {
|
|
||||||
ReferenceOrderedDatum rod = tracker.lookup(sampleNamesToRods.get(name), null);
|
|
||||||
if (rod != null) {
|
|
||||||
Variation av = (Variation) rod;
|
|
||||||
String lod = String.format("%d", av.getNegLog10PError() > 99 ? 99 : (int) av.getNegLog10PError());
|
|
||||||
int depth = 0;
|
|
||||||
|
|
||||||
if (rod instanceof RodGeliText) {
|
|
||||||
RodGeliText rv = (RodGeliText) rod;
|
|
||||||
depth = rv.depth;
|
|
||||||
}
|
|
||||||
if (!(rod instanceof VariantBackedByGenotype))
|
|
||||||
throw new IllegalArgumentException("The passed in variant type must be backed by genotype data");
|
|
||||||
Genotype genotype = ((VariantBackedByGenotype) rod).getCalledGenotype();
|
|
||||||
List<VCFGenotypeEncoding> alleles = new ArrayList<VCFGenotypeEncoding>();
|
|
||||||
for (char base : genotype.getBases().toCharArray()) {
|
|
||||||
alleles.add(new VCFGenotypeEncoding(String.valueOf(base)));
|
|
||||||
if (base != ref.getBase() && !alts.contains(String.valueOf(base))) alts.add(new VCFGenotypeEncoding(String.valueOf(base)));
|
|
||||||
}
|
|
||||||
int allele1 = BaseUtils.simpleBaseToBaseIndex(genotype.getBases().charAt(0));
|
|
||||||
int allele2 = BaseUtils.simpleBaseToBaseIndex(genotype.getBases().charAt(1));
|
|
||||||
if (allele1 >= 0 && allele1 != refbase) {
|
|
||||||
alleleCounts[allele1]++;
|
|
||||||
}
|
|
||||||
if (allele2 >= 0 && allele2 != refbase) {
|
|
||||||
alleleCounts[allele2]++;
|
|
||||||
}
|
|
||||||
Map<String, String> str = new HashMap<String, String>();
|
|
||||||
str.put("GQ", lod);
|
|
||||||
if (depth > 0) str.put("DP", String.valueOf(depth));
|
|
||||||
|
|
||||||
VCFGenotypeRecord rec = new VCFGenotypeRecord(name, alleles, VCFGenotypeRecord.PHASE.UNPHASED);
|
|
||||||
for ( Map.Entry<String, String> entry : str.entrySet() )
|
|
||||||
rec.setField(entry.getKey(), entry.getValue());
|
|
||||||
gt.add(rec);
|
|
||||||
|
|
||||||
numSNPs++;
|
|
||||||
snpQual += av.getNegLog10PError();
|
|
||||||
} else if (BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1) {
|
|
||||||
List<VCFGenotypeEncoding> alleles = new ArrayList<VCFGenotypeEncoding>();
|
|
||||||
alleles.add(new VCFGenotypeEncoding(String.valueOf(ref.getBase())));
|
|
||||||
alleles.add(new VCFGenotypeEncoding(String.valueOf(ref.getBase())));
|
|
||||||
gt.add(new VCFGenotypeRecord(name, alleles, VCFGenotypeRecord.PHASE.UNPHASED));
|
|
||||||
|
|
||||||
numRefs++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (numSNPs == 0)
|
|
||||||
return null;
|
|
||||||
|
|
||||||
|
|
||||||
Integer[] perm = Utils.SortPermutation(alleleCounts);
|
|
||||||
int[] sortedCounts = Utils.PermuteArray(alleleCounts, perm);
|
|
||||||
int[] sortedNames = Utils.PermuteArray(alleleNames, perm);
|
|
||||||
|
|
||||||
rodDbSNP dbsnp = (rodDbSNP) tracker.lookup("dbsnp", null);
|
|
||||||
|
|
||||||
String infoString = String.format("locus=%s ref=%c allele_count=( %c:%d %c:%d %c:%d %c:%d )",
|
|
||||||
context.getLocation(),
|
|
||||||
ref.getBase(),
|
|
||||||
BaseUtils.baseIndexToSimpleBase(sortedNames[0]), sortedCounts[0],
|
|
||||||
BaseUtils.baseIndexToSimpleBase(sortedNames[1]), sortedCounts[1],
|
|
||||||
BaseUtils.baseIndexToSimpleBase(sortedNames[2]), sortedCounts[2],
|
|
||||||
BaseUtils.baseIndexToSimpleBase(sortedNames[3]), sortedCounts[3]
|
|
||||||
);
|
|
||||||
|
|
||||||
if (SUPPRESS_MULTISTATE && sortedCounts[2] > 0) {
|
|
||||||
out.println("[multistate] " + infoString);
|
|
||||||
return null;
|
|
||||||
} else {
|
|
||||||
if (VERBOSE) {
|
|
||||||
out.println("[locus_info] " + infoString);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Map<String,String> info = new HashMap<String,String>();
|
|
||||||
if (dbsnp != null) info.put("DB","1");
|
|
||||||
if (dbsnp != null && dbsnp.isHapmap()) info.put("H2","1");
|
|
||||||
|
|
||||||
return new VCFRecord(Character.toString(ref.getBase()),
|
|
||||||
context.getContig(),
|
|
||||||
(int) context.getPosition(),
|
|
||||||
(dbsnp == null) ? "." : dbsnp.getRS_ID(),
|
|
||||||
alts,
|
|
||||||
snpQual > 99 ? 99 : (int) snpQual,
|
|
||||||
"0",
|
|
||||||
info,
|
|
||||||
format,
|
|
||||||
gt);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Integer reduceInit() {
|
public Integer reduceInit() {
|
||||||
|
|
@ -208,8 +78,6 @@ public class VariantsToVCF extends RefWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void onTraversalDone(Integer sum) {
|
public void onTraversalDone(Integer sum) {
|
||||||
out.println("Processed " + sum + " variants.");
|
|
||||||
|
|
||||||
vcfwriter.close();
|
vcfwriter.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,6 @@ import org.junit.Test;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.io.File;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -18,62 +17,38 @@ import java.io.File;
|
||||||
public class VariantsToVCFIntegrationTest extends WalkerTest {
|
public class VariantsToVCFIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
|
|
||||||
//@Test
|
@Test
|
||||||
public void testVariantsToVCFUsingGeliInput() {
|
public void testVariantsToVCFUsingGeliInput() {
|
||||||
List<String> md5 = new ArrayList<String>();
|
List<String> md5 = new ArrayList<String>();
|
||||||
md5.add("a94c15f2e8905fd3e98301375cf0f42a");
|
md5.add("8f32f0efed5d0233cf9292198f4f01d8");
|
||||||
|
|
||||||
/**
|
|
||||||
* the above MD5 was calculated from running the following command:
|
|
||||||
*
|
|
||||||
* java -jar ./dist/GenomeAnalysisTK.jar \
|
|
||||||
* -R /broad/1KG/reference/human_b36_both.fasta \
|
|
||||||
* -T VariantEval \
|
|
||||||
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
|
|
||||||
* -L 1:10,000,000-11,000,000 \
|
|
||||||
* --outerr myVariantEval \
|
|
||||||
* --supressDateInformation \
|
|
||||||
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||||
" --rodBind NA123AB,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" +
|
" -B variant,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" +
|
||||||
" -T VariantsToVCF" +
|
" -T VariantsToVCF" +
|
||||||
" -L 1:10,000,000-11,000,000" +
|
" -L 1:10,000,000-11,000,000" +
|
||||||
" --vcfout %s",
|
" -sample NA123AB" +
|
||||||
|
" -o %s",
|
||||||
1, // just one output file
|
1, // just one output file
|
||||||
md5);
|
md5);
|
||||||
List<File> result = executeTest("testVariantsToVCFUsingGeliInput", spec).getFirst();
|
executeTest("testVariantsToVCFUsingGeliInput #1", spec).getFirst();
|
||||||
}
|
}
|
||||||
|
|
||||||
//@Test
|
@Test
|
||||||
public void testGenotypesToVCFUsingGeliInput() {
|
public void testGenotypesToVCFUsingGeliInput() {
|
||||||
List<String> md5 = new ArrayList<String>();
|
List<String> md5 = new ArrayList<String>();
|
||||||
md5.add("6b18f33e25edbd2154c17a949656644b");
|
md5.add("40cc4d04d9a50043ce1322ea2650c453");
|
||||||
|
|
||||||
/**
|
|
||||||
* the above MD5 was calculated from running the following command:
|
|
||||||
*
|
|
||||||
* java -jar ./dist/GenomeAnalysisTK.jar \
|
|
||||||
* -R /broad/1KG/reference/human_b36_both.fasta \
|
|
||||||
* -T VariantEval \
|
|
||||||
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
|
|
||||||
* -L 1:10,000,000-11,000,000 \
|
|
||||||
* --outerr myVariantEval \
|
|
||||||
* --supressDateInformation \
|
|
||||||
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||||
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||||
" --rodBind NA123AB,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.genotypes.geli.calls" +
|
" -B variant,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.genotypes.geli.calls" +
|
||||||
" -T VariantsToVCF" +
|
" -T VariantsToVCF" +
|
||||||
" -L 1:10,000,000-11,000,000" +
|
" -L 1:10,000,000-11,000,000" +
|
||||||
" --vcfout %s",
|
" -sample NA123AB" +
|
||||||
|
" -o %s",
|
||||||
1, // just one output file
|
1, // just one output file
|
||||||
md5);
|
md5);
|
||||||
List<File> result = executeTest("testVariantsToVCFUsingGeliInput", spec).getFirst();
|
executeTest("testVariantsToVCFUsingGeliInput #2", spec).getFirst();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue