VariantContext to VCF converter. BeagleROD, and phasing of VCF calls. Integration tests galore :-)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2814 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-02-09 19:02:25 +00:00
parent 369cc50802
commit 934d4b93a2
9 changed files with 318 additions and 7 deletions

View File

@ -108,6 +108,7 @@ public class Genotype {
* @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF)
*/
public boolean isNoCall() { return getType() == Type.NO_CALL; }
public boolean isCalled() { return getType() != Type.NO_CALL; }
public void validate() {
// todo -- add validation checking here

View File

@ -1,8 +1,6 @@
package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeEncoding;
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
@ -182,6 +180,16 @@ public class VariantContextAdaptors {
return null; // can't handle anything else
}
public static VCFHeader createVCFHeader(Set<VCFHeaderLine> hInfo, VariantContext vc) {
HashSet<String> names = new LinkedHashSet<String>();
for ( Genotype g : vc.getGenotypesSortedByName() ) {
names.add(g.getSampleName());
}
return new VCFHeader(hInfo == null ? new HashSet<VCFHeaderLine>() : hInfo, names);
}
public static VCFRecord toVCF(VariantContext vc) {
// deal with the reference
char referenceBase = 'N'; // by default we'll use N
@ -189,6 +197,10 @@ public class VariantContextAdaptors {
referenceBase = (char)vc.getReference().getBases()[0];
}
if ( ! vc.isSNP() )
// todo -- update the code so it works correctly with indels
throw new StingException("VariantContext -> VCF converter currently doesn't support indels; complain to the GATK team");
String contig = vc.getLocation().getContig();
long position = vc.getLocation().getStart();
String ID = vc.hasAttribute("ID") ? vc.getAttributeAsString("ID") : ".";

View File

@ -5,10 +5,16 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.EnumSet;
import java.util.Set;
import java.util.HashSet;
import java.util.List;
import java.io.File;
/**
* Test routine for new VariantContext object
@ -26,6 +32,17 @@ public class TestVariantContextWalker extends RodWalker<Integer, Integer> {
@Argument(fullName="printPerLocus", doc="If true, we'll print the variant contexts, in addition to counts", required=false)
boolean printContexts = false;
@Argument(fullName="outputVCF", doc="If provided, we'll convert the first input context into a VCF", required=false)
String outputVCF = null;
private VCFWriter writer = null;
private boolean wroteHeader = false;
public void initialize() {
if ( outputVCF != null )
writer = new VCFWriter(new File(outputVCF));
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( ref == null )
return 0;
@ -34,6 +51,15 @@ public class TestVariantContextWalker extends RodWalker<Integer, Integer> {
int n = 0;
for (VariantContext vc : tracker.getAllVariantContexts(allowedTypes, context.getLocation(), onlyContextsStartinAtCurrentPosition, takeFirstOnly) ) {
if ( writer != null && n == 0 ) {
if ( ! wroteHeader ) {
writer.writeHeader(VariantContextAdaptors.createVCFHeader(null, vc));
wroteHeader = true;
}
writer.addRecord(VariantContextAdaptors.toVCF(vc));
}
n++;
if ( printContexts ) out.printf(" %s%n", vc);
}

View File

@ -98,7 +98,7 @@ public class MendelianViolationEvaluator extends VariantEvaluator {
Genotype dadG = vc.getGenotype(trio.dad);
Genotype childG = vc.getGenotype(trio.child);
if ( momG.getNegLog10PError() > getQThreshold() && dadG.getNegLog10PError() > getQThreshold() && childG.getNegLog10PError() > getQThreshold() ) {
if ( includeGenotype(momG) && includeGenotype(dadG) && includeGenotype(childG) ) {
// all genotypes are good, so let's see if child is a violation
if ( isViolation(vc, momG, dadG, childG) ) {
@ -152,6 +152,10 @@ public class MendelianViolationEvaluator extends VariantEvaluator {
}
}
private boolean includeGenotype(Genotype g) {
return g.getNegLog10PError() > getQThreshold() && g.isCalled();
}
private enum ViolationType {
UNDER_CALL, OVER_CALL
}

View File

@ -196,6 +196,9 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
//System.out.printf("map at %s with %d skipped%n", context.getLocation(), context.getSkippedBases());
if ( ref == null )
return 0;
Map<String, VariantContext> comps = getCompVariantContexts(tracker, context);
// to enable walking over pairs where eval or comps have no elements

View File

@ -0,0 +1,129 @@
package org.broadinstitute.sting.oneoffprojects.walkers.vcftools;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeader;
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.oneoffprojects.walkers.varianteval2.MendelianViolationEvaluator;
import java.util.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.io.File;
import java.io.FileNotFoundException;
/**
* Test routine for new VariantContext object
*/
@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="variants",type=ReferenceOrderedDatum.class), @RMD(name="beagle",type=BeagleROD.class)})
public class BeagleTrioToVCFWalker extends RodWalker<VariantContext, Long> {
@Argument(shortName="trio", doc="If provide, treats the input VCF as a single record containing genotypes for a single trio; String formatted as dad+mom=child", required=false)
protected String TRIO_STRUCTURE;
@Argument(shortName="eth", fullName="excludeTripleHets", doc="If provide, sites that are triple hets calls will not be phased, regardless of Beagle's value", required=false)
protected boolean dontPhaseTripleHets = false;
private MendelianViolationEvaluator.TrioStructure trio = null;
private VCFWriter writer;
private boolean headerWritten = false;
private final static String TRACK_NAME = "variants";
private final static String BEAGLE_NAME = "beagle";
public void initialize() {
trio = MendelianViolationEvaluator.parseTrioDescription(TRIO_STRUCTURE);
writer = new VCFWriter(out);
}
public VariantContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
VariantContext vc = null;
if ( ref != null ) {
vc = tracker.getVariantContext(TRACK_NAME, null, context.getLocation(), false);
BeagleROD beagle = (BeagleROD)tracker.lookup(BEAGLE_NAME, null);
if ( vc != null ) {
if ( ! headerWritten ) {
RodVCF vcfrod = (RodVCF)tracker.lookup(TRACK_NAME, null);
writer.writeHeader(vcfrod.getHeader());
headerWritten = true;
}
//System.out.printf("VCF: %s%n", tracker.lookup(TRACK_NAME, null));
vc = maybePhaseVC(vc, beagle);
}
}
return vc;
}
private VariantContext maybePhaseVC(VariantContext unphased, BeagleROD beagle) {
if ( beagle == null ) {
return unphased;
} else {
Map<String, List<String>> bglData = beagle.getGenotypes();
List<String> momBgl = bglData.get(trio.mom);
List<String> dadBgl = bglData.get(trio.dad);
Genotype unphasedMom = unphased.getGenotype(trio.mom);
Genotype unphasedDad = unphased.getGenotype(trio.dad);
Genotype unphasedKid = unphased.getGenotype(trio.child);
if ( dontPhaseTripleHets && unphasedMom.isHet() && unphasedDad.isHet() && unphasedKid.isHet() )
return unphased;
else {
Allele momTrans = unphased.getAllele(momBgl.get(0));
Allele momUntrans = unphased.getAllele(momBgl.get(1));
Allele dadTrans = unphased.getAllele(dadBgl.get(0));
Allele dadUntrans = unphased.getAllele(dadBgl.get(1));
Genotype momG = phaseGenotype(unphasedMom, Arrays.asList(momTrans, momUntrans));
Genotype dadG = phaseGenotype(unphasedDad, Arrays.asList(dadTrans, dadUntrans));
Genotype kidG = phaseGenotype(unphasedKid, Arrays.asList(momTrans, dadTrans));
return new VariantContext(unphased.getName(), unphased.getLocation(), unphased.getAlleles(),
Arrays.asList(momG, dadG, kidG), unphased.getNegLog10PError(), unphased.getFilters(), unphased.getAttributes());
}
}
}
private Genotype phaseGenotype(Genotype base, List<Allele> alleles) {
return new Genotype(base.getSampleName(), alleles, base.getNegLog10PError(), base.getFilters(), base.getAttributes(), true);
}
public Long reduceInit() {
return 0L;
}
public Integer reduce(VariantContext point, Integer sum) {
return sum;
}
public void onTraversalDone(Integer result) {
//logger.info(String.format("Saw %d raw SNPs", result.nVariants));
//logger.info(String.format("Converted %d (%.2f%%) of these sites", result.nConverted, (100.0 * result.nConverted) / result.nVariants));
}
public Long reduce(VariantContext vc, Long prevReduce) {
if ( vc == null ) {
return prevReduce;
} else {
writer.addRecord(VariantContextAdaptors.toVCF(vc));
prevReduce++;
}
return prevReduce;
}
}

View File

@ -0,0 +1,114 @@
package org.broadinstitute.sting.oneoffprojects.walkers.vcftools;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.oneoffprojects.walkers.varianteval2.MendelianViolationEvaluator;
import java.util.EnumSet;
import java.util.Arrays;
/**
* Test routine for new VariantContext object
*/
@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="variants",type=ReferenceOrderedDatum.class)})
public class VCFToBeagleWalker extends RodWalker<Integer, VCFToBeagleWalker.Result> {
@Argument(shortName="trio", doc="If provide, treats the input VCF as a single record containing genotypes for a single trio; String formatted as dad+mom=child", required=false)
protected String TRIO_STRUCTURE;
private MendelianViolationEvaluator.TrioStructure trio = null;
public class Result {
int nVariants, nConverted;
}
public void initialize() {
if ( TRIO_STRUCTURE != null ) {
trio = MendelianViolationEvaluator.parseTrioDescription(TRIO_STRUCTURE);
out.printf("I id %s%n", Utils.join(" ", Arrays.asList(trio.mom, trio.mom, trio.dad, trio.dad, trio.child, trio.child)));
}
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( ref != null ) {
EnumSet<VariantContext.Type> allowedTypes = EnumSet.of(VariantContext.Type.SNP);
VariantContext vc = tracker.getVariantContext("variants", allowedTypes, context.getLocation(), false);
if ( vc != null && vc.isBiallelic() && vc.isNotFiltered() ) {
if ( trio != null ) { // we are emitting a trio file
if ( ! vc.hasGenotypes() || vc.getGenotypes().size() != 3 )
throw new StingException("Convertion exception: Trio conversion requires exactly three genotypes at every locus: " + vc);
if ( genotypesAreGood(vc) ) {
if ( ! genotypesAreGoodForTrios(vc, trio) ) {
logger.debug("VC excluded due to poor trio genotyping " + vc);
} else {
Genotype mom = vc.getGenotype(trio.mom);
Genotype dad = vc.getGenotype(trio.dad);
Genotype child = vc.getGenotype(trio.child);
// beagle format looks like:
//
// I id 1001 1001 1002 1002 1003 1003
// A diabetes 1 1 2 2 2 2
// M rs2289311 A G G G A G
String loc = "c" + vc.getLocation().getContig() + "_p" + vc.getLocation().getStart();
out.printf("M %s %s %s %s%n", loc, genotype2BeagleString(mom), genotype2BeagleString(dad), genotype2BeagleString(child));
return 1;
}
}
} else {
throw new IllegalArgumentException("VCFToBeagle currently only supports conversion of trios. Complain to mark");
}
}
}
return 0;
}
private String genotype2BeagleString(Genotype g) {
return allele2BeagleString(g.getAllele(0)) + " " + allele2BeagleString(g.getAllele(1));
}
private String allele2BeagleString(Allele a) {
return new String(a.getBases());
}
private static boolean genotypesAreGood(VariantContext vc) {
for ( Genotype g : vc.getGenotypes().values() ) {
if ( g.isFiltered() )
return false;
}
return true;
}
private static boolean genotypesAreGoodForTrios(VariantContext vc, MendelianViolationEvaluator.TrioStructure trio) {
return ! MendelianViolationEvaluator.isViolation(vc, vc.getGenotype(trio.mom), vc.getGenotype(trio.dad), vc.getGenotype(trio.child));
}
public Result reduceInit() {
return new Result();
}
public Result reduce(Integer point, Result sum) {
sum.nVariants++;
sum.nConverted += point;
return sum;
}
public void onTraversalDone(Result result) {
logger.info(String.format("Saw %d raw SNPs", result.nVariants));
logger.info(String.format("Converted %d (%.2f%%) of these sites", result.nConverted, (100.0 * result.nConverted) / result.nVariants));
}
}

View File

@ -568,14 +568,24 @@ public class Utils {
}
public static <T extends Comparable<T>> List<T> sorted(Collection<T> c) {
return sorted(c, false);
}
public static <T extends Comparable<T>> List<T> sorted(Collection<T> c, boolean reverse) {
List<T> l = new ArrayList<T>(c);
Collections.sort(l);
if ( reverse ) Collections.reverse(l);
return l;
}
public static <T extends Comparable<T>, V> List<V> sorted(Map<T,V> c) {
return sorted(c, false);
}
public static <T extends Comparable<T>, V> List<V> sorted(Map<T,V> c, boolean reverse) {
List<T> t = new ArrayList<T>(c.keySet());
Collections.sort(t);
if ( reverse ) Collections.reverse(t);
List<V> l = new ArrayList<V>();
for ( T k : t ) {

View File

@ -10,11 +10,13 @@ import java.util.List;
import java.io.File;
public class VariantContextIntegrationTest extends WalkerTest {
private static String root = "-T TestVariantContext" +
" -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -B vcf,VCF,/humgen/gsa-hpprojects/GATK/data/Validation_Data/yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" +
private static String cmdRoot = "-T TestVariantContext" +
" -R " + oneKGLocation + "reference/human_b36_both.fasta";
private static String root = cmdRoot +
" -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -B vcf,VCF,/humgen/gsa-hpprojects/GATK/data/Validation_Data/yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf";
static HashMap<String, String> expectations = new HashMap<String, String>();
static {
expectations.put("-L 1:1-10000 --printPerLocus", "eb5802e7e615fa79b788a9447b7e824c");
@ -42,6 +44,16 @@ public class VariantContextIntegrationTest extends WalkerTest {
}
}
@Test
public void testToVCF() {
// this really just tests that we are seeing the same number of objects over all of chr1
WalkerTestSpec spec = new WalkerTestSpec( cmdRoot + " -B vcf,VCF," + validationDataLocation + "/yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.vcf -L 1:1-1000000 -o %s --outputVCF %s",
2, // just one output file
Arrays.asList("e3c35d0c4b5d4935c84a270f9df0951f", "461960b26ee1f8998ccc47da9bd3913c"));
executeTest("testToVCF", spec);
}
@Test
public void testLargeScaleConversion() {
// this really just tests that we are seeing the same number of objects over all of chr1