VariantContext to VCF converter. BeagleROD, and phasing of VCF calls. Integration tests galore :-)
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2814 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
369cc50802
commit
934d4b93a2
|
|
@ -108,6 +108,7 @@ public class Genotype {
|
|||
* @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF)
|
||||
*/
|
||||
public boolean isNoCall() { return getType() == Type.NO_CALL; }
|
||||
public boolean isCalled() { return getType() != Type.NO_CALL; }
|
||||
|
||||
public void validate() {
|
||||
// todo -- add validation checking here
|
||||
|
|
|
|||
|
|
@ -1,8 +1,6 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeEncoding;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
|
|
@ -182,6 +180,16 @@ public class VariantContextAdaptors {
|
|||
return null; // can't handle anything else
|
||||
}
|
||||
|
||||
|
||||
public static VCFHeader createVCFHeader(Set<VCFHeaderLine> hInfo, VariantContext vc) {
|
||||
HashSet<String> names = new LinkedHashSet<String>();
|
||||
for ( Genotype g : vc.getGenotypesSortedByName() ) {
|
||||
names.add(g.getSampleName());
|
||||
}
|
||||
|
||||
return new VCFHeader(hInfo == null ? new HashSet<VCFHeaderLine>() : hInfo, names);
|
||||
}
|
||||
|
||||
public static VCFRecord toVCF(VariantContext vc) {
|
||||
// deal with the reference
|
||||
char referenceBase = 'N'; // by default we'll use N
|
||||
|
|
@ -189,6 +197,10 @@ public class VariantContextAdaptors {
|
|||
referenceBase = (char)vc.getReference().getBases()[0];
|
||||
}
|
||||
|
||||
if ( ! vc.isSNP() )
|
||||
// todo -- update the code so it works correctly with indels
|
||||
throw new StingException("VariantContext -> VCF converter currently doesn't support indels; complain to the GATK team");
|
||||
|
||||
String contig = vc.getLocation().getContig();
|
||||
long position = vc.getLocation().getStart();
|
||||
String ID = vc.hasAttribute("ID") ? vc.getAttributeAsString("ID") : ".";
|
||||
|
|
|
|||
|
|
@ -5,10 +5,16 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.EnumSet;
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Test routine for new VariantContext object
|
||||
|
|
@ -26,6 +32,17 @@ public class TestVariantContextWalker extends RodWalker<Integer, Integer> {
|
|||
@Argument(fullName="printPerLocus", doc="If true, we'll print the variant contexts, in addition to counts", required=false)
|
||||
boolean printContexts = false;
|
||||
|
||||
@Argument(fullName="outputVCF", doc="If provided, we'll convert the first input context into a VCF", required=false)
|
||||
String outputVCF = null;
|
||||
|
||||
private VCFWriter writer = null;
|
||||
private boolean wroteHeader = false;
|
||||
|
||||
public void initialize() {
|
||||
if ( outputVCF != null )
|
||||
writer = new VCFWriter(new File(outputVCF));
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( ref == null )
|
||||
return 0;
|
||||
|
|
@ -34,6 +51,15 @@ public class TestVariantContextWalker extends RodWalker<Integer, Integer> {
|
|||
|
||||
int n = 0;
|
||||
for (VariantContext vc : tracker.getAllVariantContexts(allowedTypes, context.getLocation(), onlyContextsStartinAtCurrentPosition, takeFirstOnly) ) {
|
||||
if ( writer != null && n == 0 ) {
|
||||
if ( ! wroteHeader ) {
|
||||
writer.writeHeader(VariantContextAdaptors.createVCFHeader(null, vc));
|
||||
wroteHeader = true;
|
||||
}
|
||||
|
||||
writer.addRecord(VariantContextAdaptors.toVCF(vc));
|
||||
}
|
||||
|
||||
n++;
|
||||
if ( printContexts ) out.printf(" %s%n", vc);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -98,7 +98,7 @@ public class MendelianViolationEvaluator extends VariantEvaluator {
|
|||
Genotype dadG = vc.getGenotype(trio.dad);
|
||||
Genotype childG = vc.getGenotype(trio.child);
|
||||
|
||||
if ( momG.getNegLog10PError() > getQThreshold() && dadG.getNegLog10PError() > getQThreshold() && childG.getNegLog10PError() > getQThreshold() ) {
|
||||
if ( includeGenotype(momG) && includeGenotype(dadG) && includeGenotype(childG) ) {
|
||||
// all genotypes are good, so let's see if child is a violation
|
||||
|
||||
if ( isViolation(vc, momG, dadG, childG) ) {
|
||||
|
|
@ -152,6 +152,10 @@ public class MendelianViolationEvaluator extends VariantEvaluator {
|
|||
}
|
||||
}
|
||||
|
||||
private boolean includeGenotype(Genotype g) {
|
||||
return g.getNegLog10PError() > getQThreshold() && g.isCalled();
|
||||
}
|
||||
|
||||
private enum ViolationType {
|
||||
UNDER_CALL, OVER_CALL
|
||||
}
|
||||
|
|
|
|||
|
|
@ -196,6 +196,9 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
|
|||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
//System.out.printf("map at %s with %d skipped%n", context.getLocation(), context.getSkippedBases());
|
||||
|
||||
if ( ref == null )
|
||||
return 0;
|
||||
|
||||
Map<String, VariantContext> comps = getCompVariantContexts(tracker, context);
|
||||
|
||||
// to enable walking over pairs where eval or comps have no elements
|
||||
|
|
|
|||
|
|
@ -0,0 +1,129 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.vcftools;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.RMD;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.oneoffprojects.walkers.varianteval2.MendelianViolationEvaluator;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
/**
|
||||
* Test routine for new VariantContext object
|
||||
*/
|
||||
@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="variants",type=ReferenceOrderedDatum.class), @RMD(name="beagle",type=BeagleROD.class)})
|
||||
public class BeagleTrioToVCFWalker extends RodWalker<VariantContext, Long> {
|
||||
@Argument(shortName="trio", doc="If provide, treats the input VCF as a single record containing genotypes for a single trio; String formatted as dad+mom=child", required=false)
|
||||
protected String TRIO_STRUCTURE;
|
||||
|
||||
@Argument(shortName="eth", fullName="excludeTripleHets", doc="If provide, sites that are triple hets calls will not be phased, regardless of Beagle's value", required=false)
|
||||
protected boolean dontPhaseTripleHets = false;
|
||||
|
||||
private MendelianViolationEvaluator.TrioStructure trio = null;
|
||||
|
||||
private VCFWriter writer;
|
||||
private boolean headerWritten = false;
|
||||
private final static String TRACK_NAME = "variants";
|
||||
private final static String BEAGLE_NAME = "beagle";
|
||||
|
||||
public void initialize() {
|
||||
trio = MendelianViolationEvaluator.parseTrioDescription(TRIO_STRUCTURE);
|
||||
writer = new VCFWriter(out);
|
||||
}
|
||||
|
||||
public VariantContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
VariantContext vc = null;
|
||||
|
||||
if ( ref != null ) {
|
||||
vc = tracker.getVariantContext(TRACK_NAME, null, context.getLocation(), false);
|
||||
BeagleROD beagle = (BeagleROD)tracker.lookup(BEAGLE_NAME, null);
|
||||
|
||||
if ( vc != null ) {
|
||||
if ( ! headerWritten ) {
|
||||
RodVCF vcfrod = (RodVCF)tracker.lookup(TRACK_NAME, null);
|
||||
writer.writeHeader(vcfrod.getHeader());
|
||||
headerWritten = true;
|
||||
}
|
||||
|
||||
//System.out.printf("VCF: %s%n", tracker.lookup(TRACK_NAME, null));
|
||||
vc = maybePhaseVC(vc, beagle);
|
||||
}
|
||||
}
|
||||
|
||||
return vc;
|
||||
}
|
||||
|
||||
private VariantContext maybePhaseVC(VariantContext unphased, BeagleROD beagle) {
|
||||
if ( beagle == null ) {
|
||||
return unphased;
|
||||
} else {
|
||||
Map<String, List<String>> bglData = beagle.getGenotypes();
|
||||
List<String> momBgl = bglData.get(trio.mom);
|
||||
List<String> dadBgl = bglData.get(trio.dad);
|
||||
|
||||
Genotype unphasedMom = unphased.getGenotype(trio.mom);
|
||||
Genotype unphasedDad = unphased.getGenotype(trio.dad);
|
||||
Genotype unphasedKid = unphased.getGenotype(trio.child);
|
||||
|
||||
if ( dontPhaseTripleHets && unphasedMom.isHet() && unphasedDad.isHet() && unphasedKid.isHet() )
|
||||
return unphased;
|
||||
else {
|
||||
Allele momTrans = unphased.getAllele(momBgl.get(0));
|
||||
Allele momUntrans = unphased.getAllele(momBgl.get(1));
|
||||
Allele dadTrans = unphased.getAllele(dadBgl.get(0));
|
||||
Allele dadUntrans = unphased.getAllele(dadBgl.get(1));
|
||||
|
||||
Genotype momG = phaseGenotype(unphasedMom, Arrays.asList(momTrans, momUntrans));
|
||||
Genotype dadG = phaseGenotype(unphasedDad, Arrays.asList(dadTrans, dadUntrans));
|
||||
Genotype kidG = phaseGenotype(unphasedKid, Arrays.asList(momTrans, dadTrans));
|
||||
|
||||
return new VariantContext(unphased.getName(), unphased.getLocation(), unphased.getAlleles(),
|
||||
Arrays.asList(momG, dadG, kidG), unphased.getNegLog10PError(), unphased.getFilters(), unphased.getAttributes());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Genotype phaseGenotype(Genotype base, List<Allele> alleles) {
|
||||
return new Genotype(base.getSampleName(), alleles, base.getNegLog10PError(), base.getFilters(), base.getAttributes(), true);
|
||||
}
|
||||
|
||||
public Long reduceInit() {
|
||||
return 0L;
|
||||
}
|
||||
|
||||
public Integer reduce(VariantContext point, Integer sum) {
|
||||
return sum;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
//logger.info(String.format("Saw %d raw SNPs", result.nVariants));
|
||||
//logger.info(String.format("Converted %d (%.2f%%) of these sites", result.nConverted, (100.0 * result.nConverted) / result.nVariants));
|
||||
}
|
||||
|
||||
public Long reduce(VariantContext vc, Long prevReduce) {
|
||||
if ( vc == null ) {
|
||||
return prevReduce;
|
||||
} else {
|
||||
writer.addRecord(VariantContextAdaptors.toVCF(vc));
|
||||
prevReduce++;
|
||||
}
|
||||
|
||||
return prevReduce;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,114 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.vcftools;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.RMD;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.oneoffprojects.walkers.varianteval2.MendelianViolationEvaluator;
|
||||
|
||||
import java.util.EnumSet;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Test routine for new VariantContext object
|
||||
*/
|
||||
@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="variants",type=ReferenceOrderedDatum.class)})
|
||||
public class VCFToBeagleWalker extends RodWalker<Integer, VCFToBeagleWalker.Result> {
|
||||
@Argument(shortName="trio", doc="If provide, treats the input VCF as a single record containing genotypes for a single trio; String formatted as dad+mom=child", required=false)
|
||||
protected String TRIO_STRUCTURE;
|
||||
|
||||
private MendelianViolationEvaluator.TrioStructure trio = null;
|
||||
|
||||
public class Result {
|
||||
int nVariants, nConverted;
|
||||
}
|
||||
|
||||
public void initialize() {
|
||||
if ( TRIO_STRUCTURE != null ) {
|
||||
trio = MendelianViolationEvaluator.parseTrioDescription(TRIO_STRUCTURE);
|
||||
out.printf("I id %s%n", Utils.join(" ", Arrays.asList(trio.mom, trio.mom, trio.dad, trio.dad, trio.child, trio.child)));
|
||||
}
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( ref != null ) {
|
||||
EnumSet<VariantContext.Type> allowedTypes = EnumSet.of(VariantContext.Type.SNP);
|
||||
|
||||
VariantContext vc = tracker.getVariantContext("variants", allowedTypes, context.getLocation(), false);
|
||||
|
||||
if ( vc != null && vc.isBiallelic() && vc.isNotFiltered() ) {
|
||||
if ( trio != null ) { // we are emitting a trio file
|
||||
if ( ! vc.hasGenotypes() || vc.getGenotypes().size() != 3 )
|
||||
throw new StingException("Convertion exception: Trio conversion requires exactly three genotypes at every locus: " + vc);
|
||||
|
||||
if ( genotypesAreGood(vc) ) {
|
||||
if ( ! genotypesAreGoodForTrios(vc, trio) ) {
|
||||
logger.debug("VC excluded due to poor trio genotyping " + vc);
|
||||
} else {
|
||||
Genotype mom = vc.getGenotype(trio.mom);
|
||||
Genotype dad = vc.getGenotype(trio.dad);
|
||||
Genotype child = vc.getGenotype(trio.child);
|
||||
|
||||
// beagle format looks like:
|
||||
//
|
||||
// I id 1001 1001 1002 1002 1003 1003
|
||||
// A diabetes 1 1 2 2 2 2
|
||||
// M rs2289311 A G G G A G
|
||||
String loc = "c" + vc.getLocation().getContig() + "_p" + vc.getLocation().getStart();
|
||||
out.printf("M %s %s %s %s%n", loc, genotype2BeagleString(mom), genotype2BeagleString(dad), genotype2BeagleString(child));
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw new IllegalArgumentException("VCFToBeagle currently only supports conversion of trios. Complain to mark");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private String genotype2BeagleString(Genotype g) {
|
||||
return allele2BeagleString(g.getAllele(0)) + " " + allele2BeagleString(g.getAllele(1));
|
||||
}
|
||||
|
||||
private String allele2BeagleString(Allele a) {
|
||||
return new String(a.getBases());
|
||||
}
|
||||
|
||||
private static boolean genotypesAreGood(VariantContext vc) {
|
||||
for ( Genotype g : vc.getGenotypes().values() ) {
|
||||
if ( g.isFiltered() )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static boolean genotypesAreGoodForTrios(VariantContext vc, MendelianViolationEvaluator.TrioStructure trio) {
|
||||
return ! MendelianViolationEvaluator.isViolation(vc, vc.getGenotype(trio.mom), vc.getGenotype(trio.dad), vc.getGenotype(trio.child));
|
||||
}
|
||||
|
||||
public Result reduceInit() {
|
||||
return new Result();
|
||||
}
|
||||
|
||||
public Result reduce(Integer point, Result sum) {
|
||||
sum.nVariants++;
|
||||
sum.nConverted += point;
|
||||
return sum;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Result result) {
|
||||
logger.info(String.format("Saw %d raw SNPs", result.nVariants));
|
||||
logger.info(String.format("Converted %d (%.2f%%) of these sites", result.nConverted, (100.0 * result.nConverted) / result.nVariants));
|
||||
}
|
||||
}
|
||||
|
|
@ -568,14 +568,24 @@ public class Utils {
|
|||
}
|
||||
|
||||
public static <T extends Comparable<T>> List<T> sorted(Collection<T> c) {
|
||||
return sorted(c, false);
|
||||
}
|
||||
|
||||
public static <T extends Comparable<T>> List<T> sorted(Collection<T> c, boolean reverse) {
|
||||
List<T> l = new ArrayList<T>(c);
|
||||
Collections.sort(l);
|
||||
if ( reverse ) Collections.reverse(l);
|
||||
return l;
|
||||
}
|
||||
|
||||
public static <T extends Comparable<T>, V> List<V> sorted(Map<T,V> c) {
|
||||
return sorted(c, false);
|
||||
}
|
||||
|
||||
public static <T extends Comparable<T>, V> List<V> sorted(Map<T,V> c, boolean reverse) {
|
||||
List<T> t = new ArrayList<T>(c.keySet());
|
||||
Collections.sort(t);
|
||||
if ( reverse ) Collections.reverse(t);
|
||||
|
||||
List<V> l = new ArrayList<V>();
|
||||
for ( T k : t ) {
|
||||
|
|
|
|||
|
|
@ -10,11 +10,13 @@ import java.util.List;
|
|||
import java.io.File;
|
||||
|
||||
public class VariantContextIntegrationTest extends WalkerTest {
|
||||
private static String root = "-T TestVariantContext" +
|
||||
" -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||
" -B vcf,VCF,/humgen/gsa-hpprojects/GATK/data/Validation_Data/yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" +
|
||||
private static String cmdRoot = "-T TestVariantContext" +
|
||||
" -R " + oneKGLocation + "reference/human_b36_both.fasta";
|
||||
|
||||
private static String root = cmdRoot +
|
||||
" -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||
" -B vcf,VCF,/humgen/gsa-hpprojects/GATK/data/Validation_Data/yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf";
|
||||
|
||||
static HashMap<String, String> expectations = new HashMap<String, String>();
|
||||
static {
|
||||
expectations.put("-L 1:1-10000 --printPerLocus", "eb5802e7e615fa79b788a9447b7e824c");
|
||||
|
|
@ -42,6 +44,16 @@ public class VariantContextIntegrationTest extends WalkerTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testToVCF() {
|
||||
// this really just tests that we are seeing the same number of objects over all of chr1
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec( cmdRoot + " -B vcf,VCF," + validationDataLocation + "/yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.vcf -L 1:1-1000000 -o %s --outputVCF %s",
|
||||
2, // just one output file
|
||||
Arrays.asList("e3c35d0c4b5d4935c84a270f9df0951f", "461960b26ee1f8998ccc47da9bd3913c"));
|
||||
executeTest("testToVCF", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLargeScaleConversion() {
|
||||
// this really just tests that we are seeing the same number of objects over all of chr1
|
||||
|
|
|
|||
Loading…
Reference in New Issue