From 94f892ad42ebbe5e553268e20f43959860be6832 Mon Sep 17 00:00:00 2001 From: depristo Date: Tue, 9 Feb 2010 01:22:05 +0000 Subject: [PATCH] VCF->beagle and VCF phasing using beagle input. Appears to work fairly well. VariantContexts now support phased genotypes. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2812 348d0f76-0448-11de-a6fe-93d51630548a --- .../contexts/variantcontext/Genotype.java | 24 ++++-- .../InferredGeneticContext.java | 1 + .../variantcontext/MutableGenotype.java | 4 +- .../variantcontext/VariantContext.java | 9 ++- .../gatk/refdata/ReferenceOrderedData.java | 2 +- .../gatk/refdata/VariantContextAdaptors.java | 80 ++++++++++++++++++- .../MendelianViolationEvaluator.java | 58 ++++++++------ .../sting/utils/genotype/vcf/VCFRecord.java | 18 ++--- .../VariantContextIntegrationTest.java | 10 +-- 9 files changed, 156 insertions(+), 50 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Genotype.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Genotype.java index b9bc95298..c6c2e7405 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Genotype.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Genotype.java @@ -13,22 +13,23 @@ public class Genotype { protected InferredGeneticContext commonInfo; public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR; protected List alleles = new ArrayList(); + private boolean genotypesArePhased = false; - public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes) { + public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean genotypesArePhased) { this.alleles = Collections.unmodifiableList(alleles); commonInfo = new InferredGeneticContext(sampleName, negLog10PError, filters, attributes); + this.genotypesArePhased = genotypesArePhased; validate(); } public Genotype(String sampleName, List alleles, double negLog10PError) { - this(sampleName, alleles, negLog10PError, null, null); + this(sampleName, alleles, negLog10PError, null, null, false); } public Genotype(String sampleName, List alleles) { - this(sampleName, alleles, NO_NEG_LOG_10PERROR, null, null); + this(sampleName, alleles, NO_NEG_LOG_10PERROR, null, null, false); } - /** * @return the alleles for this genotype */ @@ -63,6 +64,7 @@ public class Genotype { return al; } + public boolean genotypesArePhased() { return genotypesArePhased; } /** * @return the ploidy of this genotype @@ -123,12 +125,19 @@ public class Genotype { throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this); } + private String haplotypesString() { + if ( genotypesArePhased() ) + return Utils.join("|", getAlleles()); + else + return Utils.join("/", Utils.sorted(getAlleles())); + } + public String toString() { - return String.format("[GT: %s %s %s Q%.2f %s]", getSampleName(), getAlleles(), getType(), 10 * getNegLog10PError(), Utils.sortedString(getAttributes())); + return String.format("[GT: %s %s %s Q%.2f %s]", getSampleName(), haplotypesString(), getType(), getPhredScaledQual(), Utils.sortedString(getAttributes())); } public String toBriefString() { - return String.format("%s:Q%.2f", getAlleles(), 10 * getNegLog10PError()); + return String.format("%s:Q%.2f", haplotypesString(), getPhredScaledQual()); } public boolean sameGenotype(Genotype other) { @@ -148,7 +157,7 @@ public class Genotype { return false; } } else { - List otherAlleles = other.getAlleles(); + List otherAlleles = new ArrayList(other.getAlleles()); for ( Allele myAllele : getAlleles() ) { Allele alleleToRemove = null; for ( Allele otherAllele : otherAlleles ) { @@ -179,6 +188,7 @@ public class Genotype { public boolean isNotFiltered() { return commonInfo.isNotFiltered(); } public boolean hasNegLog10PError() { return commonInfo.hasNegLog10PError(); } public double getNegLog10PError() { return commonInfo.getNegLog10PError(); } + public double getPhredScaledQual() { return commonInfo.getPhredScaledQual(); } public Map getAttributes() { return commonInfo.getAttributes(); } public boolean hasAttribute(String key) { return commonInfo.hasAttribute(key); } diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/InferredGeneticContext.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/InferredGeneticContext.java index 402fabacf..957b70978 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/InferredGeneticContext.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/InferredGeneticContext.java @@ -107,6 +107,7 @@ final class InferredGeneticContext { * @return the -1 * log10-based error estimate */ public double getNegLog10PError() { return negLog10PError; } + public double getPhredScaledQual() { return getNegLog10PError() * 10; } public void setNegLog10PError(double negLog10PError) { if ( negLog10PError < 0 && negLog10PError != NO_NEG_LOG_10PERROR ) throw new IllegalArgumentException("BUG: negLog10PError cannot be < than 0 : " + negLog10PError); diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/MutableGenotype.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/MutableGenotype.java index 252c40566..fe6e778b5 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/MutableGenotype.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/MutableGenotype.java @@ -9,7 +9,7 @@ import java.util.*; */ public class MutableGenotype extends Genotype { public MutableGenotype(Genotype parent) { - super(parent.getSampleName(), parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes()); + super(parent.getSampleName(), parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.genotypesArePhased()); } // todo -- add rest of genotype constructors here @@ -18,7 +18,7 @@ public class MutableGenotype extends Genotype { } public Genotype unmodifiableGenotype() { - return new Genotype(getSampleName(), getAlleles(), getNegLog10PError(), getFilters(), getAttributes()); + return new Genotype(getSampleName(), getAlleles(), getNegLog10PError(), getFilters(), getAttributes(), genotypesArePhased()); } diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java index 3203a68ce..f1c874116 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContext.java @@ -457,6 +457,7 @@ public class VariantContext { public boolean isNotFiltered() { return commonInfo.isNotFiltered(); } public boolean hasNegLog10PError() { return commonInfo.hasNegLog10PError(); } public double getNegLog10PError() { return commonInfo.getNegLog10PError(); } + public double getPhredScaledQual() { return commonInfo.getPhredScaledQual(); } public Map getAttributes() { return commonInfo.getAttributes(); } public boolean hasAttribute(String key) { return commonInfo.hasAttribute(key); } @@ -601,6 +602,8 @@ public class VariantContext { */ public Map getGenotypes() { return genotypes; } + public List getGenotypesSortedByName() { return Utils.sorted(genotypes); } + /** * Returns a map from sampleName -> Genotype for the genotype associated with sampleName. Returns a map * for consistency with the multi-get function. @@ -652,6 +655,10 @@ public class VariantContext { return getGenotypes().containsKey(sample); } + public Genotype getGenotype(int ith) { + return getGenotypesSortedByName().get(ith); + } + /** * Returns the number of chromosomes carrying any allele in the genotypes (i.e., excluding NO_CALLS @@ -833,7 +840,7 @@ public class VariantContext { public String toString() { return String.format("[VC %s @ %s of type=%s alleles=%s attr=%s GT=%s", getName(), getLocation(), this.getType(), - Utils.sorted(this.getAlleles()), Utils.sortedString(this.getAttributes()), Utils.sorted(this.getGenotypes())); + Utils.sorted(this.getAlleles()), Utils.sortedString(this.getAttributes()), this.getGenotypesSortedByName()); } // protected basic manipulation routines diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java index 8c06969ce..8ca0b0c21 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java @@ -79,7 +79,7 @@ public class ReferenceOrderedData implements addModule("VCF", RodVCF.class); addModule("PicardDbSNP", rodPicardDbSNP.class); addModule("HapmapVCF", HapmapVCFROD.class); - + addModule("Beagle", BeagleROD.class); } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index ede1f4740..c9d26ad04 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -4,6 +4,7 @@ import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord; import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeEncoding; import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord; import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; @@ -104,7 +105,7 @@ public class VariantContextAdaptors { // -------------------------------------------------------------------------------------------------------------- // - // VCF to VariantContext + // VCF to VariantContext and back again // // -------------------------------------------------------------------------------------------------------------- @@ -170,7 +171,7 @@ public class VariantContextAdaptors { if ( vcfG.isFiltered() ) // setup the FL genotype filter fields genotypeFilters.addAll(Arrays.asList(vcfG.getFields().get(VCFGenotypeRecord.GENOTYPE_FILTER_KEY).split(";"))); - Genotype g = new Genotype(vcfG.getSampleName(), genotypeAlleles, pError, genotypeFilters, fields); + Genotype g = new Genotype(vcfG.getSampleName(), genotypeAlleles, pError, genotypeFilters, fields, vcfG.getPhaseType() == VCFGenotypeRecord.PHASE.PHASED); genotypes.put(g.getSampleName(), g); } @@ -180,4 +181,79 @@ public class VariantContextAdaptors { } else return null; // can't handle anything else } + + public static VCFRecord toVCF(VariantContext vc) { + // deal with the reference + char referenceBase = 'N'; // by default we'll use N + if ( vc.getReference().length() == 1 ) { + referenceBase = (char)vc.getReference().getBases()[0]; + } + + String contig = vc.getLocation().getContig(); + long position = vc.getLocation().getStart(); + String ID = vc.hasAttribute("ID") ? vc.getAttributeAsString("ID") : "."; + double qual = vc.getPhredScaledQual(); + String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : VCFRecord.PASSES_FILTERS; + + Map alleleMap = new HashMap(); + alleleMap.put(Allele.NO_CALL, new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); // convenience for lookup + List vcfAltAlleles = new ArrayList(); + for ( Allele a : vc.getAlleles() ) { + VCFGenotypeEncoding encoding = new VCFGenotypeEncoding(new String(a.getBases())); + if ( a.isNonReference() ) { + vcfAltAlleles.add(encoding); + } + alleleMap.put(a, encoding); + } + + List vcfGenotypeAttributeKeys = new ArrayList(Arrays.asList(VCFGenotypeRecord.GENOTYPE_KEY)); + List vcGenotypeKeys = calcVCFGenotypeKeys(vc); + if ( vc.hasGenotypes() ) vcfGenotypeAttributeKeys.addAll(vcGenotypeKeys); + String genotypeFormatString = Utils.join(VCFRecord.GENOTYPE_FIELD_SEPERATOR, vcfGenotypeAttributeKeys); + + List genotypeObjects = new ArrayList(vc.getGenotypes().size()); + for ( Genotype g : vc.getGenotypesSortedByName() ) { + List encodings = new ArrayList(g.getPloidy()); + for ( Allele a : g.getAlleles() ) { + encodings.add(alleleMap.get(a)); + } + + VCFGenotypeRecord.PHASE phasing = g.genotypesArePhased() ? VCFGenotypeRecord.PHASE.PHASED : VCFGenotypeRecord.PHASE.UNPHASED; + VCFGenotypeRecord vcfG = new VCFGenotypeRecord(g.getSampleName(), encodings, phasing); + + if ( ! g.isNoCall() ) { + for ( String key : vcGenotypeKeys ) { + String val = key.equals(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY) ? String.format("%.2f", g.getPhredScaledQual()) : g.getAttribute(key).toString(); + vcfG.setField(key, val); + } + } + + genotypeObjects.add(vcfG); + } + + // info fields + Map infoFields = new HashMap(); + for ( Map.Entry elt : vc.getAttributes().entrySet() ) { + String key = elt.getKey(); + String val = elt.getValue().toString(); + if ( ! key.equals("ID") ) { + infoFields.put(key, val); + } + } + + return new VCFRecord(referenceBase, contig, position, ID, vcfAltAlleles, qual, filters, infoFields, genotypeFormatString, genotypeObjects); + } + + private static List calcVCFGenotypeKeys(VariantContext vc) { + Set keys = new HashSet(); + + for ( Genotype g : vc.getGenotypes().values() ) { + for ( String key : g.getAttributes().keySet() ) { + keys.add(key); + } + } + + keys.add(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY); + return Utils.sorted(new ArrayList(keys)); + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java index 32d3432e9..a1b35c90d 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java @@ -41,26 +41,36 @@ import java.util.regex.Matcher; */ public class MendelianViolationEvaluator extends VariantEvaluator { long nVariants, nViolations, nOverCalls, nUnderCalls; - String mom, dad, child; + TrioStructure trio; VariantEval2Walker parent; private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)"); + public static class TrioStructure { + public String mom, dad, child; + } + + public static TrioStructure parseTrioDescription(String family) { + Matcher m = FAMILY_PATTERN.matcher(family); + if ( m.matches() ) { + TrioStructure trio = new TrioStructure(); + //System.out.printf("Found a family pattern: %s%n", parent.FAMILY_STRUCTURE); + trio.mom = m.group(1); + trio.dad = m.group(2); + trio.child = m.group(3); + return trio; + } else { + throw new IllegalArgumentException("Malformatted family structure string: " + family + " required format is mom+dad=child"); + } + } + public MendelianViolationEvaluator(VariantEval2Walker parent) { this.parent = parent; if ( enabled() ) { - Matcher m = FAMILY_PATTERN.matcher(parent.FAMILY_STRUCTURE); - if ( m.matches() ) { - //System.out.printf("Found a family pattern: %s%n", parent.FAMILY_STRUCTURE); - mom = m.group(1); - dad = m.group(2); - child = m.group(3); - parent.getLogger().debug(String.format("Found a family pattern: %s mom=%s dad=%s child=%s", parent.FAMILY_STRUCTURE, mom, dad, child)); - - } else { - throw new IllegalArgumentException("Malformatted family structure string: " + parent.FAMILY_STRUCTURE + " required format is mom+dad=child"); - } + trio = parseTrioDescription(parent.FAMILY_STRUCTURE); + parent.getLogger().debug(String.format("Found a family pattern: %s mom=%s dad=%s child=%s", + parent.FAMILY_STRUCTURE, trio.mom, trio.dad, trio.child)); } } @@ -84,9 +94,9 @@ public class MendelianViolationEvaluator extends VariantEvaluator { if ( vc.isBiallelic() && vc.hasGenotypes() ) { // todo -- currently limited to biallelic loci nVariants++; - Genotype momG = vc.getGenotype(mom); - Genotype dadG = vc.getGenotype(dad); - Genotype childG = vc.getGenotype(child); + Genotype momG = vc.getGenotype(trio.mom); + Genotype dadG = vc.getGenotype(trio.dad); + Genotype childG = vc.getGenotype(trio.child); if ( momG.getNegLog10PError() > getQThreshold() && dadG.getNegLog10PError() > getQThreshold() && childG.getNegLog10PError() > getQThreshold() ) { // all genotypes are good, so let's see if child is a violation @@ -146,15 +156,17 @@ public class MendelianViolationEvaluator extends VariantEvaluator { UNDER_CALL, OVER_CALL } - public boolean isViolation(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG ) { - VariantContext momVC = vc.subContextFromGenotypes(momG); - VariantContext dadVC = vc.subContextFromGenotypes(dadG); + public static boolean isViolation(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG ) { + //VariantContext momVC = vc.subContextFromGenotypes(momG); + //VariantContext dadVC = vc.subContextFromGenotypes(dadG); int i = 0; - for ( Allele momAllele : momVC.getAlleles() ) { - for ( Allele dadAllele : dadVC.getAlleles() ) { - Genotype possibleChild = new Genotype("possibleGenotype" + i, Arrays.asList(momAllele, dadAllele)); - if ( childG.sameGenotype(possibleChild, false) ) { - return false; + for ( Allele momAllele : momG.getAlleles() ) { + for ( Allele dadAllele : dadG.getAlleles() ) { + if ( momAllele.isCalled() && dadAllele.isCalled() ) { + Genotype possibleChild = new Genotype("possibleGenotype" + i, Arrays.asList(momAllele, dadAllele)); + if ( childG.sameGenotype(possibleChild, false) ) { + return false; + } } } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java index 3060d959f..73dbcc31c 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java @@ -112,7 +112,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { */ public VCFRecord(char referenceBase, String contig, - int position, + long position, String ID, List altBases, double qual, @@ -139,7 +139,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { */ private void extractFields(Map columnValues) { String chrom = null; - int position = -1; + long position = -1; for (VCFHeader.HEADER_FIELDS val : columnValues.keySet()) { switch (val) { @@ -467,7 +467,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { mReferenceBase = referenceBase; } - public void setLocation(String chrom, int position) { + public void setLocation(String chrom, long position) { if ( chrom == null ) throw new IllegalArgumentException("Chromosomes cannot be missing"); if ( position < 0 ) @@ -598,12 +598,12 @@ public class VCFRecord implements Variation, VariantBackedByGenotype { builder.append(createInfoString()); if ( mGenotypeFormatString != null && mGenotypeFormatString.length() > 0 ) { - try { - addGenotypeData(builder, header); - } catch (Exception e) { - if ( validationStringency == VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT ) - throw new RuntimeException(e.getMessage()); - } +// try { + addGenotypeData(builder, header); +// } catch (Exception e) { +// if ( validationStringency == VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT ) +// throw new RuntimeException(e); +// } } return builder.toString(); diff --git a/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextIntegrationTest.java index 5437494b8..0bc6c97eb 100755 --- a/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextIntegrationTest.java @@ -17,11 +17,11 @@ public class VariantContextIntegrationTest extends WalkerTest { static HashMap expectations = new HashMap(); static { - expectations.put("-L 1:1-10000 --printPerLocus", "39c035ae756eb176a7baffcd0f0fe0af"); - expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly", "33ab797ac3de900e75fc0d9b01efe482"); - expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsStartinAtCurrentPosition", "38619d704068072a4ccfd354652957a2"); - expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly --onlyContextsStartinAtCurrentPosition", "bf64ab634126382813a6a6b29a5f47d8"); - expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType SNP", "be087f53429974b4e505cd59f9363bfe"); + expectations.put("-L 1:1-10000 --printPerLocus", "eb5802e7e615fa79b788a9447b7e824c"); + expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly", "f4580866bcff10e0e742f422c45695d3"); + expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsStartinAtCurrentPosition", "85c949c985d8aa340030362ea1fc13d2"); + expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly --onlyContextsStartinAtCurrentPosition", "a45430cfed2f574fb69e79d74ed41017"); + expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType SNP", "105547f432744e0776bd58e753cfa859"); expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType INDEL", "d758adbab9011e42c77d502fe4d62c27"); expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType INDEL --onlyContextsStartinAtCurrentPosition", "933ec8327192c5ed58a1952c73fb4f73"); expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType MIXED", "7d5d0283d92220ee78db7465d675b37f");