VCF->beagle and VCF phasing using beagle input. Appears to work fairly well. VariantContexts now support phased genotypes.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2812 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-02-09 01:22:05 +00:00
parent 457568485a
commit 94f892ad42
9 changed files with 156 additions and 50 deletions

View File

@ -13,22 +13,23 @@ public class Genotype {
protected InferredGeneticContext commonInfo;
public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR;
protected List<Allele> alleles = new ArrayList<Allele>();
private boolean genotypesArePhased = false;
public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes) {
public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean genotypesArePhased) {
this.alleles = Collections.unmodifiableList(alleles);
commonInfo = new InferredGeneticContext(sampleName, negLog10PError, filters, attributes);
this.genotypesArePhased = genotypesArePhased;
validate();
}
public Genotype(String sampleName, List<Allele> alleles, double negLog10PError) {
this(sampleName, alleles, negLog10PError, null, null);
this(sampleName, alleles, negLog10PError, null, null, false);
}
public Genotype(String sampleName, List<Allele> alleles) {
this(sampleName, alleles, NO_NEG_LOG_10PERROR, null, null);
this(sampleName, alleles, NO_NEG_LOG_10PERROR, null, null, false);
}
/**
* @return the alleles for this genotype
*/
@ -63,6 +64,7 @@ public class Genotype {
return al;
}
public boolean genotypesArePhased() { return genotypesArePhased; }
/**
* @return the ploidy of this genotype
@ -123,12 +125,19 @@ public class Genotype {
throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this);
}
private String haplotypesString() {
if ( genotypesArePhased() )
return Utils.join("|", getAlleles());
else
return Utils.join("/", Utils.sorted(getAlleles()));
}
public String toString() {
return String.format("[GT: %s %s %s Q%.2f %s]", getSampleName(), getAlleles(), getType(), 10 * getNegLog10PError(), Utils.sortedString(getAttributes()));
return String.format("[GT: %s %s %s Q%.2f %s]", getSampleName(), haplotypesString(), getType(), getPhredScaledQual(), Utils.sortedString(getAttributes()));
}
public String toBriefString() {
return String.format("%s:Q%.2f", getAlleles(), 10 * getNegLog10PError());
return String.format("%s:Q%.2f", haplotypesString(), getPhredScaledQual());
}
public boolean sameGenotype(Genotype other) {
@ -148,7 +157,7 @@ public class Genotype {
return false;
}
} else {
List<Allele> otherAlleles = other.getAlleles();
List<Allele> otherAlleles = new ArrayList<Allele>(other.getAlleles());
for ( Allele myAllele : getAlleles() ) {
Allele alleleToRemove = null;
for ( Allele otherAllele : otherAlleles ) {
@ -179,6 +188,7 @@ public class Genotype {
public boolean isNotFiltered() { return commonInfo.isNotFiltered(); }
public boolean hasNegLog10PError() { return commonInfo.hasNegLog10PError(); }
public double getNegLog10PError() { return commonInfo.getNegLog10PError(); }
public double getPhredScaledQual() { return commonInfo.getPhredScaledQual(); }
public Map<String, Object> getAttributes() { return commonInfo.getAttributes(); }
public boolean hasAttribute(String key) { return commonInfo.hasAttribute(key); }

View File

@ -107,6 +107,7 @@ final class InferredGeneticContext {
* @return the -1 * log10-based error estimate
*/
public double getNegLog10PError() { return negLog10PError; }
public double getPhredScaledQual() { return getNegLog10PError() * 10; }
public void setNegLog10PError(double negLog10PError) {
if ( negLog10PError < 0 && negLog10PError != NO_NEG_LOG_10PERROR ) throw new IllegalArgumentException("BUG: negLog10PError cannot be < than 0 : " + negLog10PError);

View File

@ -9,7 +9,7 @@ import java.util.*;
*/
public class MutableGenotype extends Genotype {
public MutableGenotype(Genotype parent) {
super(parent.getSampleName(), parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes());
super(parent.getSampleName(), parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.genotypesArePhased());
}
// todo -- add rest of genotype constructors here
@ -18,7 +18,7 @@ public class MutableGenotype extends Genotype {
}
public Genotype unmodifiableGenotype() {
return new Genotype(getSampleName(), getAlleles(), getNegLog10PError(), getFilters(), getAttributes());
return new Genotype(getSampleName(), getAlleles(), getNegLog10PError(), getFilters(), getAttributes(), genotypesArePhased());
}

View File

@ -457,6 +457,7 @@ public class VariantContext {
public boolean isNotFiltered() { return commonInfo.isNotFiltered(); }
public boolean hasNegLog10PError() { return commonInfo.hasNegLog10PError(); }
public double getNegLog10PError() { return commonInfo.getNegLog10PError(); }
public double getPhredScaledQual() { return commonInfo.getPhredScaledQual(); }
public Map<String, Object> getAttributes() { return commonInfo.getAttributes(); }
public boolean hasAttribute(String key) { return commonInfo.hasAttribute(key); }
@ -601,6 +602,8 @@ public class VariantContext {
*/
public Map<String, Genotype> getGenotypes() { return genotypes; }
public List<Genotype> getGenotypesSortedByName() { return Utils.sorted(genotypes); }
/**
* Returns a map from sampleName -> Genotype for the genotype associated with sampleName. Returns a map
* for consistency with the multi-get function.
@ -652,6 +655,10 @@ public class VariantContext {
return getGenotypes().containsKey(sample);
}
public Genotype getGenotype(int ith) {
return getGenotypesSortedByName().get(ith);
}
/**
* Returns the number of chromosomes carrying any allele in the genotypes (i.e., excluding NO_CALLS
@ -833,7 +840,7 @@ public class VariantContext {
public String toString() {
return String.format("[VC %s @ %s of type=%s alleles=%s attr=%s GT=%s",
getName(), getLocation(), this.getType(),
Utils.sorted(this.getAlleles()), Utils.sortedString(this.getAttributes()), Utils.sorted(this.getGenotypes()));
Utils.sorted(this.getAlleles()), Utils.sortedString(this.getAttributes()), this.getGenotypesSortedByName());
}
// protected basic manipulation routines

View File

@ -79,7 +79,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
addModule("VCF", RodVCF.class);
addModule("PicardDbSNP", rodPicardDbSNP.class);
addModule("HapmapVCF", HapmapVCFROD.class);
addModule("Beagle", BeagleROD.class);
}

View File

@ -4,6 +4,7 @@ import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeEncoding;
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
@ -104,7 +105,7 @@ public class VariantContextAdaptors {
// --------------------------------------------------------------------------------------------------------------
//
// VCF to VariantContext
// VCF to VariantContext and back again
//
// --------------------------------------------------------------------------------------------------------------
@ -170,7 +171,7 @@ public class VariantContextAdaptors {
if ( vcfG.isFiltered() ) // setup the FL genotype filter fields
genotypeFilters.addAll(Arrays.asList(vcfG.getFields().get(VCFGenotypeRecord.GENOTYPE_FILTER_KEY).split(";")));
Genotype g = new Genotype(vcfG.getSampleName(), genotypeAlleles, pError, genotypeFilters, fields);
Genotype g = new Genotype(vcfG.getSampleName(), genotypeAlleles, pError, genotypeFilters, fields, vcfG.getPhaseType() == VCFGenotypeRecord.PHASE.PHASED);
genotypes.put(g.getSampleName(), g);
}
@ -180,4 +181,79 @@ public class VariantContextAdaptors {
} else
return null; // can't handle anything else
}
public static VCFRecord toVCF(VariantContext vc) {
// deal with the reference
char referenceBase = 'N'; // by default we'll use N
if ( vc.getReference().length() == 1 ) {
referenceBase = (char)vc.getReference().getBases()[0];
}
String contig = vc.getLocation().getContig();
long position = vc.getLocation().getStart();
String ID = vc.hasAttribute("ID") ? vc.getAttributeAsString("ID") : ".";
double qual = vc.getPhredScaledQual();
String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : VCFRecord.PASSES_FILTERS;
Map<Allele, VCFGenotypeEncoding> alleleMap = new HashMap<Allele, VCFGenotypeEncoding>();
alleleMap.put(Allele.NO_CALL, new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); // convenience for lookup
List<VCFGenotypeEncoding> vcfAltAlleles = new ArrayList<VCFGenotypeEncoding>();
for ( Allele a : vc.getAlleles() ) {
VCFGenotypeEncoding encoding = new VCFGenotypeEncoding(new String(a.getBases()));
if ( a.isNonReference() ) {
vcfAltAlleles.add(encoding);
}
alleleMap.put(a, encoding);
}
List<String> vcfGenotypeAttributeKeys = new ArrayList<String>(Arrays.asList(VCFGenotypeRecord.GENOTYPE_KEY));
List<String> vcGenotypeKeys = calcVCFGenotypeKeys(vc);
if ( vc.hasGenotypes() ) vcfGenotypeAttributeKeys.addAll(vcGenotypeKeys);
String genotypeFormatString = Utils.join(VCFRecord.GENOTYPE_FIELD_SEPERATOR, vcfGenotypeAttributeKeys);
List<VCFGenotypeRecord> genotypeObjects = new ArrayList<VCFGenotypeRecord>(vc.getGenotypes().size());
for ( Genotype g : vc.getGenotypesSortedByName() ) {
List<VCFGenotypeEncoding> encodings = new ArrayList<VCFGenotypeEncoding>(g.getPloidy());
for ( Allele a : g.getAlleles() ) {
encodings.add(alleleMap.get(a));
}
VCFGenotypeRecord.PHASE phasing = g.genotypesArePhased() ? VCFGenotypeRecord.PHASE.PHASED : VCFGenotypeRecord.PHASE.UNPHASED;
VCFGenotypeRecord vcfG = new VCFGenotypeRecord(g.getSampleName(), encodings, phasing);
if ( ! g.isNoCall() ) {
for ( String key : vcGenotypeKeys ) {
String val = key.equals(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY) ? String.format("%.2f", g.getPhredScaledQual()) : g.getAttribute(key).toString();
vcfG.setField(key, val);
}
}
genotypeObjects.add(vcfG);
}
// info fields
Map<String, String> infoFields = new HashMap<String, String>();
for ( Map.Entry<String, Object> elt : vc.getAttributes().entrySet() ) {
String key = elt.getKey();
String val = elt.getValue().toString();
if ( ! key.equals("ID") ) {
infoFields.put(key, val);
}
}
return new VCFRecord(referenceBase, contig, position, ID, vcfAltAlleles, qual, filters, infoFields, genotypeFormatString, genotypeObjects);
}
private static List<String> calcVCFGenotypeKeys(VariantContext vc) {
Set<String> keys = new HashSet<String>();
for ( Genotype g : vc.getGenotypes().values() ) {
for ( String key : g.getAttributes().keySet() ) {
keys.add(key);
}
}
keys.add(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY);
return Utils.sorted(new ArrayList<String>(keys));
}
}

View File

@ -41,26 +41,36 @@ import java.util.regex.Matcher;
*/
public class MendelianViolationEvaluator extends VariantEvaluator {
long nVariants, nViolations, nOverCalls, nUnderCalls;
String mom, dad, child;
TrioStructure trio;
VariantEval2Walker parent;
private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)");
public static class TrioStructure {
public String mom, dad, child;
}
public static TrioStructure parseTrioDescription(String family) {
Matcher m = FAMILY_PATTERN.matcher(family);
if ( m.matches() ) {
TrioStructure trio = new TrioStructure();
//System.out.printf("Found a family pattern: %s%n", parent.FAMILY_STRUCTURE);
trio.mom = m.group(1);
trio.dad = m.group(2);
trio.child = m.group(3);
return trio;
} else {
throw new IllegalArgumentException("Malformatted family structure string: " + family + " required format is mom+dad=child");
}
}
public MendelianViolationEvaluator(VariantEval2Walker parent) {
this.parent = parent;
if ( enabled() ) {
Matcher m = FAMILY_PATTERN.matcher(parent.FAMILY_STRUCTURE);
if ( m.matches() ) {
//System.out.printf("Found a family pattern: %s%n", parent.FAMILY_STRUCTURE);
mom = m.group(1);
dad = m.group(2);
child = m.group(3);
parent.getLogger().debug(String.format("Found a family pattern: %s mom=%s dad=%s child=%s", parent.FAMILY_STRUCTURE, mom, dad, child));
} else {
throw new IllegalArgumentException("Malformatted family structure string: " + parent.FAMILY_STRUCTURE + " required format is mom+dad=child");
}
trio = parseTrioDescription(parent.FAMILY_STRUCTURE);
parent.getLogger().debug(String.format("Found a family pattern: %s mom=%s dad=%s child=%s",
parent.FAMILY_STRUCTURE, trio.mom, trio.dad, trio.child));
}
}
@ -84,9 +94,9 @@ public class MendelianViolationEvaluator extends VariantEvaluator {
if ( vc.isBiallelic() && vc.hasGenotypes() ) { // todo -- currently limited to biallelic loci
nVariants++;
Genotype momG = vc.getGenotype(mom);
Genotype dadG = vc.getGenotype(dad);
Genotype childG = vc.getGenotype(child);
Genotype momG = vc.getGenotype(trio.mom);
Genotype dadG = vc.getGenotype(trio.dad);
Genotype childG = vc.getGenotype(trio.child);
if ( momG.getNegLog10PError() > getQThreshold() && dadG.getNegLog10PError() > getQThreshold() && childG.getNegLog10PError() > getQThreshold() ) {
// all genotypes are good, so let's see if child is a violation
@ -146,15 +156,17 @@ public class MendelianViolationEvaluator extends VariantEvaluator {
UNDER_CALL, OVER_CALL
}
public boolean isViolation(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG ) {
VariantContext momVC = vc.subContextFromGenotypes(momG);
VariantContext dadVC = vc.subContextFromGenotypes(dadG);
public static boolean isViolation(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG ) {
//VariantContext momVC = vc.subContextFromGenotypes(momG);
//VariantContext dadVC = vc.subContextFromGenotypes(dadG);
int i = 0;
for ( Allele momAllele : momVC.getAlleles() ) {
for ( Allele dadAllele : dadVC.getAlleles() ) {
Genotype possibleChild = new Genotype("possibleGenotype" + i, Arrays.asList(momAllele, dadAllele));
if ( childG.sameGenotype(possibleChild, false) ) {
return false;
for ( Allele momAllele : momG.getAlleles() ) {
for ( Allele dadAllele : dadG.getAlleles() ) {
if ( momAllele.isCalled() && dadAllele.isCalled() ) {
Genotype possibleChild = new Genotype("possibleGenotype" + i, Arrays.asList(momAllele, dadAllele));
if ( childG.sameGenotype(possibleChild, false) ) {
return false;
}
}
}
}

View File

@ -112,7 +112,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
*/
public VCFRecord(char referenceBase,
String contig,
int position,
long position,
String ID,
List<VCFGenotypeEncoding> altBases,
double qual,
@ -139,7 +139,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
*/
private void extractFields(Map<VCFHeader.HEADER_FIELDS, String> columnValues) {
String chrom = null;
int position = -1;
long position = -1;
for (VCFHeader.HEADER_FIELDS val : columnValues.keySet()) {
switch (val) {
@ -467,7 +467,7 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
mReferenceBase = referenceBase;
}
public void setLocation(String chrom, int position) {
public void setLocation(String chrom, long position) {
if ( chrom == null )
throw new IllegalArgumentException("Chromosomes cannot be missing");
if ( position < 0 )
@ -598,12 +598,12 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
builder.append(createInfoString());
if ( mGenotypeFormatString != null && mGenotypeFormatString.length() > 0 ) {
try {
addGenotypeData(builder, header);
} catch (Exception e) {
if ( validationStringency == VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT )
throw new RuntimeException(e.getMessage());
}
// try {
addGenotypeData(builder, header);
// } catch (Exception e) {
// if ( validationStringency == VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT )
// throw new RuntimeException(e);
// }
}
return builder.toString();

View File

@ -17,11 +17,11 @@ public class VariantContextIntegrationTest extends WalkerTest {
static HashMap<String, String> expectations = new HashMap<String, String>();
static {
expectations.put("-L 1:1-10000 --printPerLocus", "39c035ae756eb176a7baffcd0f0fe0af");
expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly", "33ab797ac3de900e75fc0d9b01efe482");
expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsStartinAtCurrentPosition", "38619d704068072a4ccfd354652957a2");
expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly --onlyContextsStartinAtCurrentPosition", "bf64ab634126382813a6a6b29a5f47d8");
expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType SNP", "be087f53429974b4e505cd59f9363bfe");
expectations.put("-L 1:1-10000 --printPerLocus", "eb5802e7e615fa79b788a9447b7e824c");
expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly", "f4580866bcff10e0e742f422c45695d3");
expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsStartinAtCurrentPosition", "85c949c985d8aa340030362ea1fc13d2");
expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly --onlyContextsStartinAtCurrentPosition", "a45430cfed2f574fb69e79d74ed41017");
expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType SNP", "105547f432744e0776bd58e753cfa859");
expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType INDEL", "d758adbab9011e42c77d502fe4d62c27");
expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType INDEL --onlyContextsStartinAtCurrentPosition", "933ec8327192c5ed58a1952c73fb4f73");
expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType MIXED", "7d5d0283d92220ee78db7465d675b37f");