Initial attempt at identifying potentially interesting variants in a Mendelian disease context when the called genotypes are uncertain.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5473 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
b2b8a4f19f
commit
55897631ad
|
|
@ -0,0 +1,307 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.gatk.walkers.RMD;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Given a VCF file and one or more scenarios for affected individuals, calculates the probability that a given site's genotypes
|
||||
* are consistent with the expected pattern for a given disease model.
|
||||
*/
|
||||
@Requires(value={},referenceMetaData=@RMD(name="variant", type=VariantContext.class))
|
||||
public class AffectedConsistencyWalker extends RodWalker<Integer, Integer> {
|
||||
public enum DiseaseModel { DOMINANT, RECESSIVE }
|
||||
|
||||
@Output
|
||||
public PrintStream out;
|
||||
|
||||
@Argument(fullName="affected", shortName="A", doc="A scenario file (or files) for affected individuals. Scenarios are specified with an identifier and a comma-separated list of samples (e.g. Pedigree_1 sample1,sample2,sample3). Each line is another scenario.", required=true)
|
||||
public String[] AFFECTED_SAMPLE_SCENARIOS;
|
||||
|
||||
@Argument(fullName="diseaseModel", shortName="DM", doc="The disease model (DOMINANT or RECESSIVE)", required=true)
|
||||
public DiseaseModel DISEASE_MODEL;
|
||||
|
||||
@Argument(fullName="verbose", shortName="V", doc="If specified, enable verbose mode with a lot of output useful for debugging", required=false)
|
||||
public PrintStream VERBOSE_WRITER = null;
|
||||
|
||||
public Map<String, Set<String>> sampleScenarios;
|
||||
public GATKReport consistencyReport;
|
||||
public Set<String> availableSamples;
|
||||
|
||||
private Map<String, Set<String>> loadAffectedSampleScenarios() {
|
||||
// Load all the specified sample scenarios specified in one or more files
|
||||
ArrayList<String> scenarioStrings = new ArrayList<String>();
|
||||
|
||||
for (String affectedSampleScenario : AFFECTED_SAMPLE_SCENARIOS) {
|
||||
File affectedSampleScenarioFile = new File(affectedSampleScenario);
|
||||
|
||||
try {
|
||||
XReadLines lineReader = new XReadLines(affectedSampleScenarioFile);
|
||||
|
||||
for (String line : lineReader) {
|
||||
// Ignore commented-out lines
|
||||
if (!line.contains("#")) {
|
||||
scenarioStrings.add(line);
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException(String.format("The scenario file '%s' was not found", affectedSampleScenarioFile.getAbsolutePath()));
|
||||
}
|
||||
}
|
||||
|
||||
// Parse all the sample scenario strings (comma- or white-space-separated sample lists)
|
||||
Map<String, Set<String>> scenarios = new HashMap<String, Set<String>>();
|
||||
|
||||
for (String scenarioString : scenarioStrings) {
|
||||
String[] pieces = scenarioString.split("[\\s]+");
|
||||
|
||||
if (pieces.length != 2) {
|
||||
throw new UserException(
|
||||
String.format("The scenario line '%s' could not be understood. Please make sure that your " +
|
||||
"scenario file has only two columns: the first being an arbitrary scenario id " +
|
||||
"(e.g. 'Pedigree_1') and the second being a comma-separated list of samples " +
|
||||
"(e.g. 'sample1,sample2,sample3')",
|
||||
scenarioString
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
String scenarioId = pieces[0];
|
||||
|
||||
String[] sampleNames = pieces[1].split(",");
|
||||
|
||||
Set<String> samples = new HashSet<String>();
|
||||
for (String sample : sampleNames) {
|
||||
if (!availableSamples.contains(sample)) {
|
||||
throw new UserException(
|
||||
String.format("The sample '%s' was not found in the ROD bound as the 'variant' track " +
|
||||
"(i.e. the file that was supplied via '-B:variant,VCF /path/to/my.vcf'). " +
|
||||
"Please make sure all samples specified for processing are present in " +
|
||||
"your VCF.",
|
||||
sample)
|
||||
);
|
||||
} else {
|
||||
samples.add(sample);
|
||||
}
|
||||
}
|
||||
|
||||
scenarios.put(scenarioId, samples);
|
||||
}
|
||||
|
||||
if (scenarios.size() == 0) {
|
||||
throw new UserException("There were no scenarios specified. Please specify at least one set of affected samples.");
|
||||
}
|
||||
|
||||
return scenarios;
|
||||
}
|
||||
|
||||
public void initialize() {
|
||||
// Figure out what samples I can possibly have (from the bound VCF file)
|
||||
ArrayList<String> rodNames = new ArrayList<String>();
|
||||
rodNames.add("variant");
|
||||
|
||||
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames);
|
||||
availableSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
|
||||
|
||||
// Load the scenarios to consider
|
||||
sampleScenarios = loadAffectedSampleScenarios();
|
||||
|
||||
// Prepare the output report
|
||||
consistencyReport = new GATKReport();
|
||||
consistencyReport.addTable("AffectedConsistency", "Table of results indicating if the observed genotypes matched the expected genotypes");
|
||||
|
||||
GATKReportTable table = consistencyReport.getTable("AffectedConsistency");
|
||||
table.addPrimaryKey("locus_and_scenario", false);
|
||||
table.addColumn("chr", "unknown");
|
||||
table.addColumn("start", 0);
|
||||
table.addColumn("scenario", "unknown");
|
||||
table.addColumn("P_of_C_given_DM_is_true", 0.0);
|
||||
table.addColumn("P_of_C_given_DM_is_false", 0.0);
|
||||
table.addColumn("OR_DM_is_true_vs_DM_is_false", 0.0);
|
||||
|
||||
for ( String sample : availableSamples ) {
|
||||
table.addColumn(sample, "unknown");
|
||||
}
|
||||
|
||||
if (VERBOSE_WRITER != null) {
|
||||
VERBOSE_WRITER.println("This is a test of the verbose writer");
|
||||
}
|
||||
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
private VariantContext getExpectedGenotypeConfiguration(Set<String> affectedSamples, VariantContext obs) {
|
||||
List<Allele> homRefAlleles = new ArrayList<Allele>();
|
||||
homRefAlleles.add(obs.getReference());
|
||||
homRefAlleles.add(obs.getReference());
|
||||
|
||||
List<Allele> hetAlleles = new ArrayList<Allele>();
|
||||
hetAlleles.add(obs.getReference());
|
||||
hetAlleles.add(obs.getAlternateAllele(0));
|
||||
|
||||
List<Allele> homVarAlleles = new ArrayList<Allele>();
|
||||
homVarAlleles.add(obs.getAlternateAllele(0));
|
||||
homVarAlleles.add(obs.getAlternateAllele(0));
|
||||
|
||||
Collection<Genotype> expectedGenotypes = new ArrayList<Genotype>();
|
||||
for ( String sample : obs.getSampleNames() ) {
|
||||
Genotype expectedGenotype = new Genotype(sample, homRefAlleles);
|
||||
if (affectedSamples.contains(sample)) {
|
||||
expectedGenotype = (DISEASE_MODEL == DiseaseModel.DOMINANT) ? new Genotype(sample, hetAlleles) : new Genotype(sample, homVarAlleles);
|
||||
}
|
||||
|
||||
expectedGenotypes.add(expectedGenotype);
|
||||
}
|
||||
|
||||
return new VariantContext("expected", obs.getChr(), obs.getStart(), obs.getEnd(), obs.getAlleles(), expectedGenotypes);
|
||||
}
|
||||
|
||||
private double getLogLikelihoodOfDiseaseModelHypothesis(VariantContext obs, VariantContext exp, boolean diseaseModelIsSupported) {
|
||||
return getLogLikelihoodOfDiseaseModelHypothesis(obs, exp, diseaseModelIsSupported, 0, 0.0);
|
||||
}
|
||||
|
||||
private double getLogLikelihoodOfDiseaseModelHypothesis(VariantContext obs, VariantContext exp, boolean diseaseModelIsSupported, int sampleIndex, double logLikelihoodSoFar) {
|
||||
if (sampleIndex < exp.getNSamples()) {
|
||||
Genotype expGenotype = exp.getGenotype(sampleIndex);
|
||||
Genotype obsGenotype = obs.getGenotype(sampleIndex);
|
||||
|
||||
if (obsGenotype.hasLikelihoods()) {
|
||||
double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(obsGenotype.getLikelihoods().getAsVector());
|
||||
boolean[] expectedGenotypes = { expGenotype.isHomRef(), expGenotype.isHet(), expGenotype.isHomVar() };
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
if (expectedGenotypes[i] == diseaseModelIsSupported) {
|
||||
return getLogLikelihoodOfDiseaseModelHypothesis(obs, exp, diseaseModelIsSupported, sampleIndex + 1, logLikelihoodSoFar + Math.log10(normalizedLikelihoods[i]));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return getLogLikelihoodOfDiseaseModelHypothesis(obs, exp, diseaseModelIsSupported, sampleIndex + 1, logLikelihoodSoFar);
|
||||
}
|
||||
}
|
||||
|
||||
return logLikelihoodSoFar;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if (tracker != null) {
|
||||
Collection<VariantContext> vcs = tracker.getAllVariantContexts(ref, null, ref.getLocus(), true, true);
|
||||
|
||||
if (vcs.size() == 1) {
|
||||
VariantContext obs = vcs.iterator().next();
|
||||
|
||||
for (String scenarioId : sampleScenarios.keySet()) {
|
||||
Set<String> affectedSamples = sampleScenarios.get(scenarioId);
|
||||
|
||||
VariantContext exp = getExpectedGenotypeConfiguration(affectedSamples, obs);
|
||||
|
||||
/*
|
||||
GATKReport report = new GATKReport();
|
||||
|
||||
String reportName = String.format("GenotypeTable_%s_%s_%s", scenarioId, ref.getLocus().getContig(), ref.getLocus().getStart());
|
||||
String reportDesc = String.format("Info for scenario %s at locus %s", scenarioId, ref.getLocus());
|
||||
report.addTable(reportName, reportDesc);
|
||||
|
||||
GATKReportTable table = report.getTable(reportName);
|
||||
|
||||
table.addPrimaryKey("sample_pk", false);
|
||||
table.addColumn("table", "unknown");
|
||||
table.addColumn("sample", "unknown");
|
||||
table.addColumn("affected", false);
|
||||
table.addColumn("homref_prob", "unknown");
|
||||
table.addColumn("het_prob", "unknown");
|
||||
table.addColumn("homvar_prob", "unknown");
|
||||
table.addColumn("observed_genotype", "unknown");
|
||||
table.addColumn("expected_genotype", "unknown");
|
||||
|
||||
for (String sample : obs.getSampleNames()) {
|
||||
double[] normalizedLikelihoods = {0.0, 0.0, 0.0};
|
||||
if (obs.getGenotype(sample).hasLikelihoods()) {
|
||||
normalizedLikelihoods = MathUtils.normalizeFromLog10(obs.getGenotype(sample).getLikelihoods().getAsVector());
|
||||
}
|
||||
|
||||
table.set(sample, "table", reportName);
|
||||
table.set(sample, "sample", sample);
|
||||
table.set(sample, "affected", affectedSamples.contains(sample));
|
||||
table.set(sample, "homref_prob", normalizedLikelihoods[0]);
|
||||
table.set(sample, "het_prob", normalizedLikelihoods[1]);
|
||||
table.set(sample, "homvar_prob", normalizedLikelihoods[2]);
|
||||
table.set(sample, "observed_genotype", obs.getGenotype(sample).getGenotypeString());
|
||||
table.set(sample, "expected_genotype", exp.getGenotype(sample).getGenotypeString());
|
||||
}
|
||||
|
||||
report.print(out);
|
||||
*/
|
||||
|
||||
double logLikelihoodThatDiseaseModelIsSupported = getLogLikelihoodOfDiseaseModelHypothesis(obs, exp, true);
|
||||
double logLikelihoodThatDiseaseModelIsNotSupported = getLogLikelihoodOfDiseaseModelHypothesis(obs, exp, false);
|
||||
double logOddsRatioThatDiseaseModelIsSupported = logLikelihoodThatDiseaseModelIsSupported / logLikelihoodThatDiseaseModelIsNotSupported;
|
||||
|
||||
String key = String.format("%s_%s_%s", ref.getLocus().getContig(), ref.getLocus().getStart(), scenarioId);
|
||||
|
||||
consistencyReport.getTable("AffectedConsistency").set(key, "scenario", scenarioId);
|
||||
consistencyReport.getTable("AffectedConsistency").set(key, "chr", ref.getLocus().getContig());
|
||||
consistencyReport.getTable("AffectedConsistency").set(key, "start", ref.getLocus().getStart());
|
||||
consistencyReport.getTable("AffectedConsistency").set(key, "P_of_C_given_DM_is_true", logLikelihoodThatDiseaseModelIsSupported);
|
||||
consistencyReport.getTable("AffectedConsistency").set(key, "P_of_C_given_DM_is_false", logLikelihoodThatDiseaseModelIsNotSupported);
|
||||
consistencyReport.getTable("AffectedConsistency").set(key, "OR_DM_is_true_vs_DM_is_false", logOddsRatioThatDiseaseModelIsSupported);
|
||||
|
||||
for ( String sample : availableSamples ) {
|
||||
String obsAndExpectedGenotypes = String.format("%s;%s", obs.getGenotype(sample).getGenotypeString(), exp.getGenotype(sample).getGenotypeString());
|
||||
consistencyReport.getTable("AffectedConsistency").set(key, sample, obsAndExpectedGenotypes);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provide an initial value for reduce computations.
|
||||
*
|
||||
* @return Initial value of reduce.
|
||||
*/
|
||||
@Override
|
||||
public Integer reduceInit() {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduces a single map with the accumulator provided as the ReduceType.
|
||||
*
|
||||
* @param value result of the map.
|
||||
* @param sum accumulator for the reduce.
|
||||
* @return accumulator with result of the map taken into account.
|
||||
*/
|
||||
@Override
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
consistencyReport.print(out);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue