better handling of edge cases (zero coverage, reference mistakes, etc.)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@747 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
jmaguire 2009-05-18 18:04:37 +00:00
parent 7c615c8fb0
commit 3441795d9c
3 changed files with 63 additions and 14 deletions

View File

@ -98,6 +98,8 @@ public class PoolCaller extends LocusWalker<AlleleFrequencyEstimate, String>
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context) public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context)
{ {
if (ref == 'N') { return null; }
// 1. seperate each context. // 1. seperate each context.
LocusContext[] contexts = filterLocusContext(context, sample_names, 0); LocusContext[] contexts = filterLocusContext(context, sample_names, 0);
@ -114,7 +116,31 @@ public class PoolCaller extends LocusWalker<AlleleFrequencyEstimate, String>
double[] trajectory = new double[MAX_ITERATIONS + 1]; trajectory[0] = EM_alt_freq; double[] trajectory = new double[MAX_ITERATIONS + 1]; trajectory[0] = EM_alt_freq;
double[] likelihood_trajectory = new double[MAX_ITERATIONS + 1]; likelihood_trajectory[0] = 0.0; double[] likelihood_trajectory = new double[MAX_ITERATIONS + 1]; likelihood_trajectory[0] = 0.0;
boolean is_a_snp = false; boolean is_a_snp = false;
// Pick the alternate base
char alt = 'N'; char alt = 'N';
{
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
String bases = pileup.getBases();
int A = 0;
int C = 0;
int G = 0;
int T = 0;
int max_count = -1;
for (int i = 0; i < bases.length(); i++)
{
char b = bases.charAt(i);
if (b == ref) { continue; }
switch (b)
{
case 'A' : A += 1; if (A > max_count) { max_count = A; alt = 'A'; } break;
case 'C' : C += 1; if (C > max_count) { max_count = C; alt = 'C'; } break;
case 'G' : G += 1; if (G > max_count) { max_count = G; alt = 'G'; } break;
case 'T' : T += 1; if (T > max_count) { max_count = T; alt = 'T'; } break;
}
}
}
for (int iterations = 0; iterations < MAX_ITERATIONS; iterations++) for (int iterations = 0; iterations < MAX_ITERATIONS; iterations++)
{ {
// 6. Re-call from shallow coverage using the estimated frequency as a prior, // 6. Re-call from shallow coverage using the estimated frequency as a prior,
@ -128,12 +154,10 @@ public class PoolCaller extends LocusWalker<AlleleFrequencyEstimate, String>
for (int i = 0; i < sample_names.size(); i++) for (int i = 0; i < sample_names.size(); i++)
{ {
callers.get(i).setAlleleFrequencyPrior(EM_alt_freq); callers.get(i).setAlleleFrequencyPrior(EM_alt_freq, alt);
calls[i] = callers.get(i).map(tracker, ref, contexts[i]); calls[i] = callers.get(i).map(tracker, ref, contexts[i]);
String genotype = calls[i].genotype(); String genotype = calls[i].genotype();
if (calls[i].alt != 'N') { alt = calls[i].alt; }
likelihood += calls[i].pBest; likelihood += calls[i].pBest;
if (! FRACTIONAL_COUNTS) if (! FRACTIONAL_COUNTS)
@ -157,7 +181,7 @@ public class PoolCaller extends LocusWalker<AlleleFrequencyEstimate, String>
} }
} }
EM_alt_freq = EM_sum / EM_N; EM_alt_freq = Math.min(EM_sum / EM_N, 0.99999);
trajectory[iterations+1] = EM_alt_freq; trajectory[iterations+1] = EM_alt_freq;
likelihood_trajectory[iterations+1] = likelihood; likelihood_trajectory[iterations+1] = likelihood;

View File

@ -278,6 +278,12 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
ReadBackedPileup pileup = new ReadBackedPileup(ref, context); ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
String bases = pileup.getBases(); String bases = pileup.getBases();
if (bases.length() == 0)
{
GenotypeLikelihoods G = new GenotypeLikelihoods();
return G.toAlleleFrequencyEstimate(context.getLocation(), ref, bases.length(), bases, G.likelihoods, sample_name);
}
List<SAMRecord> reads = context.getReads(); List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets(); List<Integer> offsets = context.getOffsets();
ref = Character.toUpperCase(ref); ref = Character.toUpperCase(ref);
@ -305,7 +311,7 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]); G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]);
} }
G.ApplyPrior(ref, this.allele_frequency_prior); G.ApplyPrior(ref, this.alt_allele, this.allele_frequency_prior);
if (fourBaseMode) { if (fourBaseMode) {
G.applyFourBaseDistributionPrior(pileup.getBases(), pileup.getSecondaryBasePileup()); G.applyFourBaseDistributionPrior(pileup.getBases(), pileup.getSecondaryBasePileup());
@ -333,9 +339,11 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
} }
double allele_frequency_prior = -1; double allele_frequency_prior = -1;
public void setAlleleFrequencyPrior(double freq) char alt_allele;
public void setAlleleFrequencyPrior(double freq, char alt)
{ {
this.allele_frequency_prior = freq; this.allele_frequency_prior = freq;
this.alt_allele = alt;
} }
// Given result of map function // Given result of map function

View File

@ -137,18 +137,24 @@ public class GenotypeLikelihoods {
return s; return s;
} }
public void ApplyPrior(char ref, double p_alt) public void ApplyPrior(char ref, char alt, double p_alt)
{ {
for (int i = 0; i < genotypes.length; i++) { for (int i = 0; i < genotypes.length; i++)
{
if ((p_alt == -1) || (p_alt <= 1e-6)) if ((p_alt == -1) || (p_alt <= 1e-6))
{ {
if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) { if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref))
{
// hom-ref // hom-ref
likelihoods[i] += Math.log10(1.0 - 1e-3); likelihoods[i] += Math.log10(1.0 - 1e-3);
} else if ((genotypes[i].charAt(0) != ref) && (genotypes[i].charAt(1) != ref)) { }
else if ((genotypes[i].charAt(0) != ref) && (genotypes[i].charAt(1) != ref))
{
// hom-nonref // hom-nonref
likelihoods[i] += Math.log10(1e-5); likelihoods[i] += Math.log10(1e-5);
} else { }
else
{
// het // het
likelihoods[i] += Math.log10(1e-3); likelihoods[i] += Math.log10(1e-3);
} }
@ -156,16 +162,27 @@ public class GenotypeLikelihoods {
} }
else else
{ {
if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) { if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref))
{
// hom-ref // hom-ref
likelihoods[i] += 2.0 * Math.log10(1.0 - p_alt); likelihoods[i] += 2.0 * Math.log10(1.0 - p_alt);
} else if ((genotypes[i].charAt(0) != ref) && (genotypes[i].charAt(1) != ref)) { }
else if ((genotypes[i].charAt(0) == alt) && (genotypes[i].charAt(1) == alt))
{
// hom-nonref // hom-nonref
likelihoods[i] += 2.0 * Math.log10(p_alt); likelihoods[i] += 2.0 * Math.log10(p_alt);
} else { }
else if (((genotypes[i].charAt(0) == alt) && (genotypes[i].charAt(1) == ref)) ||
((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == alt)))
{
// het // het
likelihoods[i] += Math.log10((1.0-p_alt) * p_alt * 2.0); likelihoods[i] += Math.log10((1.0-p_alt) * p_alt * 2.0);
} }
else
{
// something else (noise!)
likelihoods[i] += Math.log10(1e-5);
}
if (Double.isInfinite(likelihoods[i])) { likelihoods[i] = -1000; } if (Double.isInfinite(likelihoods[i])) { likelihoods[i] = -1000; }
} }