From 65a788f18a3f292ac2054407d78c783445ccd1f8 Mon Sep 17 00:00:00 2001 From: jmaguire Date: Mon, 29 Jun 2009 16:32:12 +0000 Subject: [PATCH] Added a ROD (SangerSNP) for parsing the Sanger's chr20 pilot1 SNP calls. Some doodling around with indel calling in an EM context. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1116 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/refdata/PooledEMSNPROD.java | 4 +- .../gatk/refdata/ReferenceOrderedData.java | 1 + .../sting/gatk/refdata/SangerSNPROD.java | 41 ++++++++++ .../gatk/walkers/MultiSampleCaller.java | 74 ++++++++++++++----- .../playground/utils/GenotypeLikelihoods.java | 7 ++ .../playground/utils/IndelLikelihood.java | 19 ++++- 6 files changed, 120 insertions(+), 26 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/gatk/refdata/SangerSNPROD.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/PooledEMSNPROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/PooledEMSNPROD.java index 33a2f9ffc..7718e5c31 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/PooledEMSNPROD.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/PooledEMSNPROD.java @@ -40,7 +40,7 @@ public class PooledEMSNPROD extends TabularROD implements SNPCallFromGenotypes { public double getMAF() { return Double.parseDouble(this.get("EM_alt_freq")); } public double getHeterozygosity() { return 2 * getMAF() * (1 - getMAF()); } public boolean isGenotype() { return false; } - public double getVariationConfidence() { return Double.parseDouble(this.get("discovery_lod")) * 10; } + public double getVariationConfidence() { return Double.parseDouble(this.get("discovery_lod")); } public double getConsensusConfidence() { return -1; } public List getGenotype() throws IllegalStateException { throw new IllegalStateException(); } public int getPloidy() throws IllegalStateException { return 2; } @@ -52,4 +52,4 @@ public class PooledEMSNPROD extends TabularROD implements SNPCallFromGenotypes { public int nHetGenotypes() { return Integer.parseInt(this.get("n_het")); } public int nHomVarGenotypes() { return Integer.parseInt(this.get("n_hom")); } public List getGenotypes() { return null; } -} \ No newline at end of file +} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java index 6f1c77c43..3f36a12ab 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java @@ -67,6 +67,7 @@ public class ReferenceOrderedData implements addModule("Table", TabularROD.class); addModule("PooledEM", PooledEMSNPROD.class); addModule("1KGSNPs", KGenomesSNPROD.class); + addModule("SangerSNP", SangerSNPROD.class); addModule("Intervals", IntervalRod.class); addModule("Variants", rodVariants.class); } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/SangerSNPROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/SangerSNPROD.java new file mode 100755 index 000000000..d276bdc4c --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/SangerSNPROD.java @@ -0,0 +1,41 @@ +package org.broadinstitute.sting.gatk.refdata; + +import java.util.*; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; + +public class SangerSNPROD extends TabularROD implements SNPCallFromGenotypes { + public SangerSNPROD(final String name) { + super(name); + } + + public GenomeLoc getLocation() { + loc = GenomeLocParser.createGenomeLoc(this.get("0"), Long.parseLong(this.get("1"))); + return loc; + } + public String getRefBasesFWD() { return this.get("2"); } + public char getRefSnpFWD() throws IllegalStateException { return getRefBasesFWD().charAt(0); } + public String getAltBasesFWD() { return this.get("3"); } + public char getAltSnpFWD() throws IllegalStateException { return getAltBasesFWD().charAt(0); } + public boolean isReference() { return getVariationConfidence() < 0.01; } + public boolean isSNP() { return ! isReference(); } + public boolean isInsertion() { return false; } + public boolean isDeletion() { return false; } + public boolean isIndel() { return false; } + public double getMAF() { return -1; } + public double getHeterozygosity() { return -1; } + public boolean isGenotype() { return false; } + public double getVariationConfidence() { return -1; } + public double getConsensusConfidence() { return -1; } + public List getGenotype() throws IllegalStateException { throw new IllegalStateException(); } + public int getPloidy() throws IllegalStateException { return 2; } + public boolean isBiallelic() { return true; } + + // SNPCallFromGenotypes interface + public int nIndividuals() { return -1; } + public int nHomRefGenotypes() { return -1; } + public int nHetGenotypes() { return -1; } + public int nHomVarGenotypes() { return -1; } + public List getGenotypes() { return null; } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller.java index 7dbd77dbf..11410ce78 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller.java @@ -28,6 +28,7 @@ public class MultiSampleCaller extends LocusWalker @Argument(fullName="discovery_output", shortName="discovery_output", required=true, doc="file to write SNP discovery output to") public String DISCOVERY_OUTPUT; @Argument(fullName="individual_output", shortName="individual_output", required=true, doc="file to write individual SNP calls to") public String INDIVIDUAL_OUTPUT; @Argument(fullName="sample_name_regex", shortName="sample_name_regex", required=false, doc="sample_name_regex") public String SAMPLE_NAME_REGEX = null; + @Argument(fullName="call_indels", shortName="call_indels", required=false, doc="call indels?") public boolean CALL_INDELS = false; // Private state. List sample_names; @@ -111,7 +112,7 @@ public class MultiSampleCaller extends LocusWalker char ref; - GenotypeLikelihoods Genotype(LocusContext context, double[] allele_likelihoods) + GenotypeLikelihoods Genotype(LocusContext context, double[] allele_likelihoods, double indel_alt_freq) { ReadBackedPileup pileup = new ReadBackedPileup(ref, context); String bases = pileup.getBases(); @@ -125,22 +126,6 @@ public class MultiSampleCaller extends LocusWalker List reads = context.getReads(); List offsets = context.getOffsets(); ref = Character.toUpperCase(ref); - - /* - // Handle indels. - if (call_indels) - { - String[] indels = BasicPileup.indelPileup(reads, offsets); - IndelCall indel_call = GenotypeLikelihoods.callIndel(indels); - if (indel_call != null) - { - if (! indel_call.type.equals("ref")) - { - System.out.printf("INDEL %s %s\n", context.getLocation(), indel_call); - } - } - } - */ // Handle single-base polymorphisms. GenotypeLikelihoods G = new GenotypeLikelihoods(); @@ -152,6 +137,21 @@ public class MultiSampleCaller extends LocusWalker } G.ApplyPrior(ref, allele_likelihoods); + // Handle indels + if (CALL_INDELS) + { + String[] indels = BasicPileup.indelPileup(reads, offsets); + IndelLikelihood indel_call = new IndelLikelihood(indels, indel_alt_freq); + if (indel_call.getType() != null) + { + G.addIndelLikelihood(indel_call); + } + else + { + G.addIndelLikelihood(null); + } + } + /* // Handle 2nd-best base calls. if (fourBaseMode && pileup.getBases().length() < 750) @@ -163,7 +163,6 @@ public class MultiSampleCaller extends LocusWalker return G; } - // thoughly check this function double[] CountFreqs(GenotypeLikelihoods[] genotype_likelihoods) { double[] allele_likelihoods = new double[4]; @@ -199,6 +198,35 @@ public class MultiSampleCaller extends LocusWalker return allele_likelihoods; } + double CountIndelFreq(GenotypeLikelihoods[] genotype_likelihoods) + { + HashMap indel_allele_likelihoods = new HashMap(); + + double pRef = 0; + double pAlt = 0; + + for (int j = 0; j < sample_names.size(); j++) + { + double personal_pRef = 0; + double personal_pAlt = 0; + + IndelLikelihood indel_likelihood = genotype_likelihoods[j].getIndelLikelihood(); + personal_pRef += 2*Math.pow(10, indel_likelihood.pRef()) + Math.pow(10, indel_likelihood.pHet()); + personal_pAlt += 2*Math.pow(10, indel_likelihood.pHom()) + Math.pow(10, indel_likelihood.pHet()); + + personal_pRef = personal_pRef / (personal_pAlt + personal_pRef); + personal_pAlt = personal_pAlt / (personal_pAlt + personal_pRef); + + pRef += personal_pRef; + pAlt += personal_pAlt; + } + + pRef = pRef / (pRef + pAlt); + pAlt = pAlt / (pRef + pAlt); + + return pAlt; + } + // Potential precision error here. double Compute_pD(GenotypeLikelihoods[] genotype_likelihoods) { @@ -223,7 +251,7 @@ public class MultiSampleCaller extends LocusWalker GenotypeLikelihoods[] G = new GenotypeLikelihoods[sample_names.size()]; for (int j = 0; j < sample_names.size(); j++) { - G[j] = Genotype(contexts[j], allele_likelihoods); + G[j] = Genotype(contexts[j], allele_likelihoods, 1e-6); } return Compute_pD(G); } @@ -271,16 +299,22 @@ public class MultiSampleCaller extends LocusWalker if (i == BaseUtils.simpleBaseToBaseIndex(ref)) { allele_likelihoods[i] = 0.9994999; } //sqrt(0.999) else { allele_likelihoods[i] = 0.0005002502; } // 0.001 / (2 * sqrt(0.999) } + double indel_alt_freq = 1e-4; GenotypeLikelihoods[] G = new GenotypeLikelihoods[sample_names.size()]; for (int i = 0; i < MAX_ITERATIONS; i++) { for (int j = 0; j < sample_names.size(); j++) { - G[j] = Genotype(contexts[j], allele_likelihoods); + G[j] = Genotype(contexts[j], allele_likelihoods, indel_alt_freq); } allele_likelihoods = CountFreqs(G); + + if (CALL_INDELS) + { + indel_alt_freq = CountIndelFreq(G); + } } return new EM_Result(G, allele_likelihoods); diff --git a/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java index b23aaa255..a1022b0e1 100644 --- a/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java @@ -155,6 +155,8 @@ public class GenotypeLikelihoods { public void add(char ref, char read, byte qual) { + if (qual <= 0) { qual = 1; } + if (coverage == 0) { for (int i = 0; i < likelihoods.length; i++) @@ -398,4 +400,9 @@ public class GenotypeLikelihoods { AFE.genotypeLikelihoods = this; return AFE; } + + private IndelLikelihood indel_likelihood; + public void addIndelLikelihood(IndelLikelihood indel_likelihood) { this.indel_likelihood = indel_likelihood; } + public IndelLikelihood getIndelLikelihood() { return this.indel_likelihood; } + } diff --git a/java/src/org/broadinstitute/sting/playground/utils/IndelLikelihood.java b/java/src/org/broadinstitute/sting/playground/utils/IndelLikelihood.java index b037e00e0..72b559b01 100755 --- a/java/src/org/broadinstitute/sting/playground/utils/IndelLikelihood.java +++ b/java/src/org/broadinstitute/sting/playground/utils/IndelLikelihood.java @@ -8,11 +8,17 @@ public class IndelLikelihood { private double p; private double lod; + private double pRef; + private double pHet; + private double pHom; + private String alt; + public IndelLikelihood(String type, String[] alleles, double p, double lod) { initialize(type, alleles, p, lod); } - public IndelLikelihood(String[] indels) { + public IndelLikelihood(String[] indels, double indel_alt_freq) + { HashMap indel_allele_counts = new HashMap(); for (int i = 0; i < indels.length; i++) { @@ -43,9 +49,9 @@ public class IndelLikelihood { //System.out.printf("\n"); double eps = 1e-3; - double pRef = null_count*Math.log10(1.0 - eps) + max_count*Math.log10(eps) + Math.log10(0.999); - double pHet = null_count*Math.log10(0.5 - eps/2) + max_count*Math.log10(0.5-eps/2) + Math.log10(1e-3); - double pHom = null_count*Math.log10(eps) + max_count*Math.log10(1.0 - eps) + Math.log10(1e-5); + pRef = null_count*Math.log10(1.0 - eps) + max_count*Math.log10(eps) + 2*Math.log10(1-indel_alt_freq); + pHet = null_count*Math.log10(0.5 - eps/2) + max_count*Math.log10(0.5-eps/2) + Math.log10((1-indel_alt_freq)*indel_alt_freq); + pHom = null_count*Math.log10(eps) + max_count*Math.log10(1.0 - eps) + 2*Math.log10(indel_alt_freq); double lodRef = pRef - Math.max(pHet, pHom); double lodHet = pHet - pRef; @@ -91,6 +97,11 @@ public class IndelLikelihood { this.lod = lod; } + public String getAlt() { return alt; } + public double pRef() { return pRef; } + public double pHet() { return pHet; } + public double pHom() { return pHom; } + public String getType() { return type; } public String[] getAlleles() { return alleles; } public double getPosteriorProbability() { return p; }