Added a ROD (SangerSNP) for parsing the Sanger's chr20 pilot1 SNP calls.
Some doodling around with indel calling in an EM context. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1116 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ceeeec13b8
commit
65a788f18a
|
|
@ -40,7 +40,7 @@ public class PooledEMSNPROD extends TabularROD implements SNPCallFromGenotypes {
|
||||||
public double getMAF() { return Double.parseDouble(this.get("EM_alt_freq")); }
|
public double getMAF() { return Double.parseDouble(this.get("EM_alt_freq")); }
|
||||||
public double getHeterozygosity() { return 2 * getMAF() * (1 - getMAF()); }
|
public double getHeterozygosity() { return 2 * getMAF() * (1 - getMAF()); }
|
||||||
public boolean isGenotype() { return false; }
|
public boolean isGenotype() { return false; }
|
||||||
public double getVariationConfidence() { return Double.parseDouble(this.get("discovery_lod")) * 10; }
|
public double getVariationConfidence() { return Double.parseDouble(this.get("discovery_lod")); }
|
||||||
public double getConsensusConfidence() { return -1; }
|
public double getConsensusConfidence() { return -1; }
|
||||||
public List<String> getGenotype() throws IllegalStateException { throw new IllegalStateException(); }
|
public List<String> getGenotype() throws IllegalStateException { throw new IllegalStateException(); }
|
||||||
public int getPloidy() throws IllegalStateException { return 2; }
|
public int getPloidy() throws IllegalStateException { return 2; }
|
||||||
|
|
|
||||||
|
|
@ -67,6 +67,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
||||||
addModule("Table", TabularROD.class);
|
addModule("Table", TabularROD.class);
|
||||||
addModule("PooledEM", PooledEMSNPROD.class);
|
addModule("PooledEM", PooledEMSNPROD.class);
|
||||||
addModule("1KGSNPs", KGenomesSNPROD.class);
|
addModule("1KGSNPs", KGenomesSNPROD.class);
|
||||||
|
addModule("SangerSNP", SangerSNPROD.class);
|
||||||
addModule("Intervals", IntervalRod.class);
|
addModule("Intervals", IntervalRod.class);
|
||||||
addModule("Variants", rodVariants.class);
|
addModule("Variants", rodVariants.class);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,41 @@
|
||||||
|
package org.broadinstitute.sting.gatk.refdata;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
|
||||||
|
public class SangerSNPROD extends TabularROD implements SNPCallFromGenotypes {
|
||||||
|
public SangerSNPROD(final String name) {
|
||||||
|
super(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
public GenomeLoc getLocation() {
|
||||||
|
loc = GenomeLocParser.createGenomeLoc(this.get("0"), Long.parseLong(this.get("1")));
|
||||||
|
return loc;
|
||||||
|
}
|
||||||
|
public String getRefBasesFWD() { return this.get("2"); }
|
||||||
|
public char getRefSnpFWD() throws IllegalStateException { return getRefBasesFWD().charAt(0); }
|
||||||
|
public String getAltBasesFWD() { return this.get("3"); }
|
||||||
|
public char getAltSnpFWD() throws IllegalStateException { return getAltBasesFWD().charAt(0); }
|
||||||
|
public boolean isReference() { return getVariationConfidence() < 0.01; }
|
||||||
|
public boolean isSNP() { return ! isReference(); }
|
||||||
|
public boolean isInsertion() { return false; }
|
||||||
|
public boolean isDeletion() { return false; }
|
||||||
|
public boolean isIndel() { return false; }
|
||||||
|
public double getMAF() { return -1; }
|
||||||
|
public double getHeterozygosity() { return -1; }
|
||||||
|
public boolean isGenotype() { return false; }
|
||||||
|
public double getVariationConfidence() { return -1; }
|
||||||
|
public double getConsensusConfidence() { return -1; }
|
||||||
|
public List<String> getGenotype() throws IllegalStateException { throw new IllegalStateException(); }
|
||||||
|
public int getPloidy() throws IllegalStateException { return 2; }
|
||||||
|
public boolean isBiallelic() { return true; }
|
||||||
|
|
||||||
|
// SNPCallFromGenotypes interface
|
||||||
|
public int nIndividuals() { return -1; }
|
||||||
|
public int nHomRefGenotypes() { return -1; }
|
||||||
|
public int nHetGenotypes() { return -1; }
|
||||||
|
public int nHomVarGenotypes() { return -1; }
|
||||||
|
public List<Genotype> getGenotypes() { return null; }
|
||||||
|
}
|
||||||
|
|
@ -28,6 +28,7 @@ public class MultiSampleCaller extends LocusWalker<String,String>
|
||||||
@Argument(fullName="discovery_output", shortName="discovery_output", required=true, doc="file to write SNP discovery output to") public String DISCOVERY_OUTPUT;
|
@Argument(fullName="discovery_output", shortName="discovery_output", required=true, doc="file to write SNP discovery output to") public String DISCOVERY_OUTPUT;
|
||||||
@Argument(fullName="individual_output", shortName="individual_output", required=true, doc="file to write individual SNP calls to") public String INDIVIDUAL_OUTPUT;
|
@Argument(fullName="individual_output", shortName="individual_output", required=true, doc="file to write individual SNP calls to") public String INDIVIDUAL_OUTPUT;
|
||||||
@Argument(fullName="sample_name_regex", shortName="sample_name_regex", required=false, doc="sample_name_regex") public String SAMPLE_NAME_REGEX = null;
|
@Argument(fullName="sample_name_regex", shortName="sample_name_regex", required=false, doc="sample_name_regex") public String SAMPLE_NAME_REGEX = null;
|
||||||
|
@Argument(fullName="call_indels", shortName="call_indels", required=false, doc="call indels?") public boolean CALL_INDELS = false;
|
||||||
|
|
||||||
// Private state.
|
// Private state.
|
||||||
List<String> sample_names;
|
List<String> sample_names;
|
||||||
|
|
@ -111,7 +112,7 @@ public class MultiSampleCaller extends LocusWalker<String,String>
|
||||||
|
|
||||||
char ref;
|
char ref;
|
||||||
|
|
||||||
GenotypeLikelihoods Genotype(LocusContext context, double[] allele_likelihoods)
|
GenotypeLikelihoods Genotype(LocusContext context, double[] allele_likelihoods, double indel_alt_freq)
|
||||||
{
|
{
|
||||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
|
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
|
||||||
String bases = pileup.getBases();
|
String bases = pileup.getBases();
|
||||||
|
|
@ -126,22 +127,6 @@ public class MultiSampleCaller extends LocusWalker<String,String>
|
||||||
List<Integer> offsets = context.getOffsets();
|
List<Integer> offsets = context.getOffsets();
|
||||||
ref = Character.toUpperCase(ref);
|
ref = Character.toUpperCase(ref);
|
||||||
|
|
||||||
/*
|
|
||||||
// Handle indels.
|
|
||||||
if (call_indels)
|
|
||||||
{
|
|
||||||
String[] indels = BasicPileup.indelPileup(reads, offsets);
|
|
||||||
IndelCall indel_call = GenotypeLikelihoods.callIndel(indels);
|
|
||||||
if (indel_call != null)
|
|
||||||
{
|
|
||||||
if (! indel_call.type.equals("ref"))
|
|
||||||
{
|
|
||||||
System.out.printf("INDEL %s %s\n", context.getLocation(), indel_call);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
// Handle single-base polymorphisms.
|
// Handle single-base polymorphisms.
|
||||||
GenotypeLikelihoods G = new GenotypeLikelihoods();
|
GenotypeLikelihoods G = new GenotypeLikelihoods();
|
||||||
for ( int i = 0; i < reads.size(); i++ )
|
for ( int i = 0; i < reads.size(); i++ )
|
||||||
|
|
@ -152,6 +137,21 @@ public class MultiSampleCaller extends LocusWalker<String,String>
|
||||||
}
|
}
|
||||||
G.ApplyPrior(ref, allele_likelihoods);
|
G.ApplyPrior(ref, allele_likelihoods);
|
||||||
|
|
||||||
|
// Handle indels
|
||||||
|
if (CALL_INDELS)
|
||||||
|
{
|
||||||
|
String[] indels = BasicPileup.indelPileup(reads, offsets);
|
||||||
|
IndelLikelihood indel_call = new IndelLikelihood(indels, indel_alt_freq);
|
||||||
|
if (indel_call.getType() != null)
|
||||||
|
{
|
||||||
|
G.addIndelLikelihood(indel_call);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
G.addIndelLikelihood(null);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
// Handle 2nd-best base calls.
|
// Handle 2nd-best base calls.
|
||||||
if (fourBaseMode && pileup.getBases().length() < 750)
|
if (fourBaseMode && pileup.getBases().length() < 750)
|
||||||
|
|
@ -163,7 +163,6 @@ public class MultiSampleCaller extends LocusWalker<String,String>
|
||||||
return G;
|
return G;
|
||||||
}
|
}
|
||||||
|
|
||||||
// thoughly check this function
|
|
||||||
double[] CountFreqs(GenotypeLikelihoods[] genotype_likelihoods)
|
double[] CountFreqs(GenotypeLikelihoods[] genotype_likelihoods)
|
||||||
{
|
{
|
||||||
double[] allele_likelihoods = new double[4];
|
double[] allele_likelihoods = new double[4];
|
||||||
|
|
@ -199,6 +198,35 @@ public class MultiSampleCaller extends LocusWalker<String,String>
|
||||||
return allele_likelihoods;
|
return allele_likelihoods;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double CountIndelFreq(GenotypeLikelihoods[] genotype_likelihoods)
|
||||||
|
{
|
||||||
|
HashMap<String, Double> indel_allele_likelihoods = new HashMap<String, Double>();
|
||||||
|
|
||||||
|
double pRef = 0;
|
||||||
|
double pAlt = 0;
|
||||||
|
|
||||||
|
for (int j = 0; j < sample_names.size(); j++)
|
||||||
|
{
|
||||||
|
double personal_pRef = 0;
|
||||||
|
double personal_pAlt = 0;
|
||||||
|
|
||||||
|
IndelLikelihood indel_likelihood = genotype_likelihoods[j].getIndelLikelihood();
|
||||||
|
personal_pRef += 2*Math.pow(10, indel_likelihood.pRef()) + Math.pow(10, indel_likelihood.pHet());
|
||||||
|
personal_pAlt += 2*Math.pow(10, indel_likelihood.pHom()) + Math.pow(10, indel_likelihood.pHet());
|
||||||
|
|
||||||
|
personal_pRef = personal_pRef / (personal_pAlt + personal_pRef);
|
||||||
|
personal_pAlt = personal_pAlt / (personal_pAlt + personal_pRef);
|
||||||
|
|
||||||
|
pRef += personal_pRef;
|
||||||
|
pAlt += personal_pAlt;
|
||||||
|
}
|
||||||
|
|
||||||
|
pRef = pRef / (pRef + pAlt);
|
||||||
|
pAlt = pAlt / (pRef + pAlt);
|
||||||
|
|
||||||
|
return pAlt;
|
||||||
|
}
|
||||||
|
|
||||||
// Potential precision error here.
|
// Potential precision error here.
|
||||||
double Compute_pD(GenotypeLikelihoods[] genotype_likelihoods)
|
double Compute_pD(GenotypeLikelihoods[] genotype_likelihoods)
|
||||||
{
|
{
|
||||||
|
|
@ -223,7 +251,7 @@ public class MultiSampleCaller extends LocusWalker<String,String>
|
||||||
GenotypeLikelihoods[] G = new GenotypeLikelihoods[sample_names.size()];
|
GenotypeLikelihoods[] G = new GenotypeLikelihoods[sample_names.size()];
|
||||||
for (int j = 0; j < sample_names.size(); j++)
|
for (int j = 0; j < sample_names.size(); j++)
|
||||||
{
|
{
|
||||||
G[j] = Genotype(contexts[j], allele_likelihoods);
|
G[j] = Genotype(contexts[j], allele_likelihoods, 1e-6);
|
||||||
}
|
}
|
||||||
return Compute_pD(G);
|
return Compute_pD(G);
|
||||||
}
|
}
|
||||||
|
|
@ -271,16 +299,22 @@ public class MultiSampleCaller extends LocusWalker<String,String>
|
||||||
if (i == BaseUtils.simpleBaseToBaseIndex(ref)) { allele_likelihoods[i] = 0.9994999; } //sqrt(0.999)
|
if (i == BaseUtils.simpleBaseToBaseIndex(ref)) { allele_likelihoods[i] = 0.9994999; } //sqrt(0.999)
|
||||||
else { allele_likelihoods[i] = 0.0005002502; } // 0.001 / (2 * sqrt(0.999)
|
else { allele_likelihoods[i] = 0.0005002502; } // 0.001 / (2 * sqrt(0.999)
|
||||||
}
|
}
|
||||||
|
double indel_alt_freq = 1e-4;
|
||||||
|
|
||||||
GenotypeLikelihoods[] G = new GenotypeLikelihoods[sample_names.size()];
|
GenotypeLikelihoods[] G = new GenotypeLikelihoods[sample_names.size()];
|
||||||
for (int i = 0; i < MAX_ITERATIONS; i++)
|
for (int i = 0; i < MAX_ITERATIONS; i++)
|
||||||
{
|
{
|
||||||
for (int j = 0; j < sample_names.size(); j++)
|
for (int j = 0; j < sample_names.size(); j++)
|
||||||
{
|
{
|
||||||
G[j] = Genotype(contexts[j], allele_likelihoods);
|
G[j] = Genotype(contexts[j], allele_likelihoods, indel_alt_freq);
|
||||||
}
|
}
|
||||||
|
|
||||||
allele_likelihoods = CountFreqs(G);
|
allele_likelihoods = CountFreqs(G);
|
||||||
|
|
||||||
|
if (CALL_INDELS)
|
||||||
|
{
|
||||||
|
indel_alt_freq = CountIndelFreq(G);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return new EM_Result(G, allele_likelihoods);
|
return new EM_Result(G, allele_likelihoods);
|
||||||
|
|
|
||||||
|
|
@ -155,6 +155,8 @@ public class GenotypeLikelihoods {
|
||||||
|
|
||||||
public void add(char ref, char read, byte qual)
|
public void add(char ref, char read, byte qual)
|
||||||
{
|
{
|
||||||
|
if (qual <= 0) { qual = 1; }
|
||||||
|
|
||||||
if (coverage == 0)
|
if (coverage == 0)
|
||||||
{
|
{
|
||||||
for (int i = 0; i < likelihoods.length; i++)
|
for (int i = 0; i < likelihoods.length; i++)
|
||||||
|
|
@ -398,4 +400,9 @@ public class GenotypeLikelihoods {
|
||||||
AFE.genotypeLikelihoods = this;
|
AFE.genotypeLikelihoods = this;
|
||||||
return AFE;
|
return AFE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private IndelLikelihood indel_likelihood;
|
||||||
|
public void addIndelLikelihood(IndelLikelihood indel_likelihood) { this.indel_likelihood = indel_likelihood; }
|
||||||
|
public IndelLikelihood getIndelLikelihood() { return this.indel_likelihood; }
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -8,11 +8,17 @@ public class IndelLikelihood {
|
||||||
private double p;
|
private double p;
|
||||||
private double lod;
|
private double lod;
|
||||||
|
|
||||||
|
private double pRef;
|
||||||
|
private double pHet;
|
||||||
|
private double pHom;
|
||||||
|
private String alt;
|
||||||
|
|
||||||
public IndelLikelihood(String type, String[] alleles, double p, double lod) {
|
public IndelLikelihood(String type, String[] alleles, double p, double lod) {
|
||||||
initialize(type, alleles, p, lod);
|
initialize(type, alleles, p, lod);
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndelLikelihood(String[] indels) {
|
public IndelLikelihood(String[] indels, double indel_alt_freq)
|
||||||
|
{
|
||||||
HashMap<String,Integer> indel_allele_counts = new HashMap<String,Integer>();
|
HashMap<String,Integer> indel_allele_counts = new HashMap<String,Integer>();
|
||||||
|
|
||||||
for (int i = 0; i < indels.length; i++) {
|
for (int i = 0; i < indels.length; i++) {
|
||||||
|
|
@ -43,9 +49,9 @@ public class IndelLikelihood {
|
||||||
//System.out.printf("\n");
|
//System.out.printf("\n");
|
||||||
|
|
||||||
double eps = 1e-3;
|
double eps = 1e-3;
|
||||||
double pRef = null_count*Math.log10(1.0 - eps) + max_count*Math.log10(eps) + Math.log10(0.999);
|
pRef = null_count*Math.log10(1.0 - eps) + max_count*Math.log10(eps) + 2*Math.log10(1-indel_alt_freq);
|
||||||
double pHet = null_count*Math.log10(0.5 - eps/2) + max_count*Math.log10(0.5-eps/2) + Math.log10(1e-3);
|
pHet = null_count*Math.log10(0.5 - eps/2) + max_count*Math.log10(0.5-eps/2) + Math.log10((1-indel_alt_freq)*indel_alt_freq);
|
||||||
double pHom = null_count*Math.log10(eps) + max_count*Math.log10(1.0 - eps) + Math.log10(1e-5);
|
pHom = null_count*Math.log10(eps) + max_count*Math.log10(1.0 - eps) + 2*Math.log10(indel_alt_freq);
|
||||||
|
|
||||||
double lodRef = pRef - Math.max(pHet, pHom);
|
double lodRef = pRef - Math.max(pHet, pHom);
|
||||||
double lodHet = pHet - pRef;
|
double lodHet = pHet - pRef;
|
||||||
|
|
@ -91,6 +97,11 @@ public class IndelLikelihood {
|
||||||
this.lod = lod;
|
this.lod = lod;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getAlt() { return alt; }
|
||||||
|
public double pRef() { return pRef; }
|
||||||
|
public double pHet() { return pHet; }
|
||||||
|
public double pHom() { return pHom; }
|
||||||
|
|
||||||
public String getType() { return type; }
|
public String getType() { return type; }
|
||||||
public String[] getAlleles() { return alleles; }
|
public String[] getAlleles() { return alleles; }
|
||||||
public double getPosteriorProbability() { return p; }
|
public double getPosteriorProbability() { return p; }
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue