Added a ROD (SangerSNP) for parsing the Sanger's chr20 pilot1 SNP calls.
Some doodling around with indel calling in an EM context. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1116 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ceeeec13b8
commit
65a788f18a
|
|
@ -40,7 +40,7 @@ public class PooledEMSNPROD extends TabularROD implements SNPCallFromGenotypes {
|
|||
public double getMAF() { return Double.parseDouble(this.get("EM_alt_freq")); }
|
||||
public double getHeterozygosity() { return 2 * getMAF() * (1 - getMAF()); }
|
||||
public boolean isGenotype() { return false; }
|
||||
public double getVariationConfidence() { return Double.parseDouble(this.get("discovery_lod")) * 10; }
|
||||
public double getVariationConfidence() { return Double.parseDouble(this.get("discovery_lod")); }
|
||||
public double getConsensusConfidence() { return -1; }
|
||||
public List<String> getGenotype() throws IllegalStateException { throw new IllegalStateException(); }
|
||||
public int getPloidy() throws IllegalStateException { return 2; }
|
||||
|
|
@ -52,4 +52,4 @@ public class PooledEMSNPROD extends TabularROD implements SNPCallFromGenotypes {
|
|||
public int nHetGenotypes() { return Integer.parseInt(this.get("n_het")); }
|
||||
public int nHomVarGenotypes() { return Integer.parseInt(this.get("n_hom")); }
|
||||
public List<Genotype> getGenotypes() { return null; }
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -67,6 +67,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
|||
addModule("Table", TabularROD.class);
|
||||
addModule("PooledEM", PooledEMSNPROD.class);
|
||||
addModule("1KGSNPs", KGenomesSNPROD.class);
|
||||
addModule("SangerSNP", SangerSNPROD.class);
|
||||
addModule("Intervals", IntervalRod.class);
|
||||
addModule("Variants", rodVariants.class);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,41 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
||||
public class SangerSNPROD extends TabularROD implements SNPCallFromGenotypes {
|
||||
public SangerSNPROD(final String name) {
|
||||
super(name);
|
||||
}
|
||||
|
||||
public GenomeLoc getLocation() {
|
||||
loc = GenomeLocParser.createGenomeLoc(this.get("0"), Long.parseLong(this.get("1")));
|
||||
return loc;
|
||||
}
|
||||
public String getRefBasesFWD() { return this.get("2"); }
|
||||
public char getRefSnpFWD() throws IllegalStateException { return getRefBasesFWD().charAt(0); }
|
||||
public String getAltBasesFWD() { return this.get("3"); }
|
||||
public char getAltSnpFWD() throws IllegalStateException { return getAltBasesFWD().charAt(0); }
|
||||
public boolean isReference() { return getVariationConfidence() < 0.01; }
|
||||
public boolean isSNP() { return ! isReference(); }
|
||||
public boolean isInsertion() { return false; }
|
||||
public boolean isDeletion() { return false; }
|
||||
public boolean isIndel() { return false; }
|
||||
public double getMAF() { return -1; }
|
||||
public double getHeterozygosity() { return -1; }
|
||||
public boolean isGenotype() { return false; }
|
||||
public double getVariationConfidence() { return -1; }
|
||||
public double getConsensusConfidence() { return -1; }
|
||||
public List<String> getGenotype() throws IllegalStateException { throw new IllegalStateException(); }
|
||||
public int getPloidy() throws IllegalStateException { return 2; }
|
||||
public boolean isBiallelic() { return true; }
|
||||
|
||||
// SNPCallFromGenotypes interface
|
||||
public int nIndividuals() { return -1; }
|
||||
public int nHomRefGenotypes() { return -1; }
|
||||
public int nHetGenotypes() { return -1; }
|
||||
public int nHomVarGenotypes() { return -1; }
|
||||
public List<Genotype> getGenotypes() { return null; }
|
||||
}
|
||||
|
|
@ -28,6 +28,7 @@ public class MultiSampleCaller extends LocusWalker<String,String>
|
|||
@Argument(fullName="discovery_output", shortName="discovery_output", required=true, doc="file to write SNP discovery output to") public String DISCOVERY_OUTPUT;
|
||||
@Argument(fullName="individual_output", shortName="individual_output", required=true, doc="file to write individual SNP calls to") public String INDIVIDUAL_OUTPUT;
|
||||
@Argument(fullName="sample_name_regex", shortName="sample_name_regex", required=false, doc="sample_name_regex") public String SAMPLE_NAME_REGEX = null;
|
||||
@Argument(fullName="call_indels", shortName="call_indels", required=false, doc="call indels?") public boolean CALL_INDELS = false;
|
||||
|
||||
// Private state.
|
||||
List<String> sample_names;
|
||||
|
|
@ -111,7 +112,7 @@ public class MultiSampleCaller extends LocusWalker<String,String>
|
|||
|
||||
char ref;
|
||||
|
||||
GenotypeLikelihoods Genotype(LocusContext context, double[] allele_likelihoods)
|
||||
GenotypeLikelihoods Genotype(LocusContext context, double[] allele_likelihoods, double indel_alt_freq)
|
||||
{
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
|
||||
String bases = pileup.getBases();
|
||||
|
|
@ -125,22 +126,6 @@ public class MultiSampleCaller extends LocusWalker<String,String>
|
|||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
ref = Character.toUpperCase(ref);
|
||||
|
||||
/*
|
||||
// Handle indels.
|
||||
if (call_indels)
|
||||
{
|
||||
String[] indels = BasicPileup.indelPileup(reads, offsets);
|
||||
IndelCall indel_call = GenotypeLikelihoods.callIndel(indels);
|
||||
if (indel_call != null)
|
||||
{
|
||||
if (! indel_call.type.equals("ref"))
|
||||
{
|
||||
System.out.printf("INDEL %s %s\n", context.getLocation(), indel_call);
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
// Handle single-base polymorphisms.
|
||||
GenotypeLikelihoods G = new GenotypeLikelihoods();
|
||||
|
|
@ -152,6 +137,21 @@ public class MultiSampleCaller extends LocusWalker<String,String>
|
|||
}
|
||||
G.ApplyPrior(ref, allele_likelihoods);
|
||||
|
||||
// Handle indels
|
||||
if (CALL_INDELS)
|
||||
{
|
||||
String[] indels = BasicPileup.indelPileup(reads, offsets);
|
||||
IndelLikelihood indel_call = new IndelLikelihood(indels, indel_alt_freq);
|
||||
if (indel_call.getType() != null)
|
||||
{
|
||||
G.addIndelLikelihood(indel_call);
|
||||
}
|
||||
else
|
||||
{
|
||||
G.addIndelLikelihood(null);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
// Handle 2nd-best base calls.
|
||||
if (fourBaseMode && pileup.getBases().length() < 750)
|
||||
|
|
@ -163,7 +163,6 @@ public class MultiSampleCaller extends LocusWalker<String,String>
|
|||
return G;
|
||||
}
|
||||
|
||||
// thoughly check this function
|
||||
double[] CountFreqs(GenotypeLikelihoods[] genotype_likelihoods)
|
||||
{
|
||||
double[] allele_likelihoods = new double[4];
|
||||
|
|
@ -199,6 +198,35 @@ public class MultiSampleCaller extends LocusWalker<String,String>
|
|||
return allele_likelihoods;
|
||||
}
|
||||
|
||||
double CountIndelFreq(GenotypeLikelihoods[] genotype_likelihoods)
|
||||
{
|
||||
HashMap<String, Double> indel_allele_likelihoods = new HashMap<String, Double>();
|
||||
|
||||
double pRef = 0;
|
||||
double pAlt = 0;
|
||||
|
||||
for (int j = 0; j < sample_names.size(); j++)
|
||||
{
|
||||
double personal_pRef = 0;
|
||||
double personal_pAlt = 0;
|
||||
|
||||
IndelLikelihood indel_likelihood = genotype_likelihoods[j].getIndelLikelihood();
|
||||
personal_pRef += 2*Math.pow(10, indel_likelihood.pRef()) + Math.pow(10, indel_likelihood.pHet());
|
||||
personal_pAlt += 2*Math.pow(10, indel_likelihood.pHom()) + Math.pow(10, indel_likelihood.pHet());
|
||||
|
||||
personal_pRef = personal_pRef / (personal_pAlt + personal_pRef);
|
||||
personal_pAlt = personal_pAlt / (personal_pAlt + personal_pRef);
|
||||
|
||||
pRef += personal_pRef;
|
||||
pAlt += personal_pAlt;
|
||||
}
|
||||
|
||||
pRef = pRef / (pRef + pAlt);
|
||||
pAlt = pAlt / (pRef + pAlt);
|
||||
|
||||
return pAlt;
|
||||
}
|
||||
|
||||
// Potential precision error here.
|
||||
double Compute_pD(GenotypeLikelihoods[] genotype_likelihoods)
|
||||
{
|
||||
|
|
@ -223,7 +251,7 @@ public class MultiSampleCaller extends LocusWalker<String,String>
|
|||
GenotypeLikelihoods[] G = new GenotypeLikelihoods[sample_names.size()];
|
||||
for (int j = 0; j < sample_names.size(); j++)
|
||||
{
|
||||
G[j] = Genotype(contexts[j], allele_likelihoods);
|
||||
G[j] = Genotype(contexts[j], allele_likelihoods, 1e-6);
|
||||
}
|
||||
return Compute_pD(G);
|
||||
}
|
||||
|
|
@ -271,16 +299,22 @@ public class MultiSampleCaller extends LocusWalker<String,String>
|
|||
if (i == BaseUtils.simpleBaseToBaseIndex(ref)) { allele_likelihoods[i] = 0.9994999; } //sqrt(0.999)
|
||||
else { allele_likelihoods[i] = 0.0005002502; } // 0.001 / (2 * sqrt(0.999)
|
||||
}
|
||||
double indel_alt_freq = 1e-4;
|
||||
|
||||
GenotypeLikelihoods[] G = new GenotypeLikelihoods[sample_names.size()];
|
||||
for (int i = 0; i < MAX_ITERATIONS; i++)
|
||||
{
|
||||
for (int j = 0; j < sample_names.size(); j++)
|
||||
{
|
||||
G[j] = Genotype(contexts[j], allele_likelihoods);
|
||||
G[j] = Genotype(contexts[j], allele_likelihoods, indel_alt_freq);
|
||||
}
|
||||
|
||||
allele_likelihoods = CountFreqs(G);
|
||||
|
||||
if (CALL_INDELS)
|
||||
{
|
||||
indel_alt_freq = CountIndelFreq(G);
|
||||
}
|
||||
}
|
||||
|
||||
return new EM_Result(G, allele_likelihoods);
|
||||
|
|
|
|||
|
|
@ -155,6 +155,8 @@ public class GenotypeLikelihoods {
|
|||
|
||||
public void add(char ref, char read, byte qual)
|
||||
{
|
||||
if (qual <= 0) { qual = 1; }
|
||||
|
||||
if (coverage == 0)
|
||||
{
|
||||
for (int i = 0; i < likelihoods.length; i++)
|
||||
|
|
@ -398,4 +400,9 @@ public class GenotypeLikelihoods {
|
|||
AFE.genotypeLikelihoods = this;
|
||||
return AFE;
|
||||
}
|
||||
|
||||
private IndelLikelihood indel_likelihood;
|
||||
public void addIndelLikelihood(IndelLikelihood indel_likelihood) { this.indel_likelihood = indel_likelihood; }
|
||||
public IndelLikelihood getIndelLikelihood() { return this.indel_likelihood; }
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,11 +8,17 @@ public class IndelLikelihood {
|
|||
private double p;
|
||||
private double lod;
|
||||
|
||||
private double pRef;
|
||||
private double pHet;
|
||||
private double pHom;
|
||||
private String alt;
|
||||
|
||||
public IndelLikelihood(String type, String[] alleles, double p, double lod) {
|
||||
initialize(type, alleles, p, lod);
|
||||
}
|
||||
|
||||
public IndelLikelihood(String[] indels) {
|
||||
public IndelLikelihood(String[] indels, double indel_alt_freq)
|
||||
{
|
||||
HashMap<String,Integer> indel_allele_counts = new HashMap<String,Integer>();
|
||||
|
||||
for (int i = 0; i < indels.length; i++) {
|
||||
|
|
@ -43,9 +49,9 @@ public class IndelLikelihood {
|
|||
//System.out.printf("\n");
|
||||
|
||||
double eps = 1e-3;
|
||||
double pRef = null_count*Math.log10(1.0 - eps) + max_count*Math.log10(eps) + Math.log10(0.999);
|
||||
double pHet = null_count*Math.log10(0.5 - eps/2) + max_count*Math.log10(0.5-eps/2) + Math.log10(1e-3);
|
||||
double pHom = null_count*Math.log10(eps) + max_count*Math.log10(1.0 - eps) + Math.log10(1e-5);
|
||||
pRef = null_count*Math.log10(1.0 - eps) + max_count*Math.log10(eps) + 2*Math.log10(1-indel_alt_freq);
|
||||
pHet = null_count*Math.log10(0.5 - eps/2) + max_count*Math.log10(0.5-eps/2) + Math.log10((1-indel_alt_freq)*indel_alt_freq);
|
||||
pHom = null_count*Math.log10(eps) + max_count*Math.log10(1.0 - eps) + 2*Math.log10(indel_alt_freq);
|
||||
|
||||
double lodRef = pRef - Math.max(pHet, pHom);
|
||||
double lodHet = pHet - pRef;
|
||||
|
|
@ -91,6 +97,11 @@ public class IndelLikelihood {
|
|||
this.lod = lod;
|
||||
}
|
||||
|
||||
public String getAlt() { return alt; }
|
||||
public double pRef() { return pRef; }
|
||||
public double pHet() { return pHet; }
|
||||
public double pHom() { return pHom; }
|
||||
|
||||
public String getType() { return type; }
|
||||
public String[] getAlleles() { return alleles; }
|
||||
public double getPosteriorProbability() { return p; }
|
||||
|
|
|
|||
Loading…
Reference in New Issue