Restructured AlleleFrequencyMetricsWalker to correctly report Hapmap concordance numbers for genotyping and added reporting for Hapmap reference/variant calling. Also, tiny bugfix in interval code.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@181 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
9e2a373184
commit
30babbf5b9
|
|
@ -597,7 +597,7 @@ public class TraversalEngine {
|
|||
|
||||
// if we don't have a particular interval we're processing, check them all, otherwise only operate at this
|
||||
// location
|
||||
if ( ( interval == null && inLocations(locus.getLocation()) ) || interval.overlapsP(locus.getLocation()) ) {
|
||||
if ( ( interval == null && inLocations(locus.getLocation()) ) || (interval != null && interval.overlapsP(locus.getLocation())) ) {
|
||||
|
||||
//System.out.format("Working at %s\n", locus.getLocation().toString());
|
||||
|
||||
|
|
|
|||
|
|
@ -26,8 +26,10 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
|
|||
long num_loci_total=0;
|
||||
long num_loci_confident=0;
|
||||
double LOD_cutoff = 5;
|
||||
long hapmap_tp=0;
|
||||
long hapmap_fp=0;
|
||||
long hapmap_genotype_correct = 0;
|
||||
long hapmap_genotype_incorrect = 0;
|
||||
long hapmap_refvar_correct = 0;
|
||||
long hapmap_refvar_incorrect = 0;
|
||||
|
||||
AlleleFrequencyWalker caller;
|
||||
|
||||
|
|
@ -37,70 +39,139 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
|
|||
|
||||
num_loci_total += 1;
|
||||
|
||||
boolean is_dbSNP_SNP = false;
|
||||
boolean has_hapmap_chip_genotype = false;
|
||||
rodGFF hapmap_chip_genotype = null;
|
||||
|
||||
for ( ReferenceOrderedDatum datum : rodData )
|
||||
{
|
||||
if ( datum != null )
|
||||
{
|
||||
if ( datum instanceof rodDbSNP )
|
||||
{
|
||||
rodDbSNP dbsnp = (rodDbSNP)datum;
|
||||
if (dbsnp.isSNP()) is_dbSNP_SNP = true;
|
||||
}
|
||||
|
||||
if ( datum instanceof rodGFF )
|
||||
{
|
||||
has_hapmap_chip_genotype = true;
|
||||
hapmap_chip_genotype = (rodGFF)datum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (Math.abs(alleleFreq.LOD) >= LOD_cutoff) { num_loci_confident += 1; }
|
||||
|
||||
if (alleleFreq.getQstar() > 0.0 && alleleFreq.getLOD() >= LOD_cutoff)
|
||||
if (alleleFreq.qstar > 0.0 && alleleFreq.LOD >= LOD_cutoff)
|
||||
{
|
||||
// Confident variant.
|
||||
|
||||
num_variants += 1;
|
||||
|
||||
boolean is_dbSNP_SNP = false;
|
||||
boolean hapmap_hit = false;
|
||||
|
||||
for ( ReferenceOrderedDatum datum : rodData )
|
||||
{
|
||||
if ( datum != null )
|
||||
{
|
||||
if ( datum instanceof rodDbSNP )
|
||||
{
|
||||
rodDbSNP dbsnp = (rodDbSNP)datum;
|
||||
if (dbsnp.isSNP()) is_dbSNP_SNP = true;
|
||||
}
|
||||
|
||||
if ( datum instanceof rodGFF )
|
||||
{
|
||||
rodGFF hapmap = (rodGFF) datum;
|
||||
String hapmap_genotype = hapmap.getFeature();
|
||||
String called_genotype = alleleFreq.asString();
|
||||
System.out.format("HAPMAP %s %s %.2f\n", hapmap_genotype, called_genotype, alleleFreq.getLOD());
|
||||
// There is a hapmap site here, is it correct?
|
||||
if (hapmap_genotype.equals(called_genotype))
|
||||
{
|
||||
hapmap_tp++;
|
||||
} else {
|
||||
hapmap_fp++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (is_dbSNP_SNP)
|
||||
{
|
||||
dbsnp_hits += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert qstar and LOD to give best qstar and a positive LOD for that qstar
|
||||
double qstar;
|
||||
double LOD;
|
||||
if (alleleFreq.LOD < 0) {
|
||||
qstar = 0.0;
|
||||
LOD = -1 * alleleFreq.LOD;
|
||||
}else{
|
||||
qstar = alleleFreq.qstar;
|
||||
LOD = alleleFreq.LOD;
|
||||
}
|
||||
|
||||
//long confident_calls_at_hapmap_chip_sites = 0;
|
||||
if (LOD >= LOD_cutoff && has_hapmap_chip_genotype) {
|
||||
//confident_calls_at_hapmap_chip_sites++;
|
||||
|
||||
// convert hapmap call to mixture of ref/nonref
|
||||
String hapmap_genotype = hapmap_chip_genotype.getFeature();
|
||||
long refs=0, alts=0;
|
||||
double hapmap_q;
|
||||
|
||||
for (char c : hapmap_genotype.toCharArray()) {
|
||||
if (c == alleleFreq.ref) { refs++; }
|
||||
if (c == alleleFreq.alt) { alts++; }
|
||||
}
|
||||
|
||||
if (refs+alts > 0) {
|
||||
hapmap_q = (float)alts / (refs+alts);
|
||||
}else{
|
||||
hapmap_q = -1;
|
||||
}
|
||||
|
||||
// Hapmap debug info
|
||||
System.out.format("HAPMAP %.2f %.2f %.2f ", hapmap_q, qstar, LOD);
|
||||
String called_genotype = alleleFreq.asString();
|
||||
System.out.format("%s %s %c %c %.2f", hapmap_genotype, called_genotype, alleleFreq.ref, alleleFreq.alt, alleleFreq.LOD);
|
||||
|
||||
// Calculate genotyping performance - did we get the correct genotype of the N+1 choices?
|
||||
if (hapmap_q != -1 && hapmap_q == qstar) {
|
||||
hapmap_genotype_correct++;
|
||||
System.out.print("\n");
|
||||
}else{
|
||||
hapmap_genotype_incorrect++;
|
||||
System.out.printf(" INCORRECT GENOTYPE Bases: %s\n", AlleleFrequencyWalker.getBases(context));
|
||||
AlleleFrequencyWalker.print_base_qual_matrix(AlleleFrequencyWalker.getOneBaseQuals(context));
|
||||
}
|
||||
|
||||
// Now calculate ref / var performance - did we correctly classify the site as
|
||||
// reference or variant without regard to genotype; i.e. het/hom "miscalls" don't matter here
|
||||
boolean hapmap_var = hapmap_q != 0.0;
|
||||
boolean called_var = qstar != 0.0;
|
||||
if (hapmap_var != called_var) {
|
||||
hapmap_refvar_incorrect++;
|
||||
}else{
|
||||
hapmap_refvar_correct++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/*String hapmap_genotype = hapmap_chip_genotype.getFeature();
|
||||
String called_genotype = alleleFreq.asString();
|
||||
System.out.format("HAPMAP %s %s %.2f\n", hapmap_genotype, called_genotype, alleleFreq.LOD);
|
||||
// There is a hapmap site here, is it correct?
|
||||
if (hapmap_genotype.equals(called_genotype))
|
||||
{
|
||||
hapmap_tp++;
|
||||
} else {
|
||||
hapmap_fp++;
|
||||
} */
|
||||
|
||||
return alleleFreq;
|
||||
}
|
||||
|
||||
public void printMetrics()
|
||||
public void printMetrics()
|
||||
{
|
||||
if (num_loci_total == 0) { return; }
|
||||
|
||||
System.out.printf("\n");
|
||||
System.out.printf("METRICS Allele Frequency Metrics (LOD >= %.0f)\n", LOD_cutoff);
|
||||
System.out.printf("METRICS -------------------------------------------------\n");
|
||||
System.out.printf("METRICS Total loci : %d\n", num_loci_total);
|
||||
System.out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total);
|
||||
System.out.printf("METRICS Total loci : %d\n", num_loci_total);
|
||||
System.out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total);
|
||||
if (num_variants != 0)
|
||||
{
|
||||
System.out.printf("METRICS Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
|
||||
System.out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
|
||||
System.out.printf("METRICS Number of variants with hapmap calls : %d\n", hapmap_tp + hapmap_fp);
|
||||
System.out.printf("METRICS Hapmap true positives : %d\n", hapmap_tp);
|
||||
System.out.printf("METRICS Hapmap false positives : %d\n", hapmap_fp);
|
||||
System.out.printf("METRICS Hapmap precision : %.2f%%\n", 100.0 * (float)hapmap_tp / (float)(hapmap_tp + hapmap_fp));
|
||||
System.out.printf("METRICS Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
|
||||
System.out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
|
||||
System.out.printf("METRICS -------------------------------------------------\n");
|
||||
System.out.printf("METRICS -- Hapmap Genotyping performance --\n");
|
||||
System.out.printf("METRICS Num. conf. calls at Hapmap chip sites : %d\n", hapmap_genotype_correct + hapmap_genotype_incorrect);
|
||||
System.out.printf("METRICS Conf. calls at chip sites correct : %d\n", hapmap_genotype_correct);
|
||||
System.out.printf("METRICS Conf. calls at chip sites incorrect : %d\n", hapmap_genotype_incorrect);
|
||||
System.out.printf("METRICS %% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_genotype_correct / (float)(hapmap_genotype_correct + hapmap_genotype_incorrect));
|
||||
System.out.printf("METRICS -------------------------------------------------\n");
|
||||
System.out.printf("METRICS -- Hapmap Reference/Variant performance --\n");
|
||||
System.out.printf("METRICS Num. conf. calls at Hapmap chip sites : %d\n", hapmap_refvar_correct + hapmap_refvar_incorrect);
|
||||
System.out.printf("METRICS Conf. calls at chip sites correct : %d\n", hapmap_refvar_correct);
|
||||
System.out.printf("METRICS Conf. calls at chip sites incorrect : %d\n", hapmap_refvar_incorrect);
|
||||
System.out.printf("METRICS %% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_refvar_correct / (float)(hapmap_refvar_correct + hapmap_refvar_incorrect));
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
|
|
@ -118,17 +189,8 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
|
|||
|
||||
public String reduce(AlleleFrequencyEstimate alleleFreq, String sum)
|
||||
{
|
||||
if ((alleleFreq.LOD >= 5) || (alleleFreq.LOD <= -5))
|
||||
{
|
||||
System.out.print(String.format("RESULT %s %c %c %f %f %f %d\n",
|
||||
alleleFreq.location,
|
||||
alleleFreq.ref,
|
||||
alleleFreq.alt,
|
||||
alleleFreq.qhat,
|
||||
alleleFreq.qstar,
|
||||
alleleFreq.LOD,
|
||||
alleleFreq.depth));
|
||||
}
|
||||
// Print RESULT data for confident calls
|
||||
if ((alleleFreq.LOD >= 5) || (alleleFreq.LOD <= -5)) { System.out.print(alleleFreq.asTabularString()); }
|
||||
|
||||
if (this.num_loci_total % 10000 == 0) { printMetrics(); }
|
||||
|
||||
|
|
|
|||
|
|
@ -12,56 +12,35 @@ import java.util.Arrays;
|
|||
|
||||
public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstimate, Integer> {
|
||||
|
||||
public AlleleFrequencyEstimate map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
|
||||
|
||||
// Convert context data into bases and 4-base quals
|
||||
public AlleleFrequencyEstimate map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
|
||||
|
||||
// Set number of chromosomes, N, to 2 for now
|
||||
int N = 2;
|
||||
|
||||
// Convert bases to CharArray
|
||||
int numReads = context.getReads().size(); //numReads();
|
||||
byte[] bases = new byte[numReads];
|
||||
String base_string = "";
|
||||
double[][] quals = new double[numReads][4];
|
||||
int refnum = nuc2num[ref];
|
||||
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
for (int i =0; i < numReads; i++ ) {
|
||||
SAMRecord read = reads.get(i);
|
||||
int offset = offsets.get(i);
|
||||
|
||||
bases[i] = read.getReadBases()[offset];
|
||||
base_string += read.getReadString().charAt(offset);
|
||||
int callednum = nuc2num[bases[i]];
|
||||
quals[i][callednum] = 1 - Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0);
|
||||
|
||||
// Set all nonref qual scores to their share of the remaining probality not "used" by the reference base's qual
|
||||
double nonref_quals = (1.0 - quals[i][callednum]) / 3;
|
||||
for (int b=0; b<4; b++)
|
||||
if (b != callednum)
|
||||
quals[i][b] = nonref_quals;
|
||||
}
|
||||
// Convert context data into bases and 4-base quals
|
||||
String bases = getBases(context);
|
||||
double quals[][] = getOneBaseQuals(context);
|
||||
|
||||
// Count bases
|
||||
int[] base_counts = new int[4];
|
||||
for (byte b : bases)
|
||||
for (byte b : bases.getBytes())
|
||||
base_counts[nuc2num[b]]++;
|
||||
|
||||
|
||||
// Find alternate allele - 2nd most frequent non-ref allele
|
||||
// (maybe we should check for ties and eval both or check most common including quality scores)
|
||||
int altnum = -1; // start with first base numerical identity (0,1,2,3)
|
||||
double altcount = -1; // start with first base count
|
||||
int refnum = nuc2num[ref];
|
||||
|
||||
for (int b=0; b<4; b++) {
|
||||
if ((b != refnum) && (base_counts[b] > altcount)) {
|
||||
altnum = b; altcount = base_counts[b];
|
||||
altnum = b; altcount = base_counts[b];
|
||||
}
|
||||
}
|
||||
assert(altnum != -1);
|
||||
|
||||
AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(context.getLocation().toString(), N, bases, quals, refnum, altnum, base_string.length());
|
||||
AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(context.getLocation().toString(), N, bases.getBytes(), quals, refnum, altnum, bases.length());
|
||||
|
||||
// Print dbSNP data if its there
|
||||
if (false) {
|
||||
|
|
@ -76,6 +55,47 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
|
|||
return alleleFreq;
|
||||
}
|
||||
|
||||
static public String getBases (LocusContext context) {
|
||||
// Convert bases to CharArray
|
||||
int numReads = context.getReads().size(); //numReads();
|
||||
//byte[] bases = new byte[numReads];
|
||||
String base_string = "";
|
||||
//int refnum = nuc2num[ref];
|
||||
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
for (int i =0; i < numReads; i++ ) {
|
||||
SAMRecord read = reads.get(i);
|
||||
int offset = offsets.get(i);
|
||||
|
||||
//bases[i] = read.getReadBases()[offset];
|
||||
base_string += read.getReadString().charAt(offset);
|
||||
}
|
||||
return base_string;
|
||||
}
|
||||
|
||||
static public double[][] getOneBaseQuals (LocusContext context) {
|
||||
int numReads = context.getReads().size(); //numReads();
|
||||
double[][] quals = new double[numReads][4];
|
||||
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
for (int i =0; i < numReads; i++ ) {
|
||||
SAMRecord read = reads.get(i);
|
||||
int offset = offsets.get(i);
|
||||
int callednum = nuc2num[read.getReadBases()[offset]];
|
||||
quals[i][callednum] = 1 - Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0);
|
||||
|
||||
// Set all nonref qual scores to their share of the remaining probality not "used" by the reference base's qual
|
||||
double nonref_quals = (1.0 - quals[i][callednum]) / 3;
|
||||
for (int b=0; b<4; b++)
|
||||
if (b != callednum)
|
||||
quals[i][b] = nonref_quals;
|
||||
}
|
||||
|
||||
return quals;
|
||||
}
|
||||
|
||||
public AlleleFrequencyEstimate AlleleFrequencyEstimator(String location, int N, byte[] bases, double[][] quals, int refnum, int altnum, int depth)
|
||||
{
|
||||
|
||||
|
|
@ -114,7 +134,7 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
|
|||
double pG = P_G(N, qstar_N);
|
||||
double posterior = pDq + pqG + pG;
|
||||
|
||||
if (posterior > best_posterior)
|
||||
if (posterior > best_posterior)
|
||||
{
|
||||
best_qstar = qstar;
|
||||
best_qhat = q;
|
||||
|
|
@ -124,6 +144,7 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
|
|||
best_pqG = pqG;
|
||||
best_pG = pG;
|
||||
}
|
||||
//System.out.format("%.2f %.2f %5.2f %5.2f %5.2f %5.2f\n", q, qstar, pDq, pqG, pG, posterior);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -230,17 +251,9 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
|
|||
|
||||
public Integer reduce(AlleleFrequencyEstimate alleleFreq, Integer sum)
|
||||
{
|
||||
if (alleleFreq.LOD >= 5)
|
||||
{
|
||||
System.out.print(String.format("RESULT %s %c %c %f %f %f %d\n",
|
||||
alleleFreq.location,
|
||||
alleleFreq.ref,
|
||||
alleleFreq.alt,
|
||||
alleleFreq.qhat,
|
||||
alleleFreq.qstar,
|
||||
alleleFreq.LOD,
|
||||
alleleFreq.depth));
|
||||
}
|
||||
// Print RESULT data for confident calls
|
||||
if ((alleleFreq.LOD >= 5) || (alleleFreq.LOD <= -5)) { System.out.print(alleleFreq.asTabularString()); }
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -267,9 +280,10 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
|
|||
|
||||
|
||||
|
||||
void print_base_qual_matrix(double [][]quals, int numReads) {
|
||||
static void print_base_qual_matrix(double [][]quals) {
|
||||
// Print quals for debugging
|
||||
System.out.println("Base quality matrix");
|
||||
int numReads = quals.length;
|
||||
for (int b=0; b<4; b++) {
|
||||
System.out.print(num2nuc[b]);
|
||||
for (int i=0; i < numReads; i++){
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ public class AlleleFrequencyEstimate {
|
|||
public double LOD;
|
||||
public int depth;
|
||||
|
||||
public double getQstar()
|
||||
public double getQstar()
|
||||
{
|
||||
return qstar;
|
||||
}
|
||||
|
|
@ -35,6 +35,17 @@ public class AlleleFrequencyEstimate {
|
|||
this.depth = depth;
|
||||
}
|
||||
|
||||
public String asTabularString() {
|
||||
return String.format("RESULT %s %c %c %f %f %f %d\n",
|
||||
location,
|
||||
ref,
|
||||
alt,
|
||||
qhat,
|
||||
qstar,
|
||||
LOD,
|
||||
depth);
|
||||
}
|
||||
|
||||
public String asString() {
|
||||
// Print out the called bases
|
||||
// Notes: switched from qhat to qstar because qhat doesn't work at n=1 (1 observed base) where having a single non-ref
|
||||
|
|
|
|||
Loading…
Reference in New Issue