Restructured AlleleFrequencyMetricsWalker to correctly report Hapmap concordance numbers for genotyping and added reporting for Hapmap reference/variant calling. Also, tiny bugfix in interval code.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@181 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
9e2a373184
commit
30babbf5b9
|
|
@ -597,7 +597,7 @@ public class TraversalEngine {
|
||||||
|
|
||||||
// if we don't have a particular interval we're processing, check them all, otherwise only operate at this
|
// if we don't have a particular interval we're processing, check them all, otherwise only operate at this
|
||||||
// location
|
// location
|
||||||
if ( ( interval == null && inLocations(locus.getLocation()) ) || interval.overlapsP(locus.getLocation()) ) {
|
if ( ( interval == null && inLocations(locus.getLocation()) ) || (interval != null && interval.overlapsP(locus.getLocation())) ) {
|
||||||
|
|
||||||
//System.out.format("Working at %s\n", locus.getLocation().toString());
|
//System.out.format("Working at %s\n", locus.getLocation().toString());
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -26,8 +26,10 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
|
||||||
long num_loci_total=0;
|
long num_loci_total=0;
|
||||||
long num_loci_confident=0;
|
long num_loci_confident=0;
|
||||||
double LOD_cutoff = 5;
|
double LOD_cutoff = 5;
|
||||||
long hapmap_tp=0;
|
long hapmap_genotype_correct = 0;
|
||||||
long hapmap_fp=0;
|
long hapmap_genotype_incorrect = 0;
|
||||||
|
long hapmap_refvar_correct = 0;
|
||||||
|
long hapmap_refvar_incorrect = 0;
|
||||||
|
|
||||||
AlleleFrequencyWalker caller;
|
AlleleFrequencyWalker caller;
|
||||||
|
|
||||||
|
|
@ -37,16 +39,9 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
|
||||||
|
|
||||||
num_loci_total += 1;
|
num_loci_total += 1;
|
||||||
|
|
||||||
if (Math.abs(alleleFreq.LOD) >= LOD_cutoff) { num_loci_confident += 1; }
|
|
||||||
|
|
||||||
if (alleleFreq.getQstar() > 0.0 && alleleFreq.getLOD() >= LOD_cutoff)
|
|
||||||
{
|
|
||||||
// Confident variant.
|
|
||||||
|
|
||||||
num_variants += 1;
|
|
||||||
|
|
||||||
boolean is_dbSNP_SNP = false;
|
boolean is_dbSNP_SNP = false;
|
||||||
boolean hapmap_hit = false;
|
boolean has_hapmap_chip_genotype = false;
|
||||||
|
rodGFF hapmap_chip_genotype = null;
|
||||||
|
|
||||||
for ( ReferenceOrderedDatum datum : rodData )
|
for ( ReferenceOrderedDatum datum : rodData )
|
||||||
{
|
{
|
||||||
|
|
@ -60,20 +55,19 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
|
||||||
|
|
||||||
if ( datum instanceof rodGFF )
|
if ( datum instanceof rodGFF )
|
||||||
{
|
{
|
||||||
rodGFF hapmap = (rodGFF) datum;
|
has_hapmap_chip_genotype = true;
|
||||||
String hapmap_genotype = hapmap.getFeature();
|
hapmap_chip_genotype = (rodGFF)datum;
|
||||||
String called_genotype = alleleFreq.asString();
|
}
|
||||||
System.out.format("HAPMAP %s %s %.2f\n", hapmap_genotype, called_genotype, alleleFreq.getLOD());
|
}
|
||||||
// There is a hapmap site here, is it correct?
|
}
|
||||||
if (hapmap_genotype.equals(called_genotype))
|
|
||||||
|
if (Math.abs(alleleFreq.LOD) >= LOD_cutoff) { num_loci_confident += 1; }
|
||||||
|
|
||||||
|
if (alleleFreq.qstar > 0.0 && alleleFreq.LOD >= LOD_cutoff)
|
||||||
{
|
{
|
||||||
hapmap_tp++;
|
// Confident variant.
|
||||||
} else {
|
|
||||||
hapmap_fp++;
|
num_variants += 1;
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (is_dbSNP_SNP)
|
if (is_dbSNP_SNP)
|
||||||
{
|
{
|
||||||
|
|
@ -81,6 +75,75 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Convert qstar and LOD to give best qstar and a positive LOD for that qstar
|
||||||
|
double qstar;
|
||||||
|
double LOD;
|
||||||
|
if (alleleFreq.LOD < 0) {
|
||||||
|
qstar = 0.0;
|
||||||
|
LOD = -1 * alleleFreq.LOD;
|
||||||
|
}else{
|
||||||
|
qstar = alleleFreq.qstar;
|
||||||
|
LOD = alleleFreq.LOD;
|
||||||
|
}
|
||||||
|
|
||||||
|
//long confident_calls_at_hapmap_chip_sites = 0;
|
||||||
|
if (LOD >= LOD_cutoff && has_hapmap_chip_genotype) {
|
||||||
|
//confident_calls_at_hapmap_chip_sites++;
|
||||||
|
|
||||||
|
// convert hapmap call to mixture of ref/nonref
|
||||||
|
String hapmap_genotype = hapmap_chip_genotype.getFeature();
|
||||||
|
long refs=0, alts=0;
|
||||||
|
double hapmap_q;
|
||||||
|
|
||||||
|
for (char c : hapmap_genotype.toCharArray()) {
|
||||||
|
if (c == alleleFreq.ref) { refs++; }
|
||||||
|
if (c == alleleFreq.alt) { alts++; }
|
||||||
|
}
|
||||||
|
|
||||||
|
if (refs+alts > 0) {
|
||||||
|
hapmap_q = (float)alts / (refs+alts);
|
||||||
|
}else{
|
||||||
|
hapmap_q = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Hapmap debug info
|
||||||
|
System.out.format("HAPMAP %.2f %.2f %.2f ", hapmap_q, qstar, LOD);
|
||||||
|
String called_genotype = alleleFreq.asString();
|
||||||
|
System.out.format("%s %s %c %c %.2f", hapmap_genotype, called_genotype, alleleFreq.ref, alleleFreq.alt, alleleFreq.LOD);
|
||||||
|
|
||||||
|
// Calculate genotyping performance - did we get the correct genotype of the N+1 choices?
|
||||||
|
if (hapmap_q != -1 && hapmap_q == qstar) {
|
||||||
|
hapmap_genotype_correct++;
|
||||||
|
System.out.print("\n");
|
||||||
|
}else{
|
||||||
|
hapmap_genotype_incorrect++;
|
||||||
|
System.out.printf(" INCORRECT GENOTYPE Bases: %s\n", AlleleFrequencyWalker.getBases(context));
|
||||||
|
AlleleFrequencyWalker.print_base_qual_matrix(AlleleFrequencyWalker.getOneBaseQuals(context));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now calculate ref / var performance - did we correctly classify the site as
|
||||||
|
// reference or variant without regard to genotype; i.e. het/hom "miscalls" don't matter here
|
||||||
|
boolean hapmap_var = hapmap_q != 0.0;
|
||||||
|
boolean called_var = qstar != 0.0;
|
||||||
|
if (hapmap_var != called_var) {
|
||||||
|
hapmap_refvar_incorrect++;
|
||||||
|
}else{
|
||||||
|
hapmap_refvar_correct++;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/*String hapmap_genotype = hapmap_chip_genotype.getFeature();
|
||||||
|
String called_genotype = alleleFreq.asString();
|
||||||
|
System.out.format("HAPMAP %s %s %.2f\n", hapmap_genotype, called_genotype, alleleFreq.LOD);
|
||||||
|
// There is a hapmap site here, is it correct?
|
||||||
|
if (hapmap_genotype.equals(called_genotype))
|
||||||
|
{
|
||||||
|
hapmap_tp++;
|
||||||
|
} else {
|
||||||
|
hapmap_fp++;
|
||||||
|
} */
|
||||||
|
|
||||||
return alleleFreq;
|
return alleleFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -97,10 +160,18 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
|
||||||
{
|
{
|
||||||
System.out.printf("METRICS Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
|
System.out.printf("METRICS Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
|
||||||
System.out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
|
System.out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
|
||||||
System.out.printf("METRICS Number of variants with hapmap calls : %d\n", hapmap_tp + hapmap_fp);
|
System.out.printf("METRICS -------------------------------------------------\n");
|
||||||
System.out.printf("METRICS Hapmap true positives : %d\n", hapmap_tp);
|
System.out.printf("METRICS -- Hapmap Genotyping performance --\n");
|
||||||
System.out.printf("METRICS Hapmap false positives : %d\n", hapmap_fp);
|
System.out.printf("METRICS Num. conf. calls at Hapmap chip sites : %d\n", hapmap_genotype_correct + hapmap_genotype_incorrect);
|
||||||
System.out.printf("METRICS Hapmap precision : %.2f%%\n", 100.0 * (float)hapmap_tp / (float)(hapmap_tp + hapmap_fp));
|
System.out.printf("METRICS Conf. calls at chip sites correct : %d\n", hapmap_genotype_correct);
|
||||||
|
System.out.printf("METRICS Conf. calls at chip sites incorrect : %d\n", hapmap_genotype_incorrect);
|
||||||
|
System.out.printf("METRICS %% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_genotype_correct / (float)(hapmap_genotype_correct + hapmap_genotype_incorrect));
|
||||||
|
System.out.printf("METRICS -------------------------------------------------\n");
|
||||||
|
System.out.printf("METRICS -- Hapmap Reference/Variant performance --\n");
|
||||||
|
System.out.printf("METRICS Num. conf. calls at Hapmap chip sites : %d\n", hapmap_refvar_correct + hapmap_refvar_incorrect);
|
||||||
|
System.out.printf("METRICS Conf. calls at chip sites correct : %d\n", hapmap_refvar_correct);
|
||||||
|
System.out.printf("METRICS Conf. calls at chip sites incorrect : %d\n", hapmap_refvar_incorrect);
|
||||||
|
System.out.printf("METRICS %% of confident calls that are correct : %.2f%%\n", 100.0 * (float) hapmap_refvar_correct / (float)(hapmap_refvar_correct + hapmap_refvar_incorrect));
|
||||||
}
|
}
|
||||||
System.out.println();
|
System.out.println();
|
||||||
}
|
}
|
||||||
|
|
@ -118,17 +189,8 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
|
||||||
|
|
||||||
public String reduce(AlleleFrequencyEstimate alleleFreq, String sum)
|
public String reduce(AlleleFrequencyEstimate alleleFreq, String sum)
|
||||||
{
|
{
|
||||||
if ((alleleFreq.LOD >= 5) || (alleleFreq.LOD <= -5))
|
// Print RESULT data for confident calls
|
||||||
{
|
if ((alleleFreq.LOD >= 5) || (alleleFreq.LOD <= -5)) { System.out.print(alleleFreq.asTabularString()); }
|
||||||
System.out.print(String.format("RESULT %s %c %c %f %f %f %d\n",
|
|
||||||
alleleFreq.location,
|
|
||||||
alleleFreq.ref,
|
|
||||||
alleleFreq.alt,
|
|
||||||
alleleFreq.qhat,
|
|
||||||
alleleFreq.qstar,
|
|
||||||
alleleFreq.LOD,
|
|
||||||
alleleFreq.depth));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this.num_loci_total % 10000 == 0) { printMetrics(); }
|
if (this.num_loci_total % 10000 == 0) { printMetrics(); }
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -12,47 +12,26 @@ import java.util.Arrays;
|
||||||
|
|
||||||
public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstimate, Integer> {
|
public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstimate, Integer> {
|
||||||
|
|
||||||
public AlleleFrequencyEstimate map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
|
|
||||||
|
|
||||||
// Convert context data into bases and 4-base quals
|
public AlleleFrequencyEstimate map(List<ReferenceOrderedDatum> rodData, char ref, LocusContext context) {
|
||||||
|
|
||||||
// Set number of chromosomes, N, to 2 for now
|
// Set number of chromosomes, N, to 2 for now
|
||||||
int N = 2;
|
int N = 2;
|
||||||
|
|
||||||
// Convert bases to CharArray
|
// Convert context data into bases and 4-base quals
|
||||||
int numReads = context.getReads().size(); //numReads();
|
String bases = getBases(context);
|
||||||
byte[] bases = new byte[numReads];
|
double quals[][] = getOneBaseQuals(context);
|
||||||
String base_string = "";
|
|
||||||
double[][] quals = new double[numReads][4];
|
|
||||||
int refnum = nuc2num[ref];
|
|
||||||
|
|
||||||
List<SAMRecord> reads = context.getReads();
|
|
||||||
List<Integer> offsets = context.getOffsets();
|
|
||||||
for (int i =0; i < numReads; i++ ) {
|
|
||||||
SAMRecord read = reads.get(i);
|
|
||||||
int offset = offsets.get(i);
|
|
||||||
|
|
||||||
bases[i] = read.getReadBases()[offset];
|
|
||||||
base_string += read.getReadString().charAt(offset);
|
|
||||||
int callednum = nuc2num[bases[i]];
|
|
||||||
quals[i][callednum] = 1 - Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0);
|
|
||||||
|
|
||||||
// Set all nonref qual scores to their share of the remaining probality not "used" by the reference base's qual
|
|
||||||
double nonref_quals = (1.0 - quals[i][callednum]) / 3;
|
|
||||||
for (int b=0; b<4; b++)
|
|
||||||
if (b != callednum)
|
|
||||||
quals[i][b] = nonref_quals;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Count bases
|
// Count bases
|
||||||
int[] base_counts = new int[4];
|
int[] base_counts = new int[4];
|
||||||
for (byte b : bases)
|
for (byte b : bases.getBytes())
|
||||||
base_counts[nuc2num[b]]++;
|
base_counts[nuc2num[b]]++;
|
||||||
|
|
||||||
// Find alternate allele - 2nd most frequent non-ref allele
|
// Find alternate allele - 2nd most frequent non-ref allele
|
||||||
// (maybe we should check for ties and eval both or check most common including quality scores)
|
// (maybe we should check for ties and eval both or check most common including quality scores)
|
||||||
int altnum = -1; // start with first base numerical identity (0,1,2,3)
|
int altnum = -1; // start with first base numerical identity (0,1,2,3)
|
||||||
double altcount = -1; // start with first base count
|
double altcount = -1; // start with first base count
|
||||||
|
int refnum = nuc2num[ref];
|
||||||
|
|
||||||
for (int b=0; b<4; b++) {
|
for (int b=0; b<4; b++) {
|
||||||
if ((b != refnum) && (base_counts[b] > altcount)) {
|
if ((b != refnum) && (base_counts[b] > altcount)) {
|
||||||
|
|
@ -61,7 +40,7 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
|
||||||
}
|
}
|
||||||
assert(altnum != -1);
|
assert(altnum != -1);
|
||||||
|
|
||||||
AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(context.getLocation().toString(), N, bases, quals, refnum, altnum, base_string.length());
|
AlleleFrequencyEstimate alleleFreq = AlleleFrequencyEstimator(context.getLocation().toString(), N, bases.getBytes(), quals, refnum, altnum, bases.length());
|
||||||
|
|
||||||
// Print dbSNP data if its there
|
// Print dbSNP data if its there
|
||||||
if (false) {
|
if (false) {
|
||||||
|
|
@ -76,6 +55,47 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
|
||||||
return alleleFreq;
|
return alleleFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static public String getBases (LocusContext context) {
|
||||||
|
// Convert bases to CharArray
|
||||||
|
int numReads = context.getReads().size(); //numReads();
|
||||||
|
//byte[] bases = new byte[numReads];
|
||||||
|
String base_string = "";
|
||||||
|
//int refnum = nuc2num[ref];
|
||||||
|
|
||||||
|
List<SAMRecord> reads = context.getReads();
|
||||||
|
List<Integer> offsets = context.getOffsets();
|
||||||
|
for (int i =0; i < numReads; i++ ) {
|
||||||
|
SAMRecord read = reads.get(i);
|
||||||
|
int offset = offsets.get(i);
|
||||||
|
|
||||||
|
//bases[i] = read.getReadBases()[offset];
|
||||||
|
base_string += read.getReadString().charAt(offset);
|
||||||
|
}
|
||||||
|
return base_string;
|
||||||
|
}
|
||||||
|
|
||||||
|
static public double[][] getOneBaseQuals (LocusContext context) {
|
||||||
|
int numReads = context.getReads().size(); //numReads();
|
||||||
|
double[][] quals = new double[numReads][4];
|
||||||
|
|
||||||
|
List<SAMRecord> reads = context.getReads();
|
||||||
|
List<Integer> offsets = context.getOffsets();
|
||||||
|
for (int i =0; i < numReads; i++ ) {
|
||||||
|
SAMRecord read = reads.get(i);
|
||||||
|
int offset = offsets.get(i);
|
||||||
|
int callednum = nuc2num[read.getReadBases()[offset]];
|
||||||
|
quals[i][callednum] = 1 - Math.pow(10.0, (double)read.getBaseQualities()[offset] / -10.0);
|
||||||
|
|
||||||
|
// Set all nonref qual scores to their share of the remaining probality not "used" by the reference base's qual
|
||||||
|
double nonref_quals = (1.0 - quals[i][callednum]) / 3;
|
||||||
|
for (int b=0; b<4; b++)
|
||||||
|
if (b != callednum)
|
||||||
|
quals[i][b] = nonref_quals;
|
||||||
|
}
|
||||||
|
|
||||||
|
return quals;
|
||||||
|
}
|
||||||
|
|
||||||
public AlleleFrequencyEstimate AlleleFrequencyEstimator(String location, int N, byte[] bases, double[][] quals, int refnum, int altnum, int depth)
|
public AlleleFrequencyEstimate AlleleFrequencyEstimator(String location, int N, byte[] bases, double[][] quals, int refnum, int altnum, int depth)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
|
@ -124,6 +144,7 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
|
||||||
best_pqG = pqG;
|
best_pqG = pqG;
|
||||||
best_pG = pG;
|
best_pG = pG;
|
||||||
}
|
}
|
||||||
|
//System.out.format("%.2f %.2f %5.2f %5.2f %5.2f %5.2f\n", q, qstar, pDq, pqG, pG, posterior);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -230,17 +251,9 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
|
||||||
|
|
||||||
public Integer reduce(AlleleFrequencyEstimate alleleFreq, Integer sum)
|
public Integer reduce(AlleleFrequencyEstimate alleleFreq, Integer sum)
|
||||||
{
|
{
|
||||||
if (alleleFreq.LOD >= 5)
|
// Print RESULT data for confident calls
|
||||||
{
|
if ((alleleFreq.LOD >= 5) || (alleleFreq.LOD <= -5)) { System.out.print(alleleFreq.asTabularString()); }
|
||||||
System.out.print(String.format("RESULT %s %c %c %f %f %f %d\n",
|
|
||||||
alleleFreq.location,
|
|
||||||
alleleFreq.ref,
|
|
||||||
alleleFreq.alt,
|
|
||||||
alleleFreq.qhat,
|
|
||||||
alleleFreq.qstar,
|
|
||||||
alleleFreq.LOD,
|
|
||||||
alleleFreq.depth));
|
|
||||||
}
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -267,9 +280,10 @@ public class AlleleFrequencyWalker extends BasicLociWalker<AlleleFrequencyEstima
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void print_base_qual_matrix(double [][]quals, int numReads) {
|
static void print_base_qual_matrix(double [][]quals) {
|
||||||
// Print quals for debugging
|
// Print quals for debugging
|
||||||
System.out.println("Base quality matrix");
|
System.out.println("Base quality matrix");
|
||||||
|
int numReads = quals.length;
|
||||||
for (int b=0; b<4; b++) {
|
for (int b=0; b<4; b++) {
|
||||||
System.out.print(num2nuc[b]);
|
System.out.print(num2nuc[b]);
|
||||||
for (int i=0; i < numReads; i++){
|
for (int i=0; i < numReads; i++){
|
||||||
|
|
|
||||||
|
|
@ -35,6 +35,17 @@ public class AlleleFrequencyEstimate {
|
||||||
this.depth = depth;
|
this.depth = depth;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String asTabularString() {
|
||||||
|
return String.format("RESULT %s %c %c %f %f %f %d\n",
|
||||||
|
location,
|
||||||
|
ref,
|
||||||
|
alt,
|
||||||
|
qhat,
|
||||||
|
qstar,
|
||||||
|
LOD,
|
||||||
|
depth);
|
||||||
|
}
|
||||||
|
|
||||||
public String asString() {
|
public String asString() {
|
||||||
// Print out the called bases
|
// Print out the called bases
|
||||||
// Notes: switched from qhat to qstar because qhat doesn't work at n=1 (1 observed base) where having a single non-ref
|
// Notes: switched from qhat to qstar because qhat doesn't work at n=1 (1 observed base) where having a single non-ref
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue