Changes to PooledGenotypeConcordance:

Additional output & better output formatting. It has now undergone a good five hours of testing; and for pools of size 1 outputs exactly the same statistics as GenotypeConcordance (when GenotypeConcordance is modified to do nothing on reference='N'); and for pools of many sizes outputs close to the expected (by genetics) statistics. Looks like this is working properly.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1642 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2009-09-16 21:45:01 +00:00
parent 386a6442ba
commit 4ad46590a3
1 changed files with 47 additions and 18 deletions

View File

@ -154,7 +154,7 @@ class PooledConcordanceTable {
} else {
if ( pooledCallIsRef( eval, ref ) ) {
callIndex = REF_CALL;
} else if (truthIndex == TRUTH_REF ) {
} else if ( truthIndex == TRUTH_REF ) {
callIndex = VARIANT_CALL;
} else if (truthIndex == TRUTH_UNKNOWN) {
callIndex = VARIANT_CALL_UNKNOWN;
@ -308,24 +308,53 @@ class PooledConcordanceTable {
int nSNPsAtNonHapmap = table[NO_TRUTH_DATA][VARIANT_CALL_NONHAPMAP];
int nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals = table[TRUTH_UNKNOWN][VARIANT_CALL_UNKNOWN];
out.add(String.format("Total Number of SNP Calls: %d", nSNPCallSites));
out.add(String.format("Total SNP calls on non-Hapmap sites %d", nSNPsAtNonHapmap));
out.add(String.format("Number of Hapmap Sites: %d", nHapmapSites));
out.add("Data on Hapmap Reference Sites");
out.add(String.format("\tSites where all Hapmap chips were ref: %d", nFullHapmapRefSites));
out.add(String.format("\t\tReference sites correctly called: %d (%f)", nRefsCalledCorrectly, ((double)nRefsCalledCorrectly/nFullHapmapRefSites)));
out.add(String.format("\t\tReference sites called as variant: %d (%f)", nRefsCalledAsSNP, ((double)nRefsCalledAsSNP/nFullHapmapRefSites)));
out.add(String.format("\tSites where all seen Hapmap chips were ref, but not all Hapmap chips available: %d", nUnknownHapmapSites));
out.add(String.format("\t\tPutative reference sites called ref: %d (%f)", table[TRUTH_UNKNOWN][REF_CALL], ((double)table[TRUTH_UNKNOWN][REF_CALL]/nUnknownHapmapSites)));
out.add(String.format("\t\tPutative reference sites called SNP: %d (%f)", nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals, ((double)nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals/nUnknownHapmapSites)));
out.add("Data on Hapmap Variant Sites");
out.add(String.format("\tNumber of Hapmap SNP Sites: %d", nHapmapSNPSites));
out.add(String.format("\t\tSNP sites incorrectly called ref: %d (%f)", nSNPsCalledAsRef, ((double)nSNPsCalledAsRef/nHapmapSNPSites)));
out.add(String.format("\tSNP calls on Hapmap SNP Sites: %d", nSNPsOnHapmapSNP));
out.add(String.format("\t\tSNP sites correctly called SNP: %d (%f)", nSNPsCalledCorrectly, ((double)nSNPsCalledCorrectly/nSNPsOnHapmapSNP)));
out.add(String.format("\t\tSNP sites called a different base: %d (%f)", nSNPsCalledIncorrectly, ((double)nSNPsCalledIncorrectly/nSNPsOnHapmapSNP)));
out.add(String.format("Calls on reference N: %d", variantCallsAtRefN));
out.add(String.format("| Total Number of SNP Calls: %d", nSNPCallSites));
out.add(String.format("| Total SNP calls on non-Hapmap sites %d", nSNPsAtNonHapmap));
out.add(String.format("| Number of Hapmap Sites: %d", nHapmapSites));
out.add("| Data on Hapmap Reference Sites");
out.add(String.format("| \t+ Sites where all Hapmap chips were ref: %d", nFullHapmapRefSites));
out.add(String.format("| \t\t- Reference sites correctly called: %d (%f)", nRefsCalledCorrectly, ((double)nRefsCalledCorrectly/nFullHapmapRefSites)));
out.add(String.format("| \t\t- Reference sites called as variant: %d (%f)", nRefsCalledAsSNP, ((double)nRefsCalledAsSNP/nFullHapmapRefSites)));
out.add(String.format("| \t+ Sites where all seen Hapmap chips were ref, but not all Hapmap chips available: %d", nUnknownHapmapSites));
out.add(String.format("| \t\t- Putative reference sites called ref: %d (%f)", table[TRUTH_UNKNOWN][REF_CALL], ((double)table[TRUTH_UNKNOWN][REF_CALL]/nUnknownHapmapSites)));
out.add(String.format("| \t\t- Putative reference sites called SNP: %d (%f)", nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals, ((double)nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals/nUnknownHapmapSites)));
out.add("| Data on Hapmap Variant Sites");
out.add(String.format("| \t+ Number of Hapmap SNP Sites: %d", nHapmapSNPSites));
out.add(String.format("| \t\t- SNP sites incorrectly called ref: %d (%f)", nSNPsCalledAsRef, ((double)nSNPsCalledAsRef/nHapmapSNPSites)));
out.add(String.format("| \t+ SNP calls on Hapmap SNP Sites: %d", nSNPsOnHapmapSNP));
out.add(String.format("| \t\t- SNP sites correctly called SNP: %d (%f)", nSNPsCalledCorrectly, ((double)nSNPsCalledCorrectly/nSNPsOnHapmapSNP)));
out.add(String.format("| \t\t- SNP sites called a different base: %d (%f)", nSNPsCalledIncorrectly, ((double)nSNPsCalledIncorrectly/nSNPsOnHapmapSNP)));
out.add(String.format("| Calls on reference N: %d", variantCallsAtRefN));
out.add("----------------------- Output By Allele Frequency ------------------------");
out.add("");
out.add("FREQUENCY \tFALSE POSITIVES\tTRUE NEGATIVES\tFALSE NEGATIVES\tTRUE POSITIVES\tMISCALLS\tFALSE NEGATIVE RATE\tFALSE POSITIVE RATE");
for( int i = 0; i < getLargestOutputAlleleFrequencyIndex(); i ++) {
double freq = freqIndexToFrequency(i);
int nRefsCalledAsSNPFreq = tableByHMFrequency[i][TRUTH_REF][VARIANT_CALL];
int nRefsCalledCorrectFreq = tableByHMFrequency[i][TRUTH_REF][REF_CALL];
int nSNPsCalledIncorrectlyFreq = tableByHMFrequency[i][TRUTH_VAR][VARIANT_CALL_MISMATCH];
int nSNPsCalledCorrectlyFreq = tableByHMFrequency[i][TRUTH_VAR][VARIANT_CALL_MATCH];
int nSNPsCalledAsRefFreq = tableByHMFrequency[i][TRUTH_VAR][REF_CALL];
int nSnpsCalledAtFreq = nSNPsCalledIncorrectlyFreq + nSNPsCalledCorrectlyFreq + nSNPsCalledAsRefFreq;
double fnrate = ((double) nSNPsCalledAsRefFreq / nSnpsCalledAtFreq);
double fprate = ((double) nRefsCalledAsSNPFreq / (nRefsCalledAsSNPFreq + nRefsCalledCorrectFreq));
out.add(String.format("%f\t%d\t\t%d\t\t%d\t\t\t%d\t%d\t\t%f\t\t\t%f",
freq, nRefsCalledAsSNPFreq, nRefsCalledCorrectFreq, nSNPsCalledAsRefFreq, nSNPsCalledCorrectlyFreq, nSNPsCalledIncorrectlyFreq, fnrate, fprate));
}
return out;
}
public int getLargestOutputAlleleFrequencyIndex() {
int nHapmapSitesAtFreq = 1;
int freqIndex = -1;
while ( nHapmapSitesAtFreq > 0 && freqIndex < calculateNumFrequencyIndeces(poolSize) ) {
freqIndex ++;
for ( int i = 0; i < CALL_INDECES; i ++) {
nHapmapSitesAtFreq += table[TRUTH_REF][i] + table[TRUTH_VAR][i] + table[TRUTH_UNKNOWN][i];
}
}
return freqIndex;
}
}