VariantEval genotype concordance for pools! Integration test coming soon

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2071 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2009-11-18 17:24:54 +00:00
parent 6fe1c337ff
commit 405c6bf2c1
5 changed files with 159 additions and 406 deletions

View File

@ -50,11 +50,17 @@ public class PowerBelowFrequencyWalker extends LocusWalker<Integer,Integer> {
@Argument(fullName="ignoreReverseReads",doc="Ignore the reverse reads at a site. Defaults to false.", required = false)
boolean ignoreReverseReads = false;
private boolean calledByAnotherWalker = false;
public void initialize() {
if ( alleleFreq < 1 ) {
String err = "Allele frequency (-af) must be greater than or equal to one.";
throw new StingException(err);
}
if ( numIndividuals == 0 ) {
calledByAnotherWalker = true;
}
}
public Integer reduceInit() {
@ -105,7 +111,7 @@ public class PowerBelowFrequencyWalker extends LocusWalker<Integer,Integer> {
}
public double calculatePowerAtFrequency( AlignmentContext context, int alleles ) {
return theoreticalPower( context.numReads(), getMeanQ(context), alleles );
return theoreticalPower( context.numReads(), getMeanQ(context), alleles, lodThresh );
}
public byte getMeanQ( AlignmentContext context ) {
@ -139,7 +145,7 @@ public class PowerBelowFrequencyWalker extends LocusWalker<Integer,Integer> {
return String.format("%s%n", header);
}
public double theoreticalPower( int depth, byte q, int alleles ) {
public double theoreticalPower( int depth, byte q, int alleles, double lodThreshold ) {
double power;
if( depth <= 0 ) {
power = 0.0;
@ -151,7 +157,7 @@ public class PowerBelowFrequencyWalker extends LocusWalker<Integer,Integer> {
double dkterm_num = Math.log10( snpProp * (p_error/3) + (1 - snpProp) * (1 - p_error) );
double dkterm_denom = Math.log10( 1 - p_error);
int kaccept = (int) Math.ceil( ( lodThresh - ( (double) depth ) * ( dkterm_num - dkterm_denom ) ) /
int kaccept = (int) Math.ceil( ( lodThreshold - ( (double) depth ) * ( dkterm_num - dkterm_denom ) ) /
( kterm_num - kterm_denom- dkterm_num + dkterm_denom ) );
// we will reject the null hypothesis if we see kaccept or more SNPs, the power is the probability that this occurs
@ -171,5 +177,11 @@ public class PowerBelowFrequencyWalker extends LocusWalker<Integer,Integer> {
return ((double)alleles)/(2*numIndividuals);
}
public void setPoolSize(int poolSize) {
if ( calledByAnotherWalker ) {
numIndividuals = poolSize;
} else {
throw new StingException("This method should only be accessible by calling it from another walker.");
}
}
}

View File

@ -22,6 +22,8 @@ public class ConcordanceTruthTable {
private static final int TRUE_NEGATIVE = 1;
private static final int FALSE_POSITIVE = 2;
private static final int FALSE_NEGATIVE = 3;
private static final int VARIANT = 1;
private static final String[] POOL_HEADERS = {"TRUE_POSITIVE","TRUE_NEGATIVE","FALSE_POSITIVE","FALSE_NEGATIVE"};
private static final int REF = 0;
private static final int VAR_HET = 1;
@ -38,6 +40,7 @@ public class ConcordanceTruthTable {
private int[] truth_totals;
private int[] calls_totals;
public ConcordanceTruthTable(String name) {
// there's a specific sample associated with this truth table
this.name = name;
@ -54,9 +57,30 @@ public class ConcordanceTruthTable {
}
}
public ConcordanceTruthTable() {
public ConcordanceTruthTable(int nSamples) {
// there's no specific sample associated with this truth table
singleSampleMode = false;
name = "pooled_concordance";
truth_totals = new int[4];
calls_totals = new int[4];
for (int i = 0; i < 4; i++) {
truth_totals[i] = 0;
calls_totals[i] = 0;
}
initializeFrequencyTable(nSamples);
}
private void initializeFrequencyTable( int numChips ) {
// System.out.println("Frequency Table for Pooled Concordance initialized with number of chips = "+numChips);
table = new int[numChips*2][4];
for (int i = 0; i < 4; i++) {
for ( int freq = 0; freq < 2*numChips; freq ++ ) {
table[freq][i] = 0;
}
}
// System.out.println("Table Size: "+table.length+" by "+table[1].length);
}
public void addEntry(List<Pair<Genotype, Genotype>> chipEvals, Variation eval, char ref) {
@ -77,6 +101,21 @@ public class ConcordanceTruthTable {
//System.out.printf("TEST: %d/%d %s vs. %s%n", truthIndex, callIndex, chip, eval);
addGenotypeEntry(truthType, callType);
}
} else { // if we cannot associate tables with individuals, then we are working in a pooled context
// first we need to expand our tables to include frequency information
Pair<Genotype,Integer> poolVariant = getPooledAlleleFrequency(chipEvals, ref);
int truthType = getGenotype(poolVariant.getFirst(),ref); // convenience method; now to interpret
if (truthType == VAR_HET || truthType == VAR_HOM) {
truthType = VARIANT;
}
int callType = getCallIndex(eval,ref);
addFrequencyEntry( truthType,callType,poolVariant.getSecond() );
}
// TODO -- implement me for pooled mode with frequency stats
@ -85,6 +124,69 @@ public class ConcordanceTruthTable {
// TODO -- Indexes like TRUE_POSITIVE are defined above for you
}
public Pair<Genotype,Integer> getPooledAlleleFrequency( List<Pair<Genotype,Genotype>> chips, char ref) {
// TODO -- this is actually just a note that I wanted to appear in blue. This method explicitly uses
// TODO --- the assumption that tri-allelic sites do not really exist, and that if they do the
// TODO --- site will be marked as such by an 'N' in the reference, so we will not get to this point.
Genotype variant = null;
int frequency = 0;
if ( chips != null ) {
for ( Pair<Genotype,Genotype> chip : chips ) {
Genotype c = chip.getFirst();
if ( c != null ) {
if ( c.isVariant(ref) ) {
if ( c.isHet() ) {
frequency ++;
} else { // c is hom
frequency += 2;
}
variant = c;
}
}
}
}
return new Pair<Genotype, Integer>(variant, frequency);
}
private void addFrequencyEntry ( int truthIndex, int callIndex, int numTrueSupportingAlleles ) {
calls_totals[callIndex] ++;
truth_totals[truthIndex] ++;
if ( truthIndex == REF && callIndex == REF ) {
// true negative
table[numTrueSupportingAlleles][TRUE_NEGATIVE] ++;
// sanity check - there should never be an entry in
// [*][TRUE_NEGATIVE] for * > 0
} else if ( truthIndex == REF && callIndex == VARIANT ) {
// false positive
table[numTrueSupportingAlleles][FALSE_POSITIVE] ++;
} else if ( truthIndex == VARIANT && (callIndex == NO_CALL || callIndex == REF) ) {
// false negative
table[numTrueSupportingAlleles][FALSE_NEGATIVE]++;
} else if ( truthIndex == VARIANT && callIndex == VARIANT ) {
// true positive
table[numTrueSupportingAlleles][TRUE_POSITIVE]++;
} else {
// something else is going on; wonky site or something. Don't do anything to the table.
}
}
private static int getCallIndex(Variation eval, char ref) {
int index;
if ( eval == null ) {
index = NO_CALL;
} else if ( ! eval.isSNP() ) {
index = REF;
} else {
index = VARIANT;
}
return index;
}
private static int getGenotype(Genotype g, char ref) {
int type;
@ -116,6 +218,20 @@ public class ConcordanceTruthTable {
private void addFrequencyStats(List<String> s) {
// TODO -- implement me for pooled mode with frequency stats
s.add(String.format("name %s",name));
s.add(String.format("TRUTH_ALLELE_FREQUENCY\tERROR_OR_TRUTH_TYPE\tTOTAL\tAS_PRCT_OF_TOTAL_CALLS\tAS_PRCT_OF_CALLS_AT_FREQUENCY"));
for ( int af = 0; af < table.length; af ++ ) {
for ( int errorIndex = 0; errorIndex < 4; errorIndex ++ ) {
StringBuffer sb = new StringBuffer();
sb.append(String.format("%f ", ((double) af)/ table.length));
sb.append(String.format("%s ",POOL_HEADERS[errorIndex]));
sb.append(String.format("%d ", table[af][errorIndex]));
sb.append(String.format("%s ", percentOfTotal(table,af,errorIndex)));
sb.append(String.format("%s ", marginalPercent(table[af],errorIndex)));
s.add(sb.toString());
}
}
}
@ -196,4 +312,24 @@ public class ConcordanceTruthTable {
sb.append("%");
return sb.toString();
}
private static String percentOfTotal(int [][] errorMatrix, int alleleFrequency, int errorIndex ) {
int count = 0;
for ( int i = 0; i < errorMatrix.length; i ++ ) {
for ( int j = 0; j < 4; j ++ ) {
count += errorMatrix[i][j];
}
}
return cellPercent(errorMatrix[alleleFrequency][errorIndex], count );
}
private static String marginalPercent ( int[] errors, int errorIndex ) {
int count = 0;
for ( int i = 0; i < 4; i ++ ) {
count += errors[i];
}
return cellPercent(errors[errorIndex], count);
}
}

View File

@ -30,7 +30,7 @@ public class PooledConcordance extends ChipConcordance implements PoolAnalysis {
protected ConcordanceTruthTable[] createTruthTableMappings(List<String> rodNames, Map<String, Integer> sampleToArrayIndex) {
// there is only one truth table - the pool-wide one
ConcordanceTruthTable[] tables = new ConcordanceTruthTable[1];
tables[0] = new ConcordanceTruthTable();
tables[0] = new ConcordanceTruthTable(rodNames.size());
for ( String name : rodNames ) {
sampleToArrayIndex.put(name, 0);

View File

@ -1,396 +0,0 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.genotype.Variation;
import java.io.*;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Sep 9, 2009
* Time: 4:45:21 PM
* To change this template use File | Settings | File Templates.
*/
public class PooledGenotypeConcordance extends BasicVariantAnalysis implements PoolAnalysis, GenotypeAnalysis {
private PooledConcordanceTable table;
private String[] hapmapNames;
public PooledGenotypeConcordance( String pathToPoolFile ) {
super("Pooled_Genotype_Concordance");
if( pathToPoolFile == null ) {
table = null;
hapmapNames = null;
} else {
generateNameTableFromFile( pathToPoolFile );
table = new PooledConcordanceTable( hapmapNames.length );
}
}
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
String s = table.updateTable(tracker, ref, eval, hapmapNames);
if ( s == null ) {
return null;
} else {
return s + " " + context.getLocation().toString();
}
}
public List<String> done() {
return table.standardOutput();
}
// private methods for reading in names from a file
private void generateNameTableFromFile(String file) {
BufferedReader reader;
try {
reader = new BufferedReader(new FileReader(file));
} catch( FileNotFoundException e) {
String errMsg = "Hapmap pool file at "+file+" was not found. Please check filepath.";
throw new StingException(errMsg, e);
}
LinkedList<String> nameList = new LinkedList<String>();
while(continueReading(reader)) {
String line = readLine(reader);
nameList.add(line);
}
hapmapNames = nameList.toArray(new String[nameList.size()]);
}
private boolean continueReading(BufferedReader reader) {
boolean continueReading;
try {
continueReading = reader.ready();
} catch(IOException e) {
continueReading = false;
}
return continueReading;
}
private String readLine(BufferedReader reader) {
String line;
try {
line = reader.readLine();
} catch( IOException e) {
String errMsg = "BufferedReader pointing to "+reader.toString()+" was declared ready but no line could be read from it.";
throw new StingException(errMsg,e);
}
return line;
}
}
class PooledConcordanceTable {
private final int CALL_INDECES = 6;
private final int TRUTH_INDECES = 4;
private final int NO_CALL = 5;
private final int REF_CALL = 0; // synonym
private final int VARIANT_CALL_NONHAPMAP = 1;
private final int VARIANT_CALL_MATCH = 2;
private final int VARIANT_CALL = 2; // re-using index
private final int VARIANT_CALL_MISMATCH = 3;
private final int VARIANT_CALL_UNKNOWN = 4;
private final int NO_TRUTH_DATA = 0;
private final int TRUTH_REF = 1;
private final int TRUTH_VAR = 2;
private final int TRUTH_UNKNOWN = 3;
private int[][] table;
private int[][][] tableByHMFrequency;
private int variantCallsAtRefN;
private int poolSize;
public PooledConcordanceTable(int poolSize) {
this.poolSize = poolSize;
table = new int[TRUTH_INDECES][CALL_INDECES];
tableByHMFrequency = new int[calculateNumFrequencyIndeces(poolSize)][TRUTH_INDECES][CALL_INDECES];
variantCallsAtRefN = 0;
for ( int j = 0; j < TRUTH_INDECES; j ++ ) {
for ( int k = 0; k < CALL_INDECES; k ++ ) {
table[j][k] = 0;
for( int i = 0; i < calculateNumFrequencyIndeces(poolSize); i ++ ) {
tableByHMFrequency[i][j][k] = 0;
}
}
}
}
public String updateTable(RefMetaDataTracker tracker, char ref, Variation eval, String[] names) {
int truthIndex, callIndex, frequencyIndex = -1;
List<Variation> chips = getChips(names, tracker);
if ( ref == 'N' || ref == 'n' ) {
variantCallsAtRefN += ( eval == null ) ? 0 : 1;
truthIndex = NO_TRUTH_DATA; // don't want calls on Ns to factor in to calculation
callIndex = NO_CALL; // don't want calls on Ns to factor in to calculation
} else if( chips.isEmpty() ) {
truthIndex = NO_TRUTH_DATA;
if ( eval == null ) {
callIndex = NO_CALL;
} else {
callIndex = ( pooledCallIsRef( eval, ref ) ) ? REF_CALL : VARIANT_CALL_NONHAPMAP;
}
} else {
frequencyIndex = calcVariantFrequencyIndex(ref, chips);
if ( freqIndexToFrequency( frequencyIndex ) != 0 ) {
truthIndex = TRUTH_VAR;
} else if ( chips.size() == poolSize ) {
truthIndex = TRUTH_REF;
} else {
truthIndex = TRUTH_UNKNOWN;
}
if ( eval == null ) {
callIndex = NO_CALL;
} else {
if ( pooledCallIsRef( eval, ref ) ) {
callIndex = REF_CALL;
} else if ( truthIndex == TRUTH_REF ) {
callIndex = VARIANT_CALL;
} else if (truthIndex == TRUTH_UNKNOWN) {
callIndex = VARIANT_CALL_UNKNOWN;
} else if ( mismatchingCalls( eval, chips, ref ) ) {
callIndex = VARIANT_CALL_MISMATCH;
} else {
callIndex = VARIANT_CALL_MATCH;
}
}
tableByHMFrequency[frequencyIndex][truthIndex][callIndex] ++; // note that this updates only on HAPMAP sites. This is good.
}
table[truthIndex][callIndex] ++;
String interest;
if ( truthIndex == TRUTH_VAR && ( callIndex == NO_CALL || callIndex == REF_CALL ) ) {
interest = "False_Negative_with_frequency: "+ Double.toString(freqIndexToFrequency(frequencyIndex));
} else if ( truthIndex == TRUTH_REF && ( callIndex == VARIANT_CALL )) {
interest = "False_Positive";
} else if ( callIndex == VARIANT_CALL_MISMATCH ) {
interest = "Mismatching_call";
} else {
interest = null;
}
return interest;
}
public List<Variation> getChips(String[] names, RefMetaDataTracker tracker) {
LinkedList<Variation> chips = new LinkedList<Variation>();
for ( String name : names ) {
Variation chip = (Variation) tracker.lookup(name,null);
if ( chip != null ) {
chips.add(chip);
}
}
return chips;
}
public boolean pooledCallIsRef(Variation eval, char ref) {
// code broken out for easy alteration when we start using pool-specific variations
return Utils.join("",eval.getAlleleList()).equalsIgnoreCase((Utils.dupString(ref,2)));
}
public int calculateNumFrequencyIndeces(int poolSize) {
// code broken out for easy alteration when we start using pool-specific variations
return 2*(poolSize+1);
}
public int calcVariantFrequencyIndex( char ref, List<Variation> evals ) {
return frequencyToFrequencyIndex( calcVariantFrequency( evals, ref ) );
}
public int frequencyToFrequencyIndex( double frequency ) {
// code broken out for easy alteration when we start using pool-specific variations
return (int) frequency;
}
public double calcVariantFrequency( List<Variation> evals, char ref ) {
// code broken out for easy alteration when we start using pool-specific variations
Variation firstEval = evals.get(0);
double alternateFrequency = 0.0;
for ( Variation eval : evals ) {
if ( mismatchingCalls(firstEval, eval, ref) ) {
// todo -- make this not a StingException but go to the log
throw new StingException("Tri-Allelic Position "+Utils.join("",eval.getAlleleList())+"/"+Utils.join("",firstEval.getAlleleList()) + " Ref: "+ ref + " not supported");
} else {
alternateFrequency += calledVariantFrequency(eval,ref);
}
}
return alternateFrequency;
}
public boolean mismatchingCalls(Variation var1, List<Variation> vars, char ref) {
ListIterator varIter = vars.listIterator();
try {
while( pooledCallIsRef( (Variation) varIter.next(), ref ) ) {
// don't do squat!
}
} catch (NoSuchElementException e) {
String errMsg = "Comparison data given to mismatchingCalls(Variation, List<Variation>, char) was entirely reference. This is a bug and should never happen.";
throw new StingException(errMsg, e);
}
return mismatchingCalls( var1, (Variation) varIter.previous(), ref);
}
public boolean mismatchingCalls(Variation eval, Variation chip, char ref) {
// eval and chip guaranteed to be non-null
char chipF = Utils.stringToChar(chip.getAlleleList().get(0));
char chipS = Utils.stringToChar(chip.getAlleleList().get(1));
char evalF = Utils.stringToChar(eval.getAlleleList().get(0));
char evalS = Utils.stringToChar(eval.getAlleleList().get(1));
boolean mismatch;
if (chipF == ref) {
if ( chipS == ref ) {
mismatch = false; // no mismatch against hom ref
} else {
mismatch = ( evalF != chipS && evalS != chipS ); // test against het nonref
}
} else if ( chipS == ref ) { // het nonref (somehow made nonref|ref rather than ref|nonref)
mismatch = ( evalF != chipF && evalS != chipF ); // test against het nonref
} else { // chip is homozygous nonreference
if( evalF == ref ) {
mismatch = ( evalS != chipF && evalS != chipS ); // make sure the nonref call of eval matches one of the chip alleles
} else if ( evalS == ref ) {
mismatch = ( evalF != chipF && evalF != chipS ); // make sure the nonref call of eval matches one of the chip alleles
} else {
// both are hom nonref
mismatch = ( evalF != chipF || evalS != chipS); // could be multiallelic calls
}
}
return mismatch;
}
public double calledVariantFrequency( Variation var, char ref ) {
// code broken out for easy alteration when we start using pool-specific variations
String varStr = Utils.join("",var.getAlleleList());
double freq;
if ( varStr.charAt(0) != ref && varStr.charAt(1) != ref ) {
freq = (double) 2;
} else if ( varStr.charAt(0) == ref && varStr.charAt(1) == ref ) {
freq = (double) 0;
} else {
freq = (double) 1;
}
return freq;
}
public double freqIndexToFrequency( int freqIndex ) {
// code broken out for easy alteration when we start using pool-specific variations
return (double) freqIndex;
}
public List<String> standardOutput() {
LinkedList<String> out = new LinkedList<String>();
int nSNPCallSites = 0;
int nHapmapSites = 0;
int nHapmapSNPSites = 0;
int nFullHapmapRefSites = 0;
int nUnknownHapmapSites = 0;
for( int truthIndex = 0; truthIndex < TRUTH_INDECES; truthIndex ++ ) {
nSNPCallSites += table[truthIndex][VARIANT_CALL_MATCH] + table[truthIndex][VARIANT_CALL_MISMATCH] + table[truthIndex][VARIANT_CALL_NONHAPMAP];
nSNPCallSites += table[truthIndex][VARIANT_CALL_UNKNOWN];
}
for ( int callIndex = 0; callIndex < CALL_INDECES; callIndex ++ ) {
nHapmapSites += table[TRUTH_REF][callIndex] + table[TRUTH_VAR][callIndex] + table[TRUTH_UNKNOWN][callIndex];
nUnknownHapmapSites += table[TRUTH_UNKNOWN][callIndex];
nHapmapSNPSites += table[TRUTH_VAR][callIndex];
nFullHapmapRefSites += table[TRUTH_REF][callIndex];
}
int nRefsCalledCorrectly = table[TRUTH_REF][REF_CALL];
int nHapmapRefsNotCalled = table[TRUTH_REF][NO_CALL];
int nRefsCalledAsSNP = table[TRUTH_REF][VARIANT_CALL];
int nSNPsCalledCorrectly = table[TRUTH_VAR][VARIANT_CALL_MATCH];
int nSNPsCalledIncorrectly = table[TRUTH_VAR][VARIANT_CALL_MISMATCH];
int nSNPsCalledAsRef = table[TRUTH_VAR][REF_CALL];
int nHapmapSNPsNotCalled = table[TRUTH_VAR][NO_CALL];
int nSNPsOnHapmapSNP = table[TRUTH_VAR][VARIANT_CALL_MATCH] + table[TRUTH_VAR][VARIANT_CALL_MISMATCH];
int nSNPsAtNonHapmap = table[NO_TRUTH_DATA][VARIANT_CALL_NONHAPMAP];
int nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals = table[TRUTH_UNKNOWN][VARIANT_CALL_UNKNOWN];
out.add(String.format("Total Number of SNP Calls:\t\t%d", nSNPCallSites));
out.add(String.format("Total SNP calls on non-Hapmap sites\t\t%d", nSNPsAtNonHapmap));
out.add(String.format("Number of Hapmap Sites:\t\t%d", nHapmapSites));
out.add("Data on Hapmap Reference Sites");
out.add(String.format("\tSites where all Hapmap chips were ref: \t\t%d", nFullHapmapRefSites));
out.add(String.format("\t\tReference sites correctly called:\t\t%d\t(%d%%)", nRefsCalledCorrectly, divideToPercent(nRefsCalledCorrectly, nFullHapmapRefSites)));
out.add(String.format("\t\tReference sites called as variant:\t\t%d\t(%d%%)", nRefsCalledAsSNP, divideToPercent(nRefsCalledAsSNP, nFullHapmapRefSites)));
out.add(String.format("\t\tReference sites not confidently called SNP:\t%d\t(%d%%)", nHapmapRefsNotCalled, divideToPercent(nHapmapRefsNotCalled, nFullHapmapRefSites)));
out.add(String.format("\tSites where all seen Hapmap chips were ref, but not all Hapmap chips available: %d", nUnknownHapmapSites));
out.add(String.format("\t\tPutative reference sites called ref:\t\t\t%d\t(%d%%)", table[TRUTH_UNKNOWN][REF_CALL], divideToPercent(table[TRUTH_UNKNOWN][REF_CALL], nUnknownHapmapSites)));
out.add(String.format("\t\tPutative reference sites called SNP:\t\t\t%d\t(%d%%)", nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals, divideToPercent(nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals, nUnknownHapmapSites)));
out.add(String.format("\t\tPutative reference sites not confidently called SNP:\t%d\t(%d%%)", table[TRUTH_UNKNOWN][NO_CALL], divideToPercent(table[TRUTH_UNKNOWN][NO_CALL], nUnknownHapmapSites)));
out.add("Data on Hapmap Variant Sites");
out.add(String.format("\tNumber of Hapmap SNP Sites:\t\t\t%d", nHapmapSNPSites));
out.add(String.format("\t\tSNP sites incorrectly called ref:\t%d\t(%d%%)", nSNPsCalledAsRef, divideToPercent(nSNPsCalledAsRef, nHapmapSNPSites)));
out.add(String.format("\t\tSNP sites not confidently called:\t%d\t(%d%%)", nHapmapSNPsNotCalled, divideToPercent(nHapmapSNPsNotCalled, nHapmapSNPSites)));
out.add(String.format("\tSNP calls on Hapmap SNP Sites:\t\t%d", nSNPsOnHapmapSNP));
out.add(String.format("\t\tSNP sites correctly called SNP:\t%d\t(%d%%)", nSNPsCalledCorrectly, divideToPercent(nSNPsCalledCorrectly, nSNPsOnHapmapSNP)));
out.add(String.format("\t\tSNP sites called a different base:\t%d\t(%d%%)", nSNPsCalledIncorrectly, divideToPercent(nSNPsCalledIncorrectly, nSNPsOnHapmapSNP)));
out.add(String.format("Calls on reference N:\t\t%d", variantCallsAtRefN));
out.add("----------------------- Output By Allele Frequency ------------------------");
out.add("");
out.add("FREQUENCY \tFALSE_POSITIVES\tTRUE_NEGATIVES\tFALSE_NEGATIVES\tTRUE_POSITIVES\tMISCALLS\tNO_CALLS\tFALSE_NEGATIVE_RATE\tFALSE_POSITIVE_RATE");
for( int i = 0; i < getLargestOutputAlleleFrequencyIndex(); i ++) {
double freq = freqIndexToFrequency(i);
int nRefsCalledAsSNPFreq = tableByHMFrequency[i][TRUTH_REF][VARIANT_CALL];
int nRefsCalledCorrectFreq = tableByHMFrequency[i][TRUTH_REF][REF_CALL];
int nSNPsCalledIncorrectlyFreq = tableByHMFrequency[i][TRUTH_VAR][VARIANT_CALL_MISMATCH];
int nSNPsCalledCorrectlyFreq = tableByHMFrequency[i][TRUTH_VAR][VARIANT_CALL_MATCH];
int nSNPsCalledAsRefFreq = tableByHMFrequency[i][TRUTH_VAR][REF_CALL];
int nNoCallsAtFreq = tableByHMFrequency[i][TRUTH_VAR][NO_CALL] + tableByHMFrequency[i][TRUTH_REF][NO_CALL];
int nSnpsCalledAtFreq = nSNPsCalledIncorrectlyFreq + nSNPsCalledCorrectlyFreq + nSNPsCalledAsRefFreq;
int nSNPsAtFreq = 0;
for( int j = 0; j < CALL_INDECES; j ++ ) {
nSNPsAtFreq += tableByHMFrequency[i][TRUTH_VAR][j];
}
int nSNPsNoCallFreq = tableByHMFrequency[i][TRUTH_VAR][NO_CALL];
double fnrate = ((double) (nSNPsNoCallFreq + nSNPsCalledAsRefFreq) / (nSNPsAtFreq));
double fprate = ((double) nRefsCalledAsSNPFreq / (nRefsCalledAsSNPFreq + nRefsCalledCorrectFreq + tableByHMFrequency[i][TRUTH_REF][NO_CALL]));
out.add(String.format("%f\t%d\t\t%d\t\t%d\t\t\t%d\t%d\t\t%d\t\t%f\t\t\t%f",
freq, nRefsCalledAsSNPFreq, nRefsCalledCorrectFreq, nSNPsCalledAsRefFreq, nSNPsCalledCorrectlyFreq, nSNPsCalledIncorrectlyFreq, nNoCallsAtFreq, fnrate, fprate));
}
return out;
}
public int getLargestOutputAlleleFrequencyIndex() {
// fixed - returns the last index at which genotype sites appear
int freqIndex = -1;
for ( int j = 0; j < calculateNumFrequencyIndeces(poolSize); j ++ ) {
int nHapmapSitesAtFreq = 0;
for ( int i = 0; i < CALL_INDECES; i ++) {
nHapmapSitesAtFreq += table[TRUTH_REF][i] + table[TRUTH_VAR][i] + table[TRUTH_UNKNOWN][i];
}
if ( nHapmapSitesAtFreq > 0 ) {
freqIndex = j;
}
}
return freqIndex;
}
public int divideToPercent( int numerator, int denominator ) {
return (int) Math.floor(100.0*((double) numerator)/ denominator );
}
}

View File

@ -130,15 +130,16 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
//
analyses.add(new ValidationDataAnalysis());
// todo -- chris remove?
//analyses.add(new PooledGenotypeConcordance(samplesFile));
analyses.add(new VariantCounter());
analyses.add(new VariantDBCoverage(knownSNPDBName));
//analyses.add(new PooledFrequencyAnalysis(numPeopleInPool,knownSNPDBName));
if ( samplesFile == null )
analyses.add(new GenotypeConcordance(genotypeChipName, false));
else
analyses.add(new GenotypeConcordance(samplesFile, true));
else if ( numPeopleInPool < 1)
analyses.add(new GenotypeConcordance(samplesFile, true));
else {
analyses.add(new PooledConcordance(samplesFile,true));
}
analyses.add(new TransitionTranversionAnalysis());
analyses.add(new NeighborDistanceAnalysis());
analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold));