VariantEval genotype concordance for pools! Integration test coming soon
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2071 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
6fe1c337ff
commit
405c6bf2c1
|
|
@ -50,11 +50,17 @@ public class PowerBelowFrequencyWalker extends LocusWalker<Integer,Integer> {
|
|||
@Argument(fullName="ignoreReverseReads",doc="Ignore the reverse reads at a site. Defaults to false.", required = false)
|
||||
boolean ignoreReverseReads = false;
|
||||
|
||||
private boolean calledByAnotherWalker = false;
|
||||
|
||||
public void initialize() {
|
||||
if ( alleleFreq < 1 ) {
|
||||
String err = "Allele frequency (-af) must be greater than or equal to one.";
|
||||
throw new StingException(err);
|
||||
}
|
||||
|
||||
if ( numIndividuals == 0 ) {
|
||||
calledByAnotherWalker = true;
|
||||
}
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
|
|
@ -105,7 +111,7 @@ public class PowerBelowFrequencyWalker extends LocusWalker<Integer,Integer> {
|
|||
}
|
||||
|
||||
public double calculatePowerAtFrequency( AlignmentContext context, int alleles ) {
|
||||
return theoreticalPower( context.numReads(), getMeanQ(context), alleles );
|
||||
return theoreticalPower( context.numReads(), getMeanQ(context), alleles, lodThresh );
|
||||
}
|
||||
|
||||
public byte getMeanQ( AlignmentContext context ) {
|
||||
|
|
@ -139,7 +145,7 @@ public class PowerBelowFrequencyWalker extends LocusWalker<Integer,Integer> {
|
|||
return String.format("%s%n", header);
|
||||
}
|
||||
|
||||
public double theoreticalPower( int depth, byte q, int alleles ) {
|
||||
public double theoreticalPower( int depth, byte q, int alleles, double lodThreshold ) {
|
||||
double power;
|
||||
if( depth <= 0 ) {
|
||||
power = 0.0;
|
||||
|
|
@ -151,7 +157,7 @@ public class PowerBelowFrequencyWalker extends LocusWalker<Integer,Integer> {
|
|||
double dkterm_num = Math.log10( snpProp * (p_error/3) + (1 - snpProp) * (1 - p_error) );
|
||||
double dkterm_denom = Math.log10( 1 - p_error);
|
||||
|
||||
int kaccept = (int) Math.ceil( ( lodThresh - ( (double) depth ) * ( dkterm_num - dkterm_denom ) ) /
|
||||
int kaccept = (int) Math.ceil( ( lodThreshold - ( (double) depth ) * ( dkterm_num - dkterm_denom ) ) /
|
||||
( kterm_num - kterm_denom- dkterm_num + dkterm_denom ) );
|
||||
|
||||
// we will reject the null hypothesis if we see kaccept or more SNPs, the power is the probability that this occurs
|
||||
|
|
@ -171,5 +177,11 @@ public class PowerBelowFrequencyWalker extends LocusWalker<Integer,Integer> {
|
|||
return ((double)alleles)/(2*numIndividuals);
|
||||
}
|
||||
|
||||
|
||||
public void setPoolSize(int poolSize) {
|
||||
if ( calledByAnotherWalker ) {
|
||||
numIndividuals = poolSize;
|
||||
} else {
|
||||
throw new StingException("This method should only be accessible by calling it from another walker.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -22,6 +22,8 @@ public class ConcordanceTruthTable {
|
|||
private static final int TRUE_NEGATIVE = 1;
|
||||
private static final int FALSE_POSITIVE = 2;
|
||||
private static final int FALSE_NEGATIVE = 3;
|
||||
private static final int VARIANT = 1;
|
||||
private static final String[] POOL_HEADERS = {"TRUE_POSITIVE","TRUE_NEGATIVE","FALSE_POSITIVE","FALSE_NEGATIVE"};
|
||||
|
||||
private static final int REF = 0;
|
||||
private static final int VAR_HET = 1;
|
||||
|
|
@ -38,6 +40,7 @@ public class ConcordanceTruthTable {
|
|||
private int[] truth_totals;
|
||||
private int[] calls_totals;
|
||||
|
||||
|
||||
public ConcordanceTruthTable(String name) {
|
||||
// there's a specific sample associated with this truth table
|
||||
this.name = name;
|
||||
|
|
@ -54,9 +57,30 @@ public class ConcordanceTruthTable {
|
|||
}
|
||||
}
|
||||
|
||||
public ConcordanceTruthTable() {
|
||||
public ConcordanceTruthTable(int nSamples) {
|
||||
// there's no specific sample associated with this truth table
|
||||
singleSampleMode = false;
|
||||
name = "pooled_concordance";
|
||||
truth_totals = new int[4];
|
||||
calls_totals = new int[4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
truth_totals[i] = 0;
|
||||
calls_totals[i] = 0;
|
||||
}
|
||||
|
||||
initializeFrequencyTable(nSamples);
|
||||
}
|
||||
|
||||
private void initializeFrequencyTable( int numChips ) {
|
||||
// System.out.println("Frequency Table for Pooled Concordance initialized with number of chips = "+numChips);
|
||||
table = new int[numChips*2][4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for ( int freq = 0; freq < 2*numChips; freq ++ ) {
|
||||
table[freq][i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// System.out.println("Table Size: "+table.length+" by "+table[1].length);
|
||||
}
|
||||
|
||||
public void addEntry(List<Pair<Genotype, Genotype>> chipEvals, Variation eval, char ref) {
|
||||
|
|
@ -77,6 +101,21 @@ public class ConcordanceTruthTable {
|
|||
//System.out.printf("TEST: %d/%d %s vs. %s%n", truthIndex, callIndex, chip, eval);
|
||||
addGenotypeEntry(truthType, callType);
|
||||
}
|
||||
} else { // if we cannot associate tables with individuals, then we are working in a pooled context
|
||||
// first we need to expand our tables to include frequency information
|
||||
|
||||
Pair<Genotype,Integer> poolVariant = getPooledAlleleFrequency(chipEvals, ref);
|
||||
|
||||
int truthType = getGenotype(poolVariant.getFirst(),ref); // convenience method; now to interpret
|
||||
|
||||
if (truthType == VAR_HET || truthType == VAR_HOM) {
|
||||
truthType = VARIANT;
|
||||
}
|
||||
|
||||
int callType = getCallIndex(eval,ref);
|
||||
|
||||
addFrequencyEntry( truthType,callType,poolVariant.getSecond() );
|
||||
|
||||
}
|
||||
|
||||
// TODO -- implement me for pooled mode with frequency stats
|
||||
|
|
@ -85,6 +124,69 @@ public class ConcordanceTruthTable {
|
|||
// TODO -- Indexes like TRUE_POSITIVE are defined above for you
|
||||
}
|
||||
|
||||
public Pair<Genotype,Integer> getPooledAlleleFrequency( List<Pair<Genotype,Genotype>> chips, char ref) {
|
||||
// TODO -- this is actually just a note that I wanted to appear in blue. This method explicitly uses
|
||||
// TODO --- the assumption that tri-allelic sites do not really exist, and that if they do the
|
||||
// TODO --- site will be marked as such by an 'N' in the reference, so we will not get to this point.
|
||||
Genotype variant = null;
|
||||
int frequency = 0;
|
||||
if ( chips != null ) {
|
||||
for ( Pair<Genotype,Genotype> chip : chips ) {
|
||||
Genotype c = chip.getFirst();
|
||||
if ( c != null ) {
|
||||
if ( c.isVariant(ref) ) {
|
||||
if ( c.isHet() ) {
|
||||
frequency ++;
|
||||
} else { // c is hom
|
||||
frequency += 2;
|
||||
}
|
||||
variant = c;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new Pair<Genotype, Integer>(variant, frequency);
|
||||
|
||||
}
|
||||
|
||||
private void addFrequencyEntry ( int truthIndex, int callIndex, int numTrueSupportingAlleles ) {
|
||||
calls_totals[callIndex] ++;
|
||||
truth_totals[truthIndex] ++;
|
||||
|
||||
if ( truthIndex == REF && callIndex == REF ) {
|
||||
// true negative
|
||||
table[numTrueSupportingAlleles][TRUE_NEGATIVE] ++;
|
||||
// sanity check - there should never be an entry in
|
||||
// [*][TRUE_NEGATIVE] for * > 0
|
||||
} else if ( truthIndex == REF && callIndex == VARIANT ) {
|
||||
// false positive
|
||||
table[numTrueSupportingAlleles][FALSE_POSITIVE] ++;
|
||||
} else if ( truthIndex == VARIANT && (callIndex == NO_CALL || callIndex == REF) ) {
|
||||
// false negative
|
||||
table[numTrueSupportingAlleles][FALSE_NEGATIVE]++;
|
||||
} else if ( truthIndex == VARIANT && callIndex == VARIANT ) {
|
||||
// true positive
|
||||
table[numTrueSupportingAlleles][TRUE_POSITIVE]++;
|
||||
} else {
|
||||
// something else is going on; wonky site or something. Don't do anything to the table.
|
||||
}
|
||||
}
|
||||
|
||||
private static int getCallIndex(Variation eval, char ref) {
|
||||
int index;
|
||||
|
||||
if ( eval == null ) {
|
||||
index = NO_CALL;
|
||||
} else if ( ! eval.isSNP() ) {
|
||||
index = REF;
|
||||
} else {
|
||||
index = VARIANT;
|
||||
}
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
private static int getGenotype(Genotype g, char ref) {
|
||||
int type;
|
||||
|
||||
|
|
@ -116,6 +218,20 @@ public class ConcordanceTruthTable {
|
|||
private void addFrequencyStats(List<String> s) {
|
||||
|
||||
// TODO -- implement me for pooled mode with frequency stats
|
||||
s.add(String.format("name %s",name));
|
||||
s.add(String.format("TRUTH_ALLELE_FREQUENCY\tERROR_OR_TRUTH_TYPE\tTOTAL\tAS_PRCT_OF_TOTAL_CALLS\tAS_PRCT_OF_CALLS_AT_FREQUENCY"));
|
||||
|
||||
for ( int af = 0; af < table.length; af ++ ) {
|
||||
for ( int errorIndex = 0; errorIndex < 4; errorIndex ++ ) {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append(String.format("%f ", ((double) af)/ table.length));
|
||||
sb.append(String.format("%s ",POOL_HEADERS[errorIndex]));
|
||||
sb.append(String.format("%d ", table[af][errorIndex]));
|
||||
sb.append(String.format("%s ", percentOfTotal(table,af,errorIndex)));
|
||||
sb.append(String.format("%s ", marginalPercent(table[af],errorIndex)));
|
||||
s.add(sb.toString());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -196,4 +312,24 @@ public class ConcordanceTruthTable {
|
|||
sb.append("%");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private static String percentOfTotal(int [][] errorMatrix, int alleleFrequency, int errorIndex ) {
|
||||
int count = 0;
|
||||
for ( int i = 0; i < errorMatrix.length; i ++ ) {
|
||||
for ( int j = 0; j < 4; j ++ ) {
|
||||
count += errorMatrix[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
return cellPercent(errorMatrix[alleleFrequency][errorIndex], count );
|
||||
}
|
||||
|
||||
private static String marginalPercent ( int[] errors, int errorIndex ) {
|
||||
int count = 0;
|
||||
for ( int i = 0; i < 4; i ++ ) {
|
||||
count += errors[i];
|
||||
}
|
||||
|
||||
return cellPercent(errors[errorIndex], count);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ public class PooledConcordance extends ChipConcordance implements PoolAnalysis {
|
|||
protected ConcordanceTruthTable[] createTruthTableMappings(List<String> rodNames, Map<String, Integer> sampleToArrayIndex) {
|
||||
// there is only one truth table - the pool-wide one
|
||||
ConcordanceTruthTable[] tables = new ConcordanceTruthTable[1];
|
||||
tables[0] = new ConcordanceTruthTable();
|
||||
tables[0] = new ConcordanceTruthTable(rodNames.size());
|
||||
|
||||
for ( String name : rodNames ) {
|
||||
sampleToArrayIndex.put(name, 0);
|
||||
|
|
|
|||
|
|
@ -1,396 +0,0 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: Sep 9, 2009
|
||||
* Time: 4:45:21 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
|
||||
public class PooledGenotypeConcordance extends BasicVariantAnalysis implements PoolAnalysis, GenotypeAnalysis {
|
||||
|
||||
private PooledConcordanceTable table;
|
||||
private String[] hapmapNames;
|
||||
|
||||
public PooledGenotypeConcordance( String pathToPoolFile ) {
|
||||
super("Pooled_Genotype_Concordance");
|
||||
if( pathToPoolFile == null ) {
|
||||
table = null;
|
||||
hapmapNames = null;
|
||||
} else {
|
||||
generateNameTableFromFile( pathToPoolFile );
|
||||
table = new PooledConcordanceTable( hapmapNames.length );
|
||||
}
|
||||
}
|
||||
|
||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
String s = table.updateTable(tracker, ref, eval, hapmapNames);
|
||||
if ( s == null ) {
|
||||
return null;
|
||||
} else {
|
||||
return s + " " + context.getLocation().toString();
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
return table.standardOutput();
|
||||
}
|
||||
|
||||
// private methods for reading in names from a file
|
||||
|
||||
private void generateNameTableFromFile(String file) {
|
||||
BufferedReader reader;
|
||||
try {
|
||||
reader = new BufferedReader(new FileReader(file));
|
||||
} catch( FileNotFoundException e) {
|
||||
String errMsg = "Hapmap pool file at "+file+" was not found. Please check filepath.";
|
||||
throw new StingException(errMsg, e);
|
||||
}
|
||||
|
||||
LinkedList<String> nameList = new LinkedList<String>();
|
||||
|
||||
while(continueReading(reader)) {
|
||||
String line = readLine(reader);
|
||||
nameList.add(line);
|
||||
}
|
||||
|
||||
hapmapNames = nameList.toArray(new String[nameList.size()]);
|
||||
}
|
||||
|
||||
private boolean continueReading(BufferedReader reader) {
|
||||
boolean continueReading;
|
||||
try {
|
||||
continueReading = reader.ready();
|
||||
} catch(IOException e) {
|
||||
continueReading = false;
|
||||
}
|
||||
return continueReading;
|
||||
}
|
||||
|
||||
private String readLine(BufferedReader reader) {
|
||||
String line;
|
||||
try {
|
||||
line = reader.readLine();
|
||||
} catch( IOException e) {
|
||||
String errMsg = "BufferedReader pointing to "+reader.toString()+" was declared ready but no line could be read from it.";
|
||||
throw new StingException(errMsg,e);
|
||||
}
|
||||
return line;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class PooledConcordanceTable {
|
||||
private final int CALL_INDECES = 6;
|
||||
private final int TRUTH_INDECES = 4;
|
||||
private final int NO_CALL = 5;
|
||||
private final int REF_CALL = 0; // synonym
|
||||
private final int VARIANT_CALL_NONHAPMAP = 1;
|
||||
private final int VARIANT_CALL_MATCH = 2;
|
||||
private final int VARIANT_CALL = 2; // re-using index
|
||||
private final int VARIANT_CALL_MISMATCH = 3;
|
||||
private final int VARIANT_CALL_UNKNOWN = 4;
|
||||
private final int NO_TRUTH_DATA = 0;
|
||||
private final int TRUTH_REF = 1;
|
||||
private final int TRUTH_VAR = 2;
|
||||
private final int TRUTH_UNKNOWN = 3;
|
||||
|
||||
private int[][] table;
|
||||
private int[][][] tableByHMFrequency;
|
||||
private int variantCallsAtRefN;
|
||||
private int poolSize;
|
||||
|
||||
public PooledConcordanceTable(int poolSize) {
|
||||
this.poolSize = poolSize;
|
||||
table = new int[TRUTH_INDECES][CALL_INDECES];
|
||||
tableByHMFrequency = new int[calculateNumFrequencyIndeces(poolSize)][TRUTH_INDECES][CALL_INDECES];
|
||||
variantCallsAtRefN = 0;
|
||||
|
||||
for ( int j = 0; j < TRUTH_INDECES; j ++ ) {
|
||||
for ( int k = 0; k < CALL_INDECES; k ++ ) {
|
||||
table[j][k] = 0;
|
||||
for( int i = 0; i < calculateNumFrequencyIndeces(poolSize); i ++ ) {
|
||||
tableByHMFrequency[i][j][k] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String updateTable(RefMetaDataTracker tracker, char ref, Variation eval, String[] names) {
|
||||
int truthIndex, callIndex, frequencyIndex = -1;
|
||||
List<Variation> chips = getChips(names, tracker);
|
||||
if ( ref == 'N' || ref == 'n' ) {
|
||||
variantCallsAtRefN += ( eval == null ) ? 0 : 1;
|
||||
truthIndex = NO_TRUTH_DATA; // don't want calls on Ns to factor in to calculation
|
||||
callIndex = NO_CALL; // don't want calls on Ns to factor in to calculation
|
||||
} else if( chips.isEmpty() ) {
|
||||
truthIndex = NO_TRUTH_DATA;
|
||||
if ( eval == null ) {
|
||||
callIndex = NO_CALL;
|
||||
} else {
|
||||
callIndex = ( pooledCallIsRef( eval, ref ) ) ? REF_CALL : VARIANT_CALL_NONHAPMAP;
|
||||
}
|
||||
} else {
|
||||
frequencyIndex = calcVariantFrequencyIndex(ref, chips);
|
||||
|
||||
if ( freqIndexToFrequency( frequencyIndex ) != 0 ) {
|
||||
truthIndex = TRUTH_VAR;
|
||||
} else if ( chips.size() == poolSize ) {
|
||||
truthIndex = TRUTH_REF;
|
||||
} else {
|
||||
truthIndex = TRUTH_UNKNOWN;
|
||||
}
|
||||
|
||||
if ( eval == null ) {
|
||||
callIndex = NO_CALL;
|
||||
} else {
|
||||
if ( pooledCallIsRef( eval, ref ) ) {
|
||||
callIndex = REF_CALL;
|
||||
} else if ( truthIndex == TRUTH_REF ) {
|
||||
callIndex = VARIANT_CALL;
|
||||
} else if (truthIndex == TRUTH_UNKNOWN) {
|
||||
callIndex = VARIANT_CALL_UNKNOWN;
|
||||
} else if ( mismatchingCalls( eval, chips, ref ) ) {
|
||||
callIndex = VARIANT_CALL_MISMATCH;
|
||||
} else {
|
||||
callIndex = VARIANT_CALL_MATCH;
|
||||
}
|
||||
}
|
||||
|
||||
tableByHMFrequency[frequencyIndex][truthIndex][callIndex] ++; // note that this updates only on HAPMAP sites. This is good.
|
||||
}
|
||||
|
||||
table[truthIndex][callIndex] ++;
|
||||
|
||||
String interest;
|
||||
|
||||
if ( truthIndex == TRUTH_VAR && ( callIndex == NO_CALL || callIndex == REF_CALL ) ) {
|
||||
interest = "False_Negative_with_frequency: "+ Double.toString(freqIndexToFrequency(frequencyIndex));
|
||||
} else if ( truthIndex == TRUTH_REF && ( callIndex == VARIANT_CALL )) {
|
||||
interest = "False_Positive";
|
||||
} else if ( callIndex == VARIANT_CALL_MISMATCH ) {
|
||||
interest = "Mismatching_call";
|
||||
} else {
|
||||
interest = null;
|
||||
}
|
||||
|
||||
return interest;
|
||||
}
|
||||
|
||||
public List<Variation> getChips(String[] names, RefMetaDataTracker tracker) {
|
||||
LinkedList<Variation> chips = new LinkedList<Variation>();
|
||||
for ( String name : names ) {
|
||||
Variation chip = (Variation) tracker.lookup(name,null);
|
||||
if ( chip != null ) {
|
||||
chips.add(chip);
|
||||
}
|
||||
}
|
||||
|
||||
return chips;
|
||||
}
|
||||
|
||||
public boolean pooledCallIsRef(Variation eval, char ref) {
|
||||
// code broken out for easy alteration when we start using pool-specific variations
|
||||
return Utils.join("",eval.getAlleleList()).equalsIgnoreCase((Utils.dupString(ref,2)));
|
||||
}
|
||||
|
||||
public int calculateNumFrequencyIndeces(int poolSize) {
|
||||
// code broken out for easy alteration when we start using pool-specific variations
|
||||
return 2*(poolSize+1);
|
||||
}
|
||||
|
||||
public int calcVariantFrequencyIndex( char ref, List<Variation> evals ) {
|
||||
return frequencyToFrequencyIndex( calcVariantFrequency( evals, ref ) );
|
||||
}
|
||||
|
||||
public int frequencyToFrequencyIndex( double frequency ) {
|
||||
// code broken out for easy alteration when we start using pool-specific variations
|
||||
return (int) frequency;
|
||||
}
|
||||
|
||||
public double calcVariantFrequency( List<Variation> evals, char ref ) {
|
||||
// code broken out for easy alteration when we start using pool-specific variations
|
||||
Variation firstEval = evals.get(0);
|
||||
double alternateFrequency = 0.0;
|
||||
for ( Variation eval : evals ) {
|
||||
if ( mismatchingCalls(firstEval, eval, ref) ) {
|
||||
// todo -- make this not a StingException but go to the log
|
||||
throw new StingException("Tri-Allelic Position "+Utils.join("",eval.getAlleleList())+"/"+Utils.join("",firstEval.getAlleleList()) + " Ref: "+ ref + " not supported");
|
||||
} else {
|
||||
alternateFrequency += calledVariantFrequency(eval,ref);
|
||||
}
|
||||
}
|
||||
return alternateFrequency;
|
||||
}
|
||||
|
||||
public boolean mismatchingCalls(Variation var1, List<Variation> vars, char ref) {
|
||||
ListIterator varIter = vars.listIterator();
|
||||
try {
|
||||
while( pooledCallIsRef( (Variation) varIter.next(), ref ) ) {
|
||||
// don't do squat!
|
||||
}
|
||||
} catch (NoSuchElementException e) {
|
||||
String errMsg = "Comparison data given to mismatchingCalls(Variation, List<Variation>, char) was entirely reference. This is a bug and should never happen.";
|
||||
throw new StingException(errMsg, e);
|
||||
}
|
||||
|
||||
return mismatchingCalls( var1, (Variation) varIter.previous(), ref);
|
||||
}
|
||||
|
||||
public boolean mismatchingCalls(Variation eval, Variation chip, char ref) {
|
||||
// eval and chip guaranteed to be non-null
|
||||
char chipF = Utils.stringToChar(chip.getAlleleList().get(0));
|
||||
char chipS = Utils.stringToChar(chip.getAlleleList().get(1));
|
||||
char evalF = Utils.stringToChar(eval.getAlleleList().get(0));
|
||||
char evalS = Utils.stringToChar(eval.getAlleleList().get(1));
|
||||
boolean mismatch;
|
||||
if (chipF == ref) {
|
||||
if ( chipS == ref ) {
|
||||
mismatch = false; // no mismatch against hom ref
|
||||
} else {
|
||||
mismatch = ( evalF != chipS && evalS != chipS ); // test against het nonref
|
||||
}
|
||||
} else if ( chipS == ref ) { // het nonref (somehow made nonref|ref rather than ref|nonref)
|
||||
mismatch = ( evalF != chipF && evalS != chipF ); // test against het nonref
|
||||
} else { // chip is homozygous nonreference
|
||||
if( evalF == ref ) {
|
||||
mismatch = ( evalS != chipF && evalS != chipS ); // make sure the nonref call of eval matches one of the chip alleles
|
||||
} else if ( evalS == ref ) {
|
||||
mismatch = ( evalF != chipF && evalF != chipS ); // make sure the nonref call of eval matches one of the chip alleles
|
||||
} else {
|
||||
// both are hom nonref
|
||||
mismatch = ( evalF != chipF || evalS != chipS); // could be multiallelic calls
|
||||
}
|
||||
}
|
||||
return mismatch;
|
||||
}
|
||||
|
||||
public double calledVariantFrequency( Variation var, char ref ) {
|
||||
// code broken out for easy alteration when we start using pool-specific variations
|
||||
String varStr = Utils.join("",var.getAlleleList());
|
||||
double freq;
|
||||
if ( varStr.charAt(0) != ref && varStr.charAt(1) != ref ) {
|
||||
freq = (double) 2;
|
||||
} else if ( varStr.charAt(0) == ref && varStr.charAt(1) == ref ) {
|
||||
freq = (double) 0;
|
||||
} else {
|
||||
freq = (double) 1;
|
||||
}
|
||||
|
||||
return freq;
|
||||
}
|
||||
|
||||
public double freqIndexToFrequency( int freqIndex ) {
|
||||
// code broken out for easy alteration when we start using pool-specific variations
|
||||
return (double) freqIndex;
|
||||
}
|
||||
|
||||
public List<String> standardOutput() {
|
||||
LinkedList<String> out = new LinkedList<String>();
|
||||
int nSNPCallSites = 0;
|
||||
int nHapmapSites = 0;
|
||||
int nHapmapSNPSites = 0;
|
||||
int nFullHapmapRefSites = 0;
|
||||
int nUnknownHapmapSites = 0;
|
||||
|
||||
for( int truthIndex = 0; truthIndex < TRUTH_INDECES; truthIndex ++ ) {
|
||||
nSNPCallSites += table[truthIndex][VARIANT_CALL_MATCH] + table[truthIndex][VARIANT_CALL_MISMATCH] + table[truthIndex][VARIANT_CALL_NONHAPMAP];
|
||||
nSNPCallSites += table[truthIndex][VARIANT_CALL_UNKNOWN];
|
||||
}
|
||||
|
||||
for ( int callIndex = 0; callIndex < CALL_INDECES; callIndex ++ ) {
|
||||
nHapmapSites += table[TRUTH_REF][callIndex] + table[TRUTH_VAR][callIndex] + table[TRUTH_UNKNOWN][callIndex];
|
||||
nUnknownHapmapSites += table[TRUTH_UNKNOWN][callIndex];
|
||||
nHapmapSNPSites += table[TRUTH_VAR][callIndex];
|
||||
nFullHapmapRefSites += table[TRUTH_REF][callIndex];
|
||||
}
|
||||
|
||||
int nRefsCalledCorrectly = table[TRUTH_REF][REF_CALL];
|
||||
int nHapmapRefsNotCalled = table[TRUTH_REF][NO_CALL];
|
||||
int nRefsCalledAsSNP = table[TRUTH_REF][VARIANT_CALL];
|
||||
int nSNPsCalledCorrectly = table[TRUTH_VAR][VARIANT_CALL_MATCH];
|
||||
int nSNPsCalledIncorrectly = table[TRUTH_VAR][VARIANT_CALL_MISMATCH];
|
||||
int nSNPsCalledAsRef = table[TRUTH_VAR][REF_CALL];
|
||||
int nHapmapSNPsNotCalled = table[TRUTH_VAR][NO_CALL];
|
||||
int nSNPsOnHapmapSNP = table[TRUTH_VAR][VARIANT_CALL_MATCH] + table[TRUTH_VAR][VARIANT_CALL_MISMATCH];
|
||||
int nSNPsAtNonHapmap = table[NO_TRUTH_DATA][VARIANT_CALL_NONHAPMAP];
|
||||
int nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals = table[TRUTH_UNKNOWN][VARIANT_CALL_UNKNOWN];
|
||||
|
||||
out.add(String.format("Total Number of SNP Calls:\t\t%d", nSNPCallSites));
|
||||
out.add(String.format("Total SNP calls on non-Hapmap sites\t\t%d", nSNPsAtNonHapmap));
|
||||
out.add(String.format("Number of Hapmap Sites:\t\t%d", nHapmapSites));
|
||||
out.add("Data on Hapmap Reference Sites");
|
||||
out.add(String.format("\tSites where all Hapmap chips were ref: \t\t%d", nFullHapmapRefSites));
|
||||
out.add(String.format("\t\tReference sites correctly called:\t\t%d\t(%d%%)", nRefsCalledCorrectly, divideToPercent(nRefsCalledCorrectly, nFullHapmapRefSites)));
|
||||
out.add(String.format("\t\tReference sites called as variant:\t\t%d\t(%d%%)", nRefsCalledAsSNP, divideToPercent(nRefsCalledAsSNP, nFullHapmapRefSites)));
|
||||
out.add(String.format("\t\tReference sites not confidently called SNP:\t%d\t(%d%%)", nHapmapRefsNotCalled, divideToPercent(nHapmapRefsNotCalled, nFullHapmapRefSites)));
|
||||
out.add(String.format("\tSites where all seen Hapmap chips were ref, but not all Hapmap chips available: %d", nUnknownHapmapSites));
|
||||
out.add(String.format("\t\tPutative reference sites called ref:\t\t\t%d\t(%d%%)", table[TRUTH_UNKNOWN][REF_CALL], divideToPercent(table[TRUTH_UNKNOWN][REF_CALL], nUnknownHapmapSites)));
|
||||
out.add(String.format("\t\tPutative reference sites called SNP:\t\t\t%d\t(%d%%)", nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals, divideToPercent(nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals, nUnknownHapmapSites)));
|
||||
out.add(String.format("\t\tPutative reference sites not confidently called SNP:\t%d\t(%d%%)", table[TRUTH_UNKNOWN][NO_CALL], divideToPercent(table[TRUTH_UNKNOWN][NO_CALL], nUnknownHapmapSites)));
|
||||
out.add("Data on Hapmap Variant Sites");
|
||||
out.add(String.format("\tNumber of Hapmap SNP Sites:\t\t\t%d", nHapmapSNPSites));
|
||||
out.add(String.format("\t\tSNP sites incorrectly called ref:\t%d\t(%d%%)", nSNPsCalledAsRef, divideToPercent(nSNPsCalledAsRef, nHapmapSNPSites)));
|
||||
out.add(String.format("\t\tSNP sites not confidently called:\t%d\t(%d%%)", nHapmapSNPsNotCalled, divideToPercent(nHapmapSNPsNotCalled, nHapmapSNPSites)));
|
||||
out.add(String.format("\tSNP calls on Hapmap SNP Sites:\t\t%d", nSNPsOnHapmapSNP));
|
||||
out.add(String.format("\t\tSNP sites correctly called SNP:\t%d\t(%d%%)", nSNPsCalledCorrectly, divideToPercent(nSNPsCalledCorrectly, nSNPsOnHapmapSNP)));
|
||||
out.add(String.format("\t\tSNP sites called a different base:\t%d\t(%d%%)", nSNPsCalledIncorrectly, divideToPercent(nSNPsCalledIncorrectly, nSNPsOnHapmapSNP)));
|
||||
out.add(String.format("Calls on reference N:\t\t%d", variantCallsAtRefN));
|
||||
out.add("----------------------- Output By Allele Frequency ------------------------");
|
||||
out.add("");
|
||||
out.add("FREQUENCY \tFALSE_POSITIVES\tTRUE_NEGATIVES\tFALSE_NEGATIVES\tTRUE_POSITIVES\tMISCALLS\tNO_CALLS\tFALSE_NEGATIVE_RATE\tFALSE_POSITIVE_RATE");
|
||||
for( int i = 0; i < getLargestOutputAlleleFrequencyIndex(); i ++) {
|
||||
double freq = freqIndexToFrequency(i);
|
||||
int nRefsCalledAsSNPFreq = tableByHMFrequency[i][TRUTH_REF][VARIANT_CALL];
|
||||
int nRefsCalledCorrectFreq = tableByHMFrequency[i][TRUTH_REF][REF_CALL];
|
||||
int nSNPsCalledIncorrectlyFreq = tableByHMFrequency[i][TRUTH_VAR][VARIANT_CALL_MISMATCH];
|
||||
int nSNPsCalledCorrectlyFreq = tableByHMFrequency[i][TRUTH_VAR][VARIANT_CALL_MATCH];
|
||||
int nSNPsCalledAsRefFreq = tableByHMFrequency[i][TRUTH_VAR][REF_CALL];
|
||||
int nNoCallsAtFreq = tableByHMFrequency[i][TRUTH_VAR][NO_CALL] + tableByHMFrequency[i][TRUTH_REF][NO_CALL];
|
||||
int nSnpsCalledAtFreq = nSNPsCalledIncorrectlyFreq + nSNPsCalledCorrectlyFreq + nSNPsCalledAsRefFreq;
|
||||
int nSNPsAtFreq = 0;
|
||||
for( int j = 0; j < CALL_INDECES; j ++ ) {
|
||||
nSNPsAtFreq += tableByHMFrequency[i][TRUTH_VAR][j];
|
||||
}
|
||||
int nSNPsNoCallFreq = tableByHMFrequency[i][TRUTH_VAR][NO_CALL];
|
||||
double fnrate = ((double) (nSNPsNoCallFreq + nSNPsCalledAsRefFreq) / (nSNPsAtFreq));
|
||||
double fprate = ((double) nRefsCalledAsSNPFreq / (nRefsCalledAsSNPFreq + nRefsCalledCorrectFreq + tableByHMFrequency[i][TRUTH_REF][NO_CALL]));
|
||||
out.add(String.format("%f\t%d\t\t%d\t\t%d\t\t\t%d\t%d\t\t%d\t\t%f\t\t\t%f",
|
||||
freq, nRefsCalledAsSNPFreq, nRefsCalledCorrectFreq, nSNPsCalledAsRefFreq, nSNPsCalledCorrectlyFreq, nSNPsCalledIncorrectlyFreq, nNoCallsAtFreq, fnrate, fprate));
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
public int getLargestOutputAlleleFrequencyIndex() {
|
||||
// fixed - returns the last index at which genotype sites appear
|
||||
|
||||
int freqIndex = -1;
|
||||
for ( int j = 0; j < calculateNumFrequencyIndeces(poolSize); j ++ ) {
|
||||
int nHapmapSitesAtFreq = 0;
|
||||
for ( int i = 0; i < CALL_INDECES; i ++) {
|
||||
nHapmapSitesAtFreq += table[TRUTH_REF][i] + table[TRUTH_VAR][i] + table[TRUTH_UNKNOWN][i];
|
||||
}
|
||||
|
||||
if ( nHapmapSitesAtFreq > 0 ) {
|
||||
freqIndex = j;
|
||||
}
|
||||
}
|
||||
|
||||
return freqIndex;
|
||||
}
|
||||
|
||||
public int divideToPercent( int numerator, int denominator ) {
|
||||
return (int) Math.floor(100.0*((double) numerator)/ denominator );
|
||||
}
|
||||
}
|
||||
|
|
@ -130,15 +130,16 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
|||
//
|
||||
analyses.add(new ValidationDataAnalysis());
|
||||
|
||||
// todo -- chris remove?
|
||||
//analyses.add(new PooledGenotypeConcordance(samplesFile));
|
||||
analyses.add(new VariantCounter());
|
||||
analyses.add(new VariantDBCoverage(knownSNPDBName));
|
||||
//analyses.add(new PooledFrequencyAnalysis(numPeopleInPool,knownSNPDBName));
|
||||
if ( samplesFile == null )
|
||||
analyses.add(new GenotypeConcordance(genotypeChipName, false));
|
||||
else
|
||||
analyses.add(new GenotypeConcordance(samplesFile, true));
|
||||
else if ( numPeopleInPool < 1)
|
||||
analyses.add(new GenotypeConcordance(samplesFile, true));
|
||||
else {
|
||||
analyses.add(new PooledConcordance(samplesFile,true));
|
||||
}
|
||||
analyses.add(new TransitionTranversionAnalysis());
|
||||
analyses.add(new NeighborDistanceAnalysis());
|
||||
analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold));
|
||||
|
|
|
|||
Loading…
Reference in New Issue