PooledConcordance calculations have been reformatted and bugs fixed. Now properly handles monomorphic sites. Also works with -G option now, correctly

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2412 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-12-19 23:22:36 +00:00
parent 9bf2d12c64
commit ee8bcdc61d
4 changed files with 79 additions and 70 deletions

View File

@ -10,10 +10,7 @@ import org.broadinstitute.sting.utils.genotype.vcf.*;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Map;
import java.util.*;
/**
@ -256,6 +253,8 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
/**
* get the genotype
*
* // todo -- WTF is this? This is a deeply unsafe call
*
* @return a map in lexigraphical order of the genotypes
*/
public Genotype getCalledGenotype() {
@ -301,6 +300,18 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
return mCurrentRecord.getSampleNames();
}
// public Map<String, Genotype> getSampleGenotypes() {
// String[] samples = getSampleNames();
// List<Genotype> genotypes = getGenotypes();
// HashMap<String, Genotype> map = new HashMap<String, Genotype>();
//
// for ( int i = 0; i < samples.length; i++ ) {
// map.put(samples[i], genotypes.get(i));
// }
//
// return map;
// }
public Map<String, String> getInfoValues() {
assertNotNull();
return mCurrentRecord.getInfoValues();

View File

@ -4,6 +4,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.Utils;
import java.util.*;
@ -22,7 +23,7 @@ public class ConcordanceTruthTable {
private static final int FALSE_POSITIVE = 2;
private static final int FALSE_NEGATIVE = 3;
private static final int VARIANT = 1;
private static final String[] POOL_HEADERS = {"TRUE_POSITIVE","TRUE_NEGATIVE","FALSE_POSITIVE","FALSE_NEGATIVE"};
private static final String[] POOL_HEADERS = {"TP","TN","FP","FN"};
private static final int REF = 0;
private static final int VAR_HET = 1;
@ -103,24 +104,19 @@ public class ConcordanceTruthTable {
}
} else { // if we cannot associate tables with individuals, then we are working in a pooled context
// first we need to expand our tables to include frequency information
Pair<Integer, Pair<Integer,Integer> > poolVariant = getPooledAlleleFrequency(chipEvals, ref);
Pair<Genotype, Pair<Integer,Integer> > poolVariant = getPooledAlleleFrequency(chipEvals, ref);
int truthType = getGenotype(poolVariant.getFirst(),ref); // convenience method; now to interpret
if (truthType == VAR_HET || truthType == VAR_HOM) {
truthType = VARIANT;
}
int truthType = poolVariant.getFirst(); // convenience method; now to interpret
int callType = getCallIndex(eval,ref);
int numTrueSupportingAlleles = poolVariant.getSecond().getFirst();
if ( numTrueSupportingAlleles > 0 && truthType == VARIANT && callType != VARIANT ) {
violation = String.format("False negative: %s with %d alt alleles",
chipEvals.get(0).getFirst(), numTrueSupportingAlleles);
violation = String.format("False negative: %s with %d alt alleles", chipEvals.get(0).getFirst(), numTrueSupportingAlleles);
} else if ( truthType == REF && callType == VARIANT ) {
violation = String.format("False positive: %s at hom-ref site", eval);
}
addFrequencyEntry( truthType,callType, poolVariant.getSecond().getFirst() );
addFrequencyEntry( truthType, callType, poolVariant.getSecond().getFirst() );
}
@ -131,46 +127,48 @@ public class ConcordanceTruthTable {
return violation;
}
public Pair< Genotype , Pair<Integer,Integer> > getPooledAlleleFrequency( List<Pair<Genotype,Genotype>> chips, char ref) {
// TODO -- this is actually just a note that I wanted to appear in blue. This method explicitly uses
// TODO --- the assumption that tri-allelic sites do not really exist, and that if they do the
// TODO --- site will be marked as such by an 'N' in the reference, so we will not get to this point.
Genotype variant = null;
public Pair<Integer, Pair<Integer,Integer>> getPooledAlleleFrequency( List<Pair<Genotype,Genotype>> chips, char ref) {
// this is actually just a note that I wanted to appear in blue. This method explicitly uses
// the assumption that tri-allelic sites do not really exist, and that if they do the
// site will be marked as such by an 'N' in the reference, so we will not get to this point.
int frequency = 0;
int nChips = 0;
if ( chips != null ) {
for ( Pair<Genotype,Genotype> chip : chips ) {
Genotype c = chip.getFirst();
if ( c != null ) {
nChips ++;
nChips++;
if ( c.isVariant(ref) ) {
if ( c.isHet() ) {
frequency ++;
frequency++;
} else { // c is hom
frequency += 2;
}
variant = c;
}
//System.out.printf(" Genotype %s at %c => %d%n", c, ref, frequency);
}
}
//System.out.printf("*** %d%n", frequency);
}
return new Pair< Genotype, Pair<Integer,Integer> >(variant, new Pair<Integer,Integer>(frequency,nChips));
int truthType = nChips > 0 ? ( frequency > 0 ? VARIANT : REF ) : NO_CALL;
return new Pair<Integer, Pair<Integer,Integer> >(truthType, new Pair<Integer,Integer>(frequency,nChips));
}
private void addFrequencyEntry ( int truthIndex, int callIndex, int numTrueSupportingAlleles ) {
calls_totals[callIndex] ++;
truth_totals[truthIndex] ++;
private void addFrequencyEntry( int truthIndex, int callIndex, int numTrueSupportingAlleles ) {
//System.out.printf(" %s %s %d%n", CALL_NAMES[truthIndex], CALL_NAMES[callIndex], numTrueSupportingAlleles);
calls_totals[callIndex]++;
truth_totals[truthIndex]++;
if ( truthIndex == REF && callIndex == REF ) {
if ( truthIndex == REF && ( callIndex == REF || callIndex == NO_CALL ) ) {
// true negative
table[numTrueSupportingAlleles][TRUE_NEGATIVE] ++;
table[numTrueSupportingAlleles][TRUE_NEGATIVE]++;
// sanity check - there should never be an entry in
// [*][TRUE_NEGATIVE] for * > 0
} else if ( truthIndex == REF && callIndex == VARIANT ) {
// false positive
table[numTrueSupportingAlleles][FALSE_POSITIVE] ++;
table[numTrueSupportingAlleles][FALSE_POSITIVE]++;
} else if ( truthIndex == VARIANT && (callIndex == NO_CALL || callIndex == REF) ) {
// false negative
table[numTrueSupportingAlleles][FALSE_NEGATIVE]++;
@ -224,22 +222,39 @@ public class ConcordanceTruthTable {
addFrequencyStats(s);
}
private void addFrequencyStats(List<String> s) {
// private void addFrequencyStats(List<String> s) {
//
// // TODO -- implement me for pooled mode with frequency stats
// s.add(String.format("name %s",name));
// s.add(String.format("TRUTH_ALLELE_FREQUENCY\tERROR_OR_TRUTH_TYPE\tTOTAL\tAS_PRCT_OF_TOTAL_CALLS\tAS_PRCT_OF_CALLS_AT_FREQUENCY"));
//
// for ( int af = 0; af < table.length; af ++ ) {
// for ( int errorIndex = 0; errorIndex < 4; errorIndex ++ ) {
// StringBuffer sb = new StringBuffer();
// sb.append(String.format("%f ", ((double) af)/ table.length));
// sb.append(String.format("%s ",POOL_HEADERS[errorIndex]));
// sb.append(String.format("%d ", table[af][errorIndex]));
// sb.append(String.format("%s ", percentOfTotal(table,af,errorIndex)));
// sb.append(String.format("%s ", marginalPercent(table[af],errorIndex)));
// s.add(sb.toString());
// }
// }
//
// }
// TODO -- implement me for pooled mode with frequency stats
private void addFrequencyStats(List<String> s) {
s.add(String.format("name %s",name));
s.add(String.format("TRUTH_ALLELE_FREQUENCY\tERROR_OR_TRUTH_TYPE\tTOTAL\tAS_PRCT_OF_TOTAL_CALLS\tAS_PRCT_OF_CALLS_AT_FREQUENCY"));
s.add("TRUTH_ALLELE_COUNT\tTRUTH_ALLELE_FREQ\tTOTAL\t" + Utils.join(" ", POOL_HEADERS));
for ( int af = 0; af < table.length; af ++ ) {
int sum = 0;
String counts = "";
for ( int errorIndex = 0; errorIndex < 4; errorIndex ++ ) {
StringBuffer sb = new StringBuffer();
sb.append(String.format("%f ", ((double) af)/ table.length));
sb.append(String.format("%s ",POOL_HEADERS[errorIndex]));
sb.append(String.format("%d ", table[af][errorIndex]));
sb.append(String.format("%s ", percentOfTotal(table,af,errorIndex)));
sb.append(String.format("%s ", marginalPercent(table[af],errorIndex)));
s.add(sb.toString());
int count = table[af][errorIndex];
sum += count;
counts += String.format(" %6d", count);
}
s.add(String.format("%6d %.3f %6d%s", af, ((double)af)/ table.length, sum, counts));
}
}
@ -321,24 +336,4 @@ public class ConcordanceTruthTable {
sb.append("%");
return sb.toString();
}
private static String percentOfTotal(int [][] errorMatrix, int alleleFrequency, int errorIndex ) {
int count = 0;
for ( int i = 0; i < errorMatrix.length; i ++ ) {
for ( int j = 0; j < 4; j ++ ) {
count += errorMatrix[i][j];
}
}
return cellPercent(errorMatrix[alleleFrequency][errorIndex], count );
}
private static String marginalPercent ( int[] errors, int errorIndex ) {
int count = 0;
for ( int i = 0; i < 4; i ++ ) {
count += errors[i];
}
return cellPercent(errors[errorIndex], count);
}
}

View File

@ -7,6 +7,6 @@ package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
* Time: 4:39:12 PM
* To change this template use File | Settings | File Templates.
*/
public interface PoolAnalysis {
public interface PoolAnalysis extends PopulationAnalysis, GenotypeAnalysis {
}

View File

@ -48,8 +48,8 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
@Argument(fullName="supressDateInformation", doc="This flag indicates that we want to suppress the date information from the output, so that if can be diff'ed against previous evals.", required=false)
public boolean supressDateInformation = false;
@Argument(fullName = "numPeopleInPool", shortName="PS", doc="If using a variant file from a pooled caller, this field provides the number of individuals in each pool", required=false)
public int numPeopleInPool = -1;
//@Argument(fullName = "numPeopleInPool", shortName="PS", doc="If using a variant file from a pooled caller, this field provides the number of individuals in each pool", required=false)
//public int numPeopleInPool = -1;
@Argument(fullName = "samplesFile", shortName="samples", doc="When running an analysis on one or more individuals with truth data, this field provides a filepath to the listing of which samples are used (and are used to name corresponding rods with -B)", required=false)
public String samplesFile = null;
@ -130,14 +130,16 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
analyses.add(new VariantCounter());
analyses.add(new VariantDBCoverage(knownSNPDBName));
if ( samplesFile != null ) {
if ( numPeopleInPool < 1 )
analyses.add(new GenotypeConcordance(samplesFile, true));
else
analyses.add(new PooledConcordance(samplesFile, true));
//if ( numPeopleInPool < 1 )
// analyses.add(new GenotypeConcordance(samplesFile, true));
//else
analyses.add(new PooledConcordance(samplesFile, true));
} else {
analyses.add(new GenotypeConcordance(genotypeChipName, false));
}
analyses.add(new TransitionTranversionAnalysis());
analyses.add(new NeighborDistanceAnalysis());
analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold));
@ -160,7 +162,8 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
} else if ( ! (analysis instanceof PopulationAnalysis || analysis instanceof PoolAnalysis) ) {
removed = analysis;
iter.remove();
} else if ( numPeopleInPool > 1 && ! ( analysis instanceof PoolAnalysis ) ) {
// } else if ( numPeopleInPool > 1 && ! ( analysis instanceof PoolAnalysis ) ) {
} else if ( analysis instanceof PoolAnalysis && samplesFile == null ) {
removed = analysis;
iter.remove();
}