PooledConcordance calculations have been reformatted and bugs fixed. Now properly handles monomorphic sites. Also works with -G option now, correctly
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2412 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
9bf2d12c64
commit
ee8bcdc61d
|
|
@ -10,10 +10,7 @@ import org.broadinstitute.sting.utils.genotype.vcf.*;
|
|||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -256,6 +253,8 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
|
|||
/**
|
||||
* get the genotype
|
||||
*
|
||||
* // todo -- WTF is this? This is a deeply unsafe call
|
||||
*
|
||||
* @return a map in lexigraphical order of the genotypes
|
||||
*/
|
||||
public Genotype getCalledGenotype() {
|
||||
|
|
@ -301,6 +300,18 @@ public class RodVCF extends BasicReferenceOrderedDatum implements VariationRod,
|
|||
return mCurrentRecord.getSampleNames();
|
||||
}
|
||||
|
||||
// public Map<String, Genotype> getSampleGenotypes() {
|
||||
// String[] samples = getSampleNames();
|
||||
// List<Genotype> genotypes = getGenotypes();
|
||||
// HashMap<String, Genotype> map = new HashMap<String, Genotype>();
|
||||
//
|
||||
// for ( int i = 0; i < samples.length; i++ ) {
|
||||
// map.put(samples[i], genotypes.get(i));
|
||||
// }
|
||||
//
|
||||
// return map;
|
||||
// }
|
||||
|
||||
public Map<String, String> getInfoValues() {
|
||||
assertNotNull();
|
||||
return mCurrentRecord.getInfoValues();
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
|||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -22,7 +23,7 @@ public class ConcordanceTruthTable {
|
|||
private static final int FALSE_POSITIVE = 2;
|
||||
private static final int FALSE_NEGATIVE = 3;
|
||||
private static final int VARIANT = 1;
|
||||
private static final String[] POOL_HEADERS = {"TRUE_POSITIVE","TRUE_NEGATIVE","FALSE_POSITIVE","FALSE_NEGATIVE"};
|
||||
private static final String[] POOL_HEADERS = {"TP","TN","FP","FN"};
|
||||
|
||||
private static final int REF = 0;
|
||||
private static final int VAR_HET = 1;
|
||||
|
|
@ -103,24 +104,19 @@ public class ConcordanceTruthTable {
|
|||
}
|
||||
} else { // if we cannot associate tables with individuals, then we are working in a pooled context
|
||||
// first we need to expand our tables to include frequency information
|
||||
Pair<Integer, Pair<Integer,Integer> > poolVariant = getPooledAlleleFrequency(chipEvals, ref);
|
||||
|
||||
Pair<Genotype, Pair<Integer,Integer> > poolVariant = getPooledAlleleFrequency(chipEvals, ref);
|
||||
|
||||
int truthType = getGenotype(poolVariant.getFirst(),ref); // convenience method; now to interpret
|
||||
|
||||
if (truthType == VAR_HET || truthType == VAR_HOM) {
|
||||
truthType = VARIANT;
|
||||
}
|
||||
|
||||
int truthType = poolVariant.getFirst(); // convenience method; now to interpret
|
||||
int callType = getCallIndex(eval,ref);
|
||||
|
||||
int numTrueSupportingAlleles = poolVariant.getSecond().getFirst();
|
||||
if ( numTrueSupportingAlleles > 0 && truthType == VARIANT && callType != VARIANT ) {
|
||||
violation = String.format("False negative: %s with %d alt alleles",
|
||||
chipEvals.get(0).getFirst(), numTrueSupportingAlleles);
|
||||
violation = String.format("False negative: %s with %d alt alleles", chipEvals.get(0).getFirst(), numTrueSupportingAlleles);
|
||||
} else if ( truthType == REF && callType == VARIANT ) {
|
||||
violation = String.format("False positive: %s at hom-ref site", eval);
|
||||
}
|
||||
|
||||
addFrequencyEntry( truthType,callType, poolVariant.getSecond().getFirst() );
|
||||
addFrequencyEntry( truthType, callType, poolVariant.getSecond().getFirst() );
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -131,46 +127,48 @@ public class ConcordanceTruthTable {
|
|||
return violation;
|
||||
}
|
||||
|
||||
public Pair< Genotype , Pair<Integer,Integer> > getPooledAlleleFrequency( List<Pair<Genotype,Genotype>> chips, char ref) {
|
||||
// TODO -- this is actually just a note that I wanted to appear in blue. This method explicitly uses
|
||||
// TODO --- the assumption that tri-allelic sites do not really exist, and that if they do the
|
||||
// TODO --- site will be marked as such by an 'N' in the reference, so we will not get to this point.
|
||||
Genotype variant = null;
|
||||
public Pair<Integer, Pair<Integer,Integer>> getPooledAlleleFrequency( List<Pair<Genotype,Genotype>> chips, char ref) {
|
||||
// this is actually just a note that I wanted to appear in blue. This method explicitly uses
|
||||
// the assumption that tri-allelic sites do not really exist, and that if they do the
|
||||
// site will be marked as such by an 'N' in the reference, so we will not get to this point.
|
||||
|
||||
int frequency = 0;
|
||||
int nChips = 0;
|
||||
if ( chips != null ) {
|
||||
for ( Pair<Genotype,Genotype> chip : chips ) {
|
||||
Genotype c = chip.getFirst();
|
||||
if ( c != null ) {
|
||||
nChips ++;
|
||||
nChips++;
|
||||
if ( c.isVariant(ref) ) {
|
||||
if ( c.isHet() ) {
|
||||
frequency ++;
|
||||
frequency++;
|
||||
} else { // c is hom
|
||||
frequency += 2;
|
||||
}
|
||||
variant = c;
|
||||
}
|
||||
//System.out.printf(" Genotype %s at %c => %d%n", c, ref, frequency);
|
||||
}
|
||||
}
|
||||
//System.out.printf("*** %d%n", frequency);
|
||||
}
|
||||
|
||||
return new Pair< Genotype, Pair<Integer,Integer> >(variant, new Pair<Integer,Integer>(frequency,nChips));
|
||||
|
||||
int truthType = nChips > 0 ? ( frequency > 0 ? VARIANT : REF ) : NO_CALL;
|
||||
return new Pair<Integer, Pair<Integer,Integer> >(truthType, new Pair<Integer,Integer>(frequency,nChips));
|
||||
}
|
||||
|
||||
private void addFrequencyEntry ( int truthIndex, int callIndex, int numTrueSupportingAlleles ) {
|
||||
calls_totals[callIndex] ++;
|
||||
truth_totals[truthIndex] ++;
|
||||
private void addFrequencyEntry( int truthIndex, int callIndex, int numTrueSupportingAlleles ) {
|
||||
//System.out.printf(" %s %s %d%n", CALL_NAMES[truthIndex], CALL_NAMES[callIndex], numTrueSupportingAlleles);
|
||||
calls_totals[callIndex]++;
|
||||
truth_totals[truthIndex]++;
|
||||
|
||||
if ( truthIndex == REF && callIndex == REF ) {
|
||||
if ( truthIndex == REF && ( callIndex == REF || callIndex == NO_CALL ) ) {
|
||||
// true negative
|
||||
table[numTrueSupportingAlleles][TRUE_NEGATIVE] ++;
|
||||
table[numTrueSupportingAlleles][TRUE_NEGATIVE]++;
|
||||
// sanity check - there should never be an entry in
|
||||
// [*][TRUE_NEGATIVE] for * > 0
|
||||
} else if ( truthIndex == REF && callIndex == VARIANT ) {
|
||||
// false positive
|
||||
table[numTrueSupportingAlleles][FALSE_POSITIVE] ++;
|
||||
table[numTrueSupportingAlleles][FALSE_POSITIVE]++;
|
||||
} else if ( truthIndex == VARIANT && (callIndex == NO_CALL || callIndex == REF) ) {
|
||||
// false negative
|
||||
table[numTrueSupportingAlleles][FALSE_NEGATIVE]++;
|
||||
|
|
@ -224,22 +222,39 @@ public class ConcordanceTruthTable {
|
|||
addFrequencyStats(s);
|
||||
}
|
||||
|
||||
private void addFrequencyStats(List<String> s) {
|
||||
// private void addFrequencyStats(List<String> s) {
|
||||
//
|
||||
// // TODO -- implement me for pooled mode with frequency stats
|
||||
// s.add(String.format("name %s",name));
|
||||
// s.add(String.format("TRUTH_ALLELE_FREQUENCY\tERROR_OR_TRUTH_TYPE\tTOTAL\tAS_PRCT_OF_TOTAL_CALLS\tAS_PRCT_OF_CALLS_AT_FREQUENCY"));
|
||||
//
|
||||
// for ( int af = 0; af < table.length; af ++ ) {
|
||||
// for ( int errorIndex = 0; errorIndex < 4; errorIndex ++ ) {
|
||||
// StringBuffer sb = new StringBuffer();
|
||||
// sb.append(String.format("%f ", ((double) af)/ table.length));
|
||||
// sb.append(String.format("%s ",POOL_HEADERS[errorIndex]));
|
||||
// sb.append(String.format("%d ", table[af][errorIndex]));
|
||||
// sb.append(String.format("%s ", percentOfTotal(table,af,errorIndex)));
|
||||
// sb.append(String.format("%s ", marginalPercent(table[af],errorIndex)));
|
||||
// s.add(sb.toString());
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// }
|
||||
|
||||
// TODO -- implement me for pooled mode with frequency stats
|
||||
private void addFrequencyStats(List<String> s) {
|
||||
s.add(String.format("name %s",name));
|
||||
s.add(String.format("TRUTH_ALLELE_FREQUENCY\tERROR_OR_TRUTH_TYPE\tTOTAL\tAS_PRCT_OF_TOTAL_CALLS\tAS_PRCT_OF_CALLS_AT_FREQUENCY"));
|
||||
s.add("TRUTH_ALLELE_COUNT\tTRUTH_ALLELE_FREQ\tTOTAL\t" + Utils.join(" ", POOL_HEADERS));
|
||||
|
||||
for ( int af = 0; af < table.length; af ++ ) {
|
||||
int sum = 0;
|
||||
String counts = "";
|
||||
for ( int errorIndex = 0; errorIndex < 4; errorIndex ++ ) {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append(String.format("%f ", ((double) af)/ table.length));
|
||||
sb.append(String.format("%s ",POOL_HEADERS[errorIndex]));
|
||||
sb.append(String.format("%d ", table[af][errorIndex]));
|
||||
sb.append(String.format("%s ", percentOfTotal(table,af,errorIndex)));
|
||||
sb.append(String.format("%s ", marginalPercent(table[af],errorIndex)));
|
||||
s.add(sb.toString());
|
||||
int count = table[af][errorIndex];
|
||||
sum += count;
|
||||
counts += String.format(" %6d", count);
|
||||
}
|
||||
s.add(String.format("%6d %.3f %6d%s", af, ((double)af)/ table.length, sum, counts));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -321,24 +336,4 @@ public class ConcordanceTruthTable {
|
|||
sb.append("%");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private static String percentOfTotal(int [][] errorMatrix, int alleleFrequency, int errorIndex ) {
|
||||
int count = 0;
|
||||
for ( int i = 0; i < errorMatrix.length; i ++ ) {
|
||||
for ( int j = 0; j < 4; j ++ ) {
|
||||
count += errorMatrix[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
return cellPercent(errorMatrix[alleleFrequency][errorIndex], count );
|
||||
}
|
||||
|
||||
private static String marginalPercent ( int[] errors, int errorIndex ) {
|
||||
int count = 0;
|
||||
for ( int i = 0; i < 4; i ++ ) {
|
||||
count += errors[i];
|
||||
}
|
||||
|
||||
return cellPercent(errors[errorIndex], count);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,6 +7,6 @@ package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
|||
* Time: 4:39:12 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public interface PoolAnalysis {
|
||||
public interface PoolAnalysis extends PopulationAnalysis, GenotypeAnalysis {
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -48,8 +48,8 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
|||
@Argument(fullName="supressDateInformation", doc="This flag indicates that we want to suppress the date information from the output, so that if can be diff'ed against previous evals.", required=false)
|
||||
public boolean supressDateInformation = false;
|
||||
|
||||
@Argument(fullName = "numPeopleInPool", shortName="PS", doc="If using a variant file from a pooled caller, this field provides the number of individuals in each pool", required=false)
|
||||
public int numPeopleInPool = -1;
|
||||
//@Argument(fullName = "numPeopleInPool", shortName="PS", doc="If using a variant file from a pooled caller, this field provides the number of individuals in each pool", required=false)
|
||||
//public int numPeopleInPool = -1;
|
||||
|
||||
@Argument(fullName = "samplesFile", shortName="samples", doc="When running an analysis on one or more individuals with truth data, this field provides a filepath to the listing of which samples are used (and are used to name corresponding rods with -B)", required=false)
|
||||
public String samplesFile = null;
|
||||
|
|
@ -130,14 +130,16 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
|||
|
||||
analyses.add(new VariantCounter());
|
||||
analyses.add(new VariantDBCoverage(knownSNPDBName));
|
||||
|
||||
if ( samplesFile != null ) {
|
||||
if ( numPeopleInPool < 1 )
|
||||
analyses.add(new GenotypeConcordance(samplesFile, true));
|
||||
else
|
||||
analyses.add(new PooledConcordance(samplesFile, true));
|
||||
//if ( numPeopleInPool < 1 )
|
||||
// analyses.add(new GenotypeConcordance(samplesFile, true));
|
||||
//else
|
||||
analyses.add(new PooledConcordance(samplesFile, true));
|
||||
} else {
|
||||
analyses.add(new GenotypeConcordance(genotypeChipName, false));
|
||||
}
|
||||
|
||||
analyses.add(new TransitionTranversionAnalysis());
|
||||
analyses.add(new NeighborDistanceAnalysis());
|
||||
analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold));
|
||||
|
|
@ -160,7 +162,8 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
|||
} else if ( ! (analysis instanceof PopulationAnalysis || analysis instanceof PoolAnalysis) ) {
|
||||
removed = analysis;
|
||||
iter.remove();
|
||||
} else if ( numPeopleInPool > 1 && ! ( analysis instanceof PoolAnalysis ) ) {
|
||||
// } else if ( numPeopleInPool > 1 && ! ( analysis instanceof PoolAnalysis ) ) {
|
||||
} else if ( analysis instanceof PoolAnalysis && samplesFile == null ) {
|
||||
removed = analysis;
|
||||
iter.remove();
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue