Changes:
Deletion: PooledGenotypeConcordanceNew Rewrite: PooledGenotypeConcordance. It works, and is blazing fast compared to the earlier version (1 order of magnitude speedup)! And is now entirely non-hackey, as opposed to before when there were some hacky bits. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1640 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3e289fcaa4
commit
8fce376792
|
|
@ -8,11 +8,8 @@ import org.broadinstitute.sting.utils.genotype.Variation;
|
|||
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.StringTokenizer;
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -21,171 +18,43 @@ import java.util.StringTokenizer;
|
|||
* Time: 4:45:21 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
|
||||
//todo -- onTraversalDone
|
||||
|
||||
|
||||
public class PooledGenotypeConcordance extends BasicVariantAnalysis implements GenotypeAnalysis {
|
||||
private String[] individuals = null;
|
||||
|
||||
private static final boolean DEBUG = true;
|
||||
private PooledConcordanceTable table;
|
||||
private String[] hapmapNames;
|
||||
|
||||
private static final int REF = 0;
|
||||
private static final int VAR_MATCH = 1;
|
||||
private static final int VAR_HET = 1;
|
||||
private static final int VAR_MISMATCH = 2;
|
||||
private static final int VAR_HOM = 2;
|
||||
private static final int UNKNOWN = 3;
|
||||
private static final int NO_CALL = 3; // synonym
|
||||
private static final String[] TRUTH_NAMES = {"IS_REF", "IS_SINGLE_VARIANT", "IS_MULTIPLE_VARIANT", "UNKNOWN"};
|
||||
private static final String[] CALL_NAMES = {"CALLED_REF", "CALLED_SINGLE_VARIANT", "CALLED_MULTIPLE_VARIANT", "NO_CALL"};
|
||||
|
||||
private int[][][] tableByIndividual;
|
||||
private int[][] truthTotalsByIndividual;
|
||||
private int[][] callTotalsByIndividual;
|
||||
|
||||
public PooledGenotypeConcordance(String pathToHapmapPoolFile) {
|
||||
super("genotype_concordance");
|
||||
if(pathToHapmapPoolFile == null) {
|
||||
// do nothing
|
||||
public PooledGenotypeConcordance( String pathToPoolFile ) {
|
||||
super("Pooled Genotype Concordance");
|
||||
if( pathToPoolFile == null ) {
|
||||
table = null;
|
||||
hapmapNames = null;
|
||||
} else {
|
||||
BufferedReader fileReader = null;
|
||||
try {
|
||||
fileReader = new BufferedReader(new FileReader(pathToHapmapPoolFile));
|
||||
} catch (IOException e) {
|
||||
String errorMsg = "Could not open any file at " + pathToHapmapPoolFile + " Please check to see if the path is accurate.";
|
||||
throw new StingException(errorMsg, e);
|
||||
}
|
||||
|
||||
generateNameTableFromFile(fileReader);
|
||||
|
||||
if(DEBUG) {
|
||||
printAllToCheck(individuals);
|
||||
}
|
||||
|
||||
truthTotalsByIndividual = new int[individuals.length][4];
|
||||
callTotalsByIndividual = new int[individuals.length][4];
|
||||
tableByIndividual = new int[individuals.length][4][4];
|
||||
for(int individual = 0; individual < individuals.length; individual ++) {
|
||||
for(int call1 = 0; call1 < 4; call1++) {
|
||||
for(int call2 = 0; call2 < 4; call2++)
|
||||
{
|
||||
tableByIndividual[individual][call1][call2] = 0;
|
||||
}
|
||||
callTotalsByIndividual[individual][call1] = 0;
|
||||
truthTotalsByIndividual[individual][call1] = 0;
|
||||
}
|
||||
}
|
||||
generateNameTableFromFile( pathToPoolFile );
|
||||
table = new PooledConcordanceTable( hapmapNames.length );
|
||||
}
|
||||
}
|
||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context){
|
||||
if(DEBUG && context != null) {
|
||||
System.out.println( "Update Called at location: " + context.getLocation().toString() );
|
||||
} else if (DEBUG) {
|
||||
System.out.println("Update Called, but no alignment context was given.");
|
||||
}
|
||||
|
||||
for( int nameOffset = 0; nameOffset < individuals.length; nameOffset++ ) {
|
||||
inc(eval, tracker, ref, context, nameOffset);
|
||||
}
|
||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
table.updateTable(tracker, ref, eval, hapmapNames);
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
public void inc(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context, int nameOffset) {
|
||||
Variation chip = (Variation) tracker.lookup(individuals[nameOffset],null);
|
||||
if ((chip != null && !(chip instanceof VariantBackedByGenotype) || (eval != null && !(eval instanceof VariantBackedByGenotype)))) {
|
||||
String errMsg = "Trying to compare non-pooled data or non-genotype data.";
|
||||
throw new StingException(errMsg);
|
||||
}
|
||||
|
||||
DiploidGenotype g = DiploidGenotype.valueOf(Utils.dupString(ref,2));
|
||||
|
||||
if(DEBUG) {
|
||||
System.out.print(incDebugString(nameOffset, chip, eval, ref));
|
||||
}
|
||||
|
||||
// todo -- is this how want to do pooled concordance checking?
|
||||
int truthIndex, callIndex;
|
||||
if( chip == null ) {
|
||||
truthIndex = UNKNOWN;
|
||||
} else if ( chip.getAlternateBases().equals(g.toString()) ) {
|
||||
truthIndex = REF;
|
||||
} else if ( chip.getAlternateBases().charAt(0) != chip.getAlternateBases().charAt(1)) {
|
||||
truthIndex = VAR_HET;
|
||||
} else {
|
||||
truthIndex = VAR_HOM;
|
||||
}
|
||||
|
||||
if( eval == null ) {
|
||||
callIndex = UNKNOWN;
|
||||
} else if ( eval.getAlternateBases().equals(g.toString()) ) {
|
||||
callIndex = REF;
|
||||
} else if ( callWrongBase(eval,chip,ref) ) {
|
||||
callIndex = VAR_MISMATCH;
|
||||
} else {
|
||||
callIndex = VAR_MATCH;
|
||||
}
|
||||
|
||||
if( chip != null || eval != null ) {
|
||||
tableByIndividual[nameOffset][truthIndex][callIndex]++;
|
||||
truthTotalsByIndividual[nameOffset][truthIndex]++;
|
||||
if ( callIndex != NO_CALL ) {
|
||||
callTotalsByIndividual[nameOffset][callIndex]++;
|
||||
}
|
||||
}
|
||||
|
||||
if(DEBUG) {
|
||||
System.out.printf("TruthIndex: %d CallIndex: %d%n", truthIndex, callIndex);
|
||||
}
|
||||
|
||||
|
||||
|
||||
public List<String> done() {
|
||||
return table.standardOutput();
|
||||
}
|
||||
|
||||
public boolean isCorrectVariantType(Variation eval) {
|
||||
// todo -- this. Check if eval is a TypeOf some ROD class that's the right pooled call output that we
|
||||
// todo -- want to deal with.
|
||||
// private methods for reading in names from a file
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean callWrongBase(Variation eval, Variation chip, char ref) {
|
||||
boolean wrongCall;
|
||||
if ( chip == null ) {
|
||||
wrongCall = true;
|
||||
} else if ( chip.getAlternateBases().charAt(0) == chip.getAlternateBases().charAt(1) ) { // homozygous nonref
|
||||
if( eval.getAlternateBases().charAt(0) == chip.getAlternateBases().charAt(0) ||
|
||||
eval.getAlternateBases().charAt(1) == chip.getAlternateBases().charAt(0) ) {
|
||||
wrongCall = false;
|
||||
} else {
|
||||
wrongCall = true;
|
||||
}
|
||||
} else if ( chip.getAlternateBases().charAt(0) == ref || chip.getAlternateBases().charAt(1) == ref ) { // het nonref
|
||||
wrongCall = ( getNonrefBase(eval,ref) != getNonrefBase(chip,ref) );
|
||||
} else { // todo -- what do we really want to do if ref is C, but chip is AG ??
|
||||
wrongCall = true;
|
||||
}
|
||||
return wrongCall;
|
||||
}
|
||||
|
||||
public char getNonrefBase(Variation var, char ref) {
|
||||
char nonRefBase;
|
||||
if( var.getAlternateBases().charAt(0) == ref ) {
|
||||
nonRefBase = var.getAlternateBases().charAt(1);
|
||||
} else {
|
||||
nonRefBase = var.getAlternateBases().charAt(0);
|
||||
private void generateNameTableFromFile(String file) {
|
||||
BufferedReader reader;
|
||||
try {
|
||||
reader = new BufferedReader(new FileReader(file));
|
||||
} catch( FileNotFoundException e) {
|
||||
String errMsg = "Hapmap pool file at "+file+" was not found. Please check filepath.";
|
||||
throw new StingException(errMsg, e);
|
||||
}
|
||||
|
||||
return nonRefBase;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// private methods for reading names into individualsByPool from an external file
|
||||
//
|
||||
|
||||
private void generateNameTableFromFile(BufferedReader reader) {
|
||||
LinkedList<String> nameList = new LinkedList<String>();
|
||||
|
||||
while(continueReading(reader)) {
|
||||
|
|
@ -193,11 +62,11 @@ public class PooledGenotypeConcordance extends BasicVariantAnalysis implements G
|
|||
nameList.add(line);
|
||||
}
|
||||
|
||||
individuals = nameList.toArray(new String[nameList.size()]);
|
||||
hapmapNames = nameList.toArray(new String[nameList.size()]);
|
||||
}
|
||||
|
||||
private boolean continueReading(BufferedReader reader) {
|
||||
boolean continueReading = false;
|
||||
boolean continueReading;
|
||||
try {
|
||||
continueReading = reader.ready();
|
||||
} catch(IOException e) {
|
||||
|
|
@ -217,32 +86,246 @@ public class PooledGenotypeConcordance extends BasicVariantAnalysis implements G
|
|||
return line;
|
||||
}
|
||||
|
||||
// code strictly for debug
|
||||
|
||||
private void printAllToCheck(String[] blah) {
|
||||
System.out.println("Checking:");
|
||||
for( String s : blah) {
|
||||
System.out.println(s);
|
||||
}
|
||||
}
|
||||
|
||||
private String incDebugString(int nameOffset, Variation chip, Variation eval, char ref) {
|
||||
String truth;
|
||||
if(chip == null) {
|
||||
truth = "NoChip";
|
||||
} else {
|
||||
truth = chip.getAlternateBases();
|
||||
}
|
||||
|
||||
String call;
|
||||
if(eval == null) {
|
||||
call = "NoCall";
|
||||
} else {
|
||||
call = eval.getAlternateBases();
|
||||
}
|
||||
return String.format("Person: %s Ref: %s Truth: %s Call: %s%n",
|
||||
individuals[nameOffset], Character.toString(ref), truth,
|
||||
call);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class PooledConcordanceTable {
|
||||
private final int CALL_INDECES = 5;
|
||||
private final int TRUTH_INDECES = 4;
|
||||
private final int NO_CALL = 0;
|
||||
private final int REF_CALL = 0; // synonym
|
||||
private final int VARIANT_CALL_NONHAPMAP = 1;
|
||||
private final int VARIANT_CALL_MATCH = 2;
|
||||
private final int VARIANT_CALL = 2; // re-using index
|
||||
private final int VARIANT_CALL_MISMATCH = 3;
|
||||
private final int VARIANT_CALL_UNKNOWN = 4;
|
||||
private final int NO_TRUTH_DATA = 0;
|
||||
private final int TRUTH_REF = 1;
|
||||
private final int TRUTH_VAR = 2;
|
||||
private final int TRUTH_UNKNOWN = 3;
|
||||
|
||||
private int[][] table;
|
||||
private int[][][] tableByHMFrequency;
|
||||
private int variantCallsAtRefN;
|
||||
private int poolSize;
|
||||
|
||||
public PooledConcordanceTable(int poolSize) {
|
||||
this.poolSize = poolSize;
|
||||
table = new int[TRUTH_INDECES][CALL_INDECES];
|
||||
tableByHMFrequency = new int[calculateNumFrequencyIndeces(poolSize)][TRUTH_INDECES][CALL_INDECES];
|
||||
variantCallsAtRefN = 0;
|
||||
|
||||
for ( int j = 0; j < TRUTH_INDECES; j ++ ) {
|
||||
for ( int k = 0; k < CALL_INDECES; k ++ ) {
|
||||
table[j][k] = 0;
|
||||
for( int i = 0; i < calculateNumFrequencyIndeces(poolSize); i ++ ) {
|
||||
tableByHMFrequency[i][j][k] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void updateTable(RefMetaDataTracker tracker, char ref, Variation eval, String[] names) {
|
||||
int truthIndex, callIndex, frequencyIndex;
|
||||
List<Variation> chips = getChips(names, tracker);
|
||||
if ( ref == 'N' || ref == 'n' ) {
|
||||
variantCallsAtRefN += ( eval == null ) ? 0 : 1;
|
||||
truthIndex = NO_TRUTH_DATA; // don't want calls on Ns to factor in to calculation
|
||||
callIndex = NO_CALL; // don't want calls on Ns to factor in to calculation
|
||||
} else if( chips.isEmpty() ) {
|
||||
truthIndex = NO_TRUTH_DATA;
|
||||
if ( eval == null ) {
|
||||
callIndex = NO_CALL;
|
||||
} else {
|
||||
callIndex = ( pooledCallIsRef( eval, ref ) ) ? REF_CALL : VARIANT_CALL_NONHAPMAP;
|
||||
}
|
||||
} else {
|
||||
frequencyIndex = calcVariantFrequencyIndex(ref, chips);
|
||||
|
||||
if ( freqIndexToFrequency( frequencyIndex ) != 0 ) {
|
||||
truthIndex = TRUTH_VAR;
|
||||
} else if ( chips.size() == poolSize ) {
|
||||
truthIndex = TRUTH_REF;
|
||||
} else {
|
||||
truthIndex = TRUTH_UNKNOWN;
|
||||
}
|
||||
|
||||
if ( eval == null ) {
|
||||
callIndex = NO_CALL;
|
||||
} else {
|
||||
if ( pooledCallIsRef( eval, ref ) ) {
|
||||
callIndex = REF_CALL;
|
||||
} else if (truthIndex == TRUTH_REF ) {
|
||||
callIndex = VARIANT_CALL;
|
||||
} else if (truthIndex == TRUTH_UNKNOWN) {
|
||||
callIndex = VARIANT_CALL_UNKNOWN;
|
||||
} else if ( mismatchingCalls( eval, chips, ref ) ) {
|
||||
callIndex = VARIANT_CALL_MISMATCH;
|
||||
} else {
|
||||
callIndex = VARIANT_CALL_MATCH;
|
||||
}
|
||||
}
|
||||
|
||||
tableByHMFrequency[frequencyIndex][truthIndex][callIndex] ++; // note that this updates only on HAPMAP sites. This is good.
|
||||
}
|
||||
|
||||
table[truthIndex][callIndex] ++;
|
||||
}
|
||||
|
||||
public List<Variation> getChips(String[] names, RefMetaDataTracker tracker) {
|
||||
LinkedList<Variation> chips = new LinkedList<Variation>();
|
||||
for ( String name : names ) {
|
||||
Variation chip = (Variation) tracker.lookup(name,null);
|
||||
if ( chip != null ) {
|
||||
chips.add(chip);
|
||||
}
|
||||
}
|
||||
|
||||
return chips;
|
||||
}
|
||||
|
||||
public boolean pooledCallIsRef(Variation eval, char ref) {
|
||||
// code broken out for easy alteration when we start using pool-specific variations
|
||||
return eval.getAlternateBases().equalsIgnoreCase((Utils.dupString(ref,2)));
|
||||
}
|
||||
|
||||
public int calculateNumFrequencyIndeces(int poolSize) {
|
||||
// code broken out for easy alteration when we start using pool-specific variations
|
||||
return 2*(poolSize+1);
|
||||
}
|
||||
|
||||
public int calcVariantFrequencyIndex( char ref, List<Variation> evals ) {
|
||||
return frequencyToFrequencyIndex( calcVariantFrequency( evals, ref ) );
|
||||
}
|
||||
|
||||
public int frequencyToFrequencyIndex( double frequency ) {
|
||||
// code broken out for easy alteration when we start using pool-specific variations
|
||||
return (int) frequency;
|
||||
}
|
||||
|
||||
public double calcVariantFrequency( List<Variation> evals, char ref ) {
|
||||
// code broken out for easy alteration when we start using pool-specific variations
|
||||
Variation firstEval = evals.get(0);
|
||||
double alternateFrequency = 0.0;
|
||||
for ( Variation eval : evals ) {
|
||||
if ( mismatchingCalls(firstEval, eval, ref) ) {
|
||||
// todo -- make this not a StingException but go to the log
|
||||
throw new StingException("Tri-Allelic Position "+eval.getAlternateBases()+"/"+firstEval.getAlternateBases() + " Ref: "+ ref + " not supported");
|
||||
} else {
|
||||
alternateFrequency += calledVariantFrequency(eval,ref);
|
||||
}
|
||||
}
|
||||
return alternateFrequency;
|
||||
}
|
||||
|
||||
public boolean mismatchingCalls(Variation var1, List<Variation> vars, char ref) {
|
||||
ListIterator varIter = vars.listIterator();
|
||||
try {
|
||||
while( pooledCallIsRef( (Variation) varIter.next(), ref ) ) {
|
||||
// don't do squat!
|
||||
}
|
||||
} catch (NoSuchElementException e) {
|
||||
String errMsg = "Comparison data given to mismatchingCalls(Variation, List<Variation>, char) was entirely reference. This is a bug and should never happen.";
|
||||
throw new StingException(errMsg, e);
|
||||
}
|
||||
|
||||
return mismatchingCalls( var1, (Variation) varIter.previous(), ref);
|
||||
}
|
||||
|
||||
public boolean mismatchingCalls(Variation eval, Variation chip, char ref) {
|
||||
// eval and chip guaranteed to be non-null
|
||||
char chipF = chip.getAlternateBases().charAt(0);
|
||||
char chipS = chip.getAlternateBases().charAt(1);
|
||||
char evalF = chip.getAlternateBases().charAt(0);
|
||||
char evalS = chip.getAlternateBases().charAt(1);
|
||||
boolean mismatch;
|
||||
if (chipF == ref) {
|
||||
if ( chipS == ref ) {
|
||||
mismatch = false; // no mismatch against hom ref
|
||||
} else {
|
||||
mismatch = ( evalF != chipS && evalS != chipS ); // test against het nonref
|
||||
}
|
||||
} else if ( chipS == ref ) {
|
||||
mismatch = ( evalF != chipF && evalS != chipF ); // test against het nonref
|
||||
} else {
|
||||
if( evalF == ref ) {
|
||||
mismatch = ( evalS != chipF && evalS != chipF ); // test against hom nonref
|
||||
} else if ( evalS == ref ) {
|
||||
mismatch = ( evalF != chipF && evalF != chipF ); // test against hom nonref
|
||||
} else {
|
||||
// both are hom nonref
|
||||
mismatch = ( evalF != chipF );
|
||||
}
|
||||
}
|
||||
return mismatch;
|
||||
}
|
||||
|
||||
public double calledVariantFrequency( Variation var, char ref ) {
|
||||
// code broken out for easy alteration when we start using pool-specific variations
|
||||
String varStr = var.getAlternateBases();
|
||||
double freq;
|
||||
if ( varStr.charAt(0) != ref && varStr.charAt(1) != ref ) {
|
||||
freq = (double) 2;
|
||||
} else if ( varStr.charAt(0) == ref && varStr.charAt(1) == ref ) {
|
||||
freq = (double) 0;
|
||||
} else {
|
||||
freq = (double) 1;
|
||||
}
|
||||
|
||||
return freq;
|
||||
}
|
||||
|
||||
public double freqIndexToFrequency( int freqIndex ) {
|
||||
// code broken out for easy alteration when we start using pool-specific variations
|
||||
return (double) freqIndex;
|
||||
}
|
||||
|
||||
public List<String> standardOutput() {
|
||||
LinkedList<String> out = new LinkedList<String>();
|
||||
int nSNPCallSites = 0;
|
||||
int nHapmapSites = 0;
|
||||
int nHapmapSNPSites = 0;
|
||||
int nFullHapmapRefSites = 0;
|
||||
int nUnknownHapmapSites = 0;
|
||||
|
||||
for( int truthIndex = 0; truthIndex < TRUTH_INDECES; truthIndex ++ ) {
|
||||
nSNPCallSites += table[truthIndex][VARIANT_CALL_MATCH] + table[truthIndex][VARIANT_CALL_MISMATCH] + table[truthIndex][VARIANT_CALL_NONHAPMAP];
|
||||
nSNPCallSites += table[truthIndex][VARIANT_CALL_UNKNOWN];
|
||||
}
|
||||
|
||||
for ( int callIndex = 0; callIndex < CALL_INDECES; callIndex ++ ) {
|
||||
nHapmapSites += table[TRUTH_REF][callIndex] + table[TRUTH_VAR][callIndex] + table[TRUTH_UNKNOWN][callIndex];
|
||||
nUnknownHapmapSites += table[TRUTH_UNKNOWN][callIndex];
|
||||
nHapmapSNPSites += table[TRUTH_VAR][callIndex];
|
||||
nFullHapmapRefSites += table[TRUTH_REF][callIndex];
|
||||
}
|
||||
|
||||
int nRefsCalledCorrectly = table[TRUTH_REF][NO_CALL];
|
||||
int nRefsCalledAsSNP = table[TRUTH_REF][VARIANT_CALL];
|
||||
int nSNPsCalledCorrectly = table[TRUTH_VAR][VARIANT_CALL_MATCH];
|
||||
int nSNPsCalledIncorrectly = table[TRUTH_VAR][VARIANT_CALL_MISMATCH];
|
||||
int nSNPsCalledAsRef = table[TRUTH_VAR][REF_CALL];
|
||||
int nSNPsOnHapmapSNP = table[TRUTH_VAR][VARIANT_CALL_MATCH] + table[TRUTH_VAR][VARIANT_CALL_MISMATCH];
|
||||
int nSNPsAtNonHapmap = table[NO_TRUTH_DATA][VARIANT_CALL_NONHAPMAP];
|
||||
int nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals = table[TRUTH_UNKNOWN][VARIANT_CALL_UNKNOWN];
|
||||
|
||||
out.add(String.format("Total Number of SNP Calls: %d", nSNPCallSites));
|
||||
out.add(String.format("Total SNP calls on non-Hapmap sites %d", nSNPsAtNonHapmap));
|
||||
out.add(String.format("Number of Hapmap Sites: %d", nHapmapSites));
|
||||
out.add("Data on Hapmap Reference Sites");
|
||||
out.add(String.format("\tSites where all Hapmap chips were ref: %d", nFullHapmapRefSites));
|
||||
out.add(String.format("\t\tReference sites correctly called: %d (%f)", nRefsCalledCorrectly, ((double)nRefsCalledCorrectly/nFullHapmapRefSites)));
|
||||
out.add(String.format("\t\tReference sites called as variant: %d (%f)", nRefsCalledAsSNP, ((double)nRefsCalledAsSNP/nFullHapmapRefSites)));
|
||||
out.add(String.format("\tSites where all seen Hapmap chips were ref, but not all Hapmap chips available: %d", nUnknownHapmapSites));
|
||||
out.add(String.format("\t\tPutative reference sites called ref: %d (%f)", table[TRUTH_UNKNOWN][REF_CALL], ((double)table[TRUTH_UNKNOWN][REF_CALL]/nUnknownHapmapSites)));
|
||||
out.add(String.format("\t\tPutative reference sites called SNP: %d (%f)", nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals, ((double)nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals/nUnknownHapmapSites)));
|
||||
out.add("Data on Hapmap Variant Sites");
|
||||
out.add(String.format("\tNumber of Hapmap SNP Sites: %d", nHapmapSNPSites));
|
||||
out.add(String.format("\t\tSNP sites incorrectly called ref: %d (%f)", nSNPsCalledAsRef, ((double)nSNPsCalledAsRef/nHapmapSNPSites)));
|
||||
out.add(String.format("\tSNP calls on Hapmap SNP Sites: %d", nSNPsOnHapmapSNP));
|
||||
out.add(String.format("\t\tSNP sites correctly called SNP: %d (%f)", nSNPsCalledCorrectly, ((double)nSNPsCalledCorrectly/nSNPsOnHapmapSNP)));
|
||||
out.add(String.format("\t\tSNP sites called a different base: %d (%f)", nSNPsCalledIncorrectly, ((double)nSNPsCalledIncorrectly/nSNPsOnHapmapSNP)));
|
||||
out.add(String.format("Calls on reference N: %d", variantCallsAtRefN));
|
||||
|
||||
return out;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,91 +12,13 @@ import java.io.*;
|
|||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: Ghost
|
||||
* Date: Sep 15, 2009
|
||||
* Time: 6:06:37 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class PooledGenotypeConcordanceNew extends BasicVariantAnalysis {
|
||||
|
||||
private String[] hapmapNames;
|
||||
private DebugWriter debug;
|
||||
private HapmapConcordanceTable table;
|
||||
|
||||
public PooledGenotypeConcordanceNew(String pathToPoolFile, String pathToDebugFile, int verbosity) {
|
||||
super("Pooled Genotype Concordance");
|
||||
if( pathToPoolFile == null ) {
|
||||
debug = new DebugWriter();
|
||||
} else {
|
||||
if( pathToDebugFile != null ) {
|
||||
debug = new DebugWriter( pathToDebugFile, verbosity );
|
||||
}
|
||||
generateNameTableFromFile( pathToPoolFile );
|
||||
table = new HapmapConcordanceTable( hapmapNames.length );
|
||||
}
|
||||
}
|
||||
|
||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
table.updateTable(tracker, ref, hapmapNames, eval, context.getLocation(), debug);
|
||||
return null;
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
return table.standardOutput();
|
||||
}
|
||||
|
||||
// methods for reading pool information from file
|
||||
|
||||
private void generateNameTableFromFile(String file) {
|
||||
BufferedReader reader;
|
||||
try {
|
||||
reader = new BufferedReader(new FileReader(file));
|
||||
} catch( FileNotFoundException e) {
|
||||
String errMsg = "Hapmap pool file at "+file+" was not found. Please check filepath.";
|
||||
throw new StingException(errMsg, e);
|
||||
}
|
||||
|
||||
LinkedList<String> nameList = new LinkedList<String>();
|
||||
|
||||
while(continueReading(reader)) {
|
||||
String line = readLine(reader);
|
||||
nameList.add(line);
|
||||
}
|
||||
|
||||
hapmapNames = nameList.toArray(new String[nameList.size()]);
|
||||
}
|
||||
|
||||
private boolean continueReading(BufferedReader reader) {
|
||||
boolean continueReading;
|
||||
try {
|
||||
continueReading = reader.ready();
|
||||
} catch(IOException e) {
|
||||
continueReading = false;
|
||||
}
|
||||
return continueReading;
|
||||
}
|
||||
|
||||
private String readLine(BufferedReader reader) {
|
||||
String line;
|
||||
try {
|
||||
line = reader.readLine();
|
||||
} catch( IOException e) {
|
||||
String errMsg = "BufferedReader pointing to "+reader.toString()+" was declared ready but no line could be read from it.";
|
||||
throw new StingException(errMsg,e);
|
||||
}
|
||||
return line;
|
||||
}
|
||||
}
|
||||
|
||||
class HapmapConcordanceTable {
|
||||
class HapmapConcordanceTableOld {
|
||||
|
||||
private int[][][] truthTableByName;
|
||||
private int[][] poolCallConcordance;
|
||||
private int[][] callsByName;
|
||||
private int[][] truthByName;
|
||||
private int absoluteFalseNegatives;
|
||||
private int absoluteFalsePositives;
|
||||
private int sitesWithFullDataAndNoSNPs;
|
||||
|
||||
private final int REF = 0;
|
||||
|
|
@ -107,14 +29,15 @@ class HapmapConcordanceTable {
|
|||
private final int MATCH = 1;
|
||||
private final int MISMATCH = 2;
|
||||
private final int NO_CALL = 3;
|
||||
private final int DE_NOVO_CALL = 3; //synonym
|
||||
private final int ONE_SNP = 0;
|
||||
private final int MULTI_SNP = 1;
|
||||
private final int DE_NOVO = 2;
|
||||
private final int TRUE_INT = 1;
|
||||
private final int FALSE_INT = 0;
|
||||
|
||||
public HapmapConcordanceTable( int numberOfPeople ) {
|
||||
absoluteFalseNegatives = 0;
|
||||
public HapmapConcordanceTableOld( int numberOfPeople ) {
|
||||
absoluteFalsePositives = 0;
|
||||
sitesWithFullDataAndNoSNPs = 0;
|
||||
truthTableByName = new int[numberOfPeople][4][4];
|
||||
callsByName = new int[numberOfPeople][4];
|
||||
|
|
@ -145,12 +68,12 @@ class HapmapConcordanceTable {
|
|||
int hapmapCoverage = 0;
|
||||
for ( int name = 0; name < names.length; name ++ ) {
|
||||
Variation chip = (Variation) tracker.lookup(names[name], null);
|
||||
debug.printLocusInfo(eval, names[name], ref, chip, loc);
|
||||
SQuad<Integer> variantAndCall = update(eval, name, chip, Character.toUpperCase(ref), debug);
|
||||
nHapmapVariants += variantAndCall.getFirst();
|
||||
nCorrectCalls += variantAndCall.getSecond();
|
||||
hapmapCoverage += variantAndCall.getThird();
|
||||
updateFalseNegatives(hapmapCoverage,nHapmapVariants,variantAndCall.getFourth());
|
||||
updateFalsePositives(hapmapCoverage,nHapmapVariants,variantAndCall.getFourth());
|
||||
debug.printLocusInfo(eval, names[name], ref, chip, loc, variantAndCall.getFourth());
|
||||
}
|
||||
updatePoolCallConcordance(nHapmapVariants,nCorrectCalls);
|
||||
debug.printVerbose(String.format("%s Variants In Pool: %d Correctly Called: %d%n", loc.toString(), nHapmapVariants, nCorrectCalls));
|
||||
|
|
@ -187,12 +110,12 @@ class HapmapConcordanceTable {
|
|||
poolCallConcordance[hapmapAllelesIndex][callConcordanceIndex]++;
|
||||
}
|
||||
|
||||
public void updateFalseNegatives(int coverage, int nHapmapVariants, int callConcordanceIndex) {
|
||||
public void updateFalsePositives(int coverage, int nHapmapVariants, int callConcordanceIndex) {
|
||||
if( coverage == truthByName.length ) {
|
||||
if( nHapmapVariants == 0 ) {
|
||||
if( nHapmapVariants == 0 ) {
|
||||
if( callConcordanceIndex != NO_CALL ) {
|
||||
if (callConcordanceIndex != NOVARIANT ) {
|
||||
absoluteFalseNegatives++;
|
||||
absoluteFalsePositives++;
|
||||
sitesWithFullDataAndNoSNPs++;
|
||||
} else {
|
||||
sitesWithFullDataAndNoSNPs++;
|
||||
|
|
@ -225,7 +148,10 @@ class HapmapConcordanceTable {
|
|||
if( eval == null ) {
|
||||
callIndex = NO_CALL;
|
||||
isCorrectCallAndThereIsVariant = FALSE_INT;
|
||||
} else if ( eval.getAlternateBases().charAt(0) == ref && chip.getAlternateBases().charAt(1) == ref ) {
|
||||
} else if ( chip == null ) {
|
||||
callIndex = DE_NOVO_CALL;
|
||||
isCorrectCallAndThereIsVariant = FALSE_INT;
|
||||
} else if ( eval.getAlternateBases().charAt(0) == ref && eval.getAlternateBases().charAt(1) == ref ) {
|
||||
callIndex = NOVARIANT;
|
||||
isCorrectCallAndThereIsVariant = FALSE_INT;
|
||||
} else if ( callWrongBase(eval, chip, ref, debug) ) {
|
||||
|
|
@ -249,6 +175,7 @@ class HapmapConcordanceTable {
|
|||
// eval and chip guaranteed to be non-null
|
||||
char evalRef;
|
||||
char evalSNP;
|
||||
|
||||
if ( eval.getAlternateBases().charAt(0) == ref ) {
|
||||
evalRef = eval.getAlternateBases().charAt(0);
|
||||
evalSNP = eval.getAlternateBases().charAt(1);
|
||||
|
|
@ -266,14 +193,32 @@ class HapmapConcordanceTable {
|
|||
LinkedList<String> outLines = new LinkedList<String>();
|
||||
int numSingleSNPSites = poolCallConcordance[ONE_SNP][MATCH] + poolCallConcordance[ONE_SNP][MISMATCH];
|
||||
int numMultiSNPSites = poolCallConcordance[MULTI_SNP][MATCH] + poolCallConcordance[MULTI_SNP][MISMATCH];
|
||||
outLines.add(String.format("Sites with variants in One hapmap individual: %d%n", numSingleSNPSites));
|
||||
outLines.add(String.format("\tNumber called correctly: %d (%f)%n",poolCallConcordance[ONE_SNP][MATCH], ((double)poolCallConcordance[ONE_SNP][MATCH]/numSingleSNPSites)));
|
||||
outLines.add(String.format("\tNumber called incorrectly: %d (%f)%n", poolCallConcordance[ONE_SNP][MISMATCH], ((double)poolCallConcordance[ONE_SNP][MISMATCH])/numSingleSNPSites));
|
||||
outLines.add(String.format("Sites with variants in multiple hapmap individuals: %d%n", numMultiSNPSites));
|
||||
outLines.add(String.format("\tNumber called correctly: %d (%f)%n", poolCallConcordance[MULTI_SNP][MATCH], ((double)poolCallConcordance[MULTI_SNP][MISMATCH])/numMultiSNPSites));
|
||||
outLines.add(String.format("\tNumber called incorrectly: %d (%f)%n", poolCallConcordance[MULTI_SNP][MISMATCH], ((double)poolCallConcordance[MULTI_SNP][MISMATCH])/numMultiSNPSites));
|
||||
outLines.add(String.format("Number of called sites with chip data on all individuals: %d%n", sitesWithFullDataAndNoSNPs));
|
||||
outLines.add(String.format("\tNumber incorrectly called SNPs: %d (%f)%n", absoluteFalseNegatives, ((double)absoluteFalseNegatives)/sitesWithFullDataAndNoSNPs));
|
||||
int numSNPSites = numSingleSNPSites + numMultiSNPSites;
|
||||
int numCorrect = poolCallConcordance[ONE_SNP][MATCH] + poolCallConcordance[MULTI_SNP][MATCH];
|
||||
int numIncorrect = poolCallConcordance[ONE_SNP][MISMATCH] + poolCallConcordance[MULTI_SNP][MISMATCH];
|
||||
outLines.add(String.format("Number of Hapmap SNP Sites: %d", numSNPSites));
|
||||
outLines.add(String.format("\tNumber correctly called: %d (%f)", numCorrect, ((double) numCorrect)/numSNPSites));
|
||||
outLines.add(String.format("\tNumber incorrectly called: %d (%f)", numIncorrect, ((double) numIncorrect)/numSNPSites));
|
||||
outLines.add(String.format("\tSites with variants in One hapmap individual: %d", numSingleSNPSites));
|
||||
outLines.add(String.format("\t\tNumber called correctly: %d (%f)",poolCallConcordance[ONE_SNP][MATCH], ((double)poolCallConcordance[ONE_SNP][MATCH]/numSingleSNPSites)));
|
||||
outLines.add(String.format("\t\tNumber called incorrectly: %d (%f)", poolCallConcordance[ONE_SNP][MISMATCH], ((double)poolCallConcordance[ONE_SNP][MISMATCH])/numSingleSNPSites));
|
||||
outLines.add(String.format("\tSites with variants in multiple hapmap individuals: %d", numMultiSNPSites));
|
||||
outLines.add(String.format("\t\tNumber called correctly: %d (%f)", poolCallConcordance[MULTI_SNP][MATCH], ((double)poolCallConcordance[MULTI_SNP][MATCH])/numMultiSNPSites));
|
||||
outLines.add(String.format("\t\tNumber called incorrectly: %d (%f)", poolCallConcordance[MULTI_SNP][MISMATCH], ((double)poolCallConcordance[MULTI_SNP][MISMATCH])/numMultiSNPSites));
|
||||
outLines.add(String.format("\tNumber of called sites with homozygous reference chip data on all individuals: %d", sitesWithFullDataAndNoSNPs));
|
||||
outLines.add(String.format("\t\tNumber incorrectly called SNPs: %d (%f)", absoluteFalsePositives, ((double)absoluteFalsePositives)/sitesWithFullDataAndNoSNPs));
|
||||
return outLines;
|
||||
}
|
||||
|
||||
public List<String> verboseOutput(String[] names) {
|
||||
LinkedList<String> outLines = (LinkedList<String>) this.standardOutput();
|
||||
for( int nameOffset = 0; nameOffset < names.length; nameOffset ++ ) {
|
||||
outLines.add(String.format("Concordance for Hapmap individual %s:", names[nameOffset]));
|
||||
int numSNPSites = truthByName[nameOffset][HET] + truthByName[nameOffset][HOM];
|
||||
int numCallSites = callsByName[nameOffset][MATCH] + callsByName[nameOffset][MISMATCH];
|
||||
outLines.add(String.format("hi %s", "hello"));
|
||||
}
|
||||
|
||||
return outLines;
|
||||
}
|
||||
|
||||
|
|
@ -347,11 +292,11 @@ class DebugWriter {
|
|||
return verbosity;
|
||||
}
|
||||
|
||||
public void printLocusInfo(Variation eval, String name, char ref, Variation chip, GenomeLoc loc) {
|
||||
public void printLocusInfo(Variation eval, String name, char ref, Variation chip, GenomeLoc loc, int callIndex) {
|
||||
if(verbosity == INFO || verbosity == ALL) {
|
||||
String callStr = (eval == null) ? "NoCall" : eval.getAlternateBases();
|
||||
String varStr = (chip == null) ? "NoChip" : chip.getAlternateBases();
|
||||
this.print(String.format("%s %s Call: %s Chip: %s Ref: %s%n", name, loc, callStr, varStr, ref));
|
||||
this.print(String.format("%s %s Call: %s Chip: %s Ref: %s CallIndex: %d%n", name, loc, callStr, varStr, ref, callIndex));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue