Some basic commits that I've been sitting on for a while now:

@ PooledGenotypeConcordance - changes to output, now also reports false-negatives and false-positives as interesting sites. It's been like this in my directory for ages, just never committed.

@NQSExtendedGroupsCovariantWalker - change for formatting.

@NQSTabularDistributionWalker - breaks out the full (window_size)-dimensional empirical error rate distribution by the window. So if you've got a window of size 3; the quality score sequences 22 25 23 and 22 25 24 have their own bins (each of the 40^3 sequences get one) for match and mismatch counts.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1730 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2009-09-25 19:35:50 +00:00
parent f7684d9e1b
commit fe6d810515
3 changed files with 198 additions and 9 deletions

View File

@ -23,7 +23,7 @@ public class NQSExtendedGroupsCovariantWalker extends LocusWalker<LocalMapType,
final int NEIGHBORHOOD_SIZE = 5;
final int MM_OFFSET = 1;
final int COUNT_OFFSET = 0;
final int MAX_QSCORE = QualityUtils.MAX_REASONABLE_Q_SCORE;
final int MAX_QSCORE = QualityUtils.MAX_REASONABLE_Q_SCORE+1;
int NQS_GROUPS;
public static final String DATA_FORMAT = "%d\t%d\t%d\t%d\t%f%n";
@ -149,7 +149,7 @@ public class NQSExtendedGroupsCovariantWalker extends LocusWalker<LocalMapType,
public String formatNQSMismatchCountString(int qscore, int group, long[][][] cumulativeBins) {
long mm = cumulativeBins[group][MM_OFFSET][qscore];
long ct = cumulativeBins[group][COUNT_OFFSET][qscore];
double mmr = mm/ct;
double mmr = (ct > 0) ? mm/ct : -1;
return String.format(DATA_FORMAT, qscore, group, mm, ct, mmr);
}

View File

@ -0,0 +1,169 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Pair;
import java.util.LinkedList;
import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
import net.sf.samtools.SAMRecord;
/**
* Created by IntelliJ IDEA.
* User: Ghost
* Date: Sep 24, 2009
* Time: 10:40:29 AM
* To change this template use File | Settings | File Templates.
*/
public class NQSTabularDistributionWalker extends LocusWalker<LocalMapType, NQSDistributionTable> {
final int WINDOW_SIZE_SIDE = 3;
final int QSCORE_BIN_SIZE = 4;
final String ROW_FORMAT = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d%n";
final String HEAD_FORMAT = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n";
int qBins;
int WINDOW_SIZE;
public void initialize() {
qBins = 2 + (int) Math.ceil(((double)QualityUtils.MAX_REASONABLE_Q_SCORE)/QSCORE_BIN_SIZE);
WINDOW_SIZE = 2*WINDOW_SIZE_SIDE+1;
}
public NQSDistributionTable reduceInit() {
return new NQSDistributionTable(WINDOW_SIZE, qBins, QSCORE_BIN_SIZE);
}
public LocalMapType map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
return new LocalMapType(context, ref, tracker);
}
public NQSDistributionTable reduce( LocalMapType curMap, NQSDistributionTable prevReduce ) {
// iterate through reads
for ( int i = 0; i < curMap.numReads(); i ++ ) {
prevReduce.update(curMap.context.getReads().get(i), curMap.context.getOffsets().get(i), WINDOW_SIZE_SIDE, QSCORE_BIN_SIZE, curMap.ref);
}
return prevReduce;
}
public void onTraversalDone(NQSDistributionTable finalTable) {
out.print( makeHeader() );
for ( int a = 0; a < qBins; a ++ ) {
for ( int b = 0; b < qBins; b ++ ) {
for ( int c = 0; c < qBins; c ++ ) {
for ( int d = 0; d < qBins; d ++ ) {
for ( int e = 0; e < qBins; e ++ ) {
for ( int f = 0; f < qBins; f ++ ) {
for ( int g = 0; g < qBins; g ++ ) {
Pair<Integer,Integer> mm = finalTable.getPair(a,b,c,d,e,f,g);
out.print( formatOutput(a,b,c,d,e,f,g,mm.first,mm.second) );
}
}
}
}
}
}
}
}
public String makeHeader() {
return String.format(HEAD_FORMAT,"Q1","Q2","Q3","Q4","Q5","Q6","Q7","Mismatch","Match");
}
public String formatOutput( int a, int b, int c, int d, int e, int f, int g, int h, int i ) {
return String.format(ROW_FORMAT, a, b, c, d, e, f, g, h, i);
}
}
class NQSDistributionTable {
public final int MM_OFFSET = 0;
public final int MATCH_OFFSET = 1;
protected int [][][][][][][][] table;
protected int OFF_END_OFFSET;
protected int OFF_START_OFFSET;
public NQSDistributionTable (int winSize, int qBins, int qStep) {
if ( table.length != winSize ) {
throw new StingException("Size positied in tabular distribution is not the size of the distribution table");
}
table = new int[qBins][qBins][qBins][qBins][qBins][qBins][qBins][2];
// yes, this is absolutely positively horrendous brute-force code.
// ... Knuth would be proud. These could be lists for a decrease
// in memory but a slowdown in building the distribution.
for ( int a = 0; a < qBins; a ++ ) {
for ( int b = 0; b < qBins; b ++ ) {
for ( int c = 0; c < qBins; c ++ ) {
for ( int d = 0; d < qBins; d ++ ) {
for ( int e = 0; e < qBins; e ++ ) {
for ( int f = 0; f < qBins; f ++ ) {
for ( int g = 0; g < qBins; g ++ ) {
for ( int h = 0; h < 2; h ++ ) {
table[a][b][c][d][e][f][g][h] = 0;
}
}
}
}
}
}
}
}
OFF_END_OFFSET = qBins-2;
OFF_START_OFFSET = qBins-1;
}
public void update( SAMRecord read, int offset, int winSize, int binSize, ReferenceContext ref ) {
// note: there are two extra bins, one for "No quality: off the end of read"
// and one for "No quality: off the beginning of read"
List<Integer> coordinates = getQscoreCoordinates(read,offset,winSize, binSize);
ListIterator<Integer> o = coordinates.listIterator();
table[o.next()][o.next()][o.next()][o.next()][o.next()][o.next()][o.next()][o.next()] ++;
// i'm sure i could use recursion for this but my brain is too tired after swimming to figure
// out, in 20 seconds, how to do it
}
public List<Integer> getQscoreCoordinates( SAMRecord read, int offset, int win, int binSize ) {
LinkedList<Integer> coords = new LinkedList<Integer>();
for ( int i = offset - win; i < offset + win ; i ++ ) {
coords.add(getQscoreBin(getQScoreAsInt(read,i), binSize));
}
return coords;
}
public int getQScoreAsInt( SAMRecord read, int offset ) {
int qscore;
if ( offset < 0 ) {
qscore = read.getReadNegativeStrandFlag() ? OFF_END_OFFSET : OFF_START_OFFSET;
} else if ( offset > read.getReadLength() ) {
qscore = read.getReadNegativeStrandFlag() ? OFF_START_OFFSET : OFF_END_OFFSET;
} else {
qscore = (int) read.getBaseQualities()[offset];
}
return qscore;
}
public int getQscoreBin( int qscore, int binSize ) {
return (int) Math.floor(((double)qscore)/binSize);
}
public Pair<Integer,Integer> getPair( int a, int b, int c, int d, int e, int f, int g ) {
int mm = table[a][b][c][d][e][f][g][MM_OFFSET];
int ma = table[a][b][c][d][e][f][g][MATCH_OFFSET];
return new Pair<Integer,Integer>(mm,ma);
}
}

View File

@ -23,7 +23,7 @@ public class PooledGenotypeConcordance extends BasicVariantAnalysis implements G
private String[] hapmapNames;
public PooledGenotypeConcordance( String pathToPoolFile ) {
super("Pooled Genotype Concordance");
super("Pooled_Genotype_Concordance");
if( pathToPoolFile == null ) {
table = null;
hapmapNames = null;
@ -34,8 +34,12 @@ public class PooledGenotypeConcordance extends BasicVariantAnalysis implements G
}
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
table.updateTable(tracker, ref, eval, hapmapNames);
return null;
String s = table.updateTable(tracker, ref, eval, hapmapNames);
if ( s == null ) {
return null;
} else {
return s + " " + context.getLocation().toString();
}
}
public List<String> done() {
@ -122,8 +126,8 @@ class PooledConcordanceTable {
}
}
public void updateTable(RefMetaDataTracker tracker, char ref, Variation eval, String[] names) {
int truthIndex, callIndex, frequencyIndex;
public String updateTable(RefMetaDataTracker tracker, char ref, Variation eval, String[] names) {
int truthIndex, callIndex, frequencyIndex = -1;
List<Variation> chips = getChips(names, tracker);
if ( ref == 'N' || ref == 'n' ) {
variantCallsAtRefN += ( eval == null ) ? 0 : 1;
@ -167,6 +171,20 @@ class PooledConcordanceTable {
}
table[truthIndex][callIndex] ++;
String interest;
if ( truthIndex == TRUTH_VAR && ( callIndex == NO_CALL || callIndex == REF_CALL ) ) {
interest = "False_Negative_with_frequency: "+ Double.toString(freqIndexToFrequency(frequencyIndex));
} else if ( truthIndex == TRUTH_REF && ( callIndex == VARIANT_CALL )) {
interest = "False_Positive";
} else if ( callIndex == VARIANT_CALL_MISMATCH ) {
interest = "Mismatching_call";
} else {
interest = null;
}
return interest;
}
public List<Variation> getChips(String[] names, RefMetaDataTracker tracker) {
@ -315,11 +333,11 @@ class PooledConcordanceTable {
out.add(String.format("| \t+ Sites where all Hapmap chips were ref: \t\t%d", nFullHapmapRefSites));
out.add(String.format("| \t\t- Reference sites correctly called:\t\t%d\t(%d%%)", nRefsCalledCorrectly, divideToPercent(nRefsCalledCorrectly, nFullHapmapRefSites)));
out.add(String.format("| \t\t- Reference sites called as variant:\t\t%d\t(%d%%)", nRefsCalledAsSNP, divideToPercent(nRefsCalledAsSNP, nFullHapmapRefSites)));
out.add(String.format("| \t\t- Reference sites not confidently called:\t%d\t(%f)", nHapmapRefsNotCalled, ((double)nHapmapRefsNotCalled)/nFullHapmapRefSites));
out.add(String.format("| \t\t- Reference sites not confidently called SNP:\t%d\t(%d%%)", nHapmapRefsNotCalled, divideToPercent(nHapmapRefsNotCalled, nFullHapmapRefSites)));
out.add(String.format("| \t+ Sites where all seen Hapmap chips were ref, but not all Hapmap chips available: %d", nUnknownHapmapSites));
out.add(String.format("| \t\t- Putative reference sites called ref:\t\t\t%d\t(%d%%)", table[TRUTH_UNKNOWN][REF_CALL], divideToPercent(table[TRUTH_UNKNOWN][REF_CALL], nUnknownHapmapSites)));
out.add(String.format("| \t\t- Putative reference sites called SNP:\t\t\t%d\t(%d%%)", nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals, divideToPercent(nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals, nUnknownHapmapSites)));
out.add(String.format("| \t\t- Putative reference sites not confidently called:\t%d\t(%d%%)", table[TRUTH_UNKNOWN][NO_CALL], divideToPercent(table[TRUTH_UNKNOWN][NO_CALL], nUnknownHapmapSites)));
out.add(String.format("| \t\t- Putative reference sites not confidently called SNP:\t%d\t(%d%%)", table[TRUTH_UNKNOWN][NO_CALL], divideToPercent(table[TRUTH_UNKNOWN][NO_CALL], nUnknownHapmapSites)));
out.add("| Data on Hapmap Variant Sites");
out.add(String.format("| \t+ Number of Hapmap SNP Sites:\t\t\t%d", nHapmapSNPSites));
out.add(String.format("| \t\t- SNP sites incorrectly called ref:\t%d\t(%d%%)", nSNPsCalledAsRef, divideToPercent(nSNPsCalledAsRef, nHapmapSNPSites)));
@ -355,6 +373,8 @@ class PooledConcordanceTable {
}
public int getLargestOutputAlleleFrequencyIndex() {
// TODO -- this code may be bugged -- is the first index at which no hapmap sites appear
// TODO -- also the index above which no hapmap sites are guaranteed to appear with said frequency
int nHapmapSitesAtFreq = 1;
int freqIndex = -1;
while ( nHapmapSitesAtFreq > 0 && freqIndex < calculateNumFrequencyIndeces(poolSize) ) {