Some basic commits that I've been sitting on for a while now:
@ PooledGenotypeConcordance - changes to output, now also reports false-negatives and false-positives as interesting sites. It's been like this in my directory for ages, just never committed. @NQSExtendedGroupsCovariantWalker - change for formatting. @NQSTabularDistributionWalker - breaks out the full (window_size)-dimensional empirical error rate distribution by the window. So if you've got a window of size 3; the quality score sequences 22 25 23 and 22 25 24 have their own bins (each of the 40^3 sequences get one) for match and mismatch counts. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1730 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f7684d9e1b
commit
fe6d810515
|
|
@ -23,7 +23,7 @@ public class NQSExtendedGroupsCovariantWalker extends LocusWalker<LocalMapType,
|
|||
final int NEIGHBORHOOD_SIZE = 5;
|
||||
final int MM_OFFSET = 1;
|
||||
final int COUNT_OFFSET = 0;
|
||||
final int MAX_QSCORE = QualityUtils.MAX_REASONABLE_Q_SCORE;
|
||||
final int MAX_QSCORE = QualityUtils.MAX_REASONABLE_Q_SCORE+1;
|
||||
int NQS_GROUPS;
|
||||
|
||||
public static final String DATA_FORMAT = "%d\t%d\t%d\t%d\t%f%n";
|
||||
|
|
@ -149,7 +149,7 @@ public class NQSExtendedGroupsCovariantWalker extends LocusWalker<LocalMapType,
|
|||
public String formatNQSMismatchCountString(int qscore, int group, long[][][] cumulativeBins) {
|
||||
long mm = cumulativeBins[group][MM_OFFSET][qscore];
|
||||
long ct = cumulativeBins[group][COUNT_OFFSET][qscore];
|
||||
double mmr = mm/ct;
|
||||
double mmr = (ct > 0) ? mm/ct : -1;
|
||||
|
||||
return String.format(DATA_FORMAT, qscore, group, mm, ct, mmr);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,169 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: Ghost
|
||||
* Date: Sep 24, 2009
|
||||
* Time: 10:40:29 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class NQSTabularDistributionWalker extends LocusWalker<LocalMapType, NQSDistributionTable> {
|
||||
|
||||
final int WINDOW_SIZE_SIDE = 3;
|
||||
final int QSCORE_BIN_SIZE = 4;
|
||||
final String ROW_FORMAT = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d%n";
|
||||
final String HEAD_FORMAT = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n";
|
||||
|
||||
int qBins;
|
||||
int WINDOW_SIZE;
|
||||
|
||||
public void initialize() {
|
||||
qBins = 2 + (int) Math.ceil(((double)QualityUtils.MAX_REASONABLE_Q_SCORE)/QSCORE_BIN_SIZE);
|
||||
WINDOW_SIZE = 2*WINDOW_SIZE_SIDE+1;
|
||||
}
|
||||
|
||||
public NQSDistributionTable reduceInit() {
|
||||
return new NQSDistributionTable(WINDOW_SIZE, qBins, QSCORE_BIN_SIZE);
|
||||
}
|
||||
|
||||
public LocalMapType map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
return new LocalMapType(context, ref, tracker);
|
||||
}
|
||||
|
||||
public NQSDistributionTable reduce( LocalMapType curMap, NQSDistributionTable prevReduce ) {
|
||||
// iterate through reads
|
||||
for ( int i = 0; i < curMap.numReads(); i ++ ) {
|
||||
prevReduce.update(curMap.context.getReads().get(i), curMap.context.getOffsets().get(i), WINDOW_SIZE_SIDE, QSCORE_BIN_SIZE, curMap.ref);
|
||||
}
|
||||
|
||||
return prevReduce;
|
||||
}
|
||||
|
||||
public void onTraversalDone(NQSDistributionTable finalTable) {
|
||||
out.print( makeHeader() );
|
||||
for ( int a = 0; a < qBins; a ++ ) {
|
||||
for ( int b = 0; b < qBins; b ++ ) {
|
||||
for ( int c = 0; c < qBins; c ++ ) {
|
||||
for ( int d = 0; d < qBins; d ++ ) {
|
||||
for ( int e = 0; e < qBins; e ++ ) {
|
||||
for ( int f = 0; f < qBins; f ++ ) {
|
||||
for ( int g = 0; g < qBins; g ++ ) {
|
||||
Pair<Integer,Integer> mm = finalTable.getPair(a,b,c,d,e,f,g);
|
||||
out.print( formatOutput(a,b,c,d,e,f,g,mm.first,mm.second) );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String makeHeader() {
|
||||
return String.format(HEAD_FORMAT,"Q1","Q2","Q3","Q4","Q5","Q6","Q7","Mismatch","Match");
|
||||
}
|
||||
|
||||
public String formatOutput( int a, int b, int c, int d, int e, int f, int g, int h, int i ) {
|
||||
return String.format(ROW_FORMAT, a, b, c, d, e, f, g, h, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class NQSDistributionTable {
|
||||
|
||||
public final int MM_OFFSET = 0;
|
||||
public final int MATCH_OFFSET = 1;
|
||||
|
||||
protected int [][][][][][][][] table;
|
||||
|
||||
protected int OFF_END_OFFSET;
|
||||
protected int OFF_START_OFFSET;
|
||||
|
||||
public NQSDistributionTable (int winSize, int qBins, int qStep) {
|
||||
if ( table.length != winSize ) {
|
||||
throw new StingException("Size positied in tabular distribution is not the size of the distribution table");
|
||||
}
|
||||
|
||||
table = new int[qBins][qBins][qBins][qBins][qBins][qBins][qBins][2];
|
||||
// yes, this is absolutely positively horrendous brute-force code.
|
||||
// ... Knuth would be proud. These could be lists for a decrease
|
||||
// in memory but a slowdown in building the distribution.
|
||||
for ( int a = 0; a < qBins; a ++ ) {
|
||||
for ( int b = 0; b < qBins; b ++ ) {
|
||||
for ( int c = 0; c < qBins; c ++ ) {
|
||||
for ( int d = 0; d < qBins; d ++ ) {
|
||||
for ( int e = 0; e < qBins; e ++ ) {
|
||||
for ( int f = 0; f < qBins; f ++ ) {
|
||||
for ( int g = 0; g < qBins; g ++ ) {
|
||||
for ( int h = 0; h < 2; h ++ ) {
|
||||
table[a][b][c][d][e][f][g][h] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OFF_END_OFFSET = qBins-2;
|
||||
OFF_START_OFFSET = qBins-1;
|
||||
}
|
||||
|
||||
public void update( SAMRecord read, int offset, int winSize, int binSize, ReferenceContext ref ) {
|
||||
// note: there are two extra bins, one for "No quality: off the end of read"
|
||||
// and one for "No quality: off the beginning of read"
|
||||
List<Integer> coordinates = getQscoreCoordinates(read,offset,winSize, binSize);
|
||||
ListIterator<Integer> o = coordinates.listIterator();
|
||||
table[o.next()][o.next()][o.next()][o.next()][o.next()][o.next()][o.next()][o.next()] ++;
|
||||
// i'm sure i could use recursion for this but my brain is too tired after swimming to figure
|
||||
// out, in 20 seconds, how to do it
|
||||
}
|
||||
|
||||
public List<Integer> getQscoreCoordinates( SAMRecord read, int offset, int win, int binSize ) {
|
||||
LinkedList<Integer> coords = new LinkedList<Integer>();
|
||||
for ( int i = offset - win; i < offset + win ; i ++ ) {
|
||||
coords.add(getQscoreBin(getQScoreAsInt(read,i), binSize));
|
||||
}
|
||||
return coords;
|
||||
}
|
||||
|
||||
public int getQScoreAsInt( SAMRecord read, int offset ) {
|
||||
int qscore;
|
||||
if ( offset < 0 ) {
|
||||
qscore = read.getReadNegativeStrandFlag() ? OFF_END_OFFSET : OFF_START_OFFSET;
|
||||
} else if ( offset > read.getReadLength() ) {
|
||||
qscore = read.getReadNegativeStrandFlag() ? OFF_START_OFFSET : OFF_END_OFFSET;
|
||||
} else {
|
||||
qscore = (int) read.getBaseQualities()[offset];
|
||||
}
|
||||
|
||||
return qscore;
|
||||
}
|
||||
|
||||
public int getQscoreBin( int qscore, int binSize ) {
|
||||
return (int) Math.floor(((double)qscore)/binSize);
|
||||
}
|
||||
|
||||
public Pair<Integer,Integer> getPair( int a, int b, int c, int d, int e, int f, int g ) {
|
||||
int mm = table[a][b][c][d][e][f][g][MM_OFFSET];
|
||||
int ma = table[a][b][c][d][e][f][g][MATCH_OFFSET];
|
||||
|
||||
return new Pair<Integer,Integer>(mm,ma);
|
||||
}
|
||||
}
|
||||
|
|
@ -23,7 +23,7 @@ public class PooledGenotypeConcordance extends BasicVariantAnalysis implements G
|
|||
private String[] hapmapNames;
|
||||
|
||||
public PooledGenotypeConcordance( String pathToPoolFile ) {
|
||||
super("Pooled Genotype Concordance");
|
||||
super("Pooled_Genotype_Concordance");
|
||||
if( pathToPoolFile == null ) {
|
||||
table = null;
|
||||
hapmapNames = null;
|
||||
|
|
@ -34,8 +34,12 @@ public class PooledGenotypeConcordance extends BasicVariantAnalysis implements G
|
|||
}
|
||||
|
||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
table.updateTable(tracker, ref, eval, hapmapNames);
|
||||
return null;
|
||||
String s = table.updateTable(tracker, ref, eval, hapmapNames);
|
||||
if ( s == null ) {
|
||||
return null;
|
||||
} else {
|
||||
return s + " " + context.getLocation().toString();
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
|
|
@ -122,8 +126,8 @@ class PooledConcordanceTable {
|
|||
}
|
||||
}
|
||||
|
||||
public void updateTable(RefMetaDataTracker tracker, char ref, Variation eval, String[] names) {
|
||||
int truthIndex, callIndex, frequencyIndex;
|
||||
public String updateTable(RefMetaDataTracker tracker, char ref, Variation eval, String[] names) {
|
||||
int truthIndex, callIndex, frequencyIndex = -1;
|
||||
List<Variation> chips = getChips(names, tracker);
|
||||
if ( ref == 'N' || ref == 'n' ) {
|
||||
variantCallsAtRefN += ( eval == null ) ? 0 : 1;
|
||||
|
|
@ -167,6 +171,20 @@ class PooledConcordanceTable {
|
|||
}
|
||||
|
||||
table[truthIndex][callIndex] ++;
|
||||
|
||||
String interest;
|
||||
|
||||
if ( truthIndex == TRUTH_VAR && ( callIndex == NO_CALL || callIndex == REF_CALL ) ) {
|
||||
interest = "False_Negative_with_frequency: "+ Double.toString(freqIndexToFrequency(frequencyIndex));
|
||||
} else if ( truthIndex == TRUTH_REF && ( callIndex == VARIANT_CALL )) {
|
||||
interest = "False_Positive";
|
||||
} else if ( callIndex == VARIANT_CALL_MISMATCH ) {
|
||||
interest = "Mismatching_call";
|
||||
} else {
|
||||
interest = null;
|
||||
}
|
||||
|
||||
return interest;
|
||||
}
|
||||
|
||||
public List<Variation> getChips(String[] names, RefMetaDataTracker tracker) {
|
||||
|
|
@ -315,11 +333,11 @@ class PooledConcordanceTable {
|
|||
out.add(String.format("| \t+ Sites where all Hapmap chips were ref: \t\t%d", nFullHapmapRefSites));
|
||||
out.add(String.format("| \t\t- Reference sites correctly called:\t\t%d\t(%d%%)", nRefsCalledCorrectly, divideToPercent(nRefsCalledCorrectly, nFullHapmapRefSites)));
|
||||
out.add(String.format("| \t\t- Reference sites called as variant:\t\t%d\t(%d%%)", nRefsCalledAsSNP, divideToPercent(nRefsCalledAsSNP, nFullHapmapRefSites)));
|
||||
out.add(String.format("| \t\t- Reference sites not confidently called:\t%d\t(%f)", nHapmapRefsNotCalled, ((double)nHapmapRefsNotCalled)/nFullHapmapRefSites));
|
||||
out.add(String.format("| \t\t- Reference sites not confidently called SNP:\t%d\t(%d%%)", nHapmapRefsNotCalled, divideToPercent(nHapmapRefsNotCalled, nFullHapmapRefSites)));
|
||||
out.add(String.format("| \t+ Sites where all seen Hapmap chips were ref, but not all Hapmap chips available: %d", nUnknownHapmapSites));
|
||||
out.add(String.format("| \t\t- Putative reference sites called ref:\t\t\t%d\t(%d%%)", table[TRUTH_UNKNOWN][REF_CALL], divideToPercent(table[TRUTH_UNKNOWN][REF_CALL], nUnknownHapmapSites)));
|
||||
out.add(String.format("| \t\t- Putative reference sites called SNP:\t\t\t%d\t(%d%%)", nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals, divideToPercent(nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals, nUnknownHapmapSites)));
|
||||
out.add(String.format("| \t\t- Putative reference sites not confidently called:\t%d\t(%d%%)", table[TRUTH_UNKNOWN][NO_CALL], divideToPercent(table[TRUTH_UNKNOWN][NO_CALL], nUnknownHapmapSites)));
|
||||
out.add(String.format("| \t\t- Putative reference sites not confidently called SNP:\t%d\t(%d%%)", table[TRUTH_UNKNOWN][NO_CALL], divideToPercent(table[TRUTH_UNKNOWN][NO_CALL], nUnknownHapmapSites)));
|
||||
out.add("| Data on Hapmap Variant Sites");
|
||||
out.add(String.format("| \t+ Number of Hapmap SNP Sites:\t\t\t%d", nHapmapSNPSites));
|
||||
out.add(String.format("| \t\t- SNP sites incorrectly called ref:\t%d\t(%d%%)", nSNPsCalledAsRef, divideToPercent(nSNPsCalledAsRef, nHapmapSNPSites)));
|
||||
|
|
@ -355,6 +373,8 @@ class PooledConcordanceTable {
|
|||
}
|
||||
|
||||
public int getLargestOutputAlleleFrequencyIndex() {
|
||||
// TODO -- this code may be bugged -- is the first index at which no hapmap sites appear
|
||||
// TODO -- also the index above which no hapmap sites are guaranteed to appear with said frequency
|
||||
int nHapmapSitesAtFreq = 1;
|
||||
int freqIndex = -1;
|
||||
while ( nHapmapSitesAtFreq > 0 && freqIndex < calculateNumFrequencyIndeces(poolSize) ) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue