Some basic commits that I've been sitting on for a while now:
@ PooledGenotypeConcordance - changes to output, now also reports false-negatives and false-positives as interesting sites. It's been like this in my directory for ages, just never committed. @NQSExtendedGroupsCovariantWalker - change for formatting. @NQSTabularDistributionWalker - breaks out the full (window_size)-dimensional empirical error rate distribution by the window. So if you've got a window of size 3; the quality score sequences 22 25 23 and 22 25 24 have their own bins (each of the 40^3 sequences get one) for match and mismatch counts. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1730 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f7684d9e1b
commit
fe6d810515
|
|
@ -23,7 +23,7 @@ public class NQSExtendedGroupsCovariantWalker extends LocusWalker<LocalMapType,
|
||||||
final int NEIGHBORHOOD_SIZE = 5;
|
final int NEIGHBORHOOD_SIZE = 5;
|
||||||
final int MM_OFFSET = 1;
|
final int MM_OFFSET = 1;
|
||||||
final int COUNT_OFFSET = 0;
|
final int COUNT_OFFSET = 0;
|
||||||
final int MAX_QSCORE = QualityUtils.MAX_REASONABLE_Q_SCORE;
|
final int MAX_QSCORE = QualityUtils.MAX_REASONABLE_Q_SCORE+1;
|
||||||
int NQS_GROUPS;
|
int NQS_GROUPS;
|
||||||
|
|
||||||
public static final String DATA_FORMAT = "%d\t%d\t%d\t%d\t%f%n";
|
public static final String DATA_FORMAT = "%d\t%d\t%d\t%d\t%f%n";
|
||||||
|
|
@ -149,7 +149,7 @@ public class NQSExtendedGroupsCovariantWalker extends LocusWalker<LocalMapType,
|
||||||
public String formatNQSMismatchCountString(int qscore, int group, long[][][] cumulativeBins) {
|
public String formatNQSMismatchCountString(int qscore, int group, long[][][] cumulativeBins) {
|
||||||
long mm = cumulativeBins[group][MM_OFFSET][qscore];
|
long mm = cumulativeBins[group][MM_OFFSET][qscore];
|
||||||
long ct = cumulativeBins[group][COUNT_OFFSET][qscore];
|
long ct = cumulativeBins[group][COUNT_OFFSET][qscore];
|
||||||
double mmr = mm/ct;
|
double mmr = (ct > 0) ? mm/ct : -1;
|
||||||
|
|
||||||
return String.format(DATA_FORMAT, qscore, group, mm, ct, mmr);
|
return String.format(DATA_FORMAT, qscore, group, mm, ct, mmr);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,169 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.Pair;
|
||||||
|
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.ListIterator;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: Ghost
|
||||||
|
* Date: Sep 24, 2009
|
||||||
|
* Time: 10:40:29 AM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class NQSTabularDistributionWalker extends LocusWalker<LocalMapType, NQSDistributionTable> {
|
||||||
|
|
||||||
|
final int WINDOW_SIZE_SIDE = 3;
|
||||||
|
final int QSCORE_BIN_SIZE = 4;
|
||||||
|
final String ROW_FORMAT = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d%n";
|
||||||
|
final String HEAD_FORMAT = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n";
|
||||||
|
|
||||||
|
int qBins;
|
||||||
|
int WINDOW_SIZE;
|
||||||
|
|
||||||
|
public void initialize() {
|
||||||
|
qBins = 2 + (int) Math.ceil(((double)QualityUtils.MAX_REASONABLE_Q_SCORE)/QSCORE_BIN_SIZE);
|
||||||
|
WINDOW_SIZE = 2*WINDOW_SIZE_SIDE+1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public NQSDistributionTable reduceInit() {
|
||||||
|
return new NQSDistributionTable(WINDOW_SIZE, qBins, QSCORE_BIN_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
public LocalMapType map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
return new LocalMapType(context, ref, tracker);
|
||||||
|
}
|
||||||
|
|
||||||
|
public NQSDistributionTable reduce( LocalMapType curMap, NQSDistributionTable prevReduce ) {
|
||||||
|
// iterate through reads
|
||||||
|
for ( int i = 0; i < curMap.numReads(); i ++ ) {
|
||||||
|
prevReduce.update(curMap.context.getReads().get(i), curMap.context.getOffsets().get(i), WINDOW_SIZE_SIDE, QSCORE_BIN_SIZE, curMap.ref);
|
||||||
|
}
|
||||||
|
|
||||||
|
return prevReduce;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void onTraversalDone(NQSDistributionTable finalTable) {
|
||||||
|
out.print( makeHeader() );
|
||||||
|
for ( int a = 0; a < qBins; a ++ ) {
|
||||||
|
for ( int b = 0; b < qBins; b ++ ) {
|
||||||
|
for ( int c = 0; c < qBins; c ++ ) {
|
||||||
|
for ( int d = 0; d < qBins; d ++ ) {
|
||||||
|
for ( int e = 0; e < qBins; e ++ ) {
|
||||||
|
for ( int f = 0; f < qBins; f ++ ) {
|
||||||
|
for ( int g = 0; g < qBins; g ++ ) {
|
||||||
|
Pair<Integer,Integer> mm = finalTable.getPair(a,b,c,d,e,f,g);
|
||||||
|
out.print( formatOutput(a,b,c,d,e,f,g,mm.first,mm.second) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public String makeHeader() {
|
||||||
|
return String.format(HEAD_FORMAT,"Q1","Q2","Q3","Q4","Q5","Q6","Q7","Mismatch","Match");
|
||||||
|
}
|
||||||
|
|
||||||
|
public String formatOutput( int a, int b, int c, int d, int e, int f, int g, int h, int i ) {
|
||||||
|
return String.format(ROW_FORMAT, a, b, c, d, e, f, g, h, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class NQSDistributionTable {
|
||||||
|
|
||||||
|
public final int MM_OFFSET = 0;
|
||||||
|
public final int MATCH_OFFSET = 1;
|
||||||
|
|
||||||
|
protected int [][][][][][][][] table;
|
||||||
|
|
||||||
|
protected int OFF_END_OFFSET;
|
||||||
|
protected int OFF_START_OFFSET;
|
||||||
|
|
||||||
|
public NQSDistributionTable (int winSize, int qBins, int qStep) {
|
||||||
|
if ( table.length != winSize ) {
|
||||||
|
throw new StingException("Size positied in tabular distribution is not the size of the distribution table");
|
||||||
|
}
|
||||||
|
|
||||||
|
table = new int[qBins][qBins][qBins][qBins][qBins][qBins][qBins][2];
|
||||||
|
// yes, this is absolutely positively horrendous brute-force code.
|
||||||
|
// ... Knuth would be proud. These could be lists for a decrease
|
||||||
|
// in memory but a slowdown in building the distribution.
|
||||||
|
for ( int a = 0; a < qBins; a ++ ) {
|
||||||
|
for ( int b = 0; b < qBins; b ++ ) {
|
||||||
|
for ( int c = 0; c < qBins; c ++ ) {
|
||||||
|
for ( int d = 0; d < qBins; d ++ ) {
|
||||||
|
for ( int e = 0; e < qBins; e ++ ) {
|
||||||
|
for ( int f = 0; f < qBins; f ++ ) {
|
||||||
|
for ( int g = 0; g < qBins; g ++ ) {
|
||||||
|
for ( int h = 0; h < 2; h ++ ) {
|
||||||
|
table[a][b][c][d][e][f][g][h] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
OFF_END_OFFSET = qBins-2;
|
||||||
|
OFF_START_OFFSET = qBins-1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void update( SAMRecord read, int offset, int winSize, int binSize, ReferenceContext ref ) {
|
||||||
|
// note: there are two extra bins, one for "No quality: off the end of read"
|
||||||
|
// and one for "No quality: off the beginning of read"
|
||||||
|
List<Integer> coordinates = getQscoreCoordinates(read,offset,winSize, binSize);
|
||||||
|
ListIterator<Integer> o = coordinates.listIterator();
|
||||||
|
table[o.next()][o.next()][o.next()][o.next()][o.next()][o.next()][o.next()][o.next()] ++;
|
||||||
|
// i'm sure i could use recursion for this but my brain is too tired after swimming to figure
|
||||||
|
// out, in 20 seconds, how to do it
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Integer> getQscoreCoordinates( SAMRecord read, int offset, int win, int binSize ) {
|
||||||
|
LinkedList<Integer> coords = new LinkedList<Integer>();
|
||||||
|
for ( int i = offset - win; i < offset + win ; i ++ ) {
|
||||||
|
coords.add(getQscoreBin(getQScoreAsInt(read,i), binSize));
|
||||||
|
}
|
||||||
|
return coords;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getQScoreAsInt( SAMRecord read, int offset ) {
|
||||||
|
int qscore;
|
||||||
|
if ( offset < 0 ) {
|
||||||
|
qscore = read.getReadNegativeStrandFlag() ? OFF_END_OFFSET : OFF_START_OFFSET;
|
||||||
|
} else if ( offset > read.getReadLength() ) {
|
||||||
|
qscore = read.getReadNegativeStrandFlag() ? OFF_START_OFFSET : OFF_END_OFFSET;
|
||||||
|
} else {
|
||||||
|
qscore = (int) read.getBaseQualities()[offset];
|
||||||
|
}
|
||||||
|
|
||||||
|
return qscore;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getQscoreBin( int qscore, int binSize ) {
|
||||||
|
return (int) Math.floor(((double)qscore)/binSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Pair<Integer,Integer> getPair( int a, int b, int c, int d, int e, int f, int g ) {
|
||||||
|
int mm = table[a][b][c][d][e][f][g][MM_OFFSET];
|
||||||
|
int ma = table[a][b][c][d][e][f][g][MATCH_OFFSET];
|
||||||
|
|
||||||
|
return new Pair<Integer,Integer>(mm,ma);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -23,7 +23,7 @@ public class PooledGenotypeConcordance extends BasicVariantAnalysis implements G
|
||||||
private String[] hapmapNames;
|
private String[] hapmapNames;
|
||||||
|
|
||||||
public PooledGenotypeConcordance( String pathToPoolFile ) {
|
public PooledGenotypeConcordance( String pathToPoolFile ) {
|
||||||
super("Pooled Genotype Concordance");
|
super("Pooled_Genotype_Concordance");
|
||||||
if( pathToPoolFile == null ) {
|
if( pathToPoolFile == null ) {
|
||||||
table = null;
|
table = null;
|
||||||
hapmapNames = null;
|
hapmapNames = null;
|
||||||
|
|
@ -34,8 +34,12 @@ public class PooledGenotypeConcordance extends BasicVariantAnalysis implements G
|
||||||
}
|
}
|
||||||
|
|
||||||
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||||
table.updateTable(tracker, ref, eval, hapmapNames);
|
String s = table.updateTable(tracker, ref, eval, hapmapNames);
|
||||||
return null;
|
if ( s == null ) {
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
|
return s + " " + context.getLocation().toString();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> done() {
|
public List<String> done() {
|
||||||
|
|
@ -122,8 +126,8 @@ class PooledConcordanceTable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void updateTable(RefMetaDataTracker tracker, char ref, Variation eval, String[] names) {
|
public String updateTable(RefMetaDataTracker tracker, char ref, Variation eval, String[] names) {
|
||||||
int truthIndex, callIndex, frequencyIndex;
|
int truthIndex, callIndex, frequencyIndex = -1;
|
||||||
List<Variation> chips = getChips(names, tracker);
|
List<Variation> chips = getChips(names, tracker);
|
||||||
if ( ref == 'N' || ref == 'n' ) {
|
if ( ref == 'N' || ref == 'n' ) {
|
||||||
variantCallsAtRefN += ( eval == null ) ? 0 : 1;
|
variantCallsAtRefN += ( eval == null ) ? 0 : 1;
|
||||||
|
|
@ -167,6 +171,20 @@ class PooledConcordanceTable {
|
||||||
}
|
}
|
||||||
|
|
||||||
table[truthIndex][callIndex] ++;
|
table[truthIndex][callIndex] ++;
|
||||||
|
|
||||||
|
String interest;
|
||||||
|
|
||||||
|
if ( truthIndex == TRUTH_VAR && ( callIndex == NO_CALL || callIndex == REF_CALL ) ) {
|
||||||
|
interest = "False_Negative_with_frequency: "+ Double.toString(freqIndexToFrequency(frequencyIndex));
|
||||||
|
} else if ( truthIndex == TRUTH_REF && ( callIndex == VARIANT_CALL )) {
|
||||||
|
interest = "False_Positive";
|
||||||
|
} else if ( callIndex == VARIANT_CALL_MISMATCH ) {
|
||||||
|
interest = "Mismatching_call";
|
||||||
|
} else {
|
||||||
|
interest = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return interest;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Variation> getChips(String[] names, RefMetaDataTracker tracker) {
|
public List<Variation> getChips(String[] names, RefMetaDataTracker tracker) {
|
||||||
|
|
@ -315,11 +333,11 @@ class PooledConcordanceTable {
|
||||||
out.add(String.format("| \t+ Sites where all Hapmap chips were ref: \t\t%d", nFullHapmapRefSites));
|
out.add(String.format("| \t+ Sites where all Hapmap chips were ref: \t\t%d", nFullHapmapRefSites));
|
||||||
out.add(String.format("| \t\t- Reference sites correctly called:\t\t%d\t(%d%%)", nRefsCalledCorrectly, divideToPercent(nRefsCalledCorrectly, nFullHapmapRefSites)));
|
out.add(String.format("| \t\t- Reference sites correctly called:\t\t%d\t(%d%%)", nRefsCalledCorrectly, divideToPercent(nRefsCalledCorrectly, nFullHapmapRefSites)));
|
||||||
out.add(String.format("| \t\t- Reference sites called as variant:\t\t%d\t(%d%%)", nRefsCalledAsSNP, divideToPercent(nRefsCalledAsSNP, nFullHapmapRefSites)));
|
out.add(String.format("| \t\t- Reference sites called as variant:\t\t%d\t(%d%%)", nRefsCalledAsSNP, divideToPercent(nRefsCalledAsSNP, nFullHapmapRefSites)));
|
||||||
out.add(String.format("| \t\t- Reference sites not confidently called:\t%d\t(%f)", nHapmapRefsNotCalled, ((double)nHapmapRefsNotCalled)/nFullHapmapRefSites));
|
out.add(String.format("| \t\t- Reference sites not confidently called SNP:\t%d\t(%d%%)", nHapmapRefsNotCalled, divideToPercent(nHapmapRefsNotCalled, nFullHapmapRefSites)));
|
||||||
out.add(String.format("| \t+ Sites where all seen Hapmap chips were ref, but not all Hapmap chips available: %d", nUnknownHapmapSites));
|
out.add(String.format("| \t+ Sites where all seen Hapmap chips were ref, but not all Hapmap chips available: %d", nUnknownHapmapSites));
|
||||||
out.add(String.format("| \t\t- Putative reference sites called ref:\t\t\t%d\t(%d%%)", table[TRUTH_UNKNOWN][REF_CALL], divideToPercent(table[TRUTH_UNKNOWN][REF_CALL], nUnknownHapmapSites)));
|
out.add(String.format("| \t\t- Putative reference sites called ref:\t\t\t%d\t(%d%%)", table[TRUTH_UNKNOWN][REF_CALL], divideToPercent(table[TRUTH_UNKNOWN][REF_CALL], nUnknownHapmapSites)));
|
||||||
out.add(String.format("| \t\t- Putative reference sites called SNP:\t\t\t%d\t(%d%%)", nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals, divideToPercent(nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals, nUnknownHapmapSites)));
|
out.add(String.format("| \t\t- Putative reference sites called SNP:\t\t\t%d\t(%d%%)", nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals, divideToPercent(nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals, nUnknownHapmapSites)));
|
||||||
out.add(String.format("| \t\t- Putative reference sites not confidently called:\t%d\t(%d%%)", table[TRUTH_UNKNOWN][NO_CALL], divideToPercent(table[TRUTH_UNKNOWN][NO_CALL], nUnknownHapmapSites)));
|
out.add(String.format("| \t\t- Putative reference sites not confidently called SNP:\t%d\t(%d%%)", table[TRUTH_UNKNOWN][NO_CALL], divideToPercent(table[TRUTH_UNKNOWN][NO_CALL], nUnknownHapmapSites)));
|
||||||
out.add("| Data on Hapmap Variant Sites");
|
out.add("| Data on Hapmap Variant Sites");
|
||||||
out.add(String.format("| \t+ Number of Hapmap SNP Sites:\t\t\t%d", nHapmapSNPSites));
|
out.add(String.format("| \t+ Number of Hapmap SNP Sites:\t\t\t%d", nHapmapSNPSites));
|
||||||
out.add(String.format("| \t\t- SNP sites incorrectly called ref:\t%d\t(%d%%)", nSNPsCalledAsRef, divideToPercent(nSNPsCalledAsRef, nHapmapSNPSites)));
|
out.add(String.format("| \t\t- SNP sites incorrectly called ref:\t%d\t(%d%%)", nSNPsCalledAsRef, divideToPercent(nSNPsCalledAsRef, nHapmapSNPSites)));
|
||||||
|
|
@ -355,6 +373,8 @@ class PooledConcordanceTable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getLargestOutputAlleleFrequencyIndex() {
|
public int getLargestOutputAlleleFrequencyIndex() {
|
||||||
|
// TODO -- this code may be bugged -- is the first index at which no hapmap sites appear
|
||||||
|
// TODO -- also the index above which no hapmap sites are guaranteed to appear with said frequency
|
||||||
int nHapmapSitesAtFreq = 1;
|
int nHapmapSitesAtFreq = 1;
|
||||||
int freqIndex = -1;
|
int freqIndex = -1;
|
||||||
while ( nHapmapSitesAtFreq > 0 && freqIndex < calculateNumFrequencyIndeces(poolSize) ) {
|
while ( nHapmapSitesAtFreq > 0 && freqIndex < calculateNumFrequencyIndeces(poolSize) ) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue