Some basic commits that I've been sitting on for a while now:

@ PooledGenotypeConcordance - changes to output, now also reports false-negatives and false-positives as interesting sites. It's been like this in my directory for ages, just never committed. @NQSExtendedGroupsCovariantWalker - change for formatting. @NQSTabularDistributionWalker - breaks out the full (window_size)-dimensional empirical error rate distribution by the window. So if you've got a window of size 3; the quality score sequences 22 25 23 and 22 25 24 have their own bins (each of the 40^3 sequences get one) for match and mismatch counts. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1730 348d0f76-0448-11de-a6fe-93d51630548a
2009-09-25 19:35:50 +00:00 · 2009-09-25 19:35:50 +00:00 · fe6d810515
parent f7684d9e1b
commit fe6d810515
3 changed files with 198 additions and 9 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSExtendedGroupsCovariantWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSExtendedGroupsCovariantWalker.java
@ -23,7 +23,7 @@ public class NQSExtendedGroupsCovariantWalker extends LocusWalker<LocalMapType,
    final int NEIGHBORHOOD_SIZE = 5;
    final int MM_OFFSET = 1;
    final int COUNT_OFFSET = 0;
-    final int MAX_QSCORE = QualityUtils.MAX_REASONABLE_Q_SCORE;
+    final int MAX_QSCORE = QualityUtils.MAX_REASONABLE_Q_SCORE+1;
    int NQS_GROUPS;

    public static final String DATA_FORMAT = "%d\t%d\t%d\t%d\t%f%n";
@ -149,7 +149,7 @@ public class NQSExtendedGroupsCovariantWalker extends LocusWalker<LocalMapType,
    public String formatNQSMismatchCountString(int qscore, int group, long[][][] cumulativeBins) {
        long mm = cumulativeBins[group][MM_OFFSET][qscore];
        long ct = cumulativeBins[group][COUNT_OFFSET][qscore];
-        double mmr = mm/ct;
+        double mmr = (ct > 0) ? mm/ct : -1;

        return String.format(DATA_FORMAT, qscore, group, mm, ct, mmr);
    }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSTabularDistributionWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSTabularDistributionWalker.java
@ -0,0 +1,169 @@
+package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
+
+import org.broadinstitute.sting.gatk.walkers.LocusWalker;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.utils.QualityUtils;
+import org.broadinstitute.sting.utils.StingException;
+import org.broadinstitute.sting.utils.Pair;
+
+import java.util.LinkedList;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ListIterator;
+
+import net.sf.samtools.SAMRecord;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: Ghost
+ * Date: Sep 24, 2009
+ * Time: 10:40:29 AM
+ * To change this template use File | Settings | File Templates.
+ */
+public class NQSTabularDistributionWalker extends LocusWalker<LocalMapType, NQSDistributionTable> {
+
+    final int WINDOW_SIZE_SIDE = 3;
+    final int QSCORE_BIN_SIZE = 4;
+    final String ROW_FORMAT = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d%n";
+    final String HEAD_FORMAT = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n";
+
+    int qBins;
+    int WINDOW_SIZE;
+
+    public void initialize() {
+        qBins = 2 + (int) Math.ceil(((double)QualityUtils.MAX_REASONABLE_Q_SCORE)/QSCORE_BIN_SIZE);
+        WINDOW_SIZE = 2*WINDOW_SIZE_SIDE+1;
+    }
+                                  
+    public NQSDistributionTable reduceInit() {
+        return new NQSDistributionTable(WINDOW_SIZE, qBins, QSCORE_BIN_SIZE);
+    }
+
+    public LocalMapType map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+        return new LocalMapType(context, ref, tracker);
+    }
+
+    public NQSDistributionTable reduce( LocalMapType curMap, NQSDistributionTable prevReduce ) {
+        // iterate through reads
+        for ( int i = 0; i < curMap.numReads(); i ++ ) {
+            prevReduce.update(curMap.context.getReads().get(i), curMap.context.getOffsets().get(i), WINDOW_SIZE_SIDE, QSCORE_BIN_SIZE, curMap.ref);
+        }
+
+        return prevReduce;
+    }
+
+    public void onTraversalDone(NQSDistributionTable finalTable) {
+        out.print( makeHeader() );
+        for ( int a = 0; a < qBins; a ++  ) {
+            for ( int b = 0; b < qBins; b ++ ) {
+                for ( int c = 0; c < qBins; c ++ ) {
+                    for ( int d = 0; d < qBins; d ++ ) {
+                        for ( int e = 0; e < qBins; e ++ ) {
+                            for ( int f = 0; f < qBins; f ++ ) {
+                                for ( int g = 0; g < qBins; g ++ ) {
+                                    Pair<Integer,Integer> mm = finalTable.getPair(a,b,c,d,e,f,g);
+                                    out.print( formatOutput(a,b,c,d,e,f,g,mm.first,mm.second) );
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    public String makeHeader() {
+        return String.format(HEAD_FORMAT,"Q1","Q2","Q3","Q4","Q5","Q6","Q7","Mismatch","Match");
+    }
+
+    public String formatOutput( int a, int b, int c, int d, int e, int f, int g, int h, int i ) {
+        return String.format(ROW_FORMAT, a, b, c, d, e, f, g, h, i);
+    }
+}
+
+
+class NQSDistributionTable {
+
+    public final int MM_OFFSET = 0;
+    public final int MATCH_OFFSET = 1;
+
+    protected int [][][][][][][][] table;
+
+    protected int OFF_END_OFFSET;
+    protected int OFF_START_OFFSET;
+
+    public NQSDistributionTable (int winSize, int qBins, int qStep) {
+        if ( table.length != winSize ) {
+            throw new StingException("Size positied in tabular distribution is not the size of the distribution table");
+        }
+
+        table = new int[qBins][qBins][qBins][qBins][qBins][qBins][qBins][2];
+        // yes, this is absolutely positively horrendous brute-force code.
+        // ... Knuth would be proud. These could be lists for a decrease
+        // in memory but a slowdown in building the distribution.
+        for ( int a = 0; a < qBins; a ++  ) {
+            for ( int b = 0; b < qBins; b ++ ) {
+                for ( int c = 0; c < qBins; c ++ ) {
+                    for ( int d = 0; d < qBins; d ++ ) {
+                        for ( int e = 0; e < qBins; e ++ ) {
+                            for ( int f = 0; f < qBins; f ++ ) {
+                                for ( int g = 0; g < qBins; g ++ ) {
+                                    for ( int h = 0; h < 2; h ++ ) {
+                                        table[a][b][c][d][e][f][g][h] = 0;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        OFF_END_OFFSET = qBins-2;
+        OFF_START_OFFSET = qBins-1;
+    }
+
+    public void update( SAMRecord read, int offset, int winSize, int binSize, ReferenceContext ref ) {
+        // note: there are two extra bins, one for "No quality: off the end of read"
+        // and one for "No quality: off the beginning of read"
+        List<Integer> coordinates = getQscoreCoordinates(read,offset,winSize, binSize);
+        ListIterator<Integer> o = coordinates.listIterator();
+        table[o.next()][o.next()][o.next()][o.next()][o.next()][o.next()][o.next()][o.next()] ++;
+        // i'm sure i could use recursion for this but my brain is too tired after swimming to figure
+        // out, in 20 seconds, how to do it
+    }
+
+    public List<Integer> getQscoreCoordinates( SAMRecord read, int offset, int win, int binSize ) {
+        LinkedList<Integer> coords = new LinkedList<Integer>();
+        for ( int i = offset - win; i < offset + win ; i ++ ) {
+            coords.add(getQscoreBin(getQScoreAsInt(read,i), binSize));
+        }
+        return coords;
+    }
+
+    public int getQScoreAsInt( SAMRecord read, int offset ) {
+        int qscore;
+        if ( offset < 0 ) {
+            qscore = read.getReadNegativeStrandFlag() ? OFF_END_OFFSET : OFF_START_OFFSET;
+        } else if ( offset > read.getReadLength() ) {
+            qscore = read.getReadNegativeStrandFlag() ? OFF_START_OFFSET : OFF_END_OFFSET;
+        } else {
+            qscore = (int) read.getBaseQualities()[offset];
+        }
+
+        return qscore;
+    }
+
+    public int getQscoreBin( int qscore, int binSize ) {
+        return (int) Math.floor(((double)qscore)/binSize);
+    }
+
+    public Pair<Integer,Integer> getPair( int a, int b, int c, int d, int e, int f, int g ) {
+        int mm = table[a][b][c][d][e][f][g][MM_OFFSET];
+        int ma = table[a][b][c][d][e][f][g][MATCH_OFFSET];
+
+        return new Pair<Integer,Integer>(mm,ma);
+    }
+}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/PooledGenotypeConcordance.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/PooledGenotypeConcordance.java
@ -23,7 +23,7 @@ public class PooledGenotypeConcordance extends BasicVariantAnalysis implements G
    private String[] hapmapNames;

    public PooledGenotypeConcordance( String pathToPoolFile ) {
-        super("Pooled Genotype Concordance");
+        super("Pooled_Genotype_Concordance");
        if( pathToPoolFile == null ) {
            table = null;
            hapmapNames = null;
@ -34,8 +34,12 @@ public class PooledGenotypeConcordance extends BasicVariantAnalysis implements G
    }

    public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) {
-        table.updateTable(tracker, ref, eval, hapmapNames);
-        return null;
+       String s = table.updateTable(tracker, ref, eval, hapmapNames);
+       if ( s == null ) {
+           return null;
+       } else {
+           return s + " " + context.getLocation().toString();
+       }
    }

    public List<String> done() {
@ -122,8 +126,8 @@ class PooledConcordanceTable {
        }
    }

-    public void updateTable(RefMetaDataTracker tracker, char ref, Variation eval, String[] names) {
-        int truthIndex, callIndex, frequencyIndex;
+    public String updateTable(RefMetaDataTracker tracker, char ref, Variation eval, String[] names) {
+        int truthIndex, callIndex, frequencyIndex = -1;
        List<Variation> chips = getChips(names, tracker);
        if ( ref == 'N' || ref == 'n' ) {
            variantCallsAtRefN += ( eval == null ) ? 0 : 1;
@ -167,6 +171,20 @@ class PooledConcordanceTable {
        }

        table[truthIndex][callIndex] ++;
+
+        String interest;
+
+        if ( truthIndex == TRUTH_VAR && ( callIndex == NO_CALL || callIndex == REF_CALL ) ) {
+            interest = "False_Negative_with_frequency: "+ Double.toString(freqIndexToFrequency(frequencyIndex));
+        } else if ( truthIndex == TRUTH_REF && ( callIndex == VARIANT_CALL )) {
+            interest = "False_Positive";
+        } else if ( callIndex == VARIANT_CALL_MISMATCH ) {
+            interest = "Mismatching_call";
+        } else {
+            interest = null;
+        }
+
+        return interest;
    }

    public List<Variation> getChips(String[] names, RefMetaDataTracker tracker) {
@ -315,11 +333,11 @@ class PooledConcordanceTable {
        out.add(String.format("| \t+ Sites where all Hapmap chips were ref: \t\t%d", nFullHapmapRefSites));
        out.add(String.format("| \t\t- Reference sites correctly called:\t\t%d\t(%d%%)", nRefsCalledCorrectly, divideToPercent(nRefsCalledCorrectly, nFullHapmapRefSites)));
        out.add(String.format("| \t\t- Reference sites called as variant:\t\t%d\t(%d%%)", nRefsCalledAsSNP, divideToPercent(nRefsCalledAsSNP, nFullHapmapRefSites)));
-        out.add(String.format("| \t\t- Reference sites not confidently called:\t%d\t(%f)", nHapmapRefsNotCalled, ((double)nHapmapRefsNotCalled)/nFullHapmapRefSites));
+        out.add(String.format("| \t\t- Reference sites not confidently called SNP:\t%d\t(%d%%)", nHapmapRefsNotCalled, divideToPercent(nHapmapRefsNotCalled, nFullHapmapRefSites)));
        out.add(String.format("| \t+ Sites where all seen Hapmap chips were ref, but not all Hapmap chips available: %d", nUnknownHapmapSites));
        out.add(String.format("| \t\t- Putative reference sites called ref:\t\t\t%d\t(%d%%)", table[TRUTH_UNKNOWN][REF_CALL], divideToPercent(table[TRUTH_UNKNOWN][REF_CALL], nUnknownHapmapSites)));
        out.add(String.format("| \t\t- Putative reference sites called SNP:\t\t\t%d\t(%d%%)", nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals, divideToPercent(nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals, nUnknownHapmapSites)));
-        out.add(String.format("| \t\t- Putative reference sites not confidently called:\t%d\t(%d%%)", table[TRUTH_UNKNOWN][NO_CALL], divideToPercent(table[TRUTH_UNKNOWN][NO_CALL], nUnknownHapmapSites)));
+        out.add(String.format("| \t\t- Putative reference sites not confidently called SNP:\t%d\t(%d%%)", table[TRUTH_UNKNOWN][NO_CALL], divideToPercent(table[TRUTH_UNKNOWN][NO_CALL], nUnknownHapmapSites)));
        out.add("| Data on Hapmap Variant Sites");
        out.add(String.format("| \t+ Number of Hapmap SNP Sites:\t\t\t%d", nHapmapSNPSites));
        out.add(String.format("| \t\t- SNP sites incorrectly called ref:\t%d\t(%d%%)", nSNPsCalledAsRef, divideToPercent(nSNPsCalledAsRef, nHapmapSNPSites)));
@ -355,6 +373,8 @@ class PooledConcordanceTable {
    }

    public int getLargestOutputAlleleFrequencyIndex() {
+        // TODO -- this code may be bugged -- is the first index at which no hapmap sites appear
+        // TODO -- also the index above which no hapmap sites are guaranteed to appear with said frequency
        int nHapmapSitesAtFreq = 1;
        int freqIndex = -1;
        while ( nHapmapSitesAtFreq > 0 && freqIndex < calculateNumFrequencyIndeces(poolSize) ) {