diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSExtendedGroupsCovariantWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSExtendedGroupsCovariantWalker.java index 228dc69ba..5e4c2db41 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSExtendedGroupsCovariantWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSExtendedGroupsCovariantWalker.java @@ -23,7 +23,7 @@ public class NQSExtendedGroupsCovariantWalker extends LocusWalker 0) ? mm/ct : -1; return String.format(DATA_FORMAT, qscore, group, mm, ct, mmr); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSTabularDistributionWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSTabularDistributionWalker.java new file mode 100644 index 000000000..587ee12ef --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSTabularDistributionWalker.java @@ -0,0 +1,169 @@ +package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; + +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.Pair; + +import java.util.LinkedList; +import java.util.ArrayList; +import java.util.List; +import java.util.ListIterator; + +import net.sf.samtools.SAMRecord; + +/** + * Created by IntelliJ IDEA. + * User: Ghost + * Date: Sep 24, 2009 + * Time: 10:40:29 AM + * To change this template use File | Settings | File Templates. + */ +public class NQSTabularDistributionWalker extends LocusWalker { + + final int WINDOW_SIZE_SIDE = 3; + final int QSCORE_BIN_SIZE = 4; + final String ROW_FORMAT = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d%n"; + final String HEAD_FORMAT = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n"; + + int qBins; + int WINDOW_SIZE; + + public void initialize() { + qBins = 2 + (int) Math.ceil(((double)QualityUtils.MAX_REASONABLE_Q_SCORE)/QSCORE_BIN_SIZE); + WINDOW_SIZE = 2*WINDOW_SIZE_SIDE+1; + } + + public NQSDistributionTable reduceInit() { + return new NQSDistributionTable(WINDOW_SIZE, qBins, QSCORE_BIN_SIZE); + } + + public LocalMapType map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return new LocalMapType(context, ref, tracker); + } + + public NQSDistributionTable reduce( LocalMapType curMap, NQSDistributionTable prevReduce ) { + // iterate through reads + for ( int i = 0; i < curMap.numReads(); i ++ ) { + prevReduce.update(curMap.context.getReads().get(i), curMap.context.getOffsets().get(i), WINDOW_SIZE_SIDE, QSCORE_BIN_SIZE, curMap.ref); + } + + return prevReduce; + } + + public void onTraversalDone(NQSDistributionTable finalTable) { + out.print( makeHeader() ); + for ( int a = 0; a < qBins; a ++ ) { + for ( int b = 0; b < qBins; b ++ ) { + for ( int c = 0; c < qBins; c ++ ) { + for ( int d = 0; d < qBins; d ++ ) { + for ( int e = 0; e < qBins; e ++ ) { + for ( int f = 0; f < qBins; f ++ ) { + for ( int g = 0; g < qBins; g ++ ) { + Pair mm = finalTable.getPair(a,b,c,d,e,f,g); + out.print( formatOutput(a,b,c,d,e,f,g,mm.first,mm.second) ); + } + } + } + } + } + } + } + } + + public String makeHeader() { + return String.format(HEAD_FORMAT,"Q1","Q2","Q3","Q4","Q5","Q6","Q7","Mismatch","Match"); + } + + public String formatOutput( int a, int b, int c, int d, int e, int f, int g, int h, int i ) { + return String.format(ROW_FORMAT, a, b, c, d, e, f, g, h, i); + } +} + + +class NQSDistributionTable { + + public final int MM_OFFSET = 0; + public final int MATCH_OFFSET = 1; + + protected int [][][][][][][][] table; + + protected int OFF_END_OFFSET; + protected int OFF_START_OFFSET; + + public NQSDistributionTable (int winSize, int qBins, int qStep) { + if ( table.length != winSize ) { + throw new StingException("Size positied in tabular distribution is not the size of the distribution table"); + } + + table = new int[qBins][qBins][qBins][qBins][qBins][qBins][qBins][2]; + // yes, this is absolutely positively horrendous brute-force code. + // ... Knuth would be proud. These could be lists for a decrease + // in memory but a slowdown in building the distribution. + for ( int a = 0; a < qBins; a ++ ) { + for ( int b = 0; b < qBins; b ++ ) { + for ( int c = 0; c < qBins; c ++ ) { + for ( int d = 0; d < qBins; d ++ ) { + for ( int e = 0; e < qBins; e ++ ) { + for ( int f = 0; f < qBins; f ++ ) { + for ( int g = 0; g < qBins; g ++ ) { + for ( int h = 0; h < 2; h ++ ) { + table[a][b][c][d][e][f][g][h] = 0; + } + } + } + } + } + } + } + } + + OFF_END_OFFSET = qBins-2; + OFF_START_OFFSET = qBins-1; + } + + public void update( SAMRecord read, int offset, int winSize, int binSize, ReferenceContext ref ) { + // note: there are two extra bins, one for "No quality: off the end of read" + // and one for "No quality: off the beginning of read" + List coordinates = getQscoreCoordinates(read,offset,winSize, binSize); + ListIterator o = coordinates.listIterator(); + table[o.next()][o.next()][o.next()][o.next()][o.next()][o.next()][o.next()][o.next()] ++; + // i'm sure i could use recursion for this but my brain is too tired after swimming to figure + // out, in 20 seconds, how to do it + } + + public List getQscoreCoordinates( SAMRecord read, int offset, int win, int binSize ) { + LinkedList coords = new LinkedList(); + for ( int i = offset - win; i < offset + win ; i ++ ) { + coords.add(getQscoreBin(getQScoreAsInt(read,i), binSize)); + } + return coords; + } + + public int getQScoreAsInt( SAMRecord read, int offset ) { + int qscore; + if ( offset < 0 ) { + qscore = read.getReadNegativeStrandFlag() ? OFF_END_OFFSET : OFF_START_OFFSET; + } else if ( offset > read.getReadLength() ) { + qscore = read.getReadNegativeStrandFlag() ? OFF_START_OFFSET : OFF_END_OFFSET; + } else { + qscore = (int) read.getBaseQualities()[offset]; + } + + return qscore; + } + + public int getQscoreBin( int qscore, int binSize ) { + return (int) Math.floor(((double)qscore)/binSize); + } + + public Pair getPair( int a, int b, int c, int d, int e, int f, int g ) { + int mm = table[a][b][c][d][e][f][g][MM_OFFSET]; + int ma = table[a][b][c][d][e][f][g][MATCH_OFFSET]; + + return new Pair(mm,ma); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/PooledGenotypeConcordance.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/PooledGenotypeConcordance.java index c101fd9a9..20444f65f 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/PooledGenotypeConcordance.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/PooledGenotypeConcordance.java @@ -23,7 +23,7 @@ public class PooledGenotypeConcordance extends BasicVariantAnalysis implements G private String[] hapmapNames; public PooledGenotypeConcordance( String pathToPoolFile ) { - super("Pooled Genotype Concordance"); + super("Pooled_Genotype_Concordance"); if( pathToPoolFile == null ) { table = null; hapmapNames = null; @@ -34,8 +34,12 @@ public class PooledGenotypeConcordance extends BasicVariantAnalysis implements G } public String update(Variation eval, RefMetaDataTracker tracker, char ref, AlignmentContext context) { - table.updateTable(tracker, ref, eval, hapmapNames); - return null; + String s = table.updateTable(tracker, ref, eval, hapmapNames); + if ( s == null ) { + return null; + } else { + return s + " " + context.getLocation().toString(); + } } public List done() { @@ -122,8 +126,8 @@ class PooledConcordanceTable { } } - public void updateTable(RefMetaDataTracker tracker, char ref, Variation eval, String[] names) { - int truthIndex, callIndex, frequencyIndex; + public String updateTable(RefMetaDataTracker tracker, char ref, Variation eval, String[] names) { + int truthIndex, callIndex, frequencyIndex = -1; List chips = getChips(names, tracker); if ( ref == 'N' || ref == 'n' ) { variantCallsAtRefN += ( eval == null ) ? 0 : 1; @@ -167,6 +171,20 @@ class PooledConcordanceTable { } table[truthIndex][callIndex] ++; + + String interest; + + if ( truthIndex == TRUTH_VAR && ( callIndex == NO_CALL || callIndex == REF_CALL ) ) { + interest = "False_Negative_with_frequency: "+ Double.toString(freqIndexToFrequency(frequencyIndex)); + } else if ( truthIndex == TRUTH_REF && ( callIndex == VARIANT_CALL )) { + interest = "False_Positive"; + } else if ( callIndex == VARIANT_CALL_MISMATCH ) { + interest = "Mismatching_call"; + } else { + interest = null; + } + + return interest; } public List getChips(String[] names, RefMetaDataTracker tracker) { @@ -315,11 +333,11 @@ class PooledConcordanceTable { out.add(String.format("| \t+ Sites where all Hapmap chips were ref: \t\t%d", nFullHapmapRefSites)); out.add(String.format("| \t\t- Reference sites correctly called:\t\t%d\t(%d%%)", nRefsCalledCorrectly, divideToPercent(nRefsCalledCorrectly, nFullHapmapRefSites))); out.add(String.format("| \t\t- Reference sites called as variant:\t\t%d\t(%d%%)", nRefsCalledAsSNP, divideToPercent(nRefsCalledAsSNP, nFullHapmapRefSites))); - out.add(String.format("| \t\t- Reference sites not confidently called:\t%d\t(%f)", nHapmapRefsNotCalled, ((double)nHapmapRefsNotCalled)/nFullHapmapRefSites)); + out.add(String.format("| \t\t- Reference sites not confidently called SNP:\t%d\t(%d%%)", nHapmapRefsNotCalled, divideToPercent(nHapmapRefsNotCalled, nFullHapmapRefSites))); out.add(String.format("| \t+ Sites where all seen Hapmap chips were ref, but not all Hapmap chips available: %d", nUnknownHapmapSites)); out.add(String.format("| \t\t- Putative reference sites called ref:\t\t\t%d\t(%d%%)", table[TRUTH_UNKNOWN][REF_CALL], divideToPercent(table[TRUTH_UNKNOWN][REF_CALL], nUnknownHapmapSites))); out.add(String.format("| \t\t- Putative reference sites called SNP:\t\t\t%d\t(%d%%)", nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals, divideToPercent(nSNPsAtHapmapWithRefDataOnSubsetOfIndividuals, nUnknownHapmapSites))); - out.add(String.format("| \t\t- Putative reference sites not confidently called:\t%d\t(%d%%)", table[TRUTH_UNKNOWN][NO_CALL], divideToPercent(table[TRUTH_UNKNOWN][NO_CALL], nUnknownHapmapSites))); + out.add(String.format("| \t\t- Putative reference sites not confidently called SNP:\t%d\t(%d%%)", table[TRUTH_UNKNOWN][NO_CALL], divideToPercent(table[TRUTH_UNKNOWN][NO_CALL], nUnknownHapmapSites))); out.add("| Data on Hapmap Variant Sites"); out.add(String.format("| \t+ Number of Hapmap SNP Sites:\t\t\t%d", nHapmapSNPSites)); out.add(String.format("| \t\t- SNP sites incorrectly called ref:\t%d\t(%d%%)", nSNPsCalledAsRef, divideToPercent(nSNPsCalledAsRef, nHapmapSNPSites))); @@ -355,6 +373,8 @@ class PooledConcordanceTable { } public int getLargestOutputAlleleFrequencyIndex() { + // TODO -- this code may be bugged -- is the first index at which no hapmap sites appear + // TODO -- also the index above which no hapmap sites are guaranteed to appear with said frequency int nHapmapSitesAtFreq = 1; int freqIndex = -1; while ( nHapmapSitesAtFreq > 0 && freqIndex < calculateNumFrequencyIndeces(poolSize) ) {