From 3b1fabeff070fa9289c85401824e285aae2d2ea6 Mon Sep 17 00:00:00 2001 From: chartl Date: Tue, 20 Oct 2009 14:58:04 +0000 Subject: [PATCH] Major code refactoring: @ Pooled utils & power - Removed two of the power walkers leaving only PowerBelowFrequency, added some additional flags on PowerBelowFrequency to give it some of the behavior that PowerAndCoverage had - Removed a number of PoolUtils variables and methods that were used in those walkers or simply not used - Removed AnalyzePowerWalker (un-necessary) - Changed the location of Quad/Squad/ReadOffsetQuad into poolseq @NQS - Deleted all walkers but the minimum NQS walker, refactored not to use LocalMapType @ BaseTransitionTable - Added a slew of new integration tests for different flaggable and integral parameters - (Scala) just a System.out that was added and commented out (no actual code change) - (Java) changed a < to <= and a boolean formula Chris git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1887 348d0f76-0448-11de-a6fe-93d51630548a --- ...seTransitionTableCalculatorJavaWalker.java | 3 +- .../walkers/Recalibration/LocalMapType.java | 34 --- .../Recalibration/MinimumNQSWalker.java | 142 ++++++++++ .../NQSClusteredZScoreWalker.java | 155 ----------- .../NQSCovariantByCountsWalker.java | 139 ---------- .../NQSExtendedGroupsCovariantWalker.java | 153 ---------- .../NQSMismatchCovariantWalker.java | 188 ------------- .../Recalibration/NQSRecalibratorWalker.java | 160 ----------- .../NQSTabularDistributionWalker.java | 181 ------------ .../QualityScoreDistributionWalker.java | 77 ----- .../walkers/poolseq/AnalyzePowerWalker.java | 218 --------------- .../poolseq/CoverageAndPowerWalker.java | 232 ---------------- .../poolseq/PowerAndCoverageWalker.java | 262 ------------------ .../poolseq/PowerBelowFrequencyWalker.java | 16 ++ .../{utils => gatk/walkers/poolseq}/Quad.java | 2 +- .../walkers/poolseq}/ReadOffsetQuad.java | 2 +- .../walkers/poolseq}/SQuad.java | 2 +- .../sting/playground/utils/PoolUtils.java | 188 +------------ .../sting/utils/ReadBackedPileup.java | 2 + ...ionTableCalculatorJavaIntegrationTest.java | 54 +++- scala/src/BaseTransitionTableCalculator.scala | 14 +- 21 files changed, 232 insertions(+), 1992 deletions(-) delete mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/LocalMapType.java create mode 100644 java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MinimumNQSWalker.java delete mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSClusteredZScoreWalker.java delete mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSCovariantByCountsWalker.java delete mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSExtendedGroupsCovariantWalker.java delete mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSMismatchCovariantWalker.java delete mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSRecalibratorWalker.java delete mode 100644 java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSTabularDistributionWalker.java delete mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreDistributionWalker.java delete mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/AnalyzePowerWalker.java delete mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java delete mode 100644 java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/PowerAndCoverageWalker.java rename java/src/org/broadinstitute/sting/playground/{utils => gatk/walkers/poolseq}/Quad.java (97%) rename java/src/org/broadinstitute/sting/playground/{utils => gatk/walkers/poolseq}/ReadOffsetQuad.java (97%) rename java/src/org/broadinstitute/sting/playground/{utils => gatk/walkers/poolseq}/SQuad.java (88%) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BaseTransitionTableCalculatorJavaWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BaseTransitionTableCalculatorJavaWalker.java index 9dd056388..fb3080fee 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BaseTransitionTableCalculatorJavaWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BaseTransitionTableCalculatorJavaWalker.java @@ -201,7 +201,7 @@ public class BaseTransitionTableCalculatorJavaWalker extends LocusWalker>,List>>, int[][][]> { + private static int MM_OFFSET = 1; + private static int MATCH_OFFSET = 0; + private static int QSCORE_MAX = 1 + QualityUtils.MAX_REASONABLE_Q_SCORE; + @Argument(fullName="windowSize", shortName="ws", doc="Size of the window (in one direction)", required=true) + private int winSide = 4; + + public void initialize() { + out.printf("%s%n", makeHeader()); + } + + public int[][][] reduceInit() { + int[][][] counts = new int[QSCORE_MAX][QSCORE_MAX][2]; + for ( int i = 0; i < QSCORE_MAX; i ++ ) { + for ( int j = 0; j < QSCORE_MAX; j ++ ) { + counts[i][j][1]=0; + counts[i][j][0]=0; + } + } + return counts; + } + + public int[][][] reduce(Pair>,List>> map, int[][][] prevReduce) { + if ( map != null ) { + List> matchingQualityNQSPairs = map.getFirst(); + List> mismatchingQualityNQSPairs = map.getSecond(); + for ( Pair p : matchingQualityNQSPairs ) { + prevReduce[p.getFirst()][p.getSecond()][MATCH_OFFSET] ++; + } + + for ( Pair p : mismatchingQualityNQSPairs ) { + prevReduce[p.getFirst()][p.getSecond()][MM_OFFSET] ++; + } + } + + return prevReduce; + } + + public Pair>,List>> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + ArrayList> matchingQualityNQSPairs = new ArrayList>(); + ArrayList> mismatchingQualityNQSPairs = new ArrayList>(); + if ( (Variation) tracker.lookup("dbsnp",null) == null ) { + for ( int r = 0; r < context.numReads(); r ++ ) { + SAMRecord read = context.getReads().get(r); + int offset = context.getOffsets().get(r); + int quality = read.getBaseQualities()[offset]; + int NQS = getNghdQ(offset, read); + Pair qualityNQSPair = new Pair (quality,NQS); + if ( BaseUtils.basesAreEqual(read.getReadBases()[offset], (byte) ref.getBase()) ) { + matchingQualityNQSPairs.add(qualityNQSPair); + } else { + mismatchingQualityNQSPairs.add(qualityNQSPair); + } + } + + return new Pair>,List>>(matchingQualityNQSPairs,mismatchingQualityNQSPairs); + } else { + return null; + } + } + + + public void onTraversalDone( int[][][] reduce ) { + for ( int qc = 0; qc < QSCORE_MAX; qc ++ ) { + for ( int qn = 0; qn < QSCORE_MAX; qn ++ ) { + out.printf("%s%n", formatData(reduce[qc][qn],qc,qn)); + } + } + } + + public int getNghdQ(int off, SAMRecord read) { + // System.out.println("getNghdQ"); + byte minQ = Byte.MAX_VALUE; + byte[] quals = read.getBaseQualities(); + int rdlnth = read.getReadLength(); + int start; + int end; + if ( off - winSide < 0 ) { + start = 0; + } else { + start = off - winSide; + } + + if ( off + winSide > rdlnth ) { + end = rdlnth; + } else { + end = off + winSide; + } + + for ( int i = start; i < end; i ++ ) { + if ( i != off ) { + byte q = quals[i]; + if ( q < minQ ) { + minQ = q; + } + } + } + + return minQ; + } + + private String makeHeader() { + return String.format("%s\t%s\t%s\t%s\t%s\t%s","Reported_Q","Min_Nghd_Q","N_observations","Mm_rate","Empirical_Q","Q_diff"); + } + + private String formatData( int[] mmArray, int qCenter, int qNghd ) { + int counts = mmArray[MM_OFFSET]+mmArray[MATCH_OFFSET]; + double mismatch = ((double)mmArray[MM_OFFSET]/counts); + byte qEmp = QualityUtils.probToQual(1-mismatch); + return String.format("%d\t%d\t%d\t%f\t%d\t%d", qCenter, qNghd, counts, mismatch, qEmp,qEmp-qCenter); + } +} + diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSClusteredZScoreWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSClusteredZScoreWalker.java deleted file mode 100755 index 97b1e146b..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSClusteredZScoreWalker.java +++ /dev/null @@ -1,155 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; - -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.utils.Pair; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.playground.gatk.walkers.Recalibration.LocalMapType; - -import net.sf.samtools.SAMRecord; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Sep 29, 2009 - * Time: 1:13:25 PM - * To change this template use File | Settings | File Templates. - */ -public class NQSClusteredZScoreWalker extends LocusWalker { - static final int WIN_SIDE_SIZE = 5; - static final int Z_SCORE_MIN = -6; - static final int Z_SCORE_MAX = 6; - static final int Z_SCORE_MULTIPLIER = 20; // bins are Z_SCORE * (this) rounded to the nearst int - static final int MM_OFFSET = 1; - static final int MATCH_OFFSET = 0; - static final int MAX_Q_SCORE = 2 + QualityUtils.MAX_REASONABLE_Q_SCORE; - - protected int WINDOW; - protected int Z_SCORE_RANGE; - - public void initialize() { - WINDOW = 2*WIN_SIDE_SIZE+1; - Z_SCORE_RANGE = Z_SCORE_MAX - Z_SCORE_MIN; - } - - public int[][][] reduceInit() { - int[][][] q = new int[Z_SCORE_RANGE*Z_SCORE_MULTIPLIER+1][MAX_Q_SCORE][2]; - for ( int i = 0; i < Z_SCORE_RANGE*Z_SCORE_MULTIPLIER+1; i ++ ) { - for ( int j = 0; j < MAX_Q_SCORE; j ++ ) { - for ( int k = 0; k < 2; k ++ ) { - q[i][j][k] = 0; - } - } - } - - return q; - } - - - public LocalMapType map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - return new LocalMapType(context, ref, tracker); - } - - public int[][][] reduce(LocalMapType map, int[][][] qScores) { - for ( int i = 0; i < map.numReads(); i ++ ) { - SAMRecord read = map.context.getReads().get(i); - int offset = map.context.getOffsets().get(i); - if ( isMismatch(read, offset, map.ref) ) { - qScores[calcZScoreBin(read,offset)][getQScoreAsInt(read,offset)][MM_OFFSET]++; - } else { - qScores[calcZScoreBin(read,offset)][getQScoreAsInt(read,offset)][MATCH_OFFSET]++; - } - } - - return qScores; - } - - public void onTraversalDone( int[][][] zScoreBins ) { - out.print( header() ); - for ( int i = 0; i < Z_SCORE_RANGE*Z_SCORE_MULTIPLIER; i ++ ) { - for ( int j = 0; j < MAX_Q_SCORE; j ++ ) { - out.print( formatData(zScoreBins[i][j], i, j) ); - } - } - } - - public boolean isMismatch( SAMRecord read, int offset, ReferenceContext ref ) { - return (Character.toUpperCase((char) read.getReadBases()[offset]) != ref.getBase()); - } - - public int getQScoreAsInt( SAMRecord read, int offset ) { - return (int) read.getBaseQualities()[offset]; - } - - public int calcZScoreBin( SAMRecord read, int offset ) { - Pair meanVar = calcWindowMeanVariance(read, offset); - double rawZ = (getQScoreAsInt(read, offset) - meanVar.first)/Math.sqrt(meanVar.second); - int zBin; - if ( rawZ >= Z_SCORE_MAX ) { - zBin = (int) Math.floor(Z_SCORE_RANGE*Z_SCORE_MULTIPLIER); - } else if ( rawZ <= Z_SCORE_MIN ) { - zBin = 0; - } else if ( rawZ > 0 ) { - zBin = (int) Math.floor(rawZ*Z_SCORE_MULTIPLIER) - (int) Math.floor(Z_SCORE_MIN*Z_SCORE_MULTIPLIER); - } else { - zBin = (int) Math.floor(-Z_SCORE_MIN*Z_SCORE_MULTIPLIER) + (int) Math.floor(rawZ*Z_SCORE_MULTIPLIER); - } - - return zBin; - } - - public Pair calcWindowMeanVariance( SAMRecord read, int offset ) { - int start,end; - if ( offset - WIN_SIDE_SIZE < 0 ) { - start = 0; - } else { - start = offset - WIN_SIDE_SIZE; - } - - if ( offset + WIN_SIDE_SIZE > read.getReadLength() ) { - end = read.getReadLength(); - } else { - end = offset + WIN_SIDE_SIZE; - } - - double mean = 0.0; - int n = end-start; - byte[] quals = read.getBaseQualities(); - - for ( int i = start; i < end; i ++ ) { - if ( i != offset ) { - mean += ((double) quals[i])/n; - } - } - - double var = 0.0; - - for ( int i = start; i < end; i ++ ) { - if ( i != offset ) { - var += Math.pow(((double) quals[i] - mean),2)/n; - } - } - - return new Pair(mean, var); - } - - public String header() { - String format = "%s\t%s\t%s\t%s\t%s\t%s%n"; - return String.format(format, "ZScore", "N_obs", "Expected_Mismatch", "Empirical_Mismatch", "Expected_MM_As_Q", "Empirical_MM_As_Q"); - } - - public String formatData ( int[] matchMismatch, int zScoreBin, int q ) { - String format = "%f\t%d\t%f\t%f\t%d\t%d%n"; - - double zScore = ( (double) zScoreBin )/Z_SCORE_MULTIPLIER; - int counts = matchMismatch[MATCH_OFFSET] + matchMismatch[MM_OFFSET]; - double empMMR = (((double)matchMismatch[MM_OFFSET])/counts); - double expMMR = QualityUtils.qualToErrorProb((byte) q); - int empMMRAsQ = QualityUtils.probToQual(1-empMMR); - - return String.format(format, zScore, counts, expMMR, empMMR, q, empMMRAsQ); - } - -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSCovariantByCountsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSCovariantByCountsWalker.java deleted file mode 100755 index 4e73f496a..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSCovariantByCountsWalker.java +++ /dev/null @@ -1,139 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; - -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.utils.Pair; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.playground.gatk.walkers.Recalibration.LocalMapType; - -import net.sf.samtools.SAMRecord; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Sep 29, 2009 - * Time: 10:54:18 AM - * To change this template use File | Settings | File Templates. - */ -public class NQSCovariantByCountsWalker extends LocusWalker { - - final static int SMALL_DEVIATION = 5; - final static int LARGE_DEVIATION = 8; - final static int WIN_SIZE_SIDE = 6; - final static int MATCH_OFFSET = 1; - final static int MM_OFFSET = 0; - final static int MAX_Q_SCORE = 2 + QualityUtils.MAX_REASONABLE_Q_SCORE; - final static String DATA_FORMAT = "%d\t%d\t%d\t%d\t%f\t%f\t%d\t%d%n"; - final static String HEADER_FORMAT = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n"; - - int NGHD_SIZE; - - public void initialize() { - NGHD_SIZE = 2*WIN_SIZE_SIDE+1; - } - - public int[][][][] reduceInit() { - int[][][][] q = new int[2*WIN_SIZE_SIDE][2*WIN_SIZE_SIDE][MAX_Q_SCORE][2]; - - for ( int i = 0; i < 2*WIN_SIZE_SIDE; i ++ ) { - for ( int j = 0; j < 2*WIN_SIZE_SIDE; j ++ ) { - for ( int k = 0; k < MAX_Q_SCORE; k ++ ) { - for ( int h = 0; h < 2; h ++ ) { - q[i][j][k][h] = 0; - } - } - } - } - - return q; - } - - public LocalMapType map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - return new LocalMapType(context, ref, tracker); - } - - public int [][][][] reduce( LocalMapType map, int [][][][] quals ) { - for ( int i = 0; i < map.numReads(); i ++ ) { - SAMRecord read = map.context.getReads().get(i); - int offset = map.context.getOffsets().get(i); - Pair deviantCounts = countNumDeviantBases(read,offset); - - if ( isMismatch(read,offset,map.ref) ) { - quals[deviantCounts.first][deviantCounts.second][(int) read.getBaseQualities()[offset]][MM_OFFSET]++; - } else { - quals[deviantCounts.first][deviantCounts.second][(int) read.getBaseQualities()[offset]][MATCH_OFFSET]++; - } - } - - return quals; - } - - public void onTraversalDone( int[][][][] counts ) { - out.print( header() ); - - for ( int i = 0; i < 2*WIN_SIZE_SIDE; i ++ ) { - for ( int j = 0; j < 2*WIN_SIZE_SIDE; j ++ ) { - out.print( formatData( counts[i][j], i, j ) ); - } - } - } - - public Pair countNumDeviantBases(SAMRecord read, int offset) { - int start,end; - if ( offset - WIN_SIZE_SIDE < 0 ) { - start = 0; - } else { - start = offset - WIN_SIZE_SIDE; - } - - if ( offset + WIN_SIZE_SIDE > read.getReadLength() ) { - end = read.getReadLength(); - } else { - end = offset + WIN_SIZE_SIDE; - } - - int largeCounts = 0; - int smallCounts = 0; - byte[] quals = read.getBaseQualities(); - for ( int i = start; i < end; i ++ ) { - if ( Math.abs(quals[i] - quals[offset]) >= LARGE_DEVIATION ) { - largeCounts ++; - } else if ( Math.abs( quals[i] - quals[offset] ) >= SMALL_DEVIATION ) { - smallCounts ++; - } - } - - return new Pair(smallCounts,largeCounts); - } - - public boolean isMismatch( SAMRecord read, int offset, ReferenceContext ref ) { - return ( Character.toUpperCase((char)read.getReadBases()[offset]) != ref.getBase() ); - } - - public String header() { - return String.format(HEADER_FORMAT, "N_SMALL_DEV", "N_LARGE_DEV", "MATCH", "MISMATCH", "EXPECTED_ERR", "EMPIRICAL_ERR", "EXP_AS_Q", "EMP_AS_Q"); - } - - public String formatData( int[][] qScores, int smDev, int lgDev ) { - int match = 0; - int mismatch = 0; - double expErr = 0.0; - - for ( int i = 0; i < MAX_Q_SCORE; i ++ ) { - match += qScores[i][MATCH_OFFSET]; - mismatch += qScores[i][MM_OFFSET]; - expErr += QualityUtils.qualToErrorProb((byte)i)*qScores[i][MATCH_OFFSET]; - expErr += QualityUtils.qualToErrorProb((byte)i)*qScores[i][MM_OFFSET]; - } - - expErr = expErr/(match + mismatch); - int expAsQ = QualityUtils.probToQual(1-expErr); - - double empErr = ((double)mismatch)/(match+mismatch); - int empAsQ = QualityUtils.probToQual(1-empErr); - - return String.format(DATA_FORMAT, smDev, lgDev, match, mismatch, expErr, empErr, expAsQ, empAsQ); - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSExtendedGroupsCovariantWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSExtendedGroupsCovariantWalker.java deleted file mode 100755 index b5289f28f..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSExtendedGroupsCovariantWalker.java +++ /dev/null @@ -1,153 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; - -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.playground.gatk.walkers.Recalibration.LocalMapType; - - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Sep 23, 2009 - * Time: 11:26:52 AM - * To change this template use File | Settings | File Templates. - */ -public class NQSExtendedGroupsCovariantWalker extends LocusWalker { - - final int NEIGHBORHOOD_SIZE = 5; - final int MM_OFFSET = 1; - final int COUNT_OFFSET = 0; - final int MAX_QSCORE = QualityUtils.MAX_REASONABLE_Q_SCORE+1; - int NQS_GROUPS; - - public static final String DATA_FORMAT = "%d\t%d\t%d\t%d\t%d%n"; - public static final String TEXT_FORMAT = "%s\t%s\t%s\t%s\t%s%n"; - - - // Walker methods - - public void initialize() { - NQS_GROUPS = (MAX_QSCORE)*(MAX_QSCORE+1)/2 + MAX_QSCORE; - // this is from solving the recurrent equation - // a_{n+1} = a_n + n + 1 - } - - public long[][][] reduceInit() { - long[][][] covariantCounts = new long[NQS_GROUPS][2][MAX_QSCORE]; - - for ( int i = 0; i < NQS_GROUPS; i ++ ) { - for ( int j = 0; j < 2; j++ ) { - for ( int k = 0; k < MAX_QSCORE; k ++) { - covariantCounts[i][j][k] = 0; - } - } - } - - return covariantCounts; - } - - public LocalMapType map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - return new LocalMapType(context, ref, tracker); - } - - public long[][][] reduce( LocalMapType map, long[][][] cumulativeBins ) { - - for ( int i = 0; i < map.numReads(); i ++ ) { - int groupNumber = calcGroupNumber(i, map.context); - int qualityAsInt = getBaseQualityAsInt(map.context,i); - cumulativeBins[groupNumber][COUNT_OFFSET][qualityAsInt] ++; - if ( ! isRef(i, map.context, map.ref ) ) { - cumulativeBins[groupNumber][MM_OFFSET][qualityAsInt] ++; - } - } - - return cumulativeBins; - } - - public void onTraversalDone(long[][][] cumulativeBins) { - printNQSMismatchTable(cumulativeBins); - } - - // Map-Reduce helper methods - - public int calcGroupNumber( int read, AlignmentContext context ) { - return calcGroupNumberFromQualities(getBaseQualityAsInt(context, read), getMinNeighborhoodBaseQualityAsInt(context, read)); - } - - /* - * calculates group number as - * #: nghd left base nghd right - * 0: 000 0 000 - * 1: 000 1 000 - * 2: 111 1 111 - * 3: 000 2 000 - * 4: 111 2 111 - * 5: 222 2 222 - * 6: 000 3 000 - * 7: 111 3 111 - * etc - */ - public int calcGroupNumberFromQualities( int baseQuality, int minNeighborhoodQuality ) { - int groupNumber = (baseQuality)*(baseQuality+1)/2; - if ( minNeighborhoodQuality <= baseQuality ) { - groupNumber += minNeighborhoodQuality; - } else { - groupNumber += baseQuality; // yes we want to do this - } - - return groupNumber; - } - - public int getBaseQualityAsInt( AlignmentContext context, int read ) { - return (int) context.getReads().get(read).getBaseQualities()[context.getOffsets().get(read)]; - } - - public int getMinNeighborhoodBaseQualityAsInt( AlignmentContext context, int read ) { - byte[] qualities = context.getReads().get(read).getBaseQualities(); - int offset = context.getOffsets().get(read); - int readLength = qualities.length; - int start = (offset - NEIGHBORHOOD_SIZE < readLength) ? 0 : offset - NEIGHBORHOOD_SIZE; - int end = (offset + NEIGHBORHOOD_SIZE > readLength) ? readLength : offset+NEIGHBORHOOD_SIZE; - byte minQuality = Byte.MAX_VALUE; - - for ( int i = start; i < end; i ++ ) { - if ( i != offset ) { - if ( qualities[i] < minQuality ) { - minQuality = qualities[i]; - } - } - } - - return (int) minQuality; - } - - public boolean isRef(int read, AlignmentContext context, ReferenceContext ref ) { - return (context.getReads().get(read).getReadBases()[context.getOffsets().get(read)] == ref.getBase()); - } - - // printing helper methods - - public void printNQSMismatchTable(long[][][] cumulativeBins) { - out.print(createHeader()); - for ( int qscore = 0; qscore < MAX_QSCORE; qscore ++ ) { - for ( int group = 0; group < NQS_GROUPS; group ++ ) { - out.print(formatNQSMismatchCountString(qscore,group,cumulativeBins)); - } - } - } - - public String createHeader() { - return String.format(TEXT_FORMAT, "Qscore_Reported", "NQS_Group", "Mismatches", "Total", "Empirical_Qscore"); - } - - public String formatNQSMismatchCountString(int qscore, int group, long[][][] cumulativeBins) { - long mm = cumulativeBins[group][MM_OFFSET][qscore]; - long ct = cumulativeBins[group][COUNT_OFFSET][qscore]; - double mmr = (ct > 0) ? mm/ct : -1; - - return String.format(DATA_FORMAT, qscore, group, mm, ct, QualityUtils.probToQual(1-mmr)); - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSMismatchCovariantWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSMismatchCovariantWalker.java deleted file mode 100755 index 5a2724b01..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSMismatchCovariantWalker.java +++ /dev/null @@ -1,188 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; - -import org.broadinstitute.sting.playground.gatk.walkers.Recalibration.LocalMapType; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.utils.QualityUtils; -import net.sf.samtools.SAMRecord; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Sep 14, 2009 - * Time: 1:37:44 PM - * To change this template use File | Settings | File Templates. - */ -public class NQSMismatchCovariantWalker extends LocusWalker { - public static final int NQS_GROUPS = 35; - public static final int NQS_RESOLUTION = 1; - public static final int NQS_DIFFERENCE = 5; - public static final int NEIGHBORHOOD_SIZE = 5; - public static final int NQS_QSCORE_START = 5; - public static final int MM_OFFSET = 0; - public static final int COUNT_OFFSET = 1; - public static final int FAIL_OFFSET = 2; - public static final String DATA_FORMAT = "%d %d %d %d %d %d%n"; - public static final String HEADER_FORMAT = "%s %s %s %s %s %s%n"; - - public long[][][] reduceInit() { - long[][][] mismatchesByNQS = new long[(int) QualityUtils.MAX_QUAL_SCORE+1][this.numNQSGroups()+1][2]; - for ( int qualityScore = 0; qualityScore <= (int) QualityUtils.MAX_QUAL_SCORE; qualityScore ++) { - for ( int nqsGroup = 0; nqsGroup <= numNQSGroups(); nqsGroup++ ) { - mismatchesByNQS[qualityScore][nqsGroup][MM_OFFSET] = 0; - mismatchesByNQS[qualityScore][nqsGroup][COUNT_OFFSET] = 0; - } - } - return mismatchesByNQS; - } - - public LocalMapType map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - return new LocalMapType(context,ref,tracker); - } - - public long[][][] reduce(LocalMapType rawMapData, long[][][] cumMismatchArray) { - if ( ! isDBSNP(rawMapData.tracker) ) { - for( int read = 0; read < rawMapData.numReads(); read ++ ) { - int bestNQSRelativeToStart = getBestNQSRelativeToStart(rawMapData.context, read, nqsDifference(), - nqsResolution(), centerBaseQualityStart(), windowSize()); - if( isValidRelativeNQS(bestNQSRelativeToStart) ) { - cumMismatchArray[rawMapData.qScore(read)][bestNQSRelativeToStart][COUNT_OFFSET]++; - if( isMismatch(read, rawMapData.context, rawMapData.ref) ) { - cumMismatchArray[rawMapData.qScore(read)][bestNQSRelativeToStart][MM_OFFSET]++; - } - } - } - } - - return cumMismatchArray; - } - - public void onTraversalDone(long[][][] cumulativeCounts) { - printNQSMismatchTable(cumulativeCounts); - } - - public int getBestNQSRelativeToStart(AlignmentContext context, int readNum, int ctrNghdNQSDif, int nqsStepSize, - int ctrBaseQStart, int windSize) { - int minNeighborhoodQualityScore = findMinQScoreInWindow(context, readNum, windSize); - int centerQScore = getQualityScore(context,readNum); - int stepsToCutoffNeighborhood = (int) Math.floor((minNeighborhoodQualityScore - (ctrBaseQStart - ctrNghdNQSDif))/nqsStepSize); - int stepsToCutoffCenter = (int) Math.floor((centerQScore-ctrBaseQStart )/nqsStepSize); - - if(stepsToCutoffNeighborhood < stepsToCutoffCenter) { - return stepsToCutoffNeighborhood; - } else { - return stepsToCutoffCenter; - } - } - - protected void printNQSMismatchTable(long[][][] cumulativeCounts) { - out.printf("%s", createHeader() ); - for ( int qscore = 0; qscore <= QualityUtils.MAX_QUAL_SCORE; qscore ++ ) { - for( int nqsGroup = 0; nqsGroup <= numNQSGroups(); nqsGroup ++ ) { - out.printf("%s", formatNQSMismatchCountString(cumulativeCounts, nqsGroup, qscore)); - } - } - } - - protected String createHeader() { - return String.format(HEADER_FORMAT,"Qscore Reported","Qscore Threshold at Locus", "Minimum Neighborhood Quality", "NQS Group", - "# Non-dbSNP Mismatches", "Total Non-dbSNP Counts"); - } - - protected String formatNQSMismatchCountString(long[][][] counts, int nqsGroup, int qscore) { - return String.format(DATA_FORMAT,qscore,getCenterThreshold(nqsGroup),getNeighborhoodThreshold(nqsGroup),nqsGroup, - counts[qscore][nqsGroup][MM_OFFSET], counts[qscore][nqsGroup][COUNT_OFFSET]); - } - - protected int getCenterThreshold(int nqsGroup) { - return centerBaseQualityStart() + nqsGroup*nqsResolution(); - } - - protected int getNeighborhoodThreshold(int nqsGroup) { - return getCenterThreshold(nqsGroup)-nqsDifference(); - } - - protected int numNQSGroups() { - return NQS_GROUPS; - } - - protected int nqsResolution() { - return NQS_RESOLUTION; - } - - protected int nqsDifference() { - return NQS_DIFFERENCE; - } - - protected boolean isValidRelativeNQS(int relNQS) { - return relNQS >= 0; - } - - protected boolean isDBSNP(RefMetaDataTracker tracker) { - - return false; - // return ( tracker.lookup("dbSNP",null) != null ); - } - - protected boolean isMismatch(int readNum, AlignmentContext context, ReferenceContext ref) { - return ( (char) getBases(context,readNum)[ getOffset(context,readNum) ] ) != ref.getBase(); - } - - protected byte[] getBases(AlignmentContext context, int offset) { - return context.getReads().get(offset).getReadBases(); - } - - protected int getOffset(AlignmentContext context, int read) { - return context.getOffsets().get(read); - } - - protected int windowSize() { - return NEIGHBORHOOD_SIZE; - } - - protected int centerBaseQualityStart() { - return NQS_QSCORE_START; - } - - protected int getQualityScore(AlignmentContext context, int readNo) { - return getQualityScore(context.getReads().get(readNo), context.getOffsets().get(readNo)); - } - - protected int getQualityScore(SAMRecord read, int offset) { - return (int) read.getBaseQualities()[offset]; - } - - protected int findMinQScoreInWindow(AlignmentContext context, int readNo, int windowSize) { - SAMRecord read = context.getReads().get(readNo); - int offset = context.getOffsets().get(readNo); - int start; - int end; - - if ( offset - windowSize < 0) { - start = 0; - } else { - start = offset - windowSize; - } - - if ( offset + windowSize > read.getReadLength() ) { - end = read.getReadLength(); - } else { - end = offset + windowSize; - } - - int minQualityScore = Integer.MAX_VALUE; - for (int positionInRead = start; positionInRead < end; positionInRead++ ) { - int qScoreAtPosition = getQualityScore(read,positionInRead); - if ( qScoreAtPosition < minQualityScore ) { - minQualityScore = qScoreAtPosition; - } - } - - return minQualityScore; - } - -} - diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSRecalibratorWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSRecalibratorWalker.java deleted file mode 100755 index f6fe59207..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSRecalibratorWalker.java +++ /dev/null @@ -1,160 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; - -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.Pair; - -import java.io.BufferedReader; -import java.io.FileReader; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.StringTokenizer; - -import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMRecord; - -/** - * Created by IntelliJ IDEA. - * User: Ghost - * Date: Oct 11, 2009 - * Time: 11:04:07 AM - * To change this template use File | Settings | File Templates. - */ -public class NQSRecalibratorWalker extends ReadWalker { - @Argument(fullName="recalibrationTable", shortName="rt", doc="Table detailing NQS recalibration by reported, minimum, and maximum", required=true) - String recalFile = null; - @Argument(fullName="outputBamFile", shortName="outputBAM", doc="output BAM file", required=false) - public SAMFileWriter outBam = null; - - final static int MIN_RECALIBRATION_OBSERVATIONS = 10000; - final static int WIN_SIDE = 11; - final static int QMAX = 3 + QualityUtils.MAX_REASONABLE_Q_SCORE; - - protected byte[][][] RECALIBRATION_TABLE; - - public void initialize() { - // initialize the table - RECALIBRATION_TABLE = initializeQualityRecalibrationTable(); - BufferedReader reader; - try { - reader = new BufferedReader(new FileReader(recalFile)); - } catch (FileNotFoundException e) { - throw new StingException("File "+recalFile+" not found",e); - } - try { - String header = reader.readLine(); - logger.debug(header); - while( reader.ready() ) { - StringTokenizer tok = new StringTokenizer(reader.readLine()); - int repQ = Integer.valueOf(tok.nextToken()); - int minQ = Integer.valueOf(tok.nextToken()); - int maxQ = Integer.valueOf(tok.nextToken()); - int nObserv = Integer.valueOf(tok.nextToken()); - tok.nextToken(); // skip one - empirical mismatch rate - int empQ = Integer.valueOf(tok.nextToken()); - if ( nObserv > MIN_RECALIBRATION_OBSERVATIONS ) { - RECALIBRATION_TABLE[repQ][minQ][maxQ] = (byte) empQ; - } - // System.out.println(repQ); - } - } catch (IOException e) { - throw new StingException("File "+recalFile+" could not be read",e); - } - } - - public SAMFileWriter reduceInit() { - return outBam; - } - - public SAMRecord map(char[] ref, SAMRecord read) { - byte[] initialQualities = read.getBaseQualities(); - byte[] finalQualities = getNewQualitiesByNQS(initialQualities); - return recalibrateRead(read, finalQualities); - } - - public byte[] getNewQualitiesByNQS( byte [] initQuals ) { - byte [] newQuals = new byte[initQuals.length]; - for ( int offset = 0; offset < initQuals.length; offset ++ ) { - newQuals[offset] = qualityByNQS(initQuals,offset); - } - - return newQuals; - } - - public SAMFileWriter reduce( SAMRecord newRead, SAMFileWriter outwriter ) { - if ( outwriter == null ) { - out.println(newRead); - } else { - outwriter.addAlignment(newRead); - } - - return outwriter; - } - - public byte qualityByNQS( byte [] quals, int offset ) { - Pair minMaxQuality = getMinMaxQuality(quals,offset); - return nqsLookup( quals[offset] , minMaxQuality.getFirst() , minMaxQuality.getSecond() ); - } - - public Pair getMinMaxQuality( byte [] quals, int offset ) { - int start; - int end; - if ( offset-WIN_SIDE < 0 ) { - start = 0; - } else { - start = offset - WIN_SIDE; - } - - if ( offset + WIN_SIDE > quals.length ) { - end = quals.length; - } else { - end = offset + WIN_SIDE; - } - - byte minQuality = Byte.MAX_VALUE; - byte maxQuality = Byte.MIN_VALUE; - - for ( int i = start; i < end; i ++ ) { - if ( i != offset ) { - if ( quals[i] < minQuality ) { - minQuality = quals[i]; - } - if ( quals[i] > maxQuality ) { - maxQuality = quals[i]; - } - } - } - - return new Pair(minQuality,maxQuality); - } - - public byte nqsLookup( byte repQ, byte minQ, byte maxQ ) { - return RECALIBRATION_TABLE[(int) repQ][(int) minQ][(int) maxQ]; - } - - public SAMRecord recalibrateRead( SAMRecord read, byte[] newQualities ) { - read.setBaseQualities(newQualities); - return read; - } - - private byte[][][] initializeQualityRecalibrationTable() { - byte [][][] table = new byte[QMAX][QMAX][QMAX]; - for ( int qrep = 0; qrep < QMAX; qrep ++ ) { - for ( int qmin = 0; qmin < QMAX; qmin ++ ) { - for ( int qmax = 0; qmax < QMAX; qmax ++ ) { - table[qrep][qmin][qmax] = (byte) qrep; - // default value is the reported q-score - } - } - } - - return table; - } - -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSTabularDistributionWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSTabularDistributionWalker.java deleted file mode 100644 index 30aba3804..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSTabularDistributionWalker.java +++ /dev/null @@ -1,181 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; - -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.Pair; - -import java.util.LinkedList; -import java.util.List; -import java.util.ListIterator; -import org.apache.log4j.Logger; - -import net.sf.samtools.SAMRecord; - -/** - * Created by IntelliJ IDEA. - * User: Ghost - * Date: Sep 24, 2009 - * Time: 10:40:29 AM - * To change this template use File | Settings | File Templates. - */ -public class NQSTabularDistributionWalker extends LocusWalker { - - final int WINDOW_SIZE_SIDE = 3; - final int QSCORE_BIN_SIZE = 4; - final String ROW_FORMAT = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d%n"; - final String HEAD_FORMAT = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n"; - - int qBins; - int WINDOW_SIZE; - - public void initialize() { - qBins = 2 + (int) Math.ceil(((double)QualityUtils.MAX_REASONABLE_Q_SCORE)/QSCORE_BIN_SIZE); - WINDOW_SIZE = 2*WINDOW_SIZE_SIDE+1; - } - - public NQSDistributionTable reduceInit() { - return new NQSDistributionTable(WINDOW_SIZE, qBins, QSCORE_BIN_SIZE, logger); - } - - public LocalMapType map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - return new LocalMapType(context, ref, tracker); - } - - public NQSDistributionTable reduce( LocalMapType curMap, NQSDistributionTable prevReduce ) { - // iterate through reads - for ( int i = 0; i < curMap.numReads(); i ++ ) { - prevReduce.update(curMap.context.getReads().get(i), curMap.context.getOffsets().get(i), WINDOW_SIZE_SIDE, QSCORE_BIN_SIZE, curMap.ref); - } - - return prevReduce; - } - - public void onTraversalDone(NQSDistributionTable finalTable) { - out.print( makeHeader() ); - for ( int a = 0; a < qBins; a ++ ) { - for ( int b = 0; b < qBins; b ++ ) { - for ( int c = 0; c < qBins; c ++ ) { - for ( int d = 0; d < qBins; d ++ ) { - for ( int e = 0; e < qBins; e ++ ) { - for ( int f = 0; f < qBins; f ++ ) { - for ( int g = 0; g < qBins; g ++ ) { - Pair mm = finalTable.getPair(a,b,c,d,e,f,g); - out.print( formatOutput(a,b,c,d,e,f,g,mm.first,mm.second) ); - } - } - } - } - } - } - } - } - - public String makeHeader() { - return String.format(HEAD_FORMAT,"Q1","Q2","Q3","Q4","Q5","Q6","Q7","Mismatch","Match"); - } - - public String formatOutput( int a, int b, int c, int d, int e, int f, int g, int h, int i ) { - return String.format(ROW_FORMAT, a, b, c, d, e, f, g, h, i); - } -} - - -class NQSDistributionTable { - - - public final int MM_OFFSET = 0; - public final int MATCH_OFFSET = 1; - - protected int [][][][][][][][] table; - protected Logger logger; - - protected int OFF_END_OFFSET; - protected int OFF_START_OFFSET; - - public NQSDistributionTable (int winSize, int qBins, int qStep, Logger logger) { - if ( 7 != winSize ) { - throw new StingException("Size positied in tabular distribution is not the size of the distribution table"); - } - this.logger = logger; - table = new int[qBins][qBins][qBins][qBins][qBins][qBins][qBins][2]; - // yes, this is absolutely positively horrendous brute-force code. - // ... Knuth would be proud. These could be lists for a decrease - // in memory but a slowdown in building the distribution. - for ( int a = 0; a < qBins; a ++ ) { - for ( int b = 0; b < qBins; b ++ ) { - for ( int c = 0; c < qBins; c ++ ) { - for ( int d = 0; d < qBins; d ++ ) { - for ( int e = 0; e < qBins; e ++ ) { - for ( int f = 0; f < qBins; f ++ ) { - for ( int g = 0; g < qBins; g ++ ) { - for ( int h = 0; h < 2; h ++ ) { - table[a][b][c][d][e][f][g][h] = 0; - } - } - } - } - } - } - } - } - - OFF_END_OFFSET = qBins-2; - OFF_START_OFFSET = qBins-1; - } - - public void update( SAMRecord read, int offset, int winSize, int binSize, ReferenceContext ref ) { - // note: there are two extra bins, one for "No quality: off the end of read" - // and one for "No quality: off the beginning of read" - List coordinates = getQscoreCoordinates(read,offset,winSize, binSize); - if ( isMatch(read, offset, ref) ) { - coordinates.add(MATCH_OFFSET); - } else { - coordinates.add(MM_OFFSET); - } - ListIterator o = coordinates.listIterator(); - table[o.next()][o.next()][o.next()][o.next()][o.next()][o.next()][o.next()][o.next()] ++; - // i'm sure i could use recursion for this but my brain is too tired after swimming to figure - // out, in 20 seconds, how to do it - } - - public List getQscoreCoordinates( SAMRecord read, int offset, int win, int binSize ) { - LinkedList coords = new LinkedList(); - for ( int i = offset - win; i <= offset + win ; i ++ ) { - coords.add(getQscoreBin(getQScoreAsInt(read,i), binSize)); - } - logger.debug(Integer.toString(coords.size())); - return coords; - } - - public int getQScoreAsInt( SAMRecord read, int offset ) { - int qscore; - if ( offset < 0 ) { - qscore = read.getReadNegativeStrandFlag() ? OFF_END_OFFSET : OFF_START_OFFSET; - } else if ( offset >= read.getReadLength() ) { - qscore = read.getReadNegativeStrandFlag() ? OFF_START_OFFSET : OFF_END_OFFSET; - } else { - qscore = (int) read.getBaseQualities()[offset]; - } - - return qscore; - } - - public int getQscoreBin( int qscore, int binSize ) { - return (int) Math.floor(((double)qscore)/binSize); - } - - public Pair getPair( int a, int b, int c, int d, int e, int f, int g ) { - int mm = table[a][b][c][d][e][f][g][MM_OFFSET]; - int ma = table[a][b][c][d][e][f][g][MATCH_OFFSET]; - - return new Pair(mm,ma); - } - - public boolean isMatch( SAMRecord read, int offset, ReferenceContext ref ) { - return ( Character.toUpperCase( (char) read.getReadBases()[offset] ) == ref.getBase() || ref.getBase() == 'N'); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreDistributionWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreDistributionWalker.java deleted file mode 100755 index b711bf997..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreDistributionWalker.java +++ /dev/null @@ -1,77 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; - -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.Pair; - -import java.util.List; -import java.util.LinkedList; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Sep 28, 2009 - * Time: 12:02:33 PM - * To change this template use File | Settings | File Templates. - */ -public class QualityScoreDistributionWalker extends LocusWalker>, int[][]> { - @Argument(fullName="maxReadLength", shortName="rl", doc="Maximum read length in the bam file", required=true) - protected int MAX_READ_LENGTH = 125; - - - protected final int MAX_REASONABLE_Q_SCORE = 3 + QualityUtils.MAX_REASONABLE_Q_SCORE; - public void initialize() { - - } - - public int[][] reduceInit() { - int [][] qualityCounts = new int[MAX_READ_LENGTH][MAX_REASONABLE_Q_SCORE]; - for ( int i = 0; i < MAX_READ_LENGTH; i ++ ) { - for ( int j = 0; j < MAX_REASONABLE_Q_SCORE; j ++ ) { - qualityCounts[i][j] = 0; - } - } - - return qualityCounts; - } - - public List> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - LinkedList> qualByPos = new LinkedList>(); - for ( int r = 0; r < context.numReads(); r ++ ) { - if( context.getReads().get(r).getReadNegativeStrandFlag() ) { - int of = context.getReads().get(r).getReadLength() - context.getOffsets().get(r); - int qu = context.getReads().get(r).getBaseQualities()[context.getOffsets().get(r)]; - qualByPos.add(new Pair(of, qu)); - } - else { - int of = context.getOffsets().get(r); - int qu = context.getReads().get(r).getBaseQualities()[of]; - qualByPos.add(new Pair(of, qu)); - } - } - - return qualByPos; - } - - public int[][] reduce( List> qualByPos, int[][] counts ) { - for ( Pair qPosPair : qualByPos ) { - counts[qPosPair.getFirst()][qPosPair.getSecond()] ++; - } - - return counts; - } - - public void onTraversalDone( int[][] counts ) { - for ( int pos = 0; pos < MAX_READ_LENGTH; pos ++ ) { - String outStr = Integer.toString(pos); - for ( int qual = 0; qual < MAX_REASONABLE_Q_SCORE; qual ++) { - outStr = outStr + "\t" + Integer.toString(counts[pos][qual]); - } - out.printf("%s%n", outStr); - } - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/AnalyzePowerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/AnalyzePowerWalker.java deleted file mode 100755 index f972d49fb..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/AnalyzePowerWalker.java +++ /dev/null @@ -1,218 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.poolseq; - -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.Pair; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.walkers.By; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.playground.utils.PoolUtils; -import org.broadinstitute.sting.playground.utils.ReadOffsetQuad; - -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.BufferedReader; -import java.io.IOException; -import java.util.StringTokenizer; -import java.util.NoSuchElementException; -import java.util.List; -import java.rmi.NoSuchObjectException; - -import net.sf.samtools.SAMRecord; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Aug 25, 2009 - * Time: 3:47:47 PM - * To change this template use File | Settings | File Templates. - */ -@By(DataSource.REFERENCE) -public class AnalyzePowerWalker extends CoverageAndPowerWalker{ - // runs CoverageAndPowerWalker except compares to Syzygy outputs in a file - - @Argument(fullName = "SyzygyOutputFile", shortName = "sof", doc="Syzygy output file to compare to", required = true) - String pathToSyzygyFile = null; - @Argument(fullName = "ColumnOffset", shortName = "co", doc = "Offset of column containing the power in the pf", required = true) - int colOffset = 0; - - BufferedReader syzyFileReader; - final String pfFileDelimiter = " "; - boolean outOfLinesInSyzyFile = false; - private double[][] syzyPowerTable; - private String[][] syzySanityCheckTable; - - @Override - public void initialize() - { - super.initialize(); - syzyPowerTable = new double[8][1500000]; - syzySanityCheckTable = new String[8][1500000]; - try { - syzyFileReader = new BufferedReader(new FileReader(pathToSyzygyFile)); - logger.error("generatePowerTable called"); - generatePowerTable(syzyFileReader); - } catch (FileNotFoundException e) { - String newErrMsg = "Syzygy input file " + pathToSyzygyFile + " could be incorrect. File not found."; - throw new StingException(newErrMsg,e); - } catch (IOException e) { - String newErrMsg = "Syzygy input file error: could not read first line of "+pathToSyzygyFile; - throw new StingException(newErrMsg,e); - } - - } - - @Override - public Pair map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext rawContext) - { - AlignmentContext context; - if (super.getMinQualityScore() <= 0) { - context = rawContext; - } else { - Pair,List> readsFilteredByQuality = filterByQuality(rawContext.getReads(),rawContext.getOffsets(), super.getMinQualityScore()); - context = new AlignmentContext(rawContext.getLocation(),readsFilteredByQuality.getFirst(),readsFilteredByQuality.getSecond()); - } - ReadOffsetQuad readsByDirection = PoolUtils.splitReadsByReadDirection(context.getReads(),context.getOffsets()); - Pair, List>,Pair,List>> splitReads = new Pair(new Pair(readsByDirection.getFirstReads(),readsByDirection.getSecondReads()),new Pair(readsByDirection.getFirstOffsets(),readsByDirection.getSecondOffsets())); - if ( !super.suppress_printing ) - { - Pair powPair = super.calculatePower(splitReads,false,context); - - - - int tabIndexChrom = getTableChromIndex(context.getLocation().toString()); - int tabIndexLoc = getTablePosIndex(context.getLocation().toString()); - double syzyPow = 0; - String syzySanity = "NoLoc"; - if(tabIndexChrom >= 0 && tabIndexLoc >= 0) { - syzyPow += syzyPowerTable[tabIndexChrom][tabIndexLoc]; - syzySanity = "s " + syzySanityCheckTable[tabIndexChrom][tabIndexLoc]; - - } - out.printf("%s|%s: %d %d %d %d %d %d %f %f %f %f %n", context.getLocation(), syzySanity, splitReads.getFirst().getFirst().size(), splitReads.getFirst().getSecond().size(), - context.getReads().size(), powPair.getSecond()[0], powPair.getSecond()[1], powPair.getSecond()[2], - powPair.getFirst()[0], powPair.getFirst()[1], powPair.getFirst()[2], syzyPow); - - } - - return new Pair(splitReads.getFirst().getFirst().size(), splitReads.getFirst().getFirst().size()); - } - - public Pair getSyzyPowFromFile() { - String thisLine = null; - try { - thisLine = syzyFileReader.readLine(); - } catch(IOException e) { - String newErrMsg = "Ran out of lines in the syzyfile; further output of Syzygy power will be suppressed."; - outOfLinesInSyzyFile=true; - logger.warn(newErrMsg + " " + e.toString()); - return new Pair(-1.1, "Printing Stops Here"); - } - - String chromPosTar = null; - StringTokenizer lineTokenizer = new StringTokenizer(thisLine, pfFileDelimiter); - try { - chromPosTar = lineTokenizer.nextToken(); - for(int j = 1; j < colOffset; j++) { - lineTokenizer.nextToken(); - } - String chromPos = (new StringTokenizer(chromPosTar, "\t")).nextToken(); - return new Pair((Double.valueOf(lineTokenizer.nextToken())/100.0),chromPos); - } catch (NoSuchElementException e) { - String errMsg = "The given column offset for the pool, " + colOffset + " exceeded the number of entries in the file " + pathToSyzygyFile; - throw new StingException(errMsg); - } - } - - public String createHeaderString() { - return (super.createHeaderString() + " PowSyz"); - } - - //TODO: rewrite this so it's not a complete hack!!! - - public void generatePowerTable(BufferedReader syzFile) { - - System.out.println("generatePowerTable entered"); - int numreads = 0; - - try{ - while(syzFile.ready()) { - String line = syzFile.readLine(); - if(line == null || line.equals("")) { - break; - } - StringTokenizer tok = new StringTokenizer(line, " :\t"); - String chrmNoStrWithChr = tok.nextToken(); - String chrmNoStr = chrmNoStrWithChr.substring(3); - String locStr = tok.nextToken(); - for(int j = colOffset - 2; j > 0; j --) { - tok.nextToken(); - } - numreads++; - String syzyPowStr = tok.nextToken(); - if ( chrmNoStr == null || locStr == null || chrmNoStr.equals("") || locStr.equals("")) { - break; - } - int chrIndex = getTableChromIndex(chrmNoStr,locStr); - int posIndex = getTablePosIndex(chrmNoStr,locStr); - syzyPowerTable[chrIndex][posIndex] = Double.valueOf(syzyPowStr)/100.0; - syzySanityCheckTable[chrIndex][posIndex] = "chrm" + chrmNoStr +":" + locStr; - if( (numreads % 1000) == 0){ - System.out.println(numreads + " reads from Syzygy file"); - } - } - } catch (IOException e) { - // do nothing - System.out.println("IOException caught"); - } - System.out.println("Table generated."); - - } - - public int getTableChromIndex(String chromNo, String locNo) { - switch (Integer.valueOf(chromNo)) { - case 1: return 0; - case 2: return 1; - case 3: - if ( Integer.valueOf(locNo) < 63000000 ) - return 2; - else - return 3; - case 7: return 4; - case 10: return 5; - case 11: return 6; - case 12: return 7; - default: - System.out.println(chromNo + " " + locNo); - return -1; - } - } - - public int getTableChromIndex(String chromAndPos) { - StringTokenizer tok = new StringTokenizer(chromAndPos,":"); - return getTableChromIndex(tok.nextToken().substring(3),tok.nextToken()); - } - - public int getTablePosIndex(String chromAndPos) { - StringTokenizer tok = new StringTokenizer(chromAndPos,":"); - return getTablePosIndex(tok.nextToken().substring(3),tok.nextToken()); - } - - public int getTablePosIndex(String chromNo, String locNo) { - switch ( getTableChromIndex(chromNo, locNo) ) { - case 0: return Integer.valueOf(locNo) - 120065274; - case 1: return Integer.valueOf(locNo) - 43311321; - case 2: return Integer.valueOf(locNo) - 12157091; - case 3: return Integer.valueOf(locNo) - 64000000; - case 4: return Integer.valueOf(locNo) - 27838907; - case 5: return Integer.valueOf(locNo) - 12003199; - case 6: return Integer.valueOf(locNo) - 92342389; - case 7: return Integer.valueOf(locNo) - 69809219; - default: - System.out.println(chromNo + " " + locNo); - return -1; - } - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java deleted file mode 100755 index 9d35f0297..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/CoverageAndPowerWalker.java +++ /dev/null @@ -1,232 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.poolseq; - -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.Pair; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.ListUtils; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.By; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.playground.utils.PoolUtils; -import org.broadinstitute.sting.playground.utils.ReadOffsetQuad; -import net.sf.samtools.SAMRecord; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Aug 18, 2009 - * Time: 11:57:07 AM - * To change this template use File | Settings | File Templates. - */ - -@By(DataSource.REFERENCE) -public class CoverageAndPowerWalker extends LocusWalker, Pair> { - @Argument(fullName="suppressLocusPrinting",doc="Suppress printing",required=false) - public boolean suppress_printing = false; - - @Argument(fullName="poolSize", shortName="ps", doc="Number of individuals in pool", required=true) - public int num_individuals = 0; - - @Argument(fullName="bootStrap", shortName="bs", doc="Use a bootstrap method", required=false) - public boolean useBootstrap = false; - - @Argument(fullName="lodThreshold", shortName="lt", doc="Threshold for LOD score for calls", required=false) - public double lodThreshold = 3.0; - - @Argument(fullName="minQScore", shortName="qm", doc="Threshold for the minimum quality score, filter out bases below",required=false) - public byte minQScore = 0; - - @Argument(fullName="outputUnthresholdedCoverage", shortName = "uc", doc="Print out the total coverage as well as the coverage above the quality threshold", required = false) - public boolean outputUnthreshCvg = false; - - private static final int BOOTSTRAP_ITERATIONS = 300; - - public void initialize() { - - if(num_individuals <= 0) - throw new IllegalArgumentException("Positive nonzero parameter expected for poolSize"); - } - - public Pair reduceInit() { - if ( ! suppress_printing ) { // print header - out.printf("%s%n",createHeaderString()); - } - return new Pair(0l,0l); - } - - public Pair reduce(Pair newCvg, Pair prevCvg) { - return new Pair(prevCvg.first + newCvg.first, prevCvg.second + newCvg.second); - } - - public Pair map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - AlignmentContext filteredContext; - if (this.getMinQualityScore() <= 0) { - filteredContext = context; - } else { - Pair,List> readsFilteredByQuality = filterByQuality(context.getReads(),context.getOffsets(), this.getMinQualityScore()); - filteredContext = new AlignmentContext(context.getLocation(),readsFilteredByQuality.getFirst(),readsFilteredByQuality.getSecond()); - } - ReadOffsetQuad splitReads = PoolUtils.splitReadsByReadDirection(filteredContext.getReads(), filteredContext.getOffsets()); - Pair,List>,Pair,List>> readsByDirection = new Pair(new Pair(splitReads.getFirstReads(),splitReads.getSecondReads()),new Pair(splitReads.getFirstOffsets(),splitReads.getSecondOffsets())); - if ( ! suppress_printing && ! outputUnthreshCvg ) { - Pair powers = calculatePower(readsByDirection, useBootstrap, filteredContext); - out.printf("%s %d %d %d %d %d %d %f %f %f%n", filteredContext.getLocation(), readsByDirection.getFirst().getFirst().size(), readsByDirection.getFirst().getSecond().size(), - filteredContext.getReads().size(), powers.getSecond()[0], powers.getSecond()[1], powers.getSecond()[2], - powers.getFirst()[0], powers.getFirst()[1], powers.getFirst()[2]); - } else if (! suppress_printing && outputUnthreshCvg) { - Pair powers = calculatePower(readsByDirection, useBootstrap, filteredContext); - ReadOffsetQuad readsByDir = PoolUtils.splitReadsByReadDirection(context.getReads(),context.getOffsets()); - Pair,List>,Pair,List>> ufReadsByDirection = new Pair(new Pair(readsByDir.getFirstReads(), readsByDir.getSecondReads()), new Pair(readsByDir.getFirstOffsets(), readsByDir.getSecondOffsets())); - out.printf("%s %d %d %d %d %d %d %d %d %d %f %f %f%n", filteredContext.getLocation(), ufReadsByDirection.getFirst().getFirst().size(), - ufReadsByDirection.getFirst().getSecond().size(), context.getReads().size(), - readsByDirection.getFirst().getFirst().size(), readsByDirection.getFirst().getSecond().size(), - filteredContext.getReads().size(), powers.getSecond()[0], powers.getSecond()[1], powers.getSecond()[2], - powers.getFirst()[0], powers.getFirst()[1], powers.getFirst()[2]); - } - return new Pair(readsByDirection.getFirst().getFirst().size(),readsByDirection.getFirst().getSecond().size()); - } - - // helper methods - - public Pair calculatePower(Pair,List>,Pair,List>> dirReads, boolean bootStrap, AlignmentContext context) { - Pair powerAndMedianQ; - if( bootStrap ) { - powerAndMedianQ = calculateDirectionalPowersByBootstrap(dirReads,lodThreshold,context); - } else { - powerAndMedianQ = calculateDirectionalPowersTheoretical(dirReads,lodThreshold,context); - } - - return powerAndMedianQ; - } - - public Pair calculateDirectionalPowersTheoretical(Pair,List>,Pair,List>> dirReads, double thresh, AlignmentContext context) { - byte[] medQs = getMedianQScores(dirReads, context); - double powerForward = calculatePowerTheoretical(dirReads.getFirst().getFirst(),dirReads.getSecond().getFirst(),medQs[0],thresh); - double powerReverse = calculatePowerTheoretical(dirReads.getFirst().getSecond(), dirReads.getSecond().getSecond(),medQs[1],thresh); - double powerCombined = calculatePowerTheoretical(context.getReads(),context.getOffsets(),medQs[2],thresh); - double [] powers = {powerForward,powerReverse,powerCombined}; - return new Pair(powers, medQs); - } - - public byte[] getMedianQScores(Pair,List>,Pair,List>> dirReads, AlignmentContext context) { - byte medQForward = getMedianQualityScore(dirReads.getFirst().getFirst(),dirReads.getSecond().getFirst()); - byte medQReverse = getMedianQualityScore(dirReads.getFirst().getSecond(), dirReads.getSecond().getSecond()); - byte medQCombined = getMedianQualityScore(context.getReads(),context.getOffsets()); - byte [] medQs = {medQForward, medQReverse, medQCombined}; - return medQs; - } - - public byte getMedianQualityScore(List reads, List offsets) { - return ListUtils.getQScoreMedian(reads,offsets); - } - - public double calculatePowerTheoretical(List reads, List offsets, byte medQ, double thresh) { - double p_error = QualityUtils.qualToErrorProb(medQ); - int depth = reads.size(); - if(depth <= 0) { - return 0; - } - double snpProp = this.getSNPProportion(1); - - // variable names from here on out come from the mathematics of a log likelihood test - // with binomial probabilities, of observing k bases consistent with the same SNP - // given that SNP occurs in one of the alleles, versus it does not occur. - // Numerator and denominator will each have a term raised to the kth power - // and a term raised to the (depth - kth) (or d-kth) power. Thus the names. - // Taking a log then brings those powers out as coefficients, where they can be solved for. - - - double kterm_num = Math.log10( snpProp * (1 - p_error) + (1 - snpProp) * (p_error/3) ); - double kterm_denom = Math.log10( p_error/3 ); - double dkterm_num = Math.log10( snpProp * (p_error/3) + (1 - snpProp) * (1 - p_error) ); - double dkterm_denom = Math.log10( 1 - p_error); - - int kaccept = (int) Math.ceil( ( thresh - ( (double) depth ) * ( dkterm_num - dkterm_denom ) )/( kterm_num - kterm_denom- dkterm_num + dkterm_denom ) ); - - // we will reject the null hypothesis if we see kaccept or more SNPs, the power is the probability that this occurs - // we can optimize this by checking to see which sum is smaller - - double pow = 0; - - if ( depth - kaccept < kaccept ) {// kaccept > depth/2 - calculate power as P[hits between kaccept and depth] - return MathUtils.cumBinomialProbLog(kaccept, depth, depth, snpProp); - } else { // kaccept < depth/2 - calculate power as 1-P[hits between 0 and kaccept] - return 1-MathUtils.cumBinomialProbLog(0,kaccept,depth,snpProp); - } - } - - public double getSNPProportion(int numSNPAlleles) { - return ((double) numSNPAlleles / (2.0 * num_individuals)); - } - - public Pair calculateDirectionalPowersByBootstrap(Pair,List>,Pair,List>> dirReads, double thresh, AlignmentContext context) { - double powerForward = bootstrapPower(dirReads.getFirst().getFirst(),dirReads.getSecond().getFirst(),thresh); - double powerReverse = bootstrapPower(dirReads.getFirst().getSecond(),dirReads.getSecond().getSecond(),thresh); - double powerCombined = bootstrapPower(context.getReads(),context.getOffsets(),thresh); - double[] powers = {powerForward, powerReverse, powerCombined}; - return new Pair(powers,getMedianQScores(dirReads,context)); - } - - public double bootstrapPower(List reads, List offsets, double thresh) { - double power = 0.0; - for ( int boot = 0; boot < BOOTSTRAP_ITERATIONS; boot++) { - Pair,List>,Pair,List>> snpReadsAndRefReads = coinTossPartition(reads,offsets,this.getSNPProportion(1)); - if( PoolUtils.calculateLogLikelihoodOfSample(snpReadsAndRefReads, num_individuals) > thresh) { - power += 1.0/BOOTSTRAP_ITERATIONS; - } - } - - return power; - } - - public String createHeaderString() { - String headString; - if(! outputUnthreshCvg ) { - headString = "Chrom:Pos ForwardCoverage ReverseCoverage CombinedCoverage ForwardMedianQ ReverseMedianQ CombinedMedianQ PowerForward PowerReverse PowerCombined"; - } else { - headString = "Chrom:Pos ForwardCoverage ReverseCoverage CombinedCoverage ForwardCoverageAboveThreshold ReverseCoverageAboveThreshold CombinedCoverageAboveThreshold ForwardMedianQ ReverseMedianQ CombinedMedianQ PowerForward PowerReverse PowerCombined"; - } - return headString; - } - - public byte getMinQualityScore() { - return minQScore; - } - - public Pair,List> filterByQuality(List reads, List offsets, byte qMin) { - return PoolUtils.thresholdReadsByQuality(reads,offsets,qMin); - } - - // class methods - - public static Pair,List>,Pair,List>> coinTossPartition(List reads, List offsets, double snpProb) { - Pair,List>,Pair,List>> partitionedReads; - if( reads == null || offsets == null ) { - partitionedReads = null; - } else { - ArrayList snpReads = new ArrayList(); - ArrayList refReads = new ArrayList(); - ArrayList snpOff = new ArrayList(); - ArrayList refOff = new ArrayList(); - for(int readNo = 0; readNo < reads.size(); readNo++) { - if ( Math.random() < snpProb) { - snpReads.add(reads.get(readNo)); - snpOff.add(offsets.get(readNo)); - } else { - refReads.add(reads.get(readNo)); - refOff.add(offsets.get(readNo)); - } - } - partitionedReads= new Pair(new Pair(snpReads,refReads), new Pair(snpOff, refOff)); - } - - return partitionedReads; - } - -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/PowerAndCoverageWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/PowerAndCoverageWalker.java deleted file mode 100644 index 8e23de3e1..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/PowerAndCoverageWalker.java +++ /dev/null @@ -1,262 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.poolseq; - -import org.broadinstitute.sting.playground.utils.SQuad; -import org.broadinstitute.sting.playground.utils.ReadOffsetQuad; -import org.broadinstitute.sting.playground.utils.PoolUtils; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.By; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.*; -import net.sf.samtools.SAMRecord; - -import java.io.PrintStream; -import java.io.FileNotFoundException; -import java.util.List; - -/** - * Created by IntelliJ IDEA. - * User: Ghost - * Date: Sep 12, 2009 - * Time: 12:20:42 PM - * To change this template use File | Settings | File Templates. - */ -@By(DataSource.REFERENCE) -public class PowerAndCoverageWalker extends LocusWalker, SQuad> { - - @Argument(fullName="suppressLocusPrinting", doc="Walker does not output locus data to any file. Only total coverage information is given at the end", required=false) - public boolean suppressPrinting = false; - - @Argument(fullName="outputFile", shortName="of", doc="Locus information is printed to this file instead of standard out", required = false) - public String outputFile = null; - - @Argument(fullName="bootStrapIterations", shortName="bs", doc="Use a bootstrap method with this many iterations", required=false) - public int bootstrapIterations = 0; - - @Argument(fullName="lodThreshold", shortName="lod", doc="Threshold for log likelihood ratio to be called a SNP. Defaults to 3.0", required = false) - public double lodThresh = 3.0; - - @Argument(fullName="minimumQScore", shortName="qm", doc="Use bases whose phred (Q) score meets or exceeds this number. Defaults to 0", required = false) - public byte minQ = 0; - - @Argument(fullName="aboveQScoreOutputOnly", doc="Output only the information for reads that exceed the q-score threshold", required=false) - public boolean aboveQScoreOutputOnly = false; - - @Argument(fullName="outputRawStatistics", shortName="rs", doc="Walker outputs the median quality score and power for the un-thresholded reads as well as those that are thresholded", required=false) - public boolean outputRawStatistics = false; - - @Argument(fullName="poolSize", shortName="ps", doc="Number of individuals in the pool", required = true) - public int numIndividuals = 0; - - protected PrintStream outputWriter = null; - - public void initialize() { - if(numIndividuals <= 0) { - throw new StingException("Pool size must be greater than 1. You input "+numIndividuals); - } - - if( outputFile != null ) { - try { - outputWriter = new PrintStream(outputFile); - } catch (FileNotFoundException e) { - String errMsg = "The filepath input as an argument to -of, "+outputFile+" was a directory or could not be found or created."; - throw new StingException(errMsg, e); - } - outputWriter.printf("%s%n", generateHeaderString()); - } else if( ! suppressPrinting ) { - out.printf("%s%n", generateHeaderString()); - } - } - - public SQuad reduceInit() { - return new SQuad(0l,0l,0l,0l); - } - - /* REDUCE - * Add up filtered and unfiltered coverage in the forward - * and reverse directions. - */ - public SQuad reduce( SQuad newCvg, SQuad oldCvg) { - return new SQuad(newCvg.getFirst() + oldCvg.getFirst(), newCvg.getSecond() + oldCvg.getSecond(), - newCvg.getThird() + oldCvg.getThird(), newCvg.getFourth() + oldCvg.getFourth()); - } - /* MAP - * Get the coverage in each direction above and below threshold and calculate the estimated power - * if printing is not suppressed print this information to the file - * pass coverage information on to Reduce - */ - public SQuad map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - ReadOffsetQuad readsByDirection = splitReadsByDirection(context.getReads(),context.getOffsets()); - ReadOffsetQuad readsByDirectionAboveThresh; - - if(minQ > 0) { - readsByDirectionAboveThresh = thresholdReadsByQuality(readsByDirection); - } else { - readsByDirectionAboveThresh = readsByDirection; - } - - - SQuad medianQScoresAboveThresh = getMedianQScores(readsByDirectionAboveThresh); - SQuad powerAboveThresh = calculatePower(readsByDirectionAboveThresh, medianQScoresAboveThresh); - SQuad medianQScoresUnthresh; - SQuad powerUnthresh; - - if(! outputRawStatistics) { - medianQScoresUnthresh = null; - powerUnthresh = null; - } else { - medianQScoresUnthresh = getMedianQScores(readsByDirection); - powerUnthresh= calculatePower(readsByDirection, medianQScoresUnthresh); - } - - if ( ! suppressPrinting && outputWriter == null) { - out.printf("%s%n",outputString(readsByDirection,readsByDirectionAboveThresh,powerAboveThresh, powerUnthresh, - medianQScoresAboveThresh, medianQScoresUnthresh, context.getLocation())); - } else if (! suppressPrinting && outputWriter != null) { - outputWriter.printf("%s%n",outputString(readsByDirection,readsByDirectionAboveThresh,powerAboveThresh, powerUnthresh, - medianQScoresAboveThresh, medianQScoresUnthresh, context.getLocation())); - } - - return new SQuad(readsByDirection.numReadsFirst(), readsByDirection.numReadsSecond(), - readsByDirectionAboveThresh.numReadsFirst(), readsByDirectionAboveThresh.numReadsSecond()); - - } - - // private helper methods - - /* - * Make different headers for the output file depending on printing-related command line arguments - */ - private String generateHeaderString() { - String header; - if (minQ <= 0) { - header = "Chrom:Pos CoverageFwd CoverageRev CoverageCom MedianQFwd MedianQRev MedianQCom PowerFwd PowerRev PowerCom"; - } else if (! aboveQScoreOutputOnly && ! outputRawStatistics){ - String m = Byte.toString(minQ); - header = "Chrom:Pos CoverageFwd CoverageRev CoverageCom CoverageAboveQ"+m+"Fwd CoverageAboveQ"+m+"Rev CoverageAboveQ"+m+"Com MedianQFwd MedianQRev MedianQCom PowerFwd PowerRev PowerCom"; - } else if (! outputRawStatistics){ - header = "Chrom:Pos CoverageFwd CoverageRev CoverageCom MedianQFwd MedianQRev MedianQCom PowerFwd PowerRev PowerCom"; - } else { - String m = Byte.toString(minQ); - header = "Chrom:Pos CoverageFwd CoverageRev CoverageCom CoverageAboveQ"+m+"Fwd CoverageAboveQ"+m+"Rev "+ - "CoverageAboveQ"+m+"Com RawMedianQFwd RawMedianQRev RawMedianQCom MedianQAboveQ"+m+"Fwd "+ - "MedianQAboveQ"+m+"Rev MedianQAboveQ"+m+"Com RawPowerFwd RawPowerRev RawPowerCom "+ - "PowerAboveQ"+m+"Fwd PowerAboveQ"+m+"Rev PowerAboveQ"+m+"Com"; - } - - return header; - } - /* - * Make different output strings depending on printing-related command-line options - */ - private String outputString(ReadOffsetQuad reads, ReadOffsetQuad threshReads, SQuad pow, SQuad rawPow, SQuad medQ, SQuad rawQ, GenomeLoc loc) { - String outputString; - if(minQ <= 0) { - outputString = String.format("%s %d %d %d %d %d %d %f %f %f", loc.toString(), reads.numReadsFirst(), reads.numReadsSecond(), reads.numReads(), - medQ.getFirst(), medQ.getSecond(), medQ.getThird(), pow.getFirst(), pow.getSecond(), pow.getThird()); - } else if (! aboveQScoreOutputOnly && ! outputRawStatistics ){ - outputString = String.format("%s %d %d %d %d %d %d %d %d %d %f %f %f", loc.toString(), reads.numReadsFirst(), reads.numReadsSecond(), reads.numReads(), - threshReads.numReadsFirst(), threshReads.numReadsSecond(), threshReads.numReads(), medQ.getFirst(), - medQ.getSecond(), medQ.getThird(), pow.getFirst(), pow.getSecond(), pow.getThird()); - } else if (! outputRawStatistics ){ - outputString = String.format("%s %d %d %d %d %d %d %f %f %f", loc.toString(), threshReads.numReadsFirst(), threshReads.numReadsSecond(), - threshReads.numReads(), medQ.getFirst(), medQ.getSecond(), medQ.getThird(), pow.getFirst(), - pow.getSecond(), pow.getThird()); - } else { - outputString = String.format("%s %d %d %d %d %d %d %d %d %d %d %d %d %f %f %f %f %f %f", loc.toString(), reads.numReadsFirst(), - reads.numReadsSecond(), reads.numReads(), threshReads.numReadsFirst(), threshReads.numReadsSecond(), - threshReads.numReads(), rawQ.getFirst(), rawQ.getSecond(), rawQ.getThird(), medQ.getFirst(), - medQ.getSecond(), medQ.getThird(), rawPow.getFirst(), rawPow.getSecond(), rawPow.getThird(), - pow.getFirst(), pow.getSecond(), pow.getThird()); - } - return outputString; - } - - private ReadOffsetQuad splitReadsByDirection(List reads, List offsets) { - return PoolUtils.splitReadsByReadDirection(reads,offsets); - } - - private ReadOffsetQuad thresholdReadsByQuality(ReadOffsetQuad readQuad) { - return new ReadOffsetQuad(PoolUtils.thresholdReadsByQuality(readQuad.getFirstReadOffsetPair(), minQ), - PoolUtils.thresholdReadsByQuality(readQuad.getSecondReadOffsetPair(), minQ)); - } - - private SQuad getMedianQScores(ReadOffsetQuad reads) { - return new SQuad(ListUtils.getQScoreMedian(reads.getFirstReads(), reads.getFirstOffsets()), - ListUtils.getQScoreMedian(reads.getSecondReads(), reads.getSecondOffsets()), - ListUtils.getQScoreMedian(reads.getReadsCombined(), reads.getOffsetsCombined()), - (byte) 0); - } - - private SQuad calculatePower(ReadOffsetQuad readQuad, SQuad medianQs) { - SQuad power; - if( bootstrapIterations > 0 ) { - power = bootstrapPowerByDirection(readQuad); - } else { - power = theoreticalPowerByDirection(readQuad,medianQs); - } - return power; - } - - private SQuad theoreticalPowerByDirection(ReadOffsetQuad readQuad, SQuad medianQs) { - return new SQuad( theoreticalPower(readQuad.numReadsFirst(), medianQs.getFirst()), - theoreticalPower(readQuad.numReadsSecond(), medianQs.getSecond()), - theoreticalPower(readQuad.numReads(), medianQs.getThird()), - 0.0 ); - } - - private SQuad bootstrapPowerByDirection(ReadOffsetQuad readQuad) { - return new SQuad ( bootstrapPower(readQuad.getFirstReads(), readQuad.getFirstOffsets()), - bootstrapPower(readQuad.getSecondReads(), readQuad.getSecondOffsets()), - bootstrapPower(readQuad.getReadsCombined(), readQuad.getOffsetsCombined()), - 0.0 ); - } - - private double theoreticalPower(int depth, byte q) { - double power; - if( depth <= 0 ) { - power = 0.0; - } else { - double p_error = QualityUtils.qualToErrorProb(q); - double snpProp = getSNPProportion(); - double kterm_num = Math.log10( snpProp * (1 - p_error) + (1 - snpProp) * (p_error/3) ); - double kterm_denom = Math.log10( p_error/3 ); - double dkterm_num = Math.log10( snpProp * (p_error/3) + (1 - snpProp) * (1 - p_error) ); - double dkterm_denom = Math.log10( 1 - p_error); - - int kaccept = (int) Math.ceil( ( lodThresh - ( (double) depth ) * ( dkterm_num - dkterm_denom ) ) / - ( kterm_num - kterm_denom- dkterm_num + dkterm_denom ) ); - - // we will reject the null hypothesis if we see kaccept or more SNPs, the power is the probability that this occurs - // we can optimize this by checking to see which sum is smaller - - if ( depth - kaccept < kaccept ) {// kaccept > depth/2 - calculate power as P[hits between kaccept and depth] - power = MathUtils.cumBinomialProbLog(kaccept, depth, depth, snpProp); - } else { // kaccept < depth/2 - calculate power as 1-P[hits between 0 and kaccept] - power = 1-MathUtils.cumBinomialProbLog(0,kaccept,depth,snpProp); - } - } - - return power; - } - - private double bootstrapPower(List reads, List offsets) { - double power = 0.0; - double snpProp = getSNPProportion(); - for ( int boot = 0; boot < bootstrapIterations; boot ++) { - ReadOffsetQuad partition = PoolUtils.coinTossPartition(reads, offsets, getSNPProportion()); - if ( PoolUtils.calculateLogLikelihoodOfSample(partition, snpProp) >= lodThresh ) { - power += 1.0/bootstrapIterations; - } - } - return power; - } - - private double getSNPProportion() { - return 1/(2.0*numIndividuals); - } - -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/PowerBelowFrequencyWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/PowerBelowFrequencyWalker.java index 651117cf6..72dd8b0fe 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/PowerBelowFrequencyWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/PowerBelowFrequencyWalker.java @@ -44,6 +44,12 @@ public class PowerBelowFrequencyWalker extends LocusWalker { @Argument(fullName="minimumMappingQuality", shortName="mmq", doc="Only use reads above this mapping quality in the power calculation", required=false) int minMappingQuality = -1; + @Argument(fullName="ignoreForwardReads",doc="Use the forward reads at a site. Defaults to true.", required = false) + boolean ignoreForwardReads = true; + + @Argument(fullName="ignoreReverseReads",doc="Use the reverse reads at a site. Defaults to true.", required = false) + boolean ignoreReverseReads = true; + public void initialize() { if ( alleleFreq < 1 ) { String err = "Allele frequency (-af) must be greater than or equal to one."; @@ -64,6 +70,16 @@ public class PowerBelowFrequencyWalker extends LocusWalker { String output = String.format("%s", context.getLocation().toString()); // threshold reads if necessary + if ( ignoreForwardReads && ignoreReverseReads) { + throw new StingException("User has elected to ignore both forward and reverse reads. Power is zero."); + } + else if ( ! ignoreForwardReads && ignoreReverseReads ) { + org.broadinstitute.sting.playground.gatk.walkers.poolseq.ReadOffsetQuad rq = PoolUtils.splitReadsByReadDirection(context.getReads(),context.getOffsets()); + context = new AlignmentContext(context.getLocation(),rq.getFirstReads(),rq.getFirstOffsets()); + } else if ( ignoreForwardReads && ! ignoreReverseReads ) { + org.broadinstitute.sting.playground.gatk.walkers.poolseq.ReadOffsetQuad rq = PoolUtils.splitReadsByReadDirection(context.getReads(),context.getOffsets()); + context = new AlignmentContext(context.getLocation(),rq.getSecondReads(),rq.getSecondOffsets()); + } if ( minQ > 0 ) { Pair, List> thresh = PoolUtils.thresholdReadsByQuality(context.getReads(),context.getOffsets(),minQ); diff --git a/java/src/org/broadinstitute/sting/playground/utils/Quad.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/Quad.java similarity index 97% rename from java/src/org/broadinstitute/sting/playground/utils/Quad.java rename to java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/Quad.java index 95780ecad..8ed3b4c30 100644 --- a/java/src/org/broadinstitute/sting/playground/utils/Quad.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/Quad.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.utils; +package org.broadinstitute.sting.playground.gatk.walkers.poolseq; import org.broadinstitute.sting.utils.Pair; diff --git a/java/src/org/broadinstitute/sting/playground/utils/ReadOffsetQuad.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/ReadOffsetQuad.java similarity index 97% rename from java/src/org/broadinstitute/sting/playground/utils/ReadOffsetQuad.java rename to java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/ReadOffsetQuad.java index cb003e7cf..d554683b9 100644 --- a/java/src/org/broadinstitute/sting/playground/utils/ReadOffsetQuad.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/ReadOffsetQuad.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.utils; +package org.broadinstitute.sting.playground.gatk.walkers.poolseq; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.GenomeLoc; diff --git a/java/src/org/broadinstitute/sting/playground/utils/SQuad.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/SQuad.java similarity index 88% rename from java/src/org/broadinstitute/sting/playground/utils/SQuad.java rename to java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/SQuad.java index 110955ade..112fab200 100644 --- a/java/src/org/broadinstitute/sting/playground/utils/SQuad.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/poolseq/SQuad.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.utils; +package org.broadinstitute.sting.playground.gatk.walkers.poolseq; /** * Created by IntelliJ IDEA. diff --git a/java/src/org/broadinstitute/sting/playground/utils/PoolUtils.java b/java/src/org/broadinstitute/sting/playground/utils/PoolUtils.java index 569621d13..5269d35ff 100755 --- a/java/src/org/broadinstitute/sting/playground/utils/PoolUtils.java +++ b/java/src/org/broadinstitute/sting/playground/utils/PoolUtils.java @@ -8,6 +8,7 @@ import java.util.ArrayList; import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.playground.gatk.walkers.poolseq.ReadOffsetQuad; /** * Created by IntelliJ IDEA. @@ -21,39 +22,6 @@ public class PoolUtils { private PoolUtils() { } - public static final int BASE_A_OFFSET = 0; - public static final int BASE_C_OFFSET = 1; - public static final int BASE_G_OFFSET = 2; - public static final int BASE_T_OFFSET = 3; - public static final int BASE_INDEXED_ARRAY_SIZE = 4; - - public static Pair, List> splitReadsByIndels( List reads, List offsets, boolean returnBases ) { - - List baseReads = new ArrayList(); - List indelReads = new ArrayList(); - List baseOffsets = new ArrayList(); - List indelOffsets = new ArrayList(); - - - for ( int r = 0; r < reads.size(); r ++ ) { - SAMRecord read = reads.get(r); - int offset = offsets.get(r); - if (BaseUtils.isRegularBase( (char) read.getReadBases()[offset] ) ) { - baseReads.add(read); - baseOffsets.add(offset); - } else { - indelReads.add(read); - indelOffsets.add(offset); - } - } - - if (returnBases) { - return new Pair,List>(baseReads,baseOffsets); - } else { - return new Pair,List>(indelReads,indelOffsets); - } - } - public static ReadOffsetQuad splitReadsByReadDirection(List reads, List offsets) { ArrayList forwardReads; ArrayList reverseReads; @@ -66,10 +34,10 @@ public class PoolUtils { forwardOffsets = null; reverseOffsets = null; } else { - forwardReads = new ArrayList(); - reverseReads = new ArrayList(); - forwardOffsets = new ArrayList(); - reverseOffsets = new ArrayList(); + forwardReads = new ArrayList(); + reverseReads = new ArrayList(); + forwardOffsets = new ArrayList(); + reverseOffsets = new ArrayList(); for (int readNo = 0; readNo < reads.size(); readNo++) { if (reads.get(readNo).getReadNegativeStrandFlag()) { @@ -85,44 +53,6 @@ public class PoolUtils { return new ReadOffsetQuad(forwardReads,forwardOffsets,reverseReads,reverseOffsets); } - public static Pair[], List[]> splitReadsByBase(List reads, List offsets) { - ArrayList[] readsByBase; - ArrayList[] offsetsByBase; - if (reads == null) { - readsByBase = null; - offsetsByBase = null; - } else { - readsByBase = new ArrayList[4]; - offsetsByBase = new ArrayList[4]; - for (int readNum = 0; readNum < reads.size(); readNum++) { - switch (reads.get(readNum).getReadBases()[offsets.get(readNum)]) { - case 'A': - case 'a': - readsByBase[BASE_A_OFFSET].add(reads.get(readNum)); - offsetsByBase[BASE_A_OFFSET].add(offsets.get(readNum)); - break; - case 'C': - case 'c': - readsByBase[BASE_C_OFFSET].add(reads.get(readNum)); - offsetsByBase[BASE_C_OFFSET].add(offsets.get(readNum)); - break; - case 'G': - case 'g': - readsByBase[BASE_G_OFFSET].add(reads.get(readNum)); - offsetsByBase[BASE_G_OFFSET].add(offsets.get(readNum)); - break; - case 'T': - case 't': - readsByBase[BASE_T_OFFSET].add(reads.get(readNum)); - offsetsByBase[BASE_T_OFFSET].add(offsets.get(readNum)); - break; - default: - break; // TODO: INDEL AWARENESS - } - } - } - return new Pair(readsByBase, offsetsByBase); - } public static Pair, List> thresholdReadsByQuality(List reads, List offsets, byte qThresh) { List threshReads; @@ -134,8 +64,8 @@ public class PoolUtils { threshReads = reads; threshOffsets = offsets; } else { - threshReads = new ArrayList(); - threshOffsets = new ArrayList(); + threshReads = new ArrayList(); + threshOffsets = new ArrayList(); for (int readNo = 0; readNo < reads.size(); readNo++) { if (reads.get(readNo).getBaseQualities()[offsets.get(readNo)] >= qThresh) { @@ -172,109 +102,5 @@ public class PoolUtils { return new Pair,List>(goodMapReads,goodMapOffsets); } - public static Pair,List> thresholdReadsByQuality(Pair,List> readPair, byte qThresh) { - return thresholdReadsByQuality(readPair.getFirst(),readPair.getSecond(),qThresh); - } - - public static int getBaseOffset(char base) { - switch (base) { - case 'A': - case 'a': - return getBaseAOffset(); - case 'C': - case 'c': - return getBaseCOffset(); - case 'G': - case 'g': - return getBaseGOffset(); - case 'T': - case 't': - return getBaseTOffset(); - default: - return -1; - } - //TODO: indel offsets - } - - public static int getBaseAOffset() { - return BASE_A_OFFSET; - } - - public static int getBaseCOffset() { - return BASE_C_OFFSET; - } - - public static int getBaseGOffset() { - return BASE_G_OFFSET; - } - - public static int getBaseTOffset() { - return BASE_T_OFFSET; - } - - public static List getReadBaseQualities(List reads, List offsets) { - List qualities = new ArrayList(reads.size()); - for (int readNo = 0; readNo < reads.size(); readNo++) { - qualities.add(reads.get(readNo).getBaseQualities()[offsets.get(readNo)]); - } - - return qualities; - } - - public static double calculateLogLikelihoodOfSample(ReadOffsetQuad readsBySupport, double alleleFreq) { - List qSNP = getReadBaseQualities(readsBySupport.getFirstReads(), readsBySupport.getFirstOffsets()); - List qRef = getReadBaseQualities(readsBySupport.getSecondReads(), readsBySupport.getSecondOffsets()); - Pair logsumSNP = qListToSumLogProbabilities(true, qSNP, 1/alleleFreq); - Pair logsumRef = qListToSumLogProbabilities(false, qRef, 1/alleleFreq); - return 0.0 - logsumSNP.getFirst() - logsumRef.getFirst() + logsumSNP.getSecond() + logsumRef.getSecond(); - } - - public static double calculateLogLikelihoodOfSample(Pair,List>,Pair,List>> snpReadsRefReads, int nIndivids) { - List qListSnps = getReadBaseQualities(snpReadsRefReads.getFirst().getFirst(),snpReadsRefReads.getSecond().getFirst()); - List qListRefs = getReadBaseQualities(snpReadsRefReads.getFirst().getSecond(),snpReadsRefReads.getSecond().getSecond()); - Pair logsumSNP = qListToSumLogProbabilities(true,qListSnps, 2.0*nIndivids); - Pair logsumRef = qListToSumLogProbabilities(false,qListRefs, 2.0*nIndivids); - - return 0.0 - logsumSNP.first - logsumRef.first + logsumSNP.second + logsumRef.second; - } - - - public static Pair qListToSumLogProbabilities(boolean listRepresentsSNPObservations, List qList, double denom) - { - double logProbObserveXAndSNPTrue = 0; // note "error" for SNP is observing a ref - double logProbObserveXAndRefTrue = 0;// and "error" for ref is observing a SNP - - for (byte qual : qList) { - double p_err = QualityUtils.qualToErrorProb(qual); - if (listRepresentsSNPObservations) { - logProbObserveXAndSNPTrue += Math.log10((1 - p_err) / denom +((denom - 1)*p_err) / denom); - logProbObserveXAndRefTrue += Math.log10(p_err); - } else { - logProbObserveXAndSNPTrue += Math.log10((denom - 1) * (1 - p_err)/denom + p_err/denom); - logProbObserveXAndRefTrue+= Math.log10(1 -p_err); - } - } - - return new Pair(logProbObserveXAndSNPTrue,logProbObserveXAndRefTrue); - } - - public static ReadOffsetQuad coinTossPartition(List reads, List offsets, double alleleFreq) { - ArrayList snpReads = new ArrayList(); - ArrayList snpOffsets = new ArrayList(); - ArrayList refReads = new ArrayList(); - ArrayList refOffsets = new ArrayList(); - - for( int i = 0; i < reads.size(); i ++ ) { - if ( Math.random() < alleleFreq ) { - snpReads.add(reads.get(i)); - snpOffsets.add(offsets.get(i)); - } else { - refReads.add(reads.get(i)); - refOffsets.add(offsets.get(i)); - } - } - - return new ReadOffsetQuad(snpReads,snpOffsets,refReads,refOffsets); - } } diff --git a/java/src/org/broadinstitute/sting/utils/ReadBackedPileup.java b/java/src/org/broadinstitute/sting/utils/ReadBackedPileup.java index 36ea48f89..ed56588b1 100755 --- a/java/src/org/broadinstitute/sting/utils/ReadBackedPileup.java +++ b/java/src/org/broadinstitute/sting/utils/ReadBackedPileup.java @@ -33,6 +33,8 @@ public class ReadBackedPileup extends BasicPileup { this.offsets = offsets; } + + public int size() { return reads.size(); } public List getReads() { return reads; } public List getOffsets() { return offsets; } diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/BaseTransitionTableCalculatorJavaIntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/BaseTransitionTableCalculatorJavaIntegrationTest.java index 382262dc3..25a52554f 100644 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/BaseTransitionTableCalculatorJavaIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/playground/gatk/walkers/BaseTransitionTableCalculatorJavaIntegrationTest.java @@ -13,8 +13,14 @@ import java.util.Arrays; * To change this template use File | Settings | File Templates. */ public class BaseTransitionTableCalculatorJavaIntegrationTest extends WalkerTest{ - - public static final String OUTPUT_MD5 = "856a3124fb078b4d820f935369ec53cc"; + // MD5s last computed 10/20 at revision 1877 + public static final String OUTPUT_MD5_STANDARD = "856a3124fb078b4d820f935369ec53cc"; + public static final String OUTPUT_MD5_3MISMATCHES = "847375e3d40415d237c627a56d163760"; + public static final String OUTPUT_MD5_LOWMAPPINGQUALITY = "d7e490168dfef8eb4185b596f55bfaa8"; + public static final String OUTPUT_MD5_LOWQUALITYSCORE = "e3f3c3875936b0df7a8899c2ef5d1dd2"; + public static final String OUTPUT_MD5_LOWCONFIDENTREFTHRESHOLD = "521a9fa7716ed22550c2ba3fe3409070"; + public static final String OUTPUT_MD5_HIGHCONFIDENTREFTHRESHOLD = "8ab6d389fc494881736e9a58126c2f1b"; + public static final String OUTPUT_MD5_ALLARGUMENTS = "f45481946d7a5c70078d432b0baff083"; public static final String LOCUS = "1:10,000,000-10,200,000"; public static final String BAM_FILE = "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam"; public static final String REFERENCE = "/broad/1KG/reference/human_b36_both.fasta"; @@ -22,7 +28,49 @@ public class BaseTransitionTableCalculatorJavaIntegrationTest extends WalkerTest @Test public void testBaseTransitionTableCalculatorJava() { String args = "-T BaseTransitionTableCalculatorJava -o %s -I "+BAM_FILE+" -L "+LOCUS+" -R "+REFERENCE; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(args,1,Arrays.asList(OUTPUT_MD5)); + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(args,1,Arrays.asList(OUTPUT_MD5_STANDARD)); executeTest("testBaseTransitionTableCalculatorJava", spec); } + + @Test + public void testBaseTransitionTableCalculatorJavaAdditionalMismatches() { + String args = "-T BaseTransitionTableCalculatorJava -o %s -I "+BAM_FILE+" -L "+LOCUS+" -R "+REFERENCE+" --maxNumMismatches 3"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(args,1,Arrays.asList(OUTPUT_MD5_3MISMATCHES)); + executeTest("BaseTransitionTableCalculatorJava: Additional Mismatches",spec); + } + + @Test + public void testBaseTransitionCalculatorJavaLowMapingQuality() { + String args = "-T BaseTransitionTableCalculatorJava -o %s -I "+BAM_FILE+" -L "+LOCUS+" -R "+REFERENCE+ " --minMappingQuality 5"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(args,1,Arrays.asList(OUTPUT_MD5_LOWMAPPINGQUALITY)); + executeTest("BaseTransitionTableCalculatorJava: Low Mapping Quality",spec); + } + + @Test + public void testBaseTransitionCalculatorJavaLowQualityScore() { + String args = "-T BaseTransitionTableCalculatorJava -o %s -I "+BAM_FILE+" -L "+LOCUS+" -R "+REFERENCE+ " --minQualityScore 5"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(args,1,Arrays.asList(OUTPUT_MD5_LOWQUALITYSCORE)); + executeTest("BaseTransitionTableCalculatorJava: Low Quality Score",spec); + } + + @Test + public void testBaseTransitionCalculatorJavaLowConfidentRefThreshold() { + String args = "-T BaseTransitionTableCalculatorJava -o %s -I "+BAM_FILE+" -L "+LOCUS+" -R "+REFERENCE+ " --confidentRefThreshold 2"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(args,1,Arrays.asList(OUTPUT_MD5_LOWCONFIDENTREFTHRESHOLD)); + executeTest("BaseTransitionTableCalculatorJava: Low Ref Threshold",spec); + } + + @Test + public void testBaseTransitionCalculatorJavaHighConfidentRefThreshold() { + String args = "-T BaseTransitionTableCalculatorJava -o %s -I "+BAM_FILE+" -L "+LOCUS+" -R "+REFERENCE+ " --confidentRefThreshold 8"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(args,1,Arrays.asList(OUTPUT_MD5_HIGHCONFIDENTREFTHRESHOLD)); + executeTest("BaseTransitionTableCalculatorJava: Low Ref Threshold",spec); + } + + @Test + public void testBaseTransitionCalculatorJavaAllArguments() { + String args = "-T BaseTransitionTableCalculatorJava -o %s -I "+BAM_FILE+" -L "+LOCUS+" -R "+REFERENCE+ " --confidentRefThreshold 2 --minQualityScore 5 --minMappingQuality 5 --maxNumMismatches 3"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(args,1,Arrays.asList(OUTPUT_MD5_ALLARGUMENTS)); + executeTest("BaseTransitionTableCalculatorJava: Low Ref Threshold, Low Q Score, Low Mapping Quality, Additional Mismatches",spec); + } } diff --git a/scala/src/BaseTransitionTableCalculator.scala b/scala/src/BaseTransitionTableCalculator.scala index 500f3832b..b07c539ce 100755 --- a/scala/src/BaseTransitionTableCalculator.scala +++ b/scala/src/BaseTransitionTableCalculator.scala @@ -41,8 +41,9 @@ class TransitionTable() { } def header(): String = { - return Utils.join("\t", mapEntries((i,j,x) => "P(%c|true=%c)" format (fromKey(j), fromKey(i))).toArray) - } + //return Utils.join("\t", mapEntries((i,j,x) => "P(%c|true=%c)" format (fromKey(j), fromKey(i))).toArray) + return Utils.join("\t", mapEntries((i,j,x) => "[%c|%c]" format (fromKey(j), fromKey(i))).toArray) + } override def toString(): String = { return Utils.join("\t", mapEntries((i,j,x) => "%.2f" format x).toArray) @@ -106,7 +107,10 @@ class BaseTransitionTableCalculator extends LocusWalker[Unit,Int] { nonRefBases.length match { case 1 => - addRead(nonRefRead, offsets.head, nonRefBases.head, ref, table1) + //if ( goodRead(nonRefRead,offsets.head) ) { + //printf("Including site:%n nonRefBases=%s%n refBases=%s%n nonRefRead=%s%n offset=%s%n", nonRefBases, refBases, nonRefRead.format, offsets.head) + //} + addRead(nonRefRead, offsets.head, nonRefBases.head, ref, table1) addRead(nonRefRead, offsets.head, nonRefBases.head, ref, if ( nonRefRead.getReadNegativeStrandFlag ) tableREV else tableFWD ) case 2 => addRead(nonRefRead, offsets.head, nonRefBases.head, ref, table2) @@ -128,7 +132,7 @@ class BaseTransitionTableCalculator extends LocusWalker[Unit,Int] { def addRead(nonRefRead: SAMRecord, offset: Integer, nonRefBase: Char, ref: ReferenceContext, table: TransitionTable): Unit = { if ( goodRead(nonRefRead, offset) ) { - //printf("Including site:%n call=%s%n nonRefBases=%s%n refBases=%s%n nonRefRead=%s%n offset=%s%n", call, nonRefBases, refBases, nonRefRead.format, offsets.head) + //printf("Including site:%n call=%s%n nonRefBases=%s%n refBases=%s%n nonRefRead=%s%n offset=%s%n", nonRefBases, refBases, nonRefRead.format, offsets.head) //println(call.getReadDepth, call.getReferencebase, nonRefBases, refBases, nonRefRead.getMappingQuality) if ( CARE_ABOUT_STRAND && nonRefRead.getReadNegativeStrandFlag() ) { // it's on the reverse strand @@ -180,7 +184,7 @@ class BaseTransitionTableCalculator extends LocusWalker[Unit,Int] { if ( call == null ) return false; - return ! call.getFirst().get(0).isVariant() && call.getFirst().get(0).getNegLog10PError() > MIN_LOD + return (! call.getFirst().get(0).isVariant()) && call.getFirst().get(0).getNegLog10PError() > MIN_LOD } def reduceInit(): Int = {