diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSClusteredZScoreWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSClusteredZScoreWalker.java new file mode 100755 index 000000000..fc0c4c69b --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSClusteredZScoreWalker.java @@ -0,0 +1,159 @@ +package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; + +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.List; +import java.util.LinkedList; + +import net.sf.samtools.SAMRecord; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Sep 29, 2009 + * Time: 1:13:25 PM + * To change this template use File | Settings | File Templates. + */ +public class NQSClusteredZScoreWalker extends LocusWalker { + static final int WIN_SIDE_SIZE = 4; + static final int Z_SCORE_MAX = 4; + static final int Z_SCORE_MULTIPLIER = 3; // bins are Z_SCORE * (this) rounded to the nearst int + static final int MM_OFFSET = 1; + static final int MATCH_OFFSET = 0; + static final int MAX_Q_SCORE = 2 + QualityUtils.MAX_REASONABLE_Q_SCORE; + + protected int WINDOW; + + public void initialize() { + WINDOW = 2*WIN_SIDE_SIZE+1; + } + + public int[][][] reduceInit() { + int[][][] q = new int[Z_SCORE_MAX*Z_SCORE_MULTIPLIER+1][MAX_Q_SCORE][2]; + for ( int i = 0; i < Z_SCORE_MAX*Z_SCORE_MULTIPLIER+1; i ++ ) { + for ( int j = 0; j < MAX_Q_SCORE; j ++ ) { + for ( int k = 0; k < 2; k ++ ) { + q[i][j][k] = 0; + } + } + } + + return q; + } + + + public LocalMapType map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { + return new LocalMapType(context, ref, tracker); + } + + public int[][][] reduce(LocalMapType map, int[][][] qScores) { + for ( int i = 0; i < map.numReads(); i ++ ) { + SAMRecord read = map.context.getReads().get(i); + int offset = map.context.getOffsets().get(i); + if ( isMismatch(read, offset, map.ref) ) { + qScores[calcZScoreBin(read,offset)][getQScoreAsInt(read,offset)][MM_OFFSET]++; + } else { + qScores[calcZScoreBin(read,offset)][getQScoreAsInt(read,offset)][MATCH_OFFSET]++; + } + } + + return qScores; + } + + public void onTraversalDone( int[][][] zScoreBins ) { + out.print( header() ); + for ( int i = 0; i < Z_SCORE_MAX*Z_SCORE_MULTIPLIER; i ++ ) { + out.print( formatData(zScoreBins[i],i) ); + } + } + + public boolean isMismatch( SAMRecord read, int offset, ReferenceContext ref ) { + return ((char) read.getReadBases()[offset]) == ref.getBase(); + } + + public int getQScoreAsInt( SAMRecord read, int offset ) { + return (int) read.getBaseQualities()[offset]; + } + + public int calcZScoreBin( SAMRecord read, int offset ) { + Pair meanVar = calcWindowMeanVariance(read, offset); + double rawZ = Math.abs((getQScoreAsInt(read, offset) - meanVar.first)/Math.sqrt(meanVar.second)); + int zBin; + if ( rawZ < Z_SCORE_MAX ) { + zBin = (int) Math.floor(rawZ*Z_SCORE_MULTIPLIER); + } else { + zBin = Z_SCORE_MULTIPLIER*Z_SCORE_MAX; + } + + return zBin; + } + + public Pair calcWindowMeanVariance( SAMRecord read, int offset ) { + int start,end; + if ( offset - WIN_SIDE_SIZE < 0 ) { + start = 0; + } else { + start = offset - WIN_SIDE_SIZE; + } + + if ( offset + WIN_SIDE_SIZE > read.getReadLength() ) { + end = read.getReadLength(); + } else { + end = offset + WIN_SIDE_SIZE; + } + + double mean = 0.0; + int n = end-start; + byte[] quals = read.getBaseQualities(); + + for ( int i = start; i < end; i ++ ) { + if ( i != offset ) { + mean += ((double) quals[i])/n; + } + } + + double var = 0.0; + + for ( int i = start; i < end; i ++ ) { + if ( i != offset ) { + var += Math.pow(((double) quals[i] - mean),2)/n; + } + } + + return new Pair(mean, var); + } + + public String header() { + String format = "%s\t%s\t%s\t%s\t%s%n"; + return String.format(format, "ZScore", "Expected_Mismatch", "Empirical_Mismatch", "Expected_MM_As_Q", "Empirical_MM_As_Q"); + } + + public String formatData ( int[][] matchMismatchQ, int zScoreBin ) { + double zScore = ( (double) zScoreBin )/Z_SCORE_MULTIPLIER; + int match = 0; + int mismatch = 0; + double expMMR = 0.0; + + for ( int i = 0; i < MAX_Q_SCORE; i ++ ) { + match += matchMismatchQ[i][MATCH_OFFSET]; + mismatch += matchMismatchQ[i][MM_OFFSET]; + expMMR += QualityUtils.qualToProb(i)*matchMismatchQ[i][MATCH_OFFSET]; + expMMR += QualityUtils.qualToProb(i)*matchMismatchQ[i][MM_OFFSET]; + } + + expMMR = (expMMR / ( match + mismatch )); + double empMMR = ((double) mismatch)/(match + mismatch); + int expMMRAsQ = QualityUtils.probToQual(expMMR); + int empMMRAsQ = QualityUtils.probToQual(empMMR); + + String format = "%f\t%f\t%f\t%d\t%d%n"; + + return String.format(format, zScore, expMMR, empMMR, expMMRAsQ, empMMRAsQ); + } + +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSCovariantByCountsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSCovariantByCountsWalker.java new file mode 100755 index 000000000..4ba0e279c --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSCovariantByCountsWalker.java @@ -0,0 +1,141 @@ +package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; + +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.List; +import java.util.LinkedList; + +import net.sf.samtools.SAMRecord; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Sep 29, 2009 + * Time: 10:54:18 AM + * To change this template use File | Settings | File Templates. + */ +public class NQSCovariantByCountsWalker extends LocusWalker< LocalMapType, int[][][][] > { + + final static int SMALL_DEVIATION = 5; + final static int LARGE_DEVIATION = 8; + final static int WIN_SIZE_SIDE = 4; + final static int MATCH_OFFSET = 1; + final static int MM_OFFSET = 0; + final static int MAX_Q_SCORE = 2 + QualityUtils.MAX_REASONABLE_Q_SCORE; + final static String DATA_FORMAT = "%d\t%d\t%d\t%d\t%f\t%f\t%d\t%d%n"; + final static String HEADER_FORMAT = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n"; + + int NGHD_SIZE; + + public void initialize() { + NGHD_SIZE = 2*WIN_SIZE_SIDE+1; + } + + public int[][][][] reduceInit() { + int[][][][] q = new int[2*WIN_SIZE_SIDE][2*WIN_SIZE_SIDE][MAX_Q_SCORE][2]; + + for ( int i = 0; i < 2*WIN_SIZE_SIDE; i ++ ) { + for ( int j = 0; j < 2*WIN_SIZE_SIDE; j ++ ) { + for ( int k = 0; k < MAX_Q_SCORE; k ++ ) { + for ( int h = 0; h < 2; h ++ ) { + q[i][j][k][h] = 0; + } + } + } + } + + return q; + } + + public LocalMapType map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { + return new LocalMapType(context, ref, tracker); + } + + public int [][][][] reduce( LocalMapType map, int [][][][] quals ) { + for ( int i = 0; i < map.numReads(); i ++ ) { + SAMRecord read = map.context.getReads().get(i); + int offset = map.context.getOffsets().get(i); + Pair deviantCounts = countNumDeviantBases(read,offset); + + if ( isMismatch(read,offset,map.ref) ) { + quals[deviantCounts.first][deviantCounts.second][(int) read.getBaseQualities()[offset]][MM_OFFSET]++; + } else { + quals[deviantCounts.first][deviantCounts.second][(int) read.getBaseQualities()[offset]][MATCH_OFFSET]++; + } + } + + return quals; + } + + public void onTraversalDone( int[][][][] counts ) { + out.print( header() ); + + for ( int i = 0; i < 2*WIN_SIZE_SIDE; i ++ ) { + for ( int j = 0; j < 2*WIN_SIZE_SIDE; j ++ ) { + out.print( formatData( counts[i][j], i, j ) ); + } + } + } + + public Pair countNumDeviantBases(SAMRecord read, int offset) { + int start,end; + if ( offset - WIN_SIZE_SIDE < 0 ) { + start = 0; + } else { + start = offset - WIN_SIZE_SIDE; + } + + if ( offset + WIN_SIZE_SIDE > read.getReadLength() ) { + end = read.getReadLength(); + } else { + end = offset + WIN_SIZE_SIDE; + } + + int largeCounts = 0; + int smallCounts = 0; + byte[] quals = read.getBaseQualities(); + for ( int i = start; i < end; i ++ ) { + if ( Math.abs(quals[i] - quals[offset]) >= LARGE_DEVIATION ) { + largeCounts ++; + } else if ( Math.abs( quals[i] - quals[offset] ) >= SMALL_DEVIATION ) { + smallCounts ++; + } + } + + return new Pair(smallCounts,largeCounts); + } + + public boolean isMismatch( SAMRecord read, int offset, ReferenceContext ref ) { + return ( ((char)read.getReadBases()[offset]) == ref.getBase() ); + } + + public String header() { + return String.format(HEADER_FORMAT, "N_SMALL_DEV", "N_LARGE_DEV", "MATCH", "MISMATCH", "EXPECTED_ERR", "EMPIRICAL_ERR", "EXP_AS_Q", "EMP_AS_Q"); + } + + public String formatData( int[][] qScores, int smDev, int lgDev ) { + int match = 0; + int mismatch = 0; + double expErr = 0.0; + + for ( int i = 0; i < MAX_Q_SCORE; i ++ ) { + match += qScores[i][MATCH_OFFSET]; + mismatch += qScores[i][MM_OFFSET]; + expErr += QualityUtils.qualToProb(i)*qScores[i][MATCH_OFFSET]; + expErr += QualityUtils.qualToProb(i)*qScores[i][MM_OFFSET]; + } + + expErr = expErr/(match + mismatch); + int expAsQ = QualityUtils.probToQual(expErr); + + double empErr = ((double)mismatch)/(match+mismatch); + int empAsQ = QualityUtils.probToQual(empErr); + + return String.format(DATA_FORMAT, smDev, lgDev, match, mismatch, expErr, empErr, expAsQ, empAsQ); + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSTabularDistributionWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSTabularDistributionWalker.java index 587ee12ef..dee2231f0 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSTabularDistributionWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NQSTabularDistributionWalker.java @@ -12,6 +12,7 @@ import java.util.LinkedList; import java.util.ArrayList; import java.util.List; import java.util.ListIterator; +import org.apache.log4j.Logger; import net.sf.samtools.SAMRecord; @@ -38,7 +39,7 @@ public class NQSTabularDistributionWalker extends LocusWalker coordinates = getQscoreCoordinates(read,offset,winSize, binSize); + if ( isMatch(read, offset, ref) ) { + coordinates.add(MATCH_OFFSET); + } else { + coordinates.add(MM_OFFSET); + } ListIterator o = coordinates.listIterator(); table[o.next()][o.next()][o.next()][o.next()][o.next()][o.next()][o.next()][o.next()] ++; // i'm sure i could use recursion for this but my brain is too tired after swimming to figure @@ -137,9 +144,10 @@ class NQSDistributionTable { public List getQscoreCoordinates( SAMRecord read, int offset, int win, int binSize ) { LinkedList coords = new LinkedList(); - for ( int i = offset - win; i < offset + win ; i ++ ) { + for ( int i = offset - win; i <= offset + win ; i ++ ) { coords.add(getQscoreBin(getQScoreAsInt(read,i), binSize)); } + logger.debug(Integer.toString(coords.size())); return coords; } @@ -147,7 +155,7 @@ class NQSDistributionTable { int qscore; if ( offset < 0 ) { qscore = read.getReadNegativeStrandFlag() ? OFF_END_OFFSET : OFF_START_OFFSET; - } else if ( offset > read.getReadLength() ) { + } else if ( offset >= read.getReadLength() ) { qscore = read.getReadNegativeStrandFlag() ? OFF_START_OFFSET : OFF_END_OFFSET; } else { qscore = (int) read.getBaseQualities()[offset]; @@ -166,4 +174,8 @@ class NQSDistributionTable { return new Pair(mm,ma); } + + public boolean isMatch( SAMRecord read, int offset, ReferenceContext ref ) { + return ( Character.toUpperCase( (char) read.getReadBases()[offset] ) == ref.getBase() || ref.getBase() == 'N'); + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreDistributionWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreDistributionWalker.java new file mode 100755 index 000000000..b711bf997 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreDistributionWalker.java @@ -0,0 +1,77 @@ +package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; + +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.Pair; + +import java.util.List; +import java.util.LinkedList; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: Sep 28, 2009 + * Time: 12:02:33 PM + * To change this template use File | Settings | File Templates. + */ +public class QualityScoreDistributionWalker extends LocusWalker>, int[][]> { + @Argument(fullName="maxReadLength", shortName="rl", doc="Maximum read length in the bam file", required=true) + protected int MAX_READ_LENGTH = 125; + + + protected final int MAX_REASONABLE_Q_SCORE = 3 + QualityUtils.MAX_REASONABLE_Q_SCORE; + public void initialize() { + + } + + public int[][] reduceInit() { + int [][] qualityCounts = new int[MAX_READ_LENGTH][MAX_REASONABLE_Q_SCORE]; + for ( int i = 0; i < MAX_READ_LENGTH; i ++ ) { + for ( int j = 0; j < MAX_REASONABLE_Q_SCORE; j ++ ) { + qualityCounts[i][j] = 0; + } + } + + return qualityCounts; + } + + public List> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { + LinkedList> qualByPos = new LinkedList>(); + for ( int r = 0; r < context.numReads(); r ++ ) { + if( context.getReads().get(r).getReadNegativeStrandFlag() ) { + int of = context.getReads().get(r).getReadLength() - context.getOffsets().get(r); + int qu = context.getReads().get(r).getBaseQualities()[context.getOffsets().get(r)]; + qualByPos.add(new Pair(of, qu)); + } + else { + int of = context.getOffsets().get(r); + int qu = context.getReads().get(r).getBaseQualities()[of]; + qualByPos.add(new Pair(of, qu)); + } + } + + return qualByPos; + } + + public int[][] reduce( List> qualByPos, int[][] counts ) { + for ( Pair qPosPair : qualByPos ) { + counts[qPosPair.getFirst()][qPosPair.getSecond()] ++; + } + + return counts; + } + + public void onTraversalDone( int[][] counts ) { + for ( int pos = 0; pos < MAX_READ_LENGTH; pos ++ ) { + String outStr = Integer.toString(pos); + for ( int qual = 0; qual < MAX_REASONABLE_Q_SCORE; qual ++) { + outStr = outStr + "\t" + Integer.toString(counts[pos][qual]); + } + out.printf("%s%n", outStr); + } + } +}