More NQS Related Walkers to play with

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1742 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2009-09-29 20:01:04 +00:00
parent 9ef80e3c3c
commit e28b45688c
4 changed files with 395 additions and 6 deletions

View File

@ -0,0 +1,159 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.List;
import java.util.LinkedList;
import net.sf.samtools.SAMRecord;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Sep 29, 2009
* Time: 1:13:25 PM
* To change this template use File | Settings | File Templates.
*/
public class NQSClusteredZScoreWalker extends LocusWalker<LocalMapType, int[][][]> {
static final int WIN_SIDE_SIZE = 4;
static final int Z_SCORE_MAX = 4;
static final int Z_SCORE_MULTIPLIER = 3; // bins are Z_SCORE * (this) rounded to the nearst int
static final int MM_OFFSET = 1;
static final int MATCH_OFFSET = 0;
static final int MAX_Q_SCORE = 2 + QualityUtils.MAX_REASONABLE_Q_SCORE;
protected int WINDOW;
public void initialize() {
WINDOW = 2*WIN_SIDE_SIZE+1;
}
public int[][][] reduceInit() {
int[][][] q = new int[Z_SCORE_MAX*Z_SCORE_MULTIPLIER+1][MAX_Q_SCORE][2];
for ( int i = 0; i < Z_SCORE_MAX*Z_SCORE_MULTIPLIER+1; i ++ ) {
for ( int j = 0; j < MAX_Q_SCORE; j ++ ) {
for ( int k = 0; k < 2; k ++ ) {
q[i][j][k] = 0;
}
}
}
return q;
}
public LocalMapType map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
return new LocalMapType(context, ref, tracker);
}
public int[][][] reduce(LocalMapType map, int[][][] qScores) {
for ( int i = 0; i < map.numReads(); i ++ ) {
SAMRecord read = map.context.getReads().get(i);
int offset = map.context.getOffsets().get(i);
if ( isMismatch(read, offset, map.ref) ) {
qScores[calcZScoreBin(read,offset)][getQScoreAsInt(read,offset)][MM_OFFSET]++;
} else {
qScores[calcZScoreBin(read,offset)][getQScoreAsInt(read,offset)][MATCH_OFFSET]++;
}
}
return qScores;
}
public void onTraversalDone( int[][][] zScoreBins ) {
out.print( header() );
for ( int i = 0; i < Z_SCORE_MAX*Z_SCORE_MULTIPLIER; i ++ ) {
out.print( formatData(zScoreBins[i],i) );
}
}
public boolean isMismatch( SAMRecord read, int offset, ReferenceContext ref ) {
return ((char) read.getReadBases()[offset]) == ref.getBase();
}
public int getQScoreAsInt( SAMRecord read, int offset ) {
return (int) read.getBaseQualities()[offset];
}
public int calcZScoreBin( SAMRecord read, int offset ) {
Pair<Double,Double> meanVar = calcWindowMeanVariance(read, offset);
double rawZ = Math.abs((getQScoreAsInt(read, offset) - meanVar.first)/Math.sqrt(meanVar.second));
int zBin;
if ( rawZ < Z_SCORE_MAX ) {
zBin = (int) Math.floor(rawZ*Z_SCORE_MULTIPLIER);
} else {
zBin = Z_SCORE_MULTIPLIER*Z_SCORE_MAX;
}
return zBin;
}
public Pair<Double,Double> calcWindowMeanVariance( SAMRecord read, int offset ) {
int start,end;
if ( offset - WIN_SIDE_SIZE < 0 ) {
start = 0;
} else {
start = offset - WIN_SIDE_SIZE;
}
if ( offset + WIN_SIDE_SIZE > read.getReadLength() ) {
end = read.getReadLength();
} else {
end = offset + WIN_SIDE_SIZE;
}
double mean = 0.0;
int n = end-start;
byte[] quals = read.getBaseQualities();
for ( int i = start; i < end; i ++ ) {
if ( i != offset ) {
mean += ((double) quals[i])/n;
}
}
double var = 0.0;
for ( int i = start; i < end; i ++ ) {
if ( i != offset ) {
var += Math.pow(((double) quals[i] - mean),2)/n;
}
}
return new Pair<Double,Double>(mean, var);
}
public String header() {
String format = "%s\t%s\t%s\t%s\t%s%n";
return String.format(format, "ZScore", "Expected_Mismatch", "Empirical_Mismatch", "Expected_MM_As_Q", "Empirical_MM_As_Q");
}
public String formatData ( int[][] matchMismatchQ, int zScoreBin ) {
double zScore = ( (double) zScoreBin )/Z_SCORE_MULTIPLIER;
int match = 0;
int mismatch = 0;
double expMMR = 0.0;
for ( int i = 0; i < MAX_Q_SCORE; i ++ ) {
match += matchMismatchQ[i][MATCH_OFFSET];
mismatch += matchMismatchQ[i][MM_OFFSET];
expMMR += QualityUtils.qualToProb(i)*matchMismatchQ[i][MATCH_OFFSET];
expMMR += QualityUtils.qualToProb(i)*matchMismatchQ[i][MM_OFFSET];
}
expMMR = (expMMR / ( match + mismatch ));
double empMMR = ((double) mismatch)/(match + mismatch);
int expMMRAsQ = QualityUtils.probToQual(expMMR);
int empMMRAsQ = QualityUtils.probToQual(empMMR);
String format = "%f\t%f\t%f\t%d\t%d%n";
return String.format(format, zScore, expMMR, empMMR, expMMRAsQ, empMMRAsQ);
}
}

View File

@ -0,0 +1,141 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.List;
import java.util.LinkedList;
import net.sf.samtools.SAMRecord;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Sep 29, 2009
* Time: 10:54:18 AM
* To change this template use File | Settings | File Templates.
*/
public class NQSCovariantByCountsWalker extends LocusWalker< LocalMapType, int[][][][] > {
final static int SMALL_DEVIATION = 5;
final static int LARGE_DEVIATION = 8;
final static int WIN_SIZE_SIDE = 4;
final static int MATCH_OFFSET = 1;
final static int MM_OFFSET = 0;
final static int MAX_Q_SCORE = 2 + QualityUtils.MAX_REASONABLE_Q_SCORE;
final static String DATA_FORMAT = "%d\t%d\t%d\t%d\t%f\t%f\t%d\t%d%n";
final static String HEADER_FORMAT = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n";
int NGHD_SIZE;
public void initialize() {
NGHD_SIZE = 2*WIN_SIZE_SIDE+1;
}
public int[][][][] reduceInit() {
int[][][][] q = new int[2*WIN_SIZE_SIDE][2*WIN_SIZE_SIDE][MAX_Q_SCORE][2];
for ( int i = 0; i < 2*WIN_SIZE_SIDE; i ++ ) {
for ( int j = 0; j < 2*WIN_SIZE_SIDE; j ++ ) {
for ( int k = 0; k < MAX_Q_SCORE; k ++ ) {
for ( int h = 0; h < 2; h ++ ) {
q[i][j][k][h] = 0;
}
}
}
}
return q;
}
public LocalMapType map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
return new LocalMapType(context, ref, tracker);
}
public int [][][][] reduce( LocalMapType map, int [][][][] quals ) {
for ( int i = 0; i < map.numReads(); i ++ ) {
SAMRecord read = map.context.getReads().get(i);
int offset = map.context.getOffsets().get(i);
Pair<Integer,Integer> deviantCounts = countNumDeviantBases(read,offset);
if ( isMismatch(read,offset,map.ref) ) {
quals[deviantCounts.first][deviantCounts.second][(int) read.getBaseQualities()[offset]][MM_OFFSET]++;
} else {
quals[deviantCounts.first][deviantCounts.second][(int) read.getBaseQualities()[offset]][MATCH_OFFSET]++;
}
}
return quals;
}
public void onTraversalDone( int[][][][] counts ) {
out.print( header() );
for ( int i = 0; i < 2*WIN_SIZE_SIDE; i ++ ) {
for ( int j = 0; j < 2*WIN_SIZE_SIDE; j ++ ) {
out.print( formatData( counts[i][j], i, j ) );
}
}
}
public Pair<Integer,Integer> countNumDeviantBases(SAMRecord read, int offset) {
int start,end;
if ( offset - WIN_SIZE_SIDE < 0 ) {
start = 0;
} else {
start = offset - WIN_SIZE_SIDE;
}
if ( offset + WIN_SIZE_SIDE > read.getReadLength() ) {
end = read.getReadLength();
} else {
end = offset + WIN_SIZE_SIDE;
}
int largeCounts = 0;
int smallCounts = 0;
byte[] quals = read.getBaseQualities();
for ( int i = start; i < end; i ++ ) {
if ( Math.abs(quals[i] - quals[offset]) >= LARGE_DEVIATION ) {
largeCounts ++;
} else if ( Math.abs( quals[i] - quals[offset] ) >= SMALL_DEVIATION ) {
smallCounts ++;
}
}
return new Pair<Integer,Integer>(smallCounts,largeCounts);
}
public boolean isMismatch( SAMRecord read, int offset, ReferenceContext ref ) {
return ( ((char)read.getReadBases()[offset]) == ref.getBase() );
}
public String header() {
return String.format(HEADER_FORMAT, "N_SMALL_DEV", "N_LARGE_DEV", "MATCH", "MISMATCH", "EXPECTED_ERR", "EMPIRICAL_ERR", "EXP_AS_Q", "EMP_AS_Q");
}
public String formatData( int[][] qScores, int smDev, int lgDev ) {
int match = 0;
int mismatch = 0;
double expErr = 0.0;
for ( int i = 0; i < MAX_Q_SCORE; i ++ ) {
match += qScores[i][MATCH_OFFSET];
mismatch += qScores[i][MM_OFFSET];
expErr += QualityUtils.qualToProb(i)*qScores[i][MATCH_OFFSET];
expErr += QualityUtils.qualToProb(i)*qScores[i][MM_OFFSET];
}
expErr = expErr/(match + mismatch);
int expAsQ = QualityUtils.probToQual(expErr);
double empErr = ((double)mismatch)/(match+mismatch);
int empAsQ = QualityUtils.probToQual(empErr);
return String.format(DATA_FORMAT, smDev, lgDev, match, mismatch, expErr, empErr, expAsQ, empAsQ);
}
}

View File

@ -12,6 +12,7 @@ import java.util.LinkedList;
import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
import org.apache.log4j.Logger;
import net.sf.samtools.SAMRecord;
@ -38,7 +39,7 @@ public class NQSTabularDistributionWalker extends LocusWalker<LocalMapType, NQSD
}
public NQSDistributionTable reduceInit() {
return new NQSDistributionTable(WINDOW_SIZE, qBins, QSCORE_BIN_SIZE);
return new NQSDistributionTable(WINDOW_SIZE, qBins, QSCORE_BIN_SIZE, logger);
}
public LocalMapType map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
@ -90,15 +91,16 @@ class NQSDistributionTable {
public final int MATCH_OFFSET = 1;
protected int [][][][][][][][] table;
protected Logger logger;
protected int OFF_END_OFFSET;
protected int OFF_START_OFFSET;
public NQSDistributionTable (int winSize, int qBins, int qStep) {
if ( table.length != winSize ) {
public NQSDistributionTable (int winSize, int qBins, int qStep, Logger logger) {
if ( 7 != winSize ) {
throw new StingException("Size positied in tabular distribution is not the size of the distribution table");
}
this.logger = logger;
table = new int[qBins][qBins][qBins][qBins][qBins][qBins][qBins][2];
// yes, this is absolutely positively horrendous brute-force code.
// ... Knuth would be proud. These could be lists for a decrease
@ -129,6 +131,11 @@ class NQSDistributionTable {
// note: there are two extra bins, one for "No quality: off the end of read"
// and one for "No quality: off the beginning of read"
List<Integer> coordinates = getQscoreCoordinates(read,offset,winSize, binSize);
if ( isMatch(read, offset, ref) ) {
coordinates.add(MATCH_OFFSET);
} else {
coordinates.add(MM_OFFSET);
}
ListIterator<Integer> o = coordinates.listIterator();
table[o.next()][o.next()][o.next()][o.next()][o.next()][o.next()][o.next()][o.next()] ++;
// i'm sure i could use recursion for this but my brain is too tired after swimming to figure
@ -137,9 +144,10 @@ class NQSDistributionTable {
public List<Integer> getQscoreCoordinates( SAMRecord read, int offset, int win, int binSize ) {
LinkedList<Integer> coords = new LinkedList<Integer>();
for ( int i = offset - win; i < offset + win ; i ++ ) {
for ( int i = offset - win; i <= offset + win ; i ++ ) {
coords.add(getQscoreBin(getQScoreAsInt(read,i), binSize));
}
logger.debug(Integer.toString(coords.size()));
return coords;
}
@ -147,7 +155,7 @@ class NQSDistributionTable {
int qscore;
if ( offset < 0 ) {
qscore = read.getReadNegativeStrandFlag() ? OFF_END_OFFSET : OFF_START_OFFSET;
} else if ( offset > read.getReadLength() ) {
} else if ( offset >= read.getReadLength() ) {
qscore = read.getReadNegativeStrandFlag() ? OFF_START_OFFSET : OFF_END_OFFSET;
} else {
qscore = (int) read.getBaseQualities()[offset];
@ -166,4 +174,8 @@ class NQSDistributionTable {
return new Pair<Integer,Integer>(mm,ma);
}
public boolean isMatch( SAMRecord read, int offset, ReferenceContext ref ) {
return ( Character.toUpperCase( (char) read.getReadBases()[offset] ) == ref.getBase() || ref.getBase() == 'N');
}
}

View File

@ -0,0 +1,77 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Pair;
import java.util.List;
import java.util.LinkedList;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Sep 28, 2009
* Time: 12:02:33 PM
* To change this template use File | Settings | File Templates.
*/
public class QualityScoreDistributionWalker extends LocusWalker<List<Pair<Integer,Integer>>, int[][]> {
@Argument(fullName="maxReadLength", shortName="rl", doc="Maximum read length in the bam file", required=true)
protected int MAX_READ_LENGTH = 125;
protected final int MAX_REASONABLE_Q_SCORE = 3 + QualityUtils.MAX_REASONABLE_Q_SCORE;
public void initialize() {
}
public int[][] reduceInit() {
int [][] qualityCounts = new int[MAX_READ_LENGTH][MAX_REASONABLE_Q_SCORE];
for ( int i = 0; i < MAX_READ_LENGTH; i ++ ) {
for ( int j = 0; j < MAX_REASONABLE_Q_SCORE; j ++ ) {
qualityCounts[i][j] = 0;
}
}
return qualityCounts;
}
public List<Pair<Integer,Integer>> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
LinkedList<Pair<Integer,Integer>> qualByPos = new LinkedList<Pair<Integer,Integer>>();
for ( int r = 0; r < context.numReads(); r ++ ) {
if( context.getReads().get(r).getReadNegativeStrandFlag() ) {
int of = context.getReads().get(r).getReadLength() - context.getOffsets().get(r);
int qu = context.getReads().get(r).getBaseQualities()[context.getOffsets().get(r)];
qualByPos.add(new Pair<Integer,Integer>(of, qu));
}
else {
int of = context.getOffsets().get(r);
int qu = context.getReads().get(r).getBaseQualities()[of];
qualByPos.add(new Pair<Integer,Integer>(of, qu));
}
}
return qualByPos;
}
public int[][] reduce( List<Pair<Integer,Integer>> qualByPos, int[][] counts ) {
for ( Pair<Integer,Integer> qPosPair : qualByPos ) {
counts[qPosPair.getFirst()][qPosPair.getSecond()] ++;
}
return counts;
}
public void onTraversalDone( int[][] counts ) {
for ( int pos = 0; pos < MAX_READ_LENGTH; pos ++ ) {
String outStr = Integer.toString(pos);
for ( int qual = 0; qual < MAX_REASONABLE_Q_SCORE; qual ++) {
outStr = outStr + "\t" + Integer.toString(counts[pos][qual]);
}
out.printf("%s%n", outStr);
}
}
}