More NQS Related Walkers to play with
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1742 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
9ef80e3c3c
commit
e28b45688c
|
|
@ -0,0 +1,159 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.LinkedList;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: Sep 29, 2009
|
||||
* Time: 1:13:25 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class NQSClusteredZScoreWalker extends LocusWalker<LocalMapType, int[][][]> {
|
||||
static final int WIN_SIDE_SIZE = 4;
|
||||
static final int Z_SCORE_MAX = 4;
|
||||
static final int Z_SCORE_MULTIPLIER = 3; // bins are Z_SCORE * (this) rounded to the nearst int
|
||||
static final int MM_OFFSET = 1;
|
||||
static final int MATCH_OFFSET = 0;
|
||||
static final int MAX_Q_SCORE = 2 + QualityUtils.MAX_REASONABLE_Q_SCORE;
|
||||
|
||||
protected int WINDOW;
|
||||
|
||||
public void initialize() {
|
||||
WINDOW = 2*WIN_SIDE_SIZE+1;
|
||||
}
|
||||
|
||||
public int[][][] reduceInit() {
|
||||
int[][][] q = new int[Z_SCORE_MAX*Z_SCORE_MULTIPLIER+1][MAX_Q_SCORE][2];
|
||||
for ( int i = 0; i < Z_SCORE_MAX*Z_SCORE_MULTIPLIER+1; i ++ ) {
|
||||
for ( int j = 0; j < MAX_Q_SCORE; j ++ ) {
|
||||
for ( int k = 0; k < 2; k ++ ) {
|
||||
q[i][j][k] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return q;
|
||||
}
|
||||
|
||||
|
||||
public LocalMapType map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
return new LocalMapType(context, ref, tracker);
|
||||
}
|
||||
|
||||
public int[][][] reduce(LocalMapType map, int[][][] qScores) {
|
||||
for ( int i = 0; i < map.numReads(); i ++ ) {
|
||||
SAMRecord read = map.context.getReads().get(i);
|
||||
int offset = map.context.getOffsets().get(i);
|
||||
if ( isMismatch(read, offset, map.ref) ) {
|
||||
qScores[calcZScoreBin(read,offset)][getQScoreAsInt(read,offset)][MM_OFFSET]++;
|
||||
} else {
|
||||
qScores[calcZScoreBin(read,offset)][getQScoreAsInt(read,offset)][MATCH_OFFSET]++;
|
||||
}
|
||||
}
|
||||
|
||||
return qScores;
|
||||
}
|
||||
|
||||
public void onTraversalDone( int[][][] zScoreBins ) {
|
||||
out.print( header() );
|
||||
for ( int i = 0; i < Z_SCORE_MAX*Z_SCORE_MULTIPLIER; i ++ ) {
|
||||
out.print( formatData(zScoreBins[i],i) );
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isMismatch( SAMRecord read, int offset, ReferenceContext ref ) {
|
||||
return ((char) read.getReadBases()[offset]) == ref.getBase();
|
||||
}
|
||||
|
||||
public int getQScoreAsInt( SAMRecord read, int offset ) {
|
||||
return (int) read.getBaseQualities()[offset];
|
||||
}
|
||||
|
||||
public int calcZScoreBin( SAMRecord read, int offset ) {
|
||||
Pair<Double,Double> meanVar = calcWindowMeanVariance(read, offset);
|
||||
double rawZ = Math.abs((getQScoreAsInt(read, offset) - meanVar.first)/Math.sqrt(meanVar.second));
|
||||
int zBin;
|
||||
if ( rawZ < Z_SCORE_MAX ) {
|
||||
zBin = (int) Math.floor(rawZ*Z_SCORE_MULTIPLIER);
|
||||
} else {
|
||||
zBin = Z_SCORE_MULTIPLIER*Z_SCORE_MAX;
|
||||
}
|
||||
|
||||
return zBin;
|
||||
}
|
||||
|
||||
public Pair<Double,Double> calcWindowMeanVariance( SAMRecord read, int offset ) {
|
||||
int start,end;
|
||||
if ( offset - WIN_SIDE_SIZE < 0 ) {
|
||||
start = 0;
|
||||
} else {
|
||||
start = offset - WIN_SIDE_SIZE;
|
||||
}
|
||||
|
||||
if ( offset + WIN_SIDE_SIZE > read.getReadLength() ) {
|
||||
end = read.getReadLength();
|
||||
} else {
|
||||
end = offset + WIN_SIDE_SIZE;
|
||||
}
|
||||
|
||||
double mean = 0.0;
|
||||
int n = end-start;
|
||||
byte[] quals = read.getBaseQualities();
|
||||
|
||||
for ( int i = start; i < end; i ++ ) {
|
||||
if ( i != offset ) {
|
||||
mean += ((double) quals[i])/n;
|
||||
}
|
||||
}
|
||||
|
||||
double var = 0.0;
|
||||
|
||||
for ( int i = start; i < end; i ++ ) {
|
||||
if ( i != offset ) {
|
||||
var += Math.pow(((double) quals[i] - mean),2)/n;
|
||||
}
|
||||
}
|
||||
|
||||
return new Pair<Double,Double>(mean, var);
|
||||
}
|
||||
|
||||
public String header() {
|
||||
String format = "%s\t%s\t%s\t%s\t%s%n";
|
||||
return String.format(format, "ZScore", "Expected_Mismatch", "Empirical_Mismatch", "Expected_MM_As_Q", "Empirical_MM_As_Q");
|
||||
}
|
||||
|
||||
public String formatData ( int[][] matchMismatchQ, int zScoreBin ) {
|
||||
double zScore = ( (double) zScoreBin )/Z_SCORE_MULTIPLIER;
|
||||
int match = 0;
|
||||
int mismatch = 0;
|
||||
double expMMR = 0.0;
|
||||
|
||||
for ( int i = 0; i < MAX_Q_SCORE; i ++ ) {
|
||||
match += matchMismatchQ[i][MATCH_OFFSET];
|
||||
mismatch += matchMismatchQ[i][MM_OFFSET];
|
||||
expMMR += QualityUtils.qualToProb(i)*matchMismatchQ[i][MATCH_OFFSET];
|
||||
expMMR += QualityUtils.qualToProb(i)*matchMismatchQ[i][MM_OFFSET];
|
||||
}
|
||||
|
||||
expMMR = (expMMR / ( match + mismatch ));
|
||||
double empMMR = ((double) mismatch)/(match + mismatch);
|
||||
int expMMRAsQ = QualityUtils.probToQual(expMMR);
|
||||
int empMMRAsQ = QualityUtils.probToQual(empMMR);
|
||||
|
||||
String format = "%f\t%f\t%f\t%d\t%d%n";
|
||||
|
||||
return String.format(format, zScore, expMMR, empMMR, expMMRAsQ, empMMRAsQ);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,141 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.LinkedList;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: Sep 29, 2009
|
||||
* Time: 10:54:18 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class NQSCovariantByCountsWalker extends LocusWalker< LocalMapType, int[][][][] > {
|
||||
|
||||
final static int SMALL_DEVIATION = 5;
|
||||
final static int LARGE_DEVIATION = 8;
|
||||
final static int WIN_SIZE_SIDE = 4;
|
||||
final static int MATCH_OFFSET = 1;
|
||||
final static int MM_OFFSET = 0;
|
||||
final static int MAX_Q_SCORE = 2 + QualityUtils.MAX_REASONABLE_Q_SCORE;
|
||||
final static String DATA_FORMAT = "%d\t%d\t%d\t%d\t%f\t%f\t%d\t%d%n";
|
||||
final static String HEADER_FORMAT = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%n";
|
||||
|
||||
int NGHD_SIZE;
|
||||
|
||||
public void initialize() {
|
||||
NGHD_SIZE = 2*WIN_SIZE_SIDE+1;
|
||||
}
|
||||
|
||||
public int[][][][] reduceInit() {
|
||||
int[][][][] q = new int[2*WIN_SIZE_SIDE][2*WIN_SIZE_SIDE][MAX_Q_SCORE][2];
|
||||
|
||||
for ( int i = 0; i < 2*WIN_SIZE_SIDE; i ++ ) {
|
||||
for ( int j = 0; j < 2*WIN_SIZE_SIDE; j ++ ) {
|
||||
for ( int k = 0; k < MAX_Q_SCORE; k ++ ) {
|
||||
for ( int h = 0; h < 2; h ++ ) {
|
||||
q[i][j][k][h] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return q;
|
||||
}
|
||||
|
||||
public LocalMapType map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
return new LocalMapType(context, ref, tracker);
|
||||
}
|
||||
|
||||
public int [][][][] reduce( LocalMapType map, int [][][][] quals ) {
|
||||
for ( int i = 0; i < map.numReads(); i ++ ) {
|
||||
SAMRecord read = map.context.getReads().get(i);
|
||||
int offset = map.context.getOffsets().get(i);
|
||||
Pair<Integer,Integer> deviantCounts = countNumDeviantBases(read,offset);
|
||||
|
||||
if ( isMismatch(read,offset,map.ref) ) {
|
||||
quals[deviantCounts.first][deviantCounts.second][(int) read.getBaseQualities()[offset]][MM_OFFSET]++;
|
||||
} else {
|
||||
quals[deviantCounts.first][deviantCounts.second][(int) read.getBaseQualities()[offset]][MATCH_OFFSET]++;
|
||||
}
|
||||
}
|
||||
|
||||
return quals;
|
||||
}
|
||||
|
||||
public void onTraversalDone( int[][][][] counts ) {
|
||||
out.print( header() );
|
||||
|
||||
for ( int i = 0; i < 2*WIN_SIZE_SIDE; i ++ ) {
|
||||
for ( int j = 0; j < 2*WIN_SIZE_SIDE; j ++ ) {
|
||||
out.print( formatData( counts[i][j], i, j ) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Pair<Integer,Integer> countNumDeviantBases(SAMRecord read, int offset) {
|
||||
int start,end;
|
||||
if ( offset - WIN_SIZE_SIDE < 0 ) {
|
||||
start = 0;
|
||||
} else {
|
||||
start = offset - WIN_SIZE_SIDE;
|
||||
}
|
||||
|
||||
if ( offset + WIN_SIZE_SIDE > read.getReadLength() ) {
|
||||
end = read.getReadLength();
|
||||
} else {
|
||||
end = offset + WIN_SIZE_SIDE;
|
||||
}
|
||||
|
||||
int largeCounts = 0;
|
||||
int smallCounts = 0;
|
||||
byte[] quals = read.getBaseQualities();
|
||||
for ( int i = start; i < end; i ++ ) {
|
||||
if ( Math.abs(quals[i] - quals[offset]) >= LARGE_DEVIATION ) {
|
||||
largeCounts ++;
|
||||
} else if ( Math.abs( quals[i] - quals[offset] ) >= SMALL_DEVIATION ) {
|
||||
smallCounts ++;
|
||||
}
|
||||
}
|
||||
|
||||
return new Pair<Integer,Integer>(smallCounts,largeCounts);
|
||||
}
|
||||
|
||||
public boolean isMismatch( SAMRecord read, int offset, ReferenceContext ref ) {
|
||||
return ( ((char)read.getReadBases()[offset]) == ref.getBase() );
|
||||
}
|
||||
|
||||
public String header() {
|
||||
return String.format(HEADER_FORMAT, "N_SMALL_DEV", "N_LARGE_DEV", "MATCH", "MISMATCH", "EXPECTED_ERR", "EMPIRICAL_ERR", "EXP_AS_Q", "EMP_AS_Q");
|
||||
}
|
||||
|
||||
public String formatData( int[][] qScores, int smDev, int lgDev ) {
|
||||
int match = 0;
|
||||
int mismatch = 0;
|
||||
double expErr = 0.0;
|
||||
|
||||
for ( int i = 0; i < MAX_Q_SCORE; i ++ ) {
|
||||
match += qScores[i][MATCH_OFFSET];
|
||||
mismatch += qScores[i][MM_OFFSET];
|
||||
expErr += QualityUtils.qualToProb(i)*qScores[i][MATCH_OFFSET];
|
||||
expErr += QualityUtils.qualToProb(i)*qScores[i][MM_OFFSET];
|
||||
}
|
||||
|
||||
expErr = expErr/(match + mismatch);
|
||||
int expAsQ = QualityUtils.probToQual(expErr);
|
||||
|
||||
double empErr = ((double)mismatch)/(match+mismatch);
|
||||
int empAsQ = QualityUtils.probToQual(empErr);
|
||||
|
||||
return String.format(DATA_FORMAT, smDev, lgDev, match, mismatch, expErr, empErr, expAsQ, empAsQ);
|
||||
}
|
||||
}
|
||||
|
|
@ -12,6 +12,7 @@ import java.util.LinkedList;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
|
|
@ -38,7 +39,7 @@ public class NQSTabularDistributionWalker extends LocusWalker<LocalMapType, NQSD
|
|||
}
|
||||
|
||||
public NQSDistributionTable reduceInit() {
|
||||
return new NQSDistributionTable(WINDOW_SIZE, qBins, QSCORE_BIN_SIZE);
|
||||
return new NQSDistributionTable(WINDOW_SIZE, qBins, QSCORE_BIN_SIZE, logger);
|
||||
}
|
||||
|
||||
public LocalMapType map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
|
|
@ -90,15 +91,16 @@ class NQSDistributionTable {
|
|||
public final int MATCH_OFFSET = 1;
|
||||
|
||||
protected int [][][][][][][][] table;
|
||||
protected Logger logger;
|
||||
|
||||
protected int OFF_END_OFFSET;
|
||||
protected int OFF_START_OFFSET;
|
||||
|
||||
public NQSDistributionTable (int winSize, int qBins, int qStep) {
|
||||
if ( table.length != winSize ) {
|
||||
public NQSDistributionTable (int winSize, int qBins, int qStep, Logger logger) {
|
||||
if ( 7 != winSize ) {
|
||||
throw new StingException("Size positied in tabular distribution is not the size of the distribution table");
|
||||
}
|
||||
|
||||
this.logger = logger;
|
||||
table = new int[qBins][qBins][qBins][qBins][qBins][qBins][qBins][2];
|
||||
// yes, this is absolutely positively horrendous brute-force code.
|
||||
// ... Knuth would be proud. These could be lists for a decrease
|
||||
|
|
@ -129,6 +131,11 @@ class NQSDistributionTable {
|
|||
// note: there are two extra bins, one for "No quality: off the end of read"
|
||||
// and one for "No quality: off the beginning of read"
|
||||
List<Integer> coordinates = getQscoreCoordinates(read,offset,winSize, binSize);
|
||||
if ( isMatch(read, offset, ref) ) {
|
||||
coordinates.add(MATCH_OFFSET);
|
||||
} else {
|
||||
coordinates.add(MM_OFFSET);
|
||||
}
|
||||
ListIterator<Integer> o = coordinates.listIterator();
|
||||
table[o.next()][o.next()][o.next()][o.next()][o.next()][o.next()][o.next()][o.next()] ++;
|
||||
// i'm sure i could use recursion for this but my brain is too tired after swimming to figure
|
||||
|
|
@ -137,9 +144,10 @@ class NQSDistributionTable {
|
|||
|
||||
public List<Integer> getQscoreCoordinates( SAMRecord read, int offset, int win, int binSize ) {
|
||||
LinkedList<Integer> coords = new LinkedList<Integer>();
|
||||
for ( int i = offset - win; i < offset + win ; i ++ ) {
|
||||
for ( int i = offset - win; i <= offset + win ; i ++ ) {
|
||||
coords.add(getQscoreBin(getQScoreAsInt(read,i), binSize));
|
||||
}
|
||||
logger.debug(Integer.toString(coords.size()));
|
||||
return coords;
|
||||
}
|
||||
|
||||
|
|
@ -147,7 +155,7 @@ class NQSDistributionTable {
|
|||
int qscore;
|
||||
if ( offset < 0 ) {
|
||||
qscore = read.getReadNegativeStrandFlag() ? OFF_END_OFFSET : OFF_START_OFFSET;
|
||||
} else if ( offset > read.getReadLength() ) {
|
||||
} else if ( offset >= read.getReadLength() ) {
|
||||
qscore = read.getReadNegativeStrandFlag() ? OFF_START_OFFSET : OFF_END_OFFSET;
|
||||
} else {
|
||||
qscore = (int) read.getBaseQualities()[offset];
|
||||
|
|
@ -166,4 +174,8 @@ class NQSDistributionTable {
|
|||
|
||||
return new Pair<Integer,Integer>(mm,ma);
|
||||
}
|
||||
|
||||
public boolean isMatch( SAMRecord read, int offset, ReferenceContext ref ) {
|
||||
return ( Character.toUpperCase( (char) read.getReadBases()[offset] ) == ref.getBase() || ref.getBase() == 'N');
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.LinkedList;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: Sep 28, 2009
|
||||
* Time: 12:02:33 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class QualityScoreDistributionWalker extends LocusWalker<List<Pair<Integer,Integer>>, int[][]> {
|
||||
@Argument(fullName="maxReadLength", shortName="rl", doc="Maximum read length in the bam file", required=true)
|
||||
protected int MAX_READ_LENGTH = 125;
|
||||
|
||||
|
||||
protected final int MAX_REASONABLE_Q_SCORE = 3 + QualityUtils.MAX_REASONABLE_Q_SCORE;
|
||||
public void initialize() {
|
||||
|
||||
}
|
||||
|
||||
public int[][] reduceInit() {
|
||||
int [][] qualityCounts = new int[MAX_READ_LENGTH][MAX_REASONABLE_Q_SCORE];
|
||||
for ( int i = 0; i < MAX_READ_LENGTH; i ++ ) {
|
||||
for ( int j = 0; j < MAX_REASONABLE_Q_SCORE; j ++ ) {
|
||||
qualityCounts[i][j] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return qualityCounts;
|
||||
}
|
||||
|
||||
public List<Pair<Integer,Integer>> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
LinkedList<Pair<Integer,Integer>> qualByPos = new LinkedList<Pair<Integer,Integer>>();
|
||||
for ( int r = 0; r < context.numReads(); r ++ ) {
|
||||
if( context.getReads().get(r).getReadNegativeStrandFlag() ) {
|
||||
int of = context.getReads().get(r).getReadLength() - context.getOffsets().get(r);
|
||||
int qu = context.getReads().get(r).getBaseQualities()[context.getOffsets().get(r)];
|
||||
qualByPos.add(new Pair<Integer,Integer>(of, qu));
|
||||
}
|
||||
else {
|
||||
int of = context.getOffsets().get(r);
|
||||
int qu = context.getReads().get(r).getBaseQualities()[of];
|
||||
qualByPos.add(new Pair<Integer,Integer>(of, qu));
|
||||
}
|
||||
}
|
||||
|
||||
return qualByPos;
|
||||
}
|
||||
|
||||
public int[][] reduce( List<Pair<Integer,Integer>> qualByPos, int[][] counts ) {
|
||||
for ( Pair<Integer,Integer> qPosPair : qualByPos ) {
|
||||
counts[qPosPair.getFirst()][qPosPair.getSecond()] ++;
|
||||
}
|
||||
|
||||
return counts;
|
||||
}
|
||||
|
||||
public void onTraversalDone( int[][] counts ) {
|
||||
for ( int pos = 0; pos < MAX_READ_LENGTH; pos ++ ) {
|
||||
String outStr = Integer.toString(pos);
|
||||
for ( int qual = 0; qual < MAX_REASONABLE_Q_SCORE; qual ++) {
|
||||
outStr = outStr + "\t" + Integer.toString(counts[pos][qual]);
|
||||
}
|
||||
out.printf("%s%n", outStr);
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue