QualityUtils: added reverse function to reverse an array of bytes (and not complement it), BaseUtils: split qualToProb into itself and qualToErrProb, CovariateCounterWalker and LogisticRecalibrationWalker: several changes including a properly acocunting (only partly complete) for reversing AND complementing bases that are negative strand, PrintReadsWalker: created option to output reads to a BAM file rather than just to the sceern (useful for creating a downsampled BAM file)
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@770 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
7e77c62b49
commit
0219d33e10
|
|
@ -1,17 +1,49 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers;
|
package org.broadinstitute.sting.gatk.walkers;
|
||||||
|
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
import org.broadinstitute.sting.gatk.LocusContext;
|
import net.sf.samtools.SAMFileWriter;
|
||||||
|
import net.sf.samtools.SAMFileWriterFactory;
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
|
||||||
public class PrintReadsWalker extends ReadWalker<Integer, Integer> {
|
import java.io.PrintStream;
|
||||||
public Integer map(char[] ref, SAMRecord read) {
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||||
|
|
||||||
|
@Argument(fullName="outputBamFile", shortName="of", doc="Write output to this BAM filename instead of STDOUT", required=false)
|
||||||
|
String outputBamFile = null;
|
||||||
|
|
||||||
|
public SAMRecord map(char[] ref, SAMRecord read) {
|
||||||
|
return read;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SAMFileWriter reduceInit() {
|
||||||
|
if ( outputBamFile != null ) { // ! outputBamFile.equals("") ) {
|
||||||
|
SAMFileWriterFactory fact = new SAMFileWriterFactory();
|
||||||
|
SAMFileHeader header = this.getToolkit().getEngine().getSAMHeader();
|
||||||
|
return fact.makeBAMWriter(header, true, new File(outputBamFile));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
|
||||||
|
if ( output != null ) {
|
||||||
|
output.addAlignment(read);
|
||||||
|
} else {
|
||||||
out.println(read.format());
|
out.println(read.format());
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Integer reduceInit() { return 0; }
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
public Integer reduce(Integer value, Integer sum) {
|
public void onTraversalDone(SAMFileWriter output) {
|
||||||
return value + sum;
|
if ( output != null ) {
|
||||||
|
output.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
@ -26,7 +27,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
public String OUTPUT_FILEROOT = "output";
|
public String OUTPUT_FILEROOT = "output";
|
||||||
|
|
||||||
@Argument(fullName="CREATE_TRAINING_DATA", shortName="trainingdata", required=false, doc="Create training data files for logistic regression")
|
@Argument(fullName="CREATE_TRAINING_DATA", shortName="trainingdata", required=false, doc="Create training data files for logistic regression")
|
||||||
public boolean CREATE_TRAINING_DATA = false;
|
public boolean CREATE_TRAINING_DATA = true;
|
||||||
|
|
||||||
@Argument(fullName="DOWNSAMPLE_FRACTION", shortName="sample", required=false, doc="Fraction of bases to randomly sample")
|
@Argument(fullName="DOWNSAMPLE_FRACTION", shortName="sample", required=false, doc="Fraction of bases to randomly sample")
|
||||||
public float DOWNSAMPLE_FRACTION=1.0f;
|
public float DOWNSAMPLE_FRACTION=1.0f;
|
||||||
|
|
@ -40,11 +41,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
int NDINUCS = 16;
|
int NDINUCS = 16;
|
||||||
RecalData[][][] data = new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS];
|
RecalData[][][] data = new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS];
|
||||||
//RecalData[][][] data = new RecalData;
|
//RecalData[][][] data = new RecalData;
|
||||||
ArrayList<RecalData> flattenData = new ArrayList();
|
ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
|
||||||
|
|
||||||
static int nuc2num[];
|
static int nuc2num[];
|
||||||
static char num2nuc[];
|
static char num2nuc[];
|
||||||
|
|
||||||
|
long counted_sites = 0; // number of sites used to count covariates
|
||||||
|
long skipped_sites = 0; // number of sites skipped because of a dbSNP entry
|
||||||
|
|
||||||
String dinuc_root = "dinuc";
|
String dinuc_root = "dinuc";
|
||||||
ArrayList<PrintStream> dinuc_outs = new ArrayList<PrintStream>();
|
ArrayList<PrintStream> dinuc_outs = new ArrayList<PrintStream>();
|
||||||
PrintStream covars_out = new PrintStream("covars.out");
|
PrintStream covars_out = new PrintStream("covars.out");
|
||||||
|
|
@ -100,6 +104,16 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
ByQualFile = new PrintStream(OUTPUT_FILEROOT+".empirical_v_reported_quality.csv");
|
||||||
|
ByCycleFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_cycle.csv");
|
||||||
|
ByDinucFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_dinucleotide.csv");
|
||||||
|
} catch (FileNotFoundException e){
|
||||||
|
System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT);
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||||
|
|
@ -110,7 +124,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
for (int i =0; i < reads.size(); i++ ) {
|
for (int i =0; i < reads.size(); i++ ) {
|
||||||
SAMRecord read = reads.get(i);
|
SAMRecord read = reads.get(i);
|
||||||
|
|
||||||
|
|
||||||
if ((READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
|
if ((READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
|
||||||
(read.getMappingQuality() >= MIN_MAPPING_QUALITY) &&
|
(read.getMappingQuality() >= MIN_MAPPING_QUALITY) &&
|
||||||
(DOWNSAMPLE_FRACTION == 1.0 || random_genrator.nextFloat() < DOWNSAMPLE_FRACTION)) {
|
(DOWNSAMPLE_FRACTION == 1.0 || random_genrator.nextFloat() < DOWNSAMPLE_FRACTION)) {
|
||||||
|
|
@ -118,13 +131,15 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
int offset = offsets.get(i);
|
int offset = offsets.get(i);
|
||||||
int numBases = read.getReadLength();
|
int numBases = read.getReadLength();
|
||||||
if ( offset > 0 && offset < (numBases-1) ) { // skip first and last bases because they suck and they don't have a dinuc count
|
if ( offset > 0 && offset < (numBases-1) ) { // skip first and last bases because they suck and they don't have a dinuc count
|
||||||
char base = (char)read.getReadBases()[offset];
|
|
||||||
int qual = (int)read.getBaseQualities()[offset];
|
int qual = (int)read.getBaseQualities()[offset];
|
||||||
//out.printf("%d %d %d\n", offset, qual, bases2dinucIndex(prevBase,base));
|
|
||||||
// previous base is the next base in terms of machine chemistry if this is a negative strand
|
|
||||||
char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)];
|
|
||||||
int dinuc_index = bases2dinucIndex(prevBase,base);
|
|
||||||
if (qual > 0 && qual <= MAX_QUAL_SCORE) {
|
if (qual > 0 && qual <= MAX_QUAL_SCORE) {
|
||||||
|
// previous base is the next base in terms of machine chemistry if this is a negative strand
|
||||||
|
char base = (char)read.getReadBases()[offset];
|
||||||
|
char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)];
|
||||||
|
int dinuc_index = bases2dinucIndex(prevBase, base, read.getReadNegativeStrandFlag());
|
||||||
|
//char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)];
|
||||||
|
//int dinuc_index = bases2dinucIndex(prevBase, base, read.getReadNegativeStrandFlag());
|
||||||
|
|
||||||
// Convert offset into cycle position which means reversing the position of reads on the negative strand
|
// Convert offset into cycle position which means reversing the position of reads on the negative strand
|
||||||
int cycle = read.getReadNegativeStrandFlag() ? numBases - offset - 1 : offset;
|
int cycle = read.getReadNegativeStrandFlag() ? numBases - offset - 1 : offset;
|
||||||
data[cycle][qual][dinuc_index].inc(base,ref);
|
data[cycle][qual][dinuc_index].inc(base,ref);
|
||||||
|
|
@ -132,6 +147,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
counted_sites += 1;
|
||||||
|
}else{
|
||||||
|
skipped_sites += 1;
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
@ -147,23 +165,32 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
qualityDiffVsCycle();
|
qualityDiffVsCycle();
|
||||||
qualityDiffVsDinucleotide();
|
qualityDiffVsDinucleotide();
|
||||||
|
|
||||||
|
out.printf("Counted sites: %d%n", counted_sites);
|
||||||
|
out.printf("Skipped sites: %d%n", skipped_sites);
|
||||||
|
out.printf("Fraction skipped: 1/%.0f%n", (double)counted_sites / skipped_sites);
|
||||||
|
|
||||||
// Close dinuc filehandles
|
// Close dinuc filehandles
|
||||||
if (CREATE_TRAINING_DATA) writeTrainingData();
|
if (CREATE_TRAINING_DATA) writeTrainingData();
|
||||||
|
/*if (CREATE_TRAINING_DATA)
|
||||||
|
for ( PrintStream dinuc_stream : this.dinuc_outs)
|
||||||
|
dinuc_stream.close();*/
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void writeTrainingData() {
|
void writeTrainingData() {
|
||||||
for ( int i=0; i<NDINUCS; i++) {
|
for ( int dinuc_index=0; dinuc_index<NDINUCS; dinuc_index++) {
|
||||||
PrintStream dinuc_out = null;
|
PrintStream dinuc_out = null;
|
||||||
try {
|
try {
|
||||||
dinuc_out = new PrintStream( dinuc_root+"."+dinucIndex2bases(i)+".csv");
|
dinuc_out = new PrintStream( dinuc_root+"."+dinucIndex2bases(dinuc_index)+".csv");
|
||||||
dinuc_out.println("logitQ,pos,indicator,count");
|
dinuc_out.println("logitQ,pos,indicator,count");
|
||||||
|
|
||||||
for ( RecalData datum: flattenData ) {
|
for ( RecalData datum: flattenData ) {
|
||||||
|
if (string2dinucIndex(datum.dinuc) == dinuc_index) {
|
||||||
if ((datum.N - datum.B) > 0)
|
if ((datum.N - datum.B) > 0)
|
||||||
dinuc_out.format("%d,%d,%d,%d\n", datum.qual, datum.pos, 0, datum.N - datum.B);
|
dinuc_out.format("%d,%d,%d,%d\n", datum.qual, datum.pos, 0, datum.N - datum.B);
|
||||||
if (datum.B > 0)
|
if (datum.B > 0)
|
||||||
dinuc_out.format("%d,%d,%d,%d\n", datum.qual, datum.pos, 0, datum.B);
|
dinuc_out.format("%d,%d,%d,%d\n", datum.qual, datum.pos, 1, datum.B);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (FileNotFoundException e) {
|
} catch (FileNotFoundException e) {
|
||||||
System.err.println("FileNotFoundException: " + e.getMessage());
|
System.err.println("FileNotFoundException: " + e.getMessage());
|
||||||
|
|
@ -177,14 +204,18 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
class MeanReportedQuality {
|
class MeanReportedQuality {
|
||||||
double Qn = 0;
|
double Qn = 0;
|
||||||
long n = 0;
|
long n = 0;
|
||||||
|
double sumErrors = 0; // Count of estimated number of errors
|
||||||
|
|
||||||
void inc(double Q, long n) {
|
void inc(double Q, long n) {
|
||||||
this.n += n;
|
this.n += n;
|
||||||
Qn += Q * n;
|
//Qn += Q * n; // wrong calculation that worked did not account for Qs being in log space
|
||||||
|
sumErrors += QualityUtils.qualToErrorProb((byte)Q) * n;
|
||||||
}
|
}
|
||||||
|
|
||||||
double result() {
|
double result() {
|
||||||
return Qn / n;
|
//return Qn / n;
|
||||||
|
return -10 * Math.log10(sumErrors / n);
|
||||||
|
//return QualityUtils.probToQual(1.0 - (sumErrors / n));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -192,6 +223,8 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
|
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
|
||||||
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
|
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
|
||||||
ByCycleFile.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n");
|
ByCycleFile.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n");
|
||||||
|
RecalData All = new RecalData(0,0,"");
|
||||||
|
MeanReportedQuality AllReported = new MeanReportedQuality();
|
||||||
for (int c=0; c < MAX_READ_LENGTH+1; c++) {
|
for (int c=0; c < MAX_READ_LENGTH+1; c++) {
|
||||||
ByCycle.add(new RecalData(c, -1, "-"));
|
ByCycle.add(new RecalData(c, -1, "-"));
|
||||||
ByCycleReportedQ.add(new MeanReportedQuality());
|
ByCycleReportedQ.add(new MeanReportedQuality());
|
||||||
|
|
@ -200,6 +233,8 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
for ( RecalData datum: flattenData ) {
|
for ( RecalData datum: flattenData ) {
|
||||||
ByCycle.get(datum.pos).inc(datum.N, datum.B);
|
ByCycle.get(datum.pos).inc(datum.N, datum.B);
|
||||||
ByCycleReportedQ.get(datum.pos).inc(datum.qual, datum.N);
|
ByCycleReportedQ.get(datum.pos).inc(datum.qual, datum.N);
|
||||||
|
All.inc(datum.N, datum.B);
|
||||||
|
AllReported.inc(datum.qual, datum.N);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int c=0; c < MAX_READ_LENGTH+1; c++) {
|
for (int c=0; c < MAX_READ_LENGTH+1; c++) {
|
||||||
|
|
@ -207,21 +242,27 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
double reportedQual = ByCycleReportedQ.get(c).result();
|
double reportedQual = ByCycleReportedQ.get(c).result();
|
||||||
ByCycleFile.printf("%d, %f, %f, %f, %d, %d%n", c, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
|
ByCycleFile.printf("%d, %f, %f, %f, %d, %d%n", c, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
|
||||||
}
|
}
|
||||||
|
System.out.printf("Cycle: N=%d, B=%d, Qemp=%.1f, ", All.N, All.B, -10 * Math.log10((double)All.B/All.N));
|
||||||
|
System.out.printf("Qrep=%.1f%n", AllReported.result());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void qualityDiffVsDinucleotide() {
|
public void qualityDiffVsDinucleotide() {
|
||||||
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
|
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
|
||||||
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
|
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
|
||||||
ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
|
ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
|
||||||
|
RecalData All = new RecalData(0,0,"");
|
||||||
|
MeanReportedQuality AllReported = new MeanReportedQuality();
|
||||||
for (int c=0; c < NDINUCS; c++) {
|
for (int c=0; c < NDINUCS; c++) {
|
||||||
ByCycle.add(new RecalData(-1, -1, dinucIndex2bases(c)));
|
ByCycle.add(new RecalData(-1, -1, dinucIndex2bases(c)));
|
||||||
ByCycleReportedQ.add(new MeanReportedQuality());
|
ByCycleReportedQ.add(new MeanReportedQuality());
|
||||||
}
|
}
|
||||||
|
|
||||||
for ( RecalData datum: flattenData ) {
|
for ( RecalData datum: flattenData ) {
|
||||||
int dinucIndex = bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1));
|
int dinucIndex = string2dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false);
|
||||||
ByCycle.get(dinucIndex).inc(datum.N, datum.B);
|
ByCycle.get(dinucIndex).inc(datum.N, datum.B);
|
||||||
ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N);
|
ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N);
|
||||||
|
All.inc(datum.N, datum.B);
|
||||||
|
AllReported.inc(datum.qual, datum.N);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int c=0; c < NDINUCS; c++) {
|
for (int c=0; c < NDINUCS; c++) {
|
||||||
|
|
@ -229,24 +270,37 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
double reportedQual = ByCycleReportedQ.get(c).result();
|
double reportedQual = ByCycleReportedQ.get(c).result();
|
||||||
ByDinucFile.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
|
ByDinucFile.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
|
||||||
}
|
}
|
||||||
|
System.out.printf("Dinuc: N=%d, B=%d, Qemp=%.1f, ", All.N, All.B, -10 * Math.log10((double)All.B/All.N));
|
||||||
|
System.out.printf("Qrep=%.1f%n", AllReported.result());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void qualityEmpiricalObserved() {
|
public void qualityEmpiricalObserved() {
|
||||||
|
|
||||||
ArrayList<RecalData> ByQ = new ArrayList<RecalData>();
|
ArrayList<RecalData> ByQ = new ArrayList<RecalData>();
|
||||||
ByQualFile.printf("Qrep,Qemp,B,N%n");
|
ArrayList<MeanReportedQuality> ByQReportedQ = new ArrayList<MeanReportedQuality>();
|
||||||
for (int q=0; q<MAX_QUAL_SCORE+1; q++)
|
ByQualFile.printf("Qrep,Qemp,Qrep_avg,B,N%n");
|
||||||
|
RecalData All = new RecalData(0,0,"");
|
||||||
|
MeanReportedQuality AllReported = new MeanReportedQuality();
|
||||||
|
for (int q=0; q<MAX_QUAL_SCORE+1; q++) {
|
||||||
ByQ.add(new RecalData(-1,q,"-"));
|
ByQ.add(new RecalData(-1,q,"-"));
|
||||||
|
ByQReportedQ.add(new MeanReportedQuality());
|
||||||
|
}
|
||||||
|
|
||||||
for ( RecalData datum: flattenData ){
|
for ( RecalData datum: flattenData ){
|
||||||
ByQ.get(datum.qual).inc(datum.N, datum.B);
|
ByQ.get(datum.qual).inc(datum.N, datum.B);
|
||||||
|
ByQReportedQ.get(datum.qual).inc(datum.qual, datum.N);
|
||||||
|
All.inc(datum.N, datum.B);
|
||||||
|
AllReported.inc(datum.qual, datum.N);
|
||||||
|
//out.printf("%2d%6d%3d %2d %s%n", datum.qual, datum.N, datum.pos, datum.qual, datum.dinuc);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int q=0; q<MAX_QUAL_SCORE; q++) {
|
for (int q=0; q<MAX_QUAL_SCORE; q++) {
|
||||||
double empiricalQual = -10 * Math.log10((double)ByQ.get(q).B / ByQ.get(q).N);
|
double empiricalQual = -10 * Math.log10((double)ByQ.get(q).B / ByQ.get(q).N);
|
||||||
ByQualFile.printf("%d, %f, %d, %d%n", q, empiricalQual, ByQ.get(q).B, ByQ.get(q).N);
|
ByQualFile.printf("%d, %f, %.0f, %d, %d%n", q, empiricalQual, ByQReportedQ.get(q).result(), ByQ.get(q).B, ByQ.get(q).N);
|
||||||
//out.printf("%3d,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, dinuc, qual, empiricalQual, qual-empiricalQual, N, B);
|
//out.printf("%3d,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, dinuc, qual, empiricalQual, qual-empiricalQual, N, B); n
|
||||||
}
|
}
|
||||||
|
System.out.printf("Emp-Obs: N=%d, B=%d, Qemp=%.1f, ", All.N, All.B, -10 * Math.log10((double)All.B/All.N));
|
||||||
|
System.out.printf("Qrep=%.1f%n", AllReported.result());
|
||||||
}
|
}
|
||||||
|
|
||||||
public Integer reduceInit() {
|
public Integer reduceInit() {
|
||||||
|
|
@ -257,8 +311,12 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int bases2dinucIndex(char prevBase, char base) {
|
public int bases2dinucIndex(char prevBase, char base, boolean Complement) {
|
||||||
|
if (!Complement) {
|
||||||
return nuc2num[prevBase] * 4 + nuc2num[base];
|
return nuc2num[prevBase] * 4 + nuc2num[base];
|
||||||
|
}else{
|
||||||
|
return (3 - nuc2num[prevBase]) * 4 + (3 - nuc2num[base]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public String dinucIndex2bases(int index) {
|
public String dinucIndex2bases(int index) {
|
||||||
|
|
@ -266,6 +324,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
return new String( data );
|
return new String( data );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int string2dinucIndex(String s) {
|
||||||
|
return bases2dinucIndex(s.charAt(0), s.charAt(1), false);
|
||||||
|
}
|
||||||
|
|
||||||
static {
|
static {
|
||||||
nuc2num = new int[128];
|
nuc2num = new int[128];
|
||||||
nuc2num['A'] = 0;
|
nuc2num['A'] = 0;
|
||||||
|
|
@ -286,9 +348,12 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
Random random_genrator;
|
Random random_genrator;
|
||||||
// Print out data for regression
|
// Print out data for regression
|
||||||
public CovariateCounterWalker() throws FileNotFoundException {
|
public CovariateCounterWalker() throws FileNotFoundException {
|
||||||
ByQualFile = new PrintStream(OUTPUT_FILEROOT+".empirical_v_reported_quality.csv");
|
/*if (CREATE_TRAINING_DATA)
|
||||||
ByCycleFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_cycle.csv");
|
for ( int i=0; i<NDINUCS; i++) {
|
||||||
ByDinucFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_dinucleotide.csv");
|
PrintStream next_dinuc = new PrintStream( dinuc_root+"."+dinucIndex2bases(i)+".csv");
|
||||||
|
next_dinuc.println("logitQ,pos,indicator");
|
||||||
|
dinuc_outs.add(next_dinuc);
|
||||||
|
} */
|
||||||
random_genrator = new Random(123454321); // keep same random seed while debugging
|
random_genrator = new Random(123454321); // keep same random seed while debugging
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -92,15 +92,20 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
|
||||||
byte[] quals = read.getBaseQualities();
|
byte[] quals = read.getBaseQualities();
|
||||||
byte[] recalQuals = new byte[quals.length];
|
byte[] recalQuals = new byte[quals.length];
|
||||||
|
|
||||||
|
// Since we want machine direction reads not corrected positive strand reads, rev comp any negative strand reads
|
||||||
|
if (read.getReadNegativeStrandFlag()) {
|
||||||
|
bases = BaseUtils.simpleReverseComplement(bases);
|
||||||
|
quals = BaseUtils.reverse(quals);
|
||||||
|
}
|
||||||
|
|
||||||
int numBases = read.getReadLength();
|
int numBases = read.getReadLength();
|
||||||
recalQuals[0] = quals[0]; // can't change the first -- no dinuc
|
recalQuals[0] = quals[0]; // can't change the first -- no dinuc
|
||||||
recalQuals[numBases-1] = quals[numBases-1]; // can't change last -- no dinuc
|
//recalQuals[numBases-1] = quals[numBases-1]; // can't change last -- no dinuc
|
||||||
for ( int i = 1; i < numBases-1; i++ ) { // skip first and last base, qual already set because no dinuc
|
for ( int cycle = 1; cycle < numBases; cycle++ ) { // skip first and last base, qual already set because no dinuc
|
||||||
// Take into account that previous base is the next base in terms of machine chemistry if this is a negative strand
|
// Take into account that previous base is the next base in terms of machine chemistry if this is a negative strand
|
||||||
int cycle = read.getReadNegativeStrandFlag() ? numBases - i - 1 : i;
|
//int cycle = i; //read.getReadNegativeStrandFlag() ? numBases - i - 1 : i;
|
||||||
String dinuc = String.format("%c%c", bases[i + (read.getReadNegativeStrandFlag() ? 1 : -1)], bases[i]);
|
String dinuc = String.format("%c%c", bases[cycle - 1], bases[cycle]);
|
||||||
byte qual = quals[i];
|
byte qual = quals[cycle];
|
||||||
//System.out.printf("dinuc %c %c%n", bases[i-1], bases[i]);
|
|
||||||
LogisticRegressor regressor = regressors.get(dinuc);
|
LogisticRegressor regressor = regressors.get(dinuc);
|
||||||
byte newQual;
|
byte newQual;
|
||||||
|
|
||||||
|
|
@ -120,9 +125,11 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
|
||||||
newQual = qual;
|
newQual = qual;
|
||||||
}
|
}
|
||||||
|
|
||||||
recalQuals[i] = newQual;
|
recalQuals[cycle] = newQual;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (read.getReadNegativeStrandFlag())
|
||||||
|
recalQuals = BaseUtils.reverse(quals);
|
||||||
//System.out.printf("OLD: %s%n", read.format());
|
//System.out.printf("OLD: %s%n", read.format());
|
||||||
read.setBaseQualities(recalQuals);
|
read.setBaseQualities(recalQuals);
|
||||||
//System.out.printf("NEW: %s%n", read.format());
|
//System.out.printf("NEW: %s%n", read.format());
|
||||||
|
|
|
||||||
|
|
@ -125,4 +125,19 @@ public class BaseUtils {
|
||||||
|
|
||||||
return rcbases;
|
return rcbases;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reverse a byte array of bases
|
||||||
|
* @param bases the byte array of bases
|
||||||
|
* @return the reverse of the base byte array
|
||||||
|
*/
|
||||||
|
static public byte[] reverse(byte[] bases) {
|
||||||
|
byte[] rcbases = new byte[bases.length];
|
||||||
|
|
||||||
|
for (int i = 0; i < bases.length; i++) {
|
||||||
|
rcbases[i] = bases[bases.length - 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
return rcbases;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,18 @@ public class QualityUtils {
|
||||||
* @return a probability (0.0-1.0)
|
* @return a probability (0.0-1.0)
|
||||||
*/
|
*/
|
||||||
static public double qualToProb(byte qual) {
|
static public double qualToProb(byte qual) {
|
||||||
return 1.0 - Math.pow(10.0, ((double) qual)/-10.0);
|
return 1.0 - qualToErrorProb(qual);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert a quality score to a probability of error. This is the Phred-style
|
||||||
|
* conversion, *not* the Illumina-style conversion (though asymptotically, they're the same).
|
||||||
|
*
|
||||||
|
* @param qual a quality score (0-40)
|
||||||
|
* @return a probability (0.0-1.0)
|
||||||
|
*/
|
||||||
|
static public double qualToErrorProb(byte qual) {
|
||||||
|
return Math.pow(10.0, ((double) qual)/-10.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue