Updated CovariateCounterWalker to be read group aware

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@889 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
andrewk 2009-06-03 10:06:06 +00:00
parent 7755476d36
commit dfe464cd81
1 changed files with 128 additions and 116 deletions

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.playground.gatk.walkers; package org.broadinstitute.sting.playground.gatk.walkers;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP; import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
@ -12,6 +13,7 @@ import org.broadinstitute.sting.utils.QualityUtils;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Random; import java.util.Random;
import java.util.HashMap;
import java.io.PrintStream; import java.io.PrintStream;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
@ -39,9 +41,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
public String READ_GROUP = "none"; public String READ_GROUP = "none";
int NDINUCS = 16; int NDINUCS = 16;
RecalData[][][] data = new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS];
//RecalData[][][] data = new RecalData;
ArrayList<RecalData> flattenData = new ArrayList<RecalData>(); ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
HashMap<String, RecalData[][][]> data = new HashMap<String, RecalData[][][]>();
//RecalData[][][] data;
static int nuc2num[]; static int nuc2num[];
static char num2nuc[]; static char num2nuc[];
@ -87,18 +89,19 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
} }
public void initialize() { public void initialize() {
for ( int i = 0; i < MAX_READ_LENGTH+1; i++) { for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
for ( int j = 0; j < MAX_QUAL_SCORE+1; j++) { data.put(readGroup.getReadGroupId(), new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS]);
for ( int k = 0; k < NDINUCS; k++) { for ( int i = 0; i < MAX_READ_LENGTH+1; i++) {
String dinuc = dinucIndex2bases(k); for ( int j = 0; j < MAX_QUAL_SCORE+1; j++) {
RecalData datum = new RecalData(i, j, dinuc); for ( int k = 0; k < NDINUCS; k++) {
data[i][j][k] = datum; String dinuc = dinucIndex2bases(k);
flattenData.add(datum); RecalData datum = new RecalData(i, j, dinuc);
data.get(readGroup.getReadGroupId())[i][j][k] = datum;
flattenData.add(datum);
}
} }
} }
} }
} }
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) { public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
@ -108,8 +111,8 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
List<Integer> offsets = context.getOffsets(); List<Integer> offsets = context.getOffsets();
for (int i =0; i < reads.size(); i++ ) { for (int i =0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i); SAMRecord read = reads.get(i);
SAMReadGroupRecord readGroup = read.getHeader().getReadGroup((String)read.getAttribute("RG"));
if (//read.getHeader().getReadGroup((String)read.getAttribute("RG")).getAttribute("PL") == "ILLUMINA" && if ( readGroup.getAttribute("PL") == "ILLUMINA" &&
!read.getReadNegativeStrandFlag() && !read.getReadNegativeStrandFlag() &&
(READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) && (READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
(read.getMappingQuality() >= MIN_MAPPING_QUALITY) && (read.getMappingQuality() >= MIN_MAPPING_QUALITY) &&
@ -130,7 +133,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
// Convert offset into cycle position which means reversing the position of reads on the negative strand // Convert offset into cycle position which means reversing the position of reads on the negative strand
//int cycle = read.getReadNegativeStrandFlag() ? numBases - offset - 1 : offset; //int cycle = read.getReadNegativeStrandFlag() ? numBases - offset - 1 : offset;
//data[cycle][qual][dinuc_index].inc(base,ref); //data[cycle][qual][dinuc_index].inc(base,ref);
data[offset][qual][dinuc_index].inc(base,ref); data.get(readGroup.getReadGroupId())[offset][qual][dinuc_index].inc(base,ref);
} }
} }
} }
@ -168,25 +171,28 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
} }
void writeTrainingData() { void writeTrainingData() {
for ( int dinuc_index=0; dinuc_index<NDINUCS; dinuc_index++) {
PrintStream dinuc_out = null;
try {
dinuc_out = new PrintStream( OUTPUT_FILEROOT+".covariate_counts."+dinucIndex2bases(dinuc_index)+".csv");
dinuc_out.println("logitQ,pos,indicator,count");
for ( RecalData datum: flattenData ) { for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
if (string2dinucIndex(datum.dinuc) == dinuc_index) { for ( int dinuc_index=0; dinuc_index<NDINUCS; dinuc_index++) {
if ((datum.N - datum.B) > 0) PrintStream dinuc_out = null;
dinuc_out.format("%d,%d,%d,%d\n", datum.qual, datum.pos, 0, datum.N - datum.B); try {
if (datum.B > 0) dinuc_out = new PrintStream( OUTPUT_FILEROOT+".covariate_counts.RG_"+readGroup.getReadGroupId()+"."+dinucIndex2bases(dinuc_index)+".csv");
dinuc_out.format("%d,%d,%d,%d\n", datum.qual, datum.pos, 1, datum.B); dinuc_out.println("logitQ,pos,indicator,count");
for ( RecalData datum: flattenData ) {
if (string2dinucIndex(datum.dinuc) == dinuc_index) {
if ((datum.N - datum.B) > 0)
dinuc_out.format("%d,%d,%d,%d\n", datum.qual, datum.pos, 0, datum.N - datum.B);
if (datum.B > 0)
dinuc_out.format("%d,%d,%d,%d\n", datum.qual, datum.pos, 1, datum.B);
}
} }
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException: " + e.getMessage());
} finally {
if (dinuc_out != null)
dinuc_out.close();
} }
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException: " + e.getMessage());
} finally {
if (dinuc_out != null)
dinuc_out.close();
} }
} }
} }
@ -210,107 +216,113 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
} }
public void qualityDiffVsCycle() { public void qualityDiffVsCycle() {
PrintStream ByCycleFile = null; for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
try { PrintStream ByCycleFile = null;
ByCycleFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_cycle.csv"); try {
} catch (FileNotFoundException e){ ByCycleFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".quality_difference_v_cycle.csv");
System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT); } catch (FileNotFoundException e){
System.exit(1); System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT);
} System.exit(1);
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>(); }
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>(); ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
ByCycleFile.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n"); ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
RecalData All = new RecalData(0,0,""); ByCycleFile.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n");
MeanReportedQuality AllReported = new MeanReportedQuality(); RecalData All = new RecalData(0,0,"");
for (int c=0; c < MAX_READ_LENGTH+1; c++) { MeanReportedQuality AllReported = new MeanReportedQuality();
ByCycle.add(new RecalData(c, -1, "-")); for (int c=0; c < MAX_READ_LENGTH+1; c++) {
ByCycleReportedQ.add(new MeanReportedQuality()); ByCycle.add(new RecalData(c, -1, "-"));
} ByCycleReportedQ.add(new MeanReportedQuality());
}
for ( RecalData datum: flattenData ) { for ( RecalData datum: flattenData ) {
ByCycle.get(datum.pos).inc(datum.N, datum.B); ByCycle.get(datum.pos).inc(datum.N, datum.B);
ByCycleReportedQ.get(datum.pos).inc(datum.qual, datum.N); ByCycleReportedQ.get(datum.pos).inc(datum.qual, datum.N);
All.inc(datum.N, datum.B); All.inc(datum.N, datum.B);
AllReported.inc(datum.qual, datum.N); AllReported.inc(datum.qual, datum.N);
} }
for (int c=0; c < MAX_READ_LENGTH+1; c++) { for (int c=0; c < MAX_READ_LENGTH+1; c++) {
double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N); double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
double reportedQual = ByCycleReportedQ.get(c).result(); double reportedQual = ByCycleReportedQ.get(c).result();
ByCycleFile.printf("%d, %f, %f, %f, %d, %d%n", c, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N); ByCycleFile.printf("%d, %f, %f, %f, %d, %d%n", c, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
}
} }
System.out.printf("Cycle: N=%d, B=%d, Qemp=%.1f, ", All.N, All.B, -10 * Math.log10((double)All.B/All.N)); //System.out.printf("Cycle: N=%d, B=%d, Qemp=%.1f, ", All.N, All.B, -10 * Math.log10((double)All.B/All.N));
System.out.printf("Qrep=%.1f%n", AllReported.result()); //System.out.printf("Qrep=%.1f%n", AllReported.result());
} }
public void qualityDiffVsDinucleotide() { public void qualityDiffVsDinucleotide() {
PrintStream ByDinucFile = null; for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
try { PrintStream ByDinucFile = null;
ByDinucFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_dinucleotide.csv"); try {
} catch (FileNotFoundException e){ ByDinucFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".quality_difference_v_dinucleotide.csv");
System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT); } catch (FileNotFoundException e){
System.exit(1); System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT);
} System.exit(1);
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>(); }
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>(); ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n"); ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
RecalData All = new RecalData(0,0,""); ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
MeanReportedQuality AllReported = new MeanReportedQuality(); RecalData All = new RecalData(0,0,"");
for (int c=0; c < NDINUCS; c++) { MeanReportedQuality AllReported = new MeanReportedQuality();
ByCycle.add(new RecalData(-1, -1, dinucIndex2bases(c))); for (int c=0; c < NDINUCS; c++) {
ByCycleReportedQ.add(new MeanReportedQuality()); ByCycle.add(new RecalData(-1, -1, dinucIndex2bases(c)));
} ByCycleReportedQ.add(new MeanReportedQuality());
}
for ( RecalData datum: flattenData ) { for ( RecalData datum: flattenData ) {
int dinucIndex = string2dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false); int dinucIndex = string2dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false);
ByCycle.get(dinucIndex).inc(datum.N, datum.B); ByCycle.get(dinucIndex).inc(datum.N, datum.B);
ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N); ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N);
All.inc(datum.N, datum.B); All.inc(datum.N, datum.B);
AllReported.inc(datum.qual, datum.N); AllReported.inc(datum.qual, datum.N);
} }
for (int c=0; c < NDINUCS; c++) { for (int c=0; c < NDINUCS; c++) {
double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N); double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
double reportedQual = ByCycleReportedQ.get(c).result(); double reportedQual = ByCycleReportedQ.get(c).result();
ByDinucFile.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N); ByDinucFile.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
}
} }
System.out.printf("Dinuc: N=%d, B=%d, Qemp=%.1f, ", All.N, All.B, -10 * Math.log10((double)All.B/All.N)); //System.out.printf("Dinuc: N=%d, B=%d, Qemp=%.1f, ", All.N, All.B, -10 * Math.log10((double)All.B/All.N));
System.out.printf("Qrep=%.1f%n", AllReported.result()); //System.out.printf("Qrep=%.1f%n", AllReported.result());
} }
public void qualityEmpiricalObserved() { public void qualityEmpiricalObserved() {
PrintStream ByQualFile = null; for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
try { PrintStream ByQualFile = null;
ByQualFile = new PrintStream(OUTPUT_FILEROOT+".empirical_v_reported_quality.csv"); try {
} catch (FileNotFoundException e){ ByQualFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".empirical_v_reported_quality.csv");
System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT); } catch (FileNotFoundException e){
System.exit(1); System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT);
} System.exit(1);
ArrayList<RecalData> ByQ = new ArrayList<RecalData>(); }
ArrayList<MeanReportedQuality> ByQReportedQ = new ArrayList<MeanReportedQuality>(); ArrayList<RecalData> ByQ = new ArrayList<RecalData>();
ByQualFile.printf("Qrep,Qemp,Qrep_avg,B,N%n"); ArrayList<MeanReportedQuality> ByQReportedQ = new ArrayList<MeanReportedQuality>();
RecalData All = new RecalData(0,0,""); ByQualFile.printf("Qrep,Qemp,Qrep_avg,B,N%n");
MeanReportedQuality AllReported = new MeanReportedQuality(); RecalData All = new RecalData(0,0,"");
for (int q=0; q<MAX_QUAL_SCORE+1; q++) { MeanReportedQuality AllReported = new MeanReportedQuality();
ByQ.add(new RecalData(-1,q,"-")); for (int q=0; q<MAX_QUAL_SCORE+1; q++) {
ByQReportedQ.add(new MeanReportedQuality()); ByQ.add(new RecalData(-1,q,"-"));
} ByQReportedQ.add(new MeanReportedQuality());
}
for ( RecalData datum: flattenData ){ for ( RecalData datum: flattenData ){
ByQ.get(datum.qual).inc(datum.N, datum.B); ByQ.get(datum.qual).inc(datum.N, datum.B);
ByQReportedQ.get(datum.qual).inc(datum.qual, datum.N); ByQReportedQ.get(datum.qual).inc(datum.qual, datum.N);
All.inc(datum.N, datum.B); All.inc(datum.N, datum.B);
AllReported.inc(datum.qual, datum.N); AllReported.inc(datum.qual, datum.N);
//out.printf("%2d%6d%3d %2d %s%n", datum.qual, datum.N, datum.pos, datum.qual, datum.dinuc); //out.printf("%2d%6d%3d %2d %s%n", datum.qual, datum.N, datum.pos, datum.qual, datum.dinuc);
} }
for (int q=0; q<MAX_QUAL_SCORE; q++) { for (int q=0; q<MAX_QUAL_SCORE; q++) {
double empiricalQual = -10 * Math.log10((double)ByQ.get(q).B / ByQ.get(q).N); double empiricalQual = -10 * Math.log10((double)ByQ.get(q).B / ByQ.get(q).N);
ByQualFile.printf("%d, %f, %.0f, %d, %d%n", q, empiricalQual, ByQReportedQ.get(q).result(), ByQ.get(q).B, ByQ.get(q).N); ByQualFile.printf("%d, %f, %.0f, %d, %d%n", q, empiricalQual, ByQReportedQ.get(q).result(), ByQ.get(q).B, ByQ.get(q).N);
//out.printf("%3d,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, dinuc, qual, empiricalQual, qual-empiricalQual, N, B); n //out.printf("%3d,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, dinuc, qual, empiricalQual, qual-empiricalQual, N, B); n
}
} }
System.out.printf("Emp-Obs: N=%d, B=%d, Qemp=%.1f, ", All.N, All.B, -10 * Math.log10((double)All.B/All.N)); //System.out.printf("Emp-Obs: N=%d, B=%d, Qemp=%.1f, ", All.N, All.B, -10 * Math.log10((double)All.B/All.N));
System.out.printf("Qrep=%.1f%n", AllReported.result()); //System.out.printf("Qrep=%.1f%n", AllReported.result());
} }
public Integer reduceInit() { public Integer reduceInit() {