Another temp checking for rearranging things

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1048 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-06-18 21:04:36 +00:00
parent 3c40db260d
commit ca8a3bd85e
5 changed files with 167 additions and 61 deletions

View File

@ -55,7 +55,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
@Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="")
public boolean collapseDinuc = false;
int NDINUCS = 16;
//ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
//HashMap<String, RecalData[][][]> data = new HashMap<String, RecalData[][][]>();
HashMap<String, RecalDataManager> data = new HashMap<String, RecalDataManager>();
@ -76,7 +75,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
if( !isSupportedReadGroup(readGroup) )
continue;
String rg = readGroup.getReadGroupId();
RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, NDINUCS, ! collapsePos, ! collapseDinuc );
RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, RecalData.NDINUCS, ! collapsePos, ! collapseDinuc );
data.put(rg, manager);
}
}
@ -142,7 +141,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
int qual = quals[offset];
if ( qual > 0 && qual <= QualityUtils.MAX_QUAL_SCORE ) {
// previous base is the next base in terms of machine chemistry if this is a negative strand
//System.out.printf("Adding b_offset=%c offset=%d cycle=%d qual=%d dinuc=%c%c ref_match=%c comp=%c%n", (char)read.getReadBases()[offset], offset, cycle, qual, prevBase, base, ref, (char)BaseUtils.simpleComplement(ref));
// if ( qual == 2 )
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
// System.out.printf("Adding neg?=%b b_offset=%c offset=%d cycle=%d qual=%d dinuc=%c%c ref_match=%c comp=%c name=%s%n",
// read.getReadNegativeStrandFlag(), (char)read.getReadBases()[offset], offset, cycle, qual, prevBase, base,
// ref, (char)BaseUtils.simpleComplement(ref), read.getReadName());
RecalData datum = getRecalData(rg, cycle, qual, prevBase, base);
if (datum != null) datum.inc(base,ref);
return 1;
@ -174,7 +177,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
if (CREATE_TRAINING_DATA) writeTrainingData();
}
void printInfo(PrintStream out) {
private void printInfo(PrintStream out) {
out.printf("# date %s%n", new Date());
out.printf("# collapsed_pos %b%n", collapsePos);
out.printf("# collapsed_dinuc %b%n", collapseDinuc);
@ -185,16 +188,16 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
}
void writeTrainingData() {
private void writeTrainingData() {
PrintStream dinuc_out = null;
PrintStream table_out = null;
try {
dinuc_out = new PrintStream( OUTPUT_FILEROOT+".covariate_counts.csv");
dinuc_out.println("rg,dn,logitQ,pos,indicator,count");
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
for ( int dinuc_index=0; dinuc_index<NDINUCS; dinuc_index++) {
for ( int dinuc_index=0; dinuc_index<RecalData.NDINUCS; dinuc_index++) {
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
if ( RecalData.string2dinucIndex(datum.dinuc) == dinuc_index ) {
if ( RecalData.dinucIndex(datum.dinuc) == dinuc_index ) {
if ((datum.N - datum.B) > 0)
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup.getReadGroupId(), RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 0, datum.N - datum.B);
if (datum.B > 0)
@ -207,7 +210,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
if ( outputRawData ) {
table_out = new PrintStream( OUTPUT_FILEROOT+".raw_data.csv");
printInfo(table_out);
table_out.println("rg,dn,Qrep,pos,NBases,MMismatches");
table_out.println("rg,dn,Qrep,pos,NBases,MMismatches,Qemp");
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
if ( datum.N > 0 )
@ -295,20 +298,20 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
RecalData All = new RecalData(0,0,readGroup.getReadGroupId(),"");
MeanReportedQuality AllReported = new MeanReportedQuality();
for (int c=0; c < NDINUCS; c++) {
for (int c=0; c < RecalData.NDINUCS; c++) {
ByCycle.add(new RecalData(-1, -1,readGroup.getReadGroupId(),RecalData.dinucIndex2bases(c)));
ByCycleReportedQ.add(new MeanReportedQuality());
}
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
int dinucIndex = RecalData.string2dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false);
int dinucIndex = RecalData.dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false);
ByCycle.get(dinucIndex).inc(datum.N, datum.B);
ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N);
All.inc(datum.N, datum.B);
AllReported.inc(datum.qual, datum.N);
}
for (int c=0; c < NDINUCS; c++) {
for (int c=0; c < RecalData.NDINUCS; c++) {
double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
double reportedQual = ByCycleReportedQ.get(c).result();
ByDinucFile.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
@ -347,7 +350,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
for (int q=0; q<QualityUtils.MAX_QUAL_SCORE; q++) {
double empiricalQual = -10 * Math.log10((double)ByQ.get(q).B / ByQ.get(q).N);
ByQualFile.printf("%d, %f, %.0f, %d, %d%n", q, empiricalQual, ByQReportedQ.get(q).result(), ByQ.get(q).B, ByQ.get(q).N);
ByQualFile.printf("%3d, %2.2f, %2.2f, %12d, %12d%n", q, empiricalQual, ByQReportedQ.get(q).result(), ByQ.get(q).B, ByQ.get(q).N);
//out.printf("%3d,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, dinuc, qual, empiricalQual, qual-empiricalQual, N, B); n
}
}

View File

@ -3,6 +3,8 @@ package org.broadinstitute.sting.playground.gatk.walkers.recalibration;
import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.*;
import org.apache.log4j.Logger;
@ -12,6 +14,7 @@ import java.io.File;
import java.io.FileNotFoundException;
@WalkerName("LogisticRecalibration")
@Requires({DataSource.READS, DataSource.REFERENCE})
public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
@Argument(shortName="logisticParams", doc="logistic params file", required=true)
public String logisticParamsFile;
@ -185,7 +188,7 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
}
if (read.getReadNegativeStrandFlag())
recalQuals = BaseUtils.reverse(quals);
recalQuals = BaseUtils.reverse(recalQuals);
//System.out.printf("OLD: %s%n", read.format());
read.setBaseQualities(recalQuals);
//System.out.printf("NEW: %s%n", read.format());

View File

@ -1,7 +1,6 @@
package org.broadinstitute.sting.playground.gatk.walkers.recalibration;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.BaseUtils;
public class RecalData {
@ -53,7 +52,7 @@ public class RecalData {
}
public String toCSVString(boolean collapsedPos) {
return String.format("%s,%s,%d,%s,%d,%d", readGroup, dinuc, qual, collapsedPos ? "*" : pos, N, B);
return String.format("%s,%s,%d,%s,%d,%d,%d", readGroup, dinuc, qual, collapsedPos ? "*" : pos, N, B, empiricalQualByte());
}
public static RecalData fromCSVString(String s) {
@ -74,25 +73,26 @@ public class RecalData {
return datum;
}
public static int bases2dinucIndex(char prevBase, char base, boolean Complement) {
if ( BaseUtils.simpleBaseToBaseIndex(prevBase) == -1 || BaseUtils.simpleBaseToBaseIndex(base) == -1 )
return -1;
if (!Complement) {
return BaseUtils.simpleBaseToBaseIndex(prevBase) * 4 + BaseUtils.simpleBaseToBaseIndex(base);
}else {
return (3 - BaseUtils.simpleBaseToBaseIndex(prevBase)) * 4 + (3 - BaseUtils.simpleBaseToBaseIndex(base));
}
public static int bases2dinucIndex(char prevBase, char base) {
int pbI = BaseUtils.simpleBaseToBaseIndex(prevBase);
int bI = BaseUtils.simpleBaseToBaseIndex(base);
return (pbI == -1 || bI == -1) ? -1 : pbI * 4 + bI;
}
public final static int NDINUCS = 16;
public static String dinucIndex2bases(int index) {
char data[] = {BaseUtils.baseIndexToSimpleBase(index / 4), BaseUtils.baseIndexToSimpleBase(index % 4)};
return new String( data );
}
public static int string2dinucIndex(String s) {
return bases2dinucIndex(s.charAt(0), s.charAt(1), false);
public static int dinucIndex(String s) {
return bases2dinucIndex(s.charAt(0), s.charAt(1));
}
public static int dinucIndex(byte prevBase, byte base) {
return bases2dinucIndex((char)prevBase, (char)base);
}
//
// private static int nuc2num[];
// private static char num2nuc[];

View File

@ -33,12 +33,21 @@ public class RecalDataManager {
return trackPos ? pos : 0;
}
public int canonicalPos(int cycle) {
return getPosIndex(cycle);
}
public int getDinucIndex(String dinuc) {
if ( trackDinuc ) {
return RecalData.string2dinucIndex(dinuc);
} else {
return 0;
}
return trackDinuc ? RecalData.dinucIndex(dinuc) : 0;
}
public int getDinucIndex(byte prevBase, byte base) {
return trackDinuc ? RecalData.dinucIndex(prevBase, base) : 0;
}
public String canonicalDinuc(String dinuc) {
return trackDinuc ? dinuc : "**";
}
public void addDatum(RecalData datum) {

View File

@ -3,16 +3,21 @@ package org.broadinstitute.sting.playground.gatk.walkers.recalibration;
import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.playground.gatk.walkers.recalibration.RecalData;
import org.apache.log4j.Logger;
import java.util.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.io.File;
import java.io.FileNotFoundException;
@WalkerName("TableRecalibration")
@Requires({DataSource.READS, DataSource.REFERENCE})
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
@Argument(shortName="params", doc="CountCovariates params file", required=true)
public String paramsFile;
@ -27,26 +32,52 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
// maps from [readGroup] -> [prevBase x base -> [cycle, qual, new qual]]
HashMap<String, RecalMapping> cache = new HashMap<String, RecalMapping>();
@Argument(shortName="serial", doc="", required=false)
//@Argument(shortName="serial", doc="", required=false)
boolean serialRecalibration = false;
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
private static Pattern COLLAPSED_POS_PATTERN = Pattern.compile("^#\\s+collapsed_pos\\s+(\\w+)");
private static Pattern COLLAPSED_DINUC_PATTERN = Pattern.compile("^#\\s+collapsed_dinuc\\s+(\\w+)");
private static Pattern HEADER_PATTERN = Pattern.compile("^rg.*");
public void initialize() {
try {
System.out.printf("Reading data...%n");
List<RecalData> data = new ArrayList<RecalData>();
boolean collapsedPos = false;
boolean collapsedDinuc = false;
List<String> lines = new xReadLines(new File(paramsFile)).readLines();
for ( String line : lines ) {
// rg,dn,logitQ,pos,indicator,count
// SRR007069,AA,28,1,0,2
data.add(RecalData.fromCSVString(line));
//System.out.printf("Reading line %s%n", line);
if ( HEADER_PATTERN.matcher(line).matches() )
continue;
if ( COMMENT_PATTERN.matcher(line).matches() ) {
collapsedPos = parseCommentLine(COLLAPSED_POS_PATTERN, line, collapsedPos);
collapsedDinuc = parseCommentLine(COLLAPSED_DINUC_PATTERN, line, collapsedDinuc);
//System.out.printf("Collapsed %b %b%n", collapsedPos, collapsedDinuc);
}
else {
data.add(RecalData.fromCSVString(line));
}
}
initializeCache(data);
initializeCache(data, collapsedPos, collapsedDinuc);
} catch ( FileNotFoundException e ) {
Utils.scareUser("Cannot read/find parameters file " + paramsFile);
}
}
private void initializeCache(List<RecalData> data) {
private boolean parseCommentLine(Pattern pat, String line, boolean flag) {
Matcher m = pat.matcher(line);
if ( m.matches() ) {
//System.out.printf("Parsing %s%n", m.group(1));
flag = Boolean.parseBoolean(m.group(1));
}
return flag;
}
private void initializeCache(List<RecalData> data, boolean collapsedPos, boolean collapsedDinuc ) {
Set<String> readGroups = new HashSet<String>();
Set<String> dinucs = new HashSet<String>();
int maxPos = -1;
@ -68,7 +99,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
// initialize the data structure
HashMap<String, RecalDataManager> managers = new HashMap<String, RecalDataManager>();
for ( String readGroup : readGroups ) {
RecalDataManager manager = new RecalDataManager(readGroup, maxPos, maxQReported, dinucs.size(), true, true);
RecalDataManager manager = new RecalDataManager(readGroup, maxPos, maxQReported, dinucs.size(), ! collapsedPos, ! collapsedDinuc);
managers.put(readGroup, manager);
}
@ -90,13 +121,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
}
public SAMRecord map(char[] ref, SAMRecord read) {
//if ( read.getReadLength() > maxReadLen ) {
// throw new RuntimeException("Expectedly long read, please increase maxium read len with maxReadLen parameter: " + read.format());
//}
byte[] bases = read.getReadBases();
byte[] quals = read.getBaseQualities();
byte[] recalQuals = new byte[quals.length];
// Since we want machine direction reads not corrected positive strand reads, rev comp any negative strand reads
if (read.getReadNegativeStrandFlag()) {
@ -104,28 +130,34 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
quals = BaseUtils.reverse(quals);
}
String readGroup = read.getAttribute("RG").toString();
byte[] recalQuals = recalibrateBasesAndQuals(read.getAttribute("RG").toString(), bases, quals);
if (read.getReadNegativeStrandFlag())
recalQuals = BaseUtils.reverse(recalQuals);
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
// System.out.printf("OLD: %s%n", read.format());
read.setBaseQualities(recalQuals);
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
// System.out.printf("NEW: %s%n", read.format());
return read;
}
public byte[] recalibrateBasesAndQuals(final String readGroup, byte[] bases, byte[] quals) {
byte[] recalQuals = new byte[quals.length];
RecalMapping mapper = cache.get(readGroup);
int numBases = read.getReadLength();
recalQuals[0] = quals[0]; // can't change the first -- no dinuc
for ( int cycle = 1; cycle < numBases; cycle++ ) { // skip first and last base, qual already set because no dinuc
// Take into account that previous base is the next base in terms of machine chemistry if
// this is a negative strand
recalQuals[0] = quals[0]; // can't change the first -- no dinuc
for ( int cycle = 1; cycle < bases.length; cycle++ ) { // skip first and last base, qual already set because no dinuc
byte qual = quals[cycle];
byte newQual = mapper.getNewQual(readGroup, bases[cycle - 1], bases[cycle], cycle, qual);
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
// System.out.printf("Processing cycle=%d qual=%d: neg?=%b => %d at %s%n",
// cycle, qual, read.getReadNegativeStrandFlag(), newQual, read.getReadName());
recalQuals[cycle] = newQual;
//System.out.printf("Mapping %d => %d%n", qual, newQual);
}
if (read.getReadNegativeStrandFlag())
recalQuals = BaseUtils.reverse(quals);
//System.out.printf("OLD: %s%n", read.format());
read.setBaseQualities(recalQuals);
//System.out.printf("NEW: %s%n", read.format());
return read;
return recalQuals;
}
public void onTraversalDone(SAMFileWriter output) {
@ -164,9 +196,58 @@ interface RecalMapping {
}
class CombinatorialRecalMapping implements RecalMapping {
HashMap<String, byte[][]> cache = new HashMap<String, byte[][]>();
ArrayList<byte[][]> cache;
RecalDataManager manager;
public CombinatorialRecalMapping(RecalDataManager manager, Set<String> dinucs, int maxPos, int maxQReported ) {
this.manager = manager;
// initialize the data structure
cache = new ArrayList<byte[][]>(RecalData.NDINUCS);
for ( String dinuc : dinucs ) {
cache.add(new byte[maxPos+1][maxQReported+1]);
}
for ( RecalData datum : manager.getAll() ) {
//System.out.printf("Adding datum %s%n", datum);
byte [][] table = cache.get(this.manager.getDinucIndex(datum.dinuc));
if ( table[datum.pos][datum.qual] != 0 )
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
//table[datum.pos][datum.qual] = (byte)(1 + datum.empiricalQualByte());
table[datum.pos][datum.qual] = datum.empiricalQualByte();
// System.out.printf("Binding %d %d => %d%n", datum.pos, datum.qual, datum.empiricalQualByte());
}
}
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) {
//String dinuc = String.format("%c%c", (char)prevBase, (char)base);
//if ( qual == 2 )
// System.out.printf("Qual = 2%n");
int pos = manager.canonicalPos(cycle);
int index = this.manager.getDinucIndex(prevBase, base);
byte[][] dataTable = index == -1 ? null : cache.get(index);
if ( dataTable == null && prevBase != 'N' && base != 'N' )
throw new RuntimeException(String.format("Unmapped data table at %s %c%c", readGroup, prevBase, base));
byte result = dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual;
//if ( result == 2 )
// System.out.printf("Lookup RG=%s dinuc=%s cycle=%d pos=%d qual=%d datatable=%s / %d => %d%n",
// readGroup, dinuc, cycle, pos, qual, dataTable, dataTable.length, result);
return result;
}
}
/*class CombinatorialRecalMapping implements RecalMapping {
HashMap<String, byte[][]> cache = new HashMap<String, byte[][]>();
RecalDataManager manager;
public CombinatorialRecalMapping(RecalDataManager manager, Set<String> dinucs, int maxPos, int maxQReported ) {
this.manager = manager;
// initialize the data structure
for ( String dinuc : dinucs ) {
byte[][] table = new byte[maxPos+1][maxQReported+1];
@ -180,22 +261,32 @@ class CombinatorialRecalMapping implements RecalMapping {
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
//table[datum.pos][datum.qual] = (byte)(1 + datum.empiricalQualByte());
table[datum.pos][datum.qual] = datum.empiricalQualByte();
// System.out.printf("Binding %d %d => %d%n", datum.pos, datum.qual, datum.empiricalQualByte());
}
}
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) {
//System.out.printf("Lookup RG=%s prevBase=%c base=%c cycle=%d qual=%d%n", readGroup, prevBase, base, cycle, qual);
//String dinuc = String.format("%c%c", (char)prevBase, (char)base);
//if ( qual == 2 )
// System.out.printf("Qual = 2%n");
byte[] bp = {prevBase, base};
String dinuc = new String(bp);
String dinuc = manager.canonicalDinuc(new String(bp));
int pos = manager.canonicalPos(cycle);
byte[][] dataTable = cache.get(dinuc);
if ( dataTable == null && prevBase != 'N' && base != 'N' )
throw new RuntimeException(String.format("Unmapped data table at %s %s", readGroup, dinuc));
return dataTable != null && cycle < dataTable.length ? dataTable[cycle][qual] : qual;
byte result = dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual;
//if ( result == 2 )
// System.out.printf("Lookup RG=%s dinuc=%s cycle=%d pos=%d qual=%d datatable=%s / %d => %d%n",
// readGroup, dinuc, cycle, pos, qual, dataTable, dataTable.length, result);
return result;
}
}
}*/
class SerialRecalMapping implements RecalMapping {
// mapping from dinuc x Q => new Q