Finishing checking for building

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1027 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-06-17 14:12:40 +00:00
parent d1e25bfe88
commit 7d281296a7
5 changed files with 67 additions and 52 deletions

View File

@ -13,11 +13,7 @@ import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.BaseUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.HashMap;
import java.util.Collections;
import java.util.*;
import java.io.PrintStream;
import java.io.FileNotFoundException;
@ -53,13 +49,17 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
@Argument(fullName="rawData", shortName="raw", required=false, doc="If true, raw mismatch observations will be output to a file")
public boolean outputRawData = true;
@Argument(fullName="collapsePos", shortName="collapsePos", required=false, doc="")
public boolean collapsePos = false;
@Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="")
public boolean collapseDinuc = false;
int NDINUCS = 16;
//ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
//HashMap<String, RecalData[][][]> data = new HashMap<String, RecalData[][][]>();
HashMap<String, RecalDataManager> data = new HashMap<String, RecalDataManager>();
//RecalData[][][] data;
boolean trackPos = true;
boolean trackDinuc = true;
long counted_sites = 0; // number of sites used to count covariates
long counted_bases = 0; // number of bases used to count covariates
@ -76,14 +76,15 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
if( !isSupportedReadGroup(readGroup) )
continue;
String rg = readGroup.getReadGroupId();
RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, NDINUCS, trackPos, trackDinuc );
//data.put(rg, new RecalData[maxReadLen+1][QualityUtils.MAX_QUAL_SCORE+1][NDINUCS]);
RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, NDINUCS, ! collapsePos, ! collapseDinuc );
data.put(rg, manager);
}
}
private RecalData getRecalData(String readGroup, int pos, int qual, int dinuc_index) {
return data.get(readGroup).expandingGetRecalData(pos, qual, dinuc_index, true);
private RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) {
byte[] cs = {(byte)prevBase, (byte)base};
String s = new String(cs);
return data.get(readGroup).expandingGetRecalData(pos, qual, s, true);
}
private List<RecalData> getRecalData(String readGroup) {
@ -141,9 +142,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
int qual = quals[offset];
if ( qual > 0 && qual <= QualityUtils.MAX_QUAL_SCORE ) {
// previous base is the next base in terms of machine chemistry if this is a negative strand
int dinuc_index = RecalData.bases2dinucIndex(prevBase, base, false);
//System.out.printf("Adding b_offset=%c offset=%d cycle=%d qual=%d dinuc=%c%c ref_match=%c comp=%c%n", (char)read.getReadBases()[offset], offset, cycle, qual, prevBase, base, ref, (char)BaseUtils.simpleComplement(ref));
getRecalData(rg, cycle, qual, dinuc_index).inc(base,ref);
RecalData datum = getRecalData(rg, cycle, qual, prevBase, base);
if (datum != null) datum.inc(base,ref);
return 1;
} else {
return 0;
@ -168,14 +169,22 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
qualityDiffVsCycle();
qualityDiffVsDinucleotide();
out.printf("Counted sites: %d%n", counted_sites);
out.printf("Counted bases: %d%n", counted_bases);
out.printf("Skipped sites: %d%n", skipped_sites);
out.printf("Fraction skipped: 1/%.0f%n", (double)counted_sites / skipped_sites);
printInfo(out);
if (CREATE_TRAINING_DATA) writeTrainingData();
}
void printInfo(PrintStream out) {
out.printf("# date %s%n", new Date());
out.printf("# collapsed_pos %b%n", collapsePos);
out.printf("# collapsed_dinuc %b%n", collapseDinuc);
out.printf("# counted_sites %d%n", counted_sites);
out.printf("# counted_bases %d%n", counted_bases);
out.printf("# skipped_sites %d%n", skipped_sites);
out.printf("# fraction_skipped 1/%.0f%n", (double)counted_sites / skipped_sites);
}
void writeTrainingData() {
PrintStream dinuc_out = null;
PrintStream table_out = null;
@ -197,10 +206,12 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
if ( outputRawData ) {
table_out = new PrintStream( OUTPUT_FILEROOT+".raw_data.csv");
printInfo(table_out);
table_out.println("rg,dn,Qrep,pos,NBases,MMismatches");
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
if ( datum.N > 0 )
table_out.format("%s%n", datum.toCSVString());
table_out.format("%s%n", datum.toCSVString(collapsePos));
}
}
}

View File

@ -24,9 +24,9 @@ public class RecalData {
B += incB;
}
public int getDinucIndex() {
return string2dinucIndex(this.dinuc);
}
//public int getDinucIndex() {
// return string2dinucIndex(this.dinuc);
//}
public void inc(char curBase, char ref) {
inc(1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1);
@ -52,8 +52,8 @@ public class RecalData {
return String.format("%3d,%s,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, readGroup, dinuc, qual, empiricalQual, qual-empiricalQual, N, B);
}
public String toCSVString() {
return String.format("%s,%s,%d,%d,%d,%d", readGroup, dinuc, qual, pos, N, B);
public String toCSVString(boolean collapsedPos) {
return String.format("%s,%s,%d,%s,%d,%d", readGroup, dinuc, qual, collapsedPos ? "*" : pos, N, B);
}
public static RecalData fromCSVString(String s) {
@ -61,7 +61,8 @@ public class RecalData {
String rg = vals[0];
String dinuc = vals[1];
int qual = Integer.parseInt(vals[2]);
int pos = Integer.parseInt(vals[3]);
int pos = vals[3].equals("*") ? 0 : Integer.parseInt(vals[3]);
int N = Integer.parseInt(vals[4]);
int B = Integer.parseInt(vals[5]);
RecalData datum = new RecalData(pos, qual, rg, dinuc);
@ -74,9 +75,12 @@ public class RecalData {
}
public static int bases2dinucIndex(char prevBase, char base, boolean Complement) {
if ( BaseUtils.simpleBaseToBaseIndex(prevBase) == -1 || BaseUtils.simpleBaseToBaseIndex(base) == -1 )
return -1;
if (!Complement) {
return BaseUtils.simpleBaseToBaseIndex(prevBase) * 4 + BaseUtils.simpleBaseToBaseIndex(base);
}else{
}else {
return (3 - BaseUtils.simpleBaseToBaseIndex(prevBase)) * 4 + (3 - BaseUtils.simpleBaseToBaseIndex(base));
}
}

View File

@ -1,21 +1,8 @@
package org.broadinstitute.sting.playground.gatk.walkers.recalibration;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.BaseUtils;
import java.util.*;
import java.io.PrintStream;
import java.io.FileNotFoundException;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
/**
* Created by IntelliJ IDEA.
@ -34,7 +21,7 @@ public class RecalDataManager {
public RecalDataManager(String readGroup,
int maxReadLen, int maxQual, int nDinucs,
boolean trackPos, boolean trackDinuc) {
data = new RecalData[maxReadLen+1][QualityUtils.MAX_QUAL_SCORE+1][nDinucs];
data = new RecalData[maxReadLen+1][maxQual+1][nDinucs];
this.readGroup = readGroup;
this.trackPos = trackPos;
this.trackDinuc = trackDinuc;
@ -46,8 +33,12 @@ public class RecalDataManager {
return trackPos ? pos : 0;
}
public int getDinucIndex(int dinuc) {
return trackDinuc ? dinuc : 0;
public int getDinucIndex(String dinuc) {
if ( trackDinuc ) {
return RecalData.string2dinucIndex(dinuc);
} else {
return 0;
}
}
public void addDatum(RecalData datum) {
@ -55,27 +46,33 @@ public class RecalDataManager {
throw new RuntimeException(String.format("BUG: adding incorrect read group datum %s to RecalDataManager for %s", datum.readGroup, this.readGroup));
}
if ( getRecalData(datum.pos, datum.qual, datum.getDinucIndex()) != null )
throw new RuntimeException(String.format("Duplicate entry discovered: %s vs. %s", getRecalData(datum.pos, datum.qual, datum.getDinucIndex()), datum));
if ( getRecalData(datum.pos, datum.qual, datum.dinuc) != null )
throw new RuntimeException(String.format("Duplicate entry discovered: %s vs. %s", getRecalData(datum.pos, datum.qual, datum.dinuc), datum));
int posIndex = getPosIndex(datum.pos);
int internalDinucIndex = getDinucIndex(datum.getDinucIndex());
int internalDinucIndex = getDinucIndex(datum.dinuc);
if ( internalDinucIndex == -1 ) return;
data[posIndex][datum.qual][internalDinucIndex] = datum;
flattenData.add(datum);
}
public RecalData getRecalData(int pos, int qual, int dinuc_index) {
return expandingGetRecalData(pos, qual, dinuc_index, false);
public RecalData getRecalData(int pos, int qual, String dinuc) {
return expandingGetRecalData(pos, qual, dinuc, false);
}
public RecalData expandingGetRecalData(int pos, int qual, int dinuc_index, boolean expandP) {
public RecalData expandingGetRecalData(int pos, int qual, String dinuc, boolean expandP) {
int posIndex = getPosIndex(pos);
int internalDinucIndex = getDinucIndex(dinuc_index);
int internalDinucIndex = getDinucIndex(dinuc);
if ( internalDinucIndex == -1 ) return null;
RecalData datum = data[posIndex][qual][internalDinucIndex];
if ( datum == null && expandP ) {
//System.out.printf("Allocating %s %d %d %d%n", readGroup, pos, qual, dinuc_index);
datum = new RecalData(posIndex, qual, readGroup, RecalData.dinucIndex2bases(dinuc_index));
datum = new RecalData(posIndex, qual, readGroup, trackDinuc ? dinuc : "**");
data[posIndex][qual][internalDinucIndex] = datum;
flattenData.add(datum);
}
@ -108,7 +105,7 @@ public class RecalDataManager {
for ( int qual = 0; qual < QualityUtils.MAX_QUAL_SCORE+1; qual++ ) {
RecalData datum = new RecalData(pos, qual, readGroup, "**");
for ( int dinucIndex = 0; dinucIndex < nDinucs; dinucIndex++ ) {
RecalData datum2 = getRecalData(pos, qual, dinucIndex);
RecalData datum2 = getRecalData(pos, qual, RecalData.dinucIndex2bases(dinucIndex));
if ( datum2 != null )
datum.inc(data[pos][qual][dinucIndex].N, data[pos][qual][dinucIndex].B);
}
@ -128,7 +125,7 @@ public class RecalDataManager {
RecalData datum = new RecalData(-1, qual, readGroup, RecalData.dinucIndex2bases(dinucIndex));
System.out.printf("Aggregating [%s]:%n", datum);
for ( int pos = 0; pos < data.length; pos++ ) {
RecalData datum2 = getRecalData(pos, qual, dinucIndex);
RecalData datum2 = getRecalData(pos, qual, RecalData.dinucIndex2bases(dinucIndex));
if ( datum2 != null ) {
System.out.printf(" + [%s]:%n", datum2);
datum.inc(data[pos][qual][dinucIndex].N, data[pos][qual][dinucIndex].B);

View File

@ -54,6 +54,7 @@ public class BaseUtils {
*/
static public int simpleBaseToBaseIndex(char base) {
switch (base) {
case '*': // the wildcard character counts as an A
case 'A':
case 'a': return 0;

View File

@ -7,6 +7,8 @@ package org.broadinstitute.sting.utils;
* @author Kiran Garimella
*/
public class QualityUtils {
public final static byte MAX_QUAL_SCORE = 63;
/**
* Private constructor. No instantiating this class!
*/
@ -70,7 +72,7 @@ public class QualityUtils {
* @return the capped quality score
*/
static public byte boundQual(int qual) {
return (byte) Math.min(qual, 63);
return (byte) Math.min(qual, MAX_QUAL_SCORE);
}
/**