Finishing checking for building
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1027 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
d1e25bfe88
commit
7d281296a7
|
|
@ -13,11 +13,7 @@ import org.broadinstitute.sting.utils.QualityUtils;
|
|||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
import java.util.HashMap;
|
||||
import java.util.Collections;
|
||||
import java.util.*;
|
||||
import java.io.PrintStream;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
|
|
@ -53,13 +49,17 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
@Argument(fullName="rawData", shortName="raw", required=false, doc="If true, raw mismatch observations will be output to a file")
|
||||
public boolean outputRawData = true;
|
||||
|
||||
@Argument(fullName="collapsePos", shortName="collapsePos", required=false, doc="")
|
||||
public boolean collapsePos = false;
|
||||
|
||||
@Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="")
|
||||
public boolean collapseDinuc = false;
|
||||
|
||||
int NDINUCS = 16;
|
||||
//ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
|
||||
//HashMap<String, RecalData[][][]> data = new HashMap<String, RecalData[][][]>();
|
||||
HashMap<String, RecalDataManager> data = new HashMap<String, RecalDataManager>();
|
||||
//RecalData[][][] data;
|
||||
boolean trackPos = true;
|
||||
boolean trackDinuc = true;
|
||||
|
||||
long counted_sites = 0; // number of sites used to count covariates
|
||||
long counted_bases = 0; // number of bases used to count covariates
|
||||
|
|
@ -76,14 +76,15 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
if( !isSupportedReadGroup(readGroup) )
|
||||
continue;
|
||||
String rg = readGroup.getReadGroupId();
|
||||
RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, NDINUCS, trackPos, trackDinuc );
|
||||
//data.put(rg, new RecalData[maxReadLen+1][QualityUtils.MAX_QUAL_SCORE+1][NDINUCS]);
|
||||
RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, NDINUCS, ! collapsePos, ! collapseDinuc );
|
||||
data.put(rg, manager);
|
||||
}
|
||||
}
|
||||
|
||||
private RecalData getRecalData(String readGroup, int pos, int qual, int dinuc_index) {
|
||||
return data.get(readGroup).expandingGetRecalData(pos, qual, dinuc_index, true);
|
||||
private RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) {
|
||||
byte[] cs = {(byte)prevBase, (byte)base};
|
||||
String s = new String(cs);
|
||||
return data.get(readGroup).expandingGetRecalData(pos, qual, s, true);
|
||||
}
|
||||
|
||||
private List<RecalData> getRecalData(String readGroup) {
|
||||
|
|
@ -141,9 +142,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
int qual = quals[offset];
|
||||
if ( qual > 0 && qual <= QualityUtils.MAX_QUAL_SCORE ) {
|
||||
// previous base is the next base in terms of machine chemistry if this is a negative strand
|
||||
int dinuc_index = RecalData.bases2dinucIndex(prevBase, base, false);
|
||||
//System.out.printf("Adding b_offset=%c offset=%d cycle=%d qual=%d dinuc=%c%c ref_match=%c comp=%c%n", (char)read.getReadBases()[offset], offset, cycle, qual, prevBase, base, ref, (char)BaseUtils.simpleComplement(ref));
|
||||
getRecalData(rg, cycle, qual, dinuc_index).inc(base,ref);
|
||||
RecalData datum = getRecalData(rg, cycle, qual, prevBase, base);
|
||||
if (datum != null) datum.inc(base,ref);
|
||||
return 1;
|
||||
} else {
|
||||
return 0;
|
||||
|
|
@ -168,14 +169,22 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
qualityDiffVsCycle();
|
||||
qualityDiffVsDinucleotide();
|
||||
|
||||
out.printf("Counted sites: %d%n", counted_sites);
|
||||
out.printf("Counted bases: %d%n", counted_bases);
|
||||
out.printf("Skipped sites: %d%n", skipped_sites);
|
||||
out.printf("Fraction skipped: 1/%.0f%n", (double)counted_sites / skipped_sites);
|
||||
printInfo(out);
|
||||
|
||||
if (CREATE_TRAINING_DATA) writeTrainingData();
|
||||
}
|
||||
|
||||
void printInfo(PrintStream out) {
|
||||
out.printf("# date %s%n", new Date());
|
||||
out.printf("# collapsed_pos %b%n", collapsePos);
|
||||
out.printf("# collapsed_dinuc %b%n", collapseDinuc);
|
||||
out.printf("# counted_sites %d%n", counted_sites);
|
||||
out.printf("# counted_bases %d%n", counted_bases);
|
||||
out.printf("# skipped_sites %d%n", skipped_sites);
|
||||
out.printf("# fraction_skipped 1/%.0f%n", (double)counted_sites / skipped_sites);
|
||||
}
|
||||
|
||||
|
||||
void writeTrainingData() {
|
||||
PrintStream dinuc_out = null;
|
||||
PrintStream table_out = null;
|
||||
|
|
@ -197,10 +206,12 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
|
||||
if ( outputRawData ) {
|
||||
table_out = new PrintStream( OUTPUT_FILEROOT+".raw_data.csv");
|
||||
printInfo(table_out);
|
||||
table_out.println("rg,dn,Qrep,pos,NBases,MMismatches");
|
||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
|
||||
if ( datum.N > 0 )
|
||||
table_out.format("%s%n", datum.toCSVString());
|
||||
table_out.format("%s%n", datum.toCSVString(collapsePos));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,9 +24,9 @@ public class RecalData {
|
|||
B += incB;
|
||||
}
|
||||
|
||||
public int getDinucIndex() {
|
||||
return string2dinucIndex(this.dinuc);
|
||||
}
|
||||
//public int getDinucIndex() {
|
||||
// return string2dinucIndex(this.dinuc);
|
||||
//}
|
||||
|
||||
public void inc(char curBase, char ref) {
|
||||
inc(1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1);
|
||||
|
|
@ -52,8 +52,8 @@ public class RecalData {
|
|||
return String.format("%3d,%s,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, readGroup, dinuc, qual, empiricalQual, qual-empiricalQual, N, B);
|
||||
}
|
||||
|
||||
public String toCSVString() {
|
||||
return String.format("%s,%s,%d,%d,%d,%d", readGroup, dinuc, qual, pos, N, B);
|
||||
public String toCSVString(boolean collapsedPos) {
|
||||
return String.format("%s,%s,%d,%s,%d,%d", readGroup, dinuc, qual, collapsedPos ? "*" : pos, N, B);
|
||||
}
|
||||
|
||||
public static RecalData fromCSVString(String s) {
|
||||
|
|
@ -61,7 +61,8 @@ public class RecalData {
|
|||
String rg = vals[0];
|
||||
String dinuc = vals[1];
|
||||
int qual = Integer.parseInt(vals[2]);
|
||||
int pos = Integer.parseInt(vals[3]);
|
||||
|
||||
int pos = vals[3].equals("*") ? 0 : Integer.parseInt(vals[3]);
|
||||
int N = Integer.parseInt(vals[4]);
|
||||
int B = Integer.parseInt(vals[5]);
|
||||
RecalData datum = new RecalData(pos, qual, rg, dinuc);
|
||||
|
|
@ -74,9 +75,12 @@ public class RecalData {
|
|||
}
|
||||
|
||||
public static int bases2dinucIndex(char prevBase, char base, boolean Complement) {
|
||||
if ( BaseUtils.simpleBaseToBaseIndex(prevBase) == -1 || BaseUtils.simpleBaseToBaseIndex(base) == -1 )
|
||||
return -1;
|
||||
|
||||
if (!Complement) {
|
||||
return BaseUtils.simpleBaseToBaseIndex(prevBase) * 4 + BaseUtils.simpleBaseToBaseIndex(base);
|
||||
}else{
|
||||
}else {
|
||||
return (3 - BaseUtils.simpleBaseToBaseIndex(prevBase)) * 4 + (3 - BaseUtils.simpleBaseToBaseIndex(base));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,21 +1,8 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.PrintStream;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -34,7 +21,7 @@ public class RecalDataManager {
|
|||
public RecalDataManager(String readGroup,
|
||||
int maxReadLen, int maxQual, int nDinucs,
|
||||
boolean trackPos, boolean trackDinuc) {
|
||||
data = new RecalData[maxReadLen+1][QualityUtils.MAX_QUAL_SCORE+1][nDinucs];
|
||||
data = new RecalData[maxReadLen+1][maxQual+1][nDinucs];
|
||||
this.readGroup = readGroup;
|
||||
this.trackPos = trackPos;
|
||||
this.trackDinuc = trackDinuc;
|
||||
|
|
@ -46,8 +33,12 @@ public class RecalDataManager {
|
|||
return trackPos ? pos : 0;
|
||||
}
|
||||
|
||||
public int getDinucIndex(int dinuc) {
|
||||
return trackDinuc ? dinuc : 0;
|
||||
public int getDinucIndex(String dinuc) {
|
||||
if ( trackDinuc ) {
|
||||
return RecalData.string2dinucIndex(dinuc);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
public void addDatum(RecalData datum) {
|
||||
|
|
@ -55,27 +46,33 @@ public class RecalDataManager {
|
|||
throw new RuntimeException(String.format("BUG: adding incorrect read group datum %s to RecalDataManager for %s", datum.readGroup, this.readGroup));
|
||||
}
|
||||
|
||||
if ( getRecalData(datum.pos, datum.qual, datum.getDinucIndex()) != null )
|
||||
throw new RuntimeException(String.format("Duplicate entry discovered: %s vs. %s", getRecalData(datum.pos, datum.qual, datum.getDinucIndex()), datum));
|
||||
if ( getRecalData(datum.pos, datum.qual, datum.dinuc) != null )
|
||||
throw new RuntimeException(String.format("Duplicate entry discovered: %s vs. %s", getRecalData(datum.pos, datum.qual, datum.dinuc), datum));
|
||||
|
||||
|
||||
int posIndex = getPosIndex(datum.pos);
|
||||
int internalDinucIndex = getDinucIndex(datum.getDinucIndex());
|
||||
int internalDinucIndex = getDinucIndex(datum.dinuc);
|
||||
|
||||
if ( internalDinucIndex == -1 ) return;
|
||||
|
||||
data[posIndex][datum.qual][internalDinucIndex] = datum;
|
||||
flattenData.add(datum);
|
||||
}
|
||||
|
||||
public RecalData getRecalData(int pos, int qual, int dinuc_index) {
|
||||
return expandingGetRecalData(pos, qual, dinuc_index, false);
|
||||
public RecalData getRecalData(int pos, int qual, String dinuc) {
|
||||
return expandingGetRecalData(pos, qual, dinuc, false);
|
||||
}
|
||||
|
||||
public RecalData expandingGetRecalData(int pos, int qual, int dinuc_index, boolean expandP) {
|
||||
public RecalData expandingGetRecalData(int pos, int qual, String dinuc, boolean expandP) {
|
||||
int posIndex = getPosIndex(pos);
|
||||
int internalDinucIndex = getDinucIndex(dinuc_index);
|
||||
int internalDinucIndex = getDinucIndex(dinuc);
|
||||
|
||||
if ( internalDinucIndex == -1 ) return null;
|
||||
|
||||
RecalData datum = data[posIndex][qual][internalDinucIndex];
|
||||
if ( datum == null && expandP ) {
|
||||
//System.out.printf("Allocating %s %d %d %d%n", readGroup, pos, qual, dinuc_index);
|
||||
datum = new RecalData(posIndex, qual, readGroup, RecalData.dinucIndex2bases(dinuc_index));
|
||||
datum = new RecalData(posIndex, qual, readGroup, trackDinuc ? dinuc : "**");
|
||||
data[posIndex][qual][internalDinucIndex] = datum;
|
||||
flattenData.add(datum);
|
||||
}
|
||||
|
|
@ -108,7 +105,7 @@ public class RecalDataManager {
|
|||
for ( int qual = 0; qual < QualityUtils.MAX_QUAL_SCORE+1; qual++ ) {
|
||||
RecalData datum = new RecalData(pos, qual, readGroup, "**");
|
||||
for ( int dinucIndex = 0; dinucIndex < nDinucs; dinucIndex++ ) {
|
||||
RecalData datum2 = getRecalData(pos, qual, dinucIndex);
|
||||
RecalData datum2 = getRecalData(pos, qual, RecalData.dinucIndex2bases(dinucIndex));
|
||||
if ( datum2 != null )
|
||||
datum.inc(data[pos][qual][dinucIndex].N, data[pos][qual][dinucIndex].B);
|
||||
}
|
||||
|
|
@ -128,7 +125,7 @@ public class RecalDataManager {
|
|||
RecalData datum = new RecalData(-1, qual, readGroup, RecalData.dinucIndex2bases(dinucIndex));
|
||||
System.out.printf("Aggregating [%s]:%n", datum);
|
||||
for ( int pos = 0; pos < data.length; pos++ ) {
|
||||
RecalData datum2 = getRecalData(pos, qual, dinucIndex);
|
||||
RecalData datum2 = getRecalData(pos, qual, RecalData.dinucIndex2bases(dinucIndex));
|
||||
if ( datum2 != null ) {
|
||||
System.out.printf(" + [%s]:%n", datum2);
|
||||
datum.inc(data[pos][qual][dinucIndex].N, data[pos][qual][dinucIndex].B);
|
||||
|
|
|
|||
|
|
@ -54,6 +54,7 @@ public class BaseUtils {
|
|||
*/
|
||||
static public int simpleBaseToBaseIndex(char base) {
|
||||
switch (base) {
|
||||
case '*': // the wildcard character counts as an A
|
||||
case 'A':
|
||||
case 'a': return 0;
|
||||
|
||||
|
|
|
|||
|
|
@ -7,6 +7,8 @@ package org.broadinstitute.sting.utils;
|
|||
* @author Kiran Garimella
|
||||
*/
|
||||
public class QualityUtils {
|
||||
public final static byte MAX_QUAL_SCORE = 63;
|
||||
|
||||
/**
|
||||
* Private constructor. No instantiating this class!
|
||||
*/
|
||||
|
|
@ -70,7 +72,7 @@ public class QualityUtils {
|
|||
* @return the capped quality score
|
||||
*/
|
||||
static public byte boundQual(int qual) {
|
||||
return (byte) Math.min(qual, 63);
|
||||
return (byte) Math.min(qual, MAX_QUAL_SCORE);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
Loading…
Reference in New Issue