Finishing checking for building
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1027 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
d1e25bfe88
commit
7d281296a7
|
|
@ -13,11 +13,7 @@ import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
import org.broadinstitute.sting.utils.BaseUtils;
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.List;
|
|
||||||
import java.util.Random;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
|
|
||||||
|
|
@ -53,13 +49,17 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
@Argument(fullName="rawData", shortName="raw", required=false, doc="If true, raw mismatch observations will be output to a file")
|
@Argument(fullName="rawData", shortName="raw", required=false, doc="If true, raw mismatch observations will be output to a file")
|
||||||
public boolean outputRawData = true;
|
public boolean outputRawData = true;
|
||||||
|
|
||||||
|
@Argument(fullName="collapsePos", shortName="collapsePos", required=false, doc="")
|
||||||
|
public boolean collapsePos = false;
|
||||||
|
|
||||||
|
@Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="")
|
||||||
|
public boolean collapseDinuc = false;
|
||||||
|
|
||||||
int NDINUCS = 16;
|
int NDINUCS = 16;
|
||||||
//ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
|
//ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
|
||||||
//HashMap<String, RecalData[][][]> data = new HashMap<String, RecalData[][][]>();
|
//HashMap<String, RecalData[][][]> data = new HashMap<String, RecalData[][][]>();
|
||||||
HashMap<String, RecalDataManager> data = new HashMap<String, RecalDataManager>();
|
HashMap<String, RecalDataManager> data = new HashMap<String, RecalDataManager>();
|
||||||
//RecalData[][][] data;
|
//RecalData[][][] data;
|
||||||
boolean trackPos = true;
|
|
||||||
boolean trackDinuc = true;
|
|
||||||
|
|
||||||
long counted_sites = 0; // number of sites used to count covariates
|
long counted_sites = 0; // number of sites used to count covariates
|
||||||
long counted_bases = 0; // number of bases used to count covariates
|
long counted_bases = 0; // number of bases used to count covariates
|
||||||
|
|
@ -76,14 +76,15 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
if( !isSupportedReadGroup(readGroup) )
|
if( !isSupportedReadGroup(readGroup) )
|
||||||
continue;
|
continue;
|
||||||
String rg = readGroup.getReadGroupId();
|
String rg = readGroup.getReadGroupId();
|
||||||
RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, NDINUCS, trackPos, trackDinuc );
|
RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, NDINUCS, ! collapsePos, ! collapseDinuc );
|
||||||
//data.put(rg, new RecalData[maxReadLen+1][QualityUtils.MAX_QUAL_SCORE+1][NDINUCS]);
|
|
||||||
data.put(rg, manager);
|
data.put(rg, manager);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private RecalData getRecalData(String readGroup, int pos, int qual, int dinuc_index) {
|
private RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) {
|
||||||
return data.get(readGroup).expandingGetRecalData(pos, qual, dinuc_index, true);
|
byte[] cs = {(byte)prevBase, (byte)base};
|
||||||
|
String s = new String(cs);
|
||||||
|
return data.get(readGroup).expandingGetRecalData(pos, qual, s, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<RecalData> getRecalData(String readGroup) {
|
private List<RecalData> getRecalData(String readGroup) {
|
||||||
|
|
@ -141,9 +142,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
int qual = quals[offset];
|
int qual = quals[offset];
|
||||||
if ( qual > 0 && qual <= QualityUtils.MAX_QUAL_SCORE ) {
|
if ( qual > 0 && qual <= QualityUtils.MAX_QUAL_SCORE ) {
|
||||||
// previous base is the next base in terms of machine chemistry if this is a negative strand
|
// previous base is the next base in terms of machine chemistry if this is a negative strand
|
||||||
int dinuc_index = RecalData.bases2dinucIndex(prevBase, base, false);
|
|
||||||
//System.out.printf("Adding b_offset=%c offset=%d cycle=%d qual=%d dinuc=%c%c ref_match=%c comp=%c%n", (char)read.getReadBases()[offset], offset, cycle, qual, prevBase, base, ref, (char)BaseUtils.simpleComplement(ref));
|
//System.out.printf("Adding b_offset=%c offset=%d cycle=%d qual=%d dinuc=%c%c ref_match=%c comp=%c%n", (char)read.getReadBases()[offset], offset, cycle, qual, prevBase, base, ref, (char)BaseUtils.simpleComplement(ref));
|
||||||
getRecalData(rg, cycle, qual, dinuc_index).inc(base,ref);
|
RecalData datum = getRecalData(rg, cycle, qual, prevBase, base);
|
||||||
|
if (datum != null) datum.inc(base,ref);
|
||||||
return 1;
|
return 1;
|
||||||
} else {
|
} else {
|
||||||
return 0;
|
return 0;
|
||||||
|
|
@ -168,14 +169,22 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
qualityDiffVsCycle();
|
qualityDiffVsCycle();
|
||||||
qualityDiffVsDinucleotide();
|
qualityDiffVsDinucleotide();
|
||||||
|
|
||||||
out.printf("Counted sites: %d%n", counted_sites);
|
printInfo(out);
|
||||||
out.printf("Counted bases: %d%n", counted_bases);
|
|
||||||
out.printf("Skipped sites: %d%n", skipped_sites);
|
|
||||||
out.printf("Fraction skipped: 1/%.0f%n", (double)counted_sites / skipped_sites);
|
|
||||||
|
|
||||||
if (CREATE_TRAINING_DATA) writeTrainingData();
|
if (CREATE_TRAINING_DATA) writeTrainingData();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void printInfo(PrintStream out) {
|
||||||
|
out.printf("# date %s%n", new Date());
|
||||||
|
out.printf("# collapsed_pos %b%n", collapsePos);
|
||||||
|
out.printf("# collapsed_dinuc %b%n", collapseDinuc);
|
||||||
|
out.printf("# counted_sites %d%n", counted_sites);
|
||||||
|
out.printf("# counted_bases %d%n", counted_bases);
|
||||||
|
out.printf("# skipped_sites %d%n", skipped_sites);
|
||||||
|
out.printf("# fraction_skipped 1/%.0f%n", (double)counted_sites / skipped_sites);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void writeTrainingData() {
|
void writeTrainingData() {
|
||||||
PrintStream dinuc_out = null;
|
PrintStream dinuc_out = null;
|
||||||
PrintStream table_out = null;
|
PrintStream table_out = null;
|
||||||
|
|
@ -197,10 +206,12 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
|
|
||||||
if ( outputRawData ) {
|
if ( outputRawData ) {
|
||||||
table_out = new PrintStream( OUTPUT_FILEROOT+".raw_data.csv");
|
table_out = new PrintStream( OUTPUT_FILEROOT+".raw_data.csv");
|
||||||
|
printInfo(table_out);
|
||||||
|
table_out.println("rg,dn,Qrep,pos,NBases,MMismatches");
|
||||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||||
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
|
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
|
||||||
if ( datum.N > 0 )
|
if ( datum.N > 0 )
|
||||||
table_out.format("%s%n", datum.toCSVString());
|
table_out.format("%s%n", datum.toCSVString(collapsePos));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -24,9 +24,9 @@ public class RecalData {
|
||||||
B += incB;
|
B += incB;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getDinucIndex() {
|
//public int getDinucIndex() {
|
||||||
return string2dinucIndex(this.dinuc);
|
// return string2dinucIndex(this.dinuc);
|
||||||
}
|
//}
|
||||||
|
|
||||||
public void inc(char curBase, char ref) {
|
public void inc(char curBase, char ref) {
|
||||||
inc(1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1);
|
inc(1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1);
|
||||||
|
|
@ -52,8 +52,8 @@ public class RecalData {
|
||||||
return String.format("%3d,%s,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, readGroup, dinuc, qual, empiricalQual, qual-empiricalQual, N, B);
|
return String.format("%3d,%s,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, readGroup, dinuc, qual, empiricalQual, qual-empiricalQual, N, B);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toCSVString() {
|
public String toCSVString(boolean collapsedPos) {
|
||||||
return String.format("%s,%s,%d,%d,%d,%d", readGroup, dinuc, qual, pos, N, B);
|
return String.format("%s,%s,%d,%s,%d,%d", readGroup, dinuc, qual, collapsedPos ? "*" : pos, N, B);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static RecalData fromCSVString(String s) {
|
public static RecalData fromCSVString(String s) {
|
||||||
|
|
@ -61,7 +61,8 @@ public class RecalData {
|
||||||
String rg = vals[0];
|
String rg = vals[0];
|
||||||
String dinuc = vals[1];
|
String dinuc = vals[1];
|
||||||
int qual = Integer.parseInt(vals[2]);
|
int qual = Integer.parseInt(vals[2]);
|
||||||
int pos = Integer.parseInt(vals[3]);
|
|
||||||
|
int pos = vals[3].equals("*") ? 0 : Integer.parseInt(vals[3]);
|
||||||
int N = Integer.parseInt(vals[4]);
|
int N = Integer.parseInt(vals[4]);
|
||||||
int B = Integer.parseInt(vals[5]);
|
int B = Integer.parseInt(vals[5]);
|
||||||
RecalData datum = new RecalData(pos, qual, rg, dinuc);
|
RecalData datum = new RecalData(pos, qual, rg, dinuc);
|
||||||
|
|
@ -74,9 +75,12 @@ public class RecalData {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int bases2dinucIndex(char prevBase, char base, boolean Complement) {
|
public static int bases2dinucIndex(char prevBase, char base, boolean Complement) {
|
||||||
|
if ( BaseUtils.simpleBaseToBaseIndex(prevBase) == -1 || BaseUtils.simpleBaseToBaseIndex(base) == -1 )
|
||||||
|
return -1;
|
||||||
|
|
||||||
if (!Complement) {
|
if (!Complement) {
|
||||||
return BaseUtils.simpleBaseToBaseIndex(prevBase) * 4 + BaseUtils.simpleBaseToBaseIndex(base);
|
return BaseUtils.simpleBaseToBaseIndex(prevBase) * 4 + BaseUtils.simpleBaseToBaseIndex(base);
|
||||||
}else{
|
}else {
|
||||||
return (3 - BaseUtils.simpleBaseToBaseIndex(prevBase)) * 4 + (3 - BaseUtils.simpleBaseToBaseIndex(base));
|
return (3 - BaseUtils.simpleBaseToBaseIndex(prevBase)) * 4 + (3 - BaseUtils.simpleBaseToBaseIndex(base));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,21 +1,8 @@
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers.recalibration;
|
package org.broadinstitute.sting.playground.gatk.walkers.recalibration;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
|
||||||
import org.broadinstitute.sting.gatk.LocusContext;
|
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
|
||||||
import org.broadinstitute.sting.utils.QualityUtils;
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
import org.broadinstitute.sting.utils.BaseUtils;
|
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.io.PrintStream;
|
|
||||||
import java.io.FileNotFoundException;
|
|
||||||
|
|
||||||
import net.sf.samtools.SAMReadGroupRecord;
|
|
||||||
import net.sf.samtools.SAMRecord;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Created by IntelliJ IDEA.
|
* Created by IntelliJ IDEA.
|
||||||
|
|
@ -34,7 +21,7 @@ public class RecalDataManager {
|
||||||
public RecalDataManager(String readGroup,
|
public RecalDataManager(String readGroup,
|
||||||
int maxReadLen, int maxQual, int nDinucs,
|
int maxReadLen, int maxQual, int nDinucs,
|
||||||
boolean trackPos, boolean trackDinuc) {
|
boolean trackPos, boolean trackDinuc) {
|
||||||
data = new RecalData[maxReadLen+1][QualityUtils.MAX_QUAL_SCORE+1][nDinucs];
|
data = new RecalData[maxReadLen+1][maxQual+1][nDinucs];
|
||||||
this.readGroup = readGroup;
|
this.readGroup = readGroup;
|
||||||
this.trackPos = trackPos;
|
this.trackPos = trackPos;
|
||||||
this.trackDinuc = trackDinuc;
|
this.trackDinuc = trackDinuc;
|
||||||
|
|
@ -46,8 +33,12 @@ public class RecalDataManager {
|
||||||
return trackPos ? pos : 0;
|
return trackPos ? pos : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getDinucIndex(int dinuc) {
|
public int getDinucIndex(String dinuc) {
|
||||||
return trackDinuc ? dinuc : 0;
|
if ( trackDinuc ) {
|
||||||
|
return RecalData.string2dinucIndex(dinuc);
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addDatum(RecalData datum) {
|
public void addDatum(RecalData datum) {
|
||||||
|
|
@ -55,27 +46,33 @@ public class RecalDataManager {
|
||||||
throw new RuntimeException(String.format("BUG: adding incorrect read group datum %s to RecalDataManager for %s", datum.readGroup, this.readGroup));
|
throw new RuntimeException(String.format("BUG: adding incorrect read group datum %s to RecalDataManager for %s", datum.readGroup, this.readGroup));
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( getRecalData(datum.pos, datum.qual, datum.getDinucIndex()) != null )
|
if ( getRecalData(datum.pos, datum.qual, datum.dinuc) != null )
|
||||||
throw new RuntimeException(String.format("Duplicate entry discovered: %s vs. %s", getRecalData(datum.pos, datum.qual, datum.getDinucIndex()), datum));
|
throw new RuntimeException(String.format("Duplicate entry discovered: %s vs. %s", getRecalData(datum.pos, datum.qual, datum.dinuc), datum));
|
||||||
|
|
||||||
|
|
||||||
int posIndex = getPosIndex(datum.pos);
|
int posIndex = getPosIndex(datum.pos);
|
||||||
int internalDinucIndex = getDinucIndex(datum.getDinucIndex());
|
int internalDinucIndex = getDinucIndex(datum.dinuc);
|
||||||
|
|
||||||
|
if ( internalDinucIndex == -1 ) return;
|
||||||
|
|
||||||
data[posIndex][datum.qual][internalDinucIndex] = datum;
|
data[posIndex][datum.qual][internalDinucIndex] = datum;
|
||||||
flattenData.add(datum);
|
flattenData.add(datum);
|
||||||
}
|
}
|
||||||
|
|
||||||
public RecalData getRecalData(int pos, int qual, int dinuc_index) {
|
public RecalData getRecalData(int pos, int qual, String dinuc) {
|
||||||
return expandingGetRecalData(pos, qual, dinuc_index, false);
|
return expandingGetRecalData(pos, qual, dinuc, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
public RecalData expandingGetRecalData(int pos, int qual, int dinuc_index, boolean expandP) {
|
public RecalData expandingGetRecalData(int pos, int qual, String dinuc, boolean expandP) {
|
||||||
int posIndex = getPosIndex(pos);
|
int posIndex = getPosIndex(pos);
|
||||||
int internalDinucIndex = getDinucIndex(dinuc_index);
|
int internalDinucIndex = getDinucIndex(dinuc);
|
||||||
|
|
||||||
|
if ( internalDinucIndex == -1 ) return null;
|
||||||
|
|
||||||
RecalData datum = data[posIndex][qual][internalDinucIndex];
|
RecalData datum = data[posIndex][qual][internalDinucIndex];
|
||||||
if ( datum == null && expandP ) {
|
if ( datum == null && expandP ) {
|
||||||
//System.out.printf("Allocating %s %d %d %d%n", readGroup, pos, qual, dinuc_index);
|
//System.out.printf("Allocating %s %d %d %d%n", readGroup, pos, qual, dinuc_index);
|
||||||
datum = new RecalData(posIndex, qual, readGroup, RecalData.dinucIndex2bases(dinuc_index));
|
datum = new RecalData(posIndex, qual, readGroup, trackDinuc ? dinuc : "**");
|
||||||
data[posIndex][qual][internalDinucIndex] = datum;
|
data[posIndex][qual][internalDinucIndex] = datum;
|
||||||
flattenData.add(datum);
|
flattenData.add(datum);
|
||||||
}
|
}
|
||||||
|
|
@ -108,7 +105,7 @@ public class RecalDataManager {
|
||||||
for ( int qual = 0; qual < QualityUtils.MAX_QUAL_SCORE+1; qual++ ) {
|
for ( int qual = 0; qual < QualityUtils.MAX_QUAL_SCORE+1; qual++ ) {
|
||||||
RecalData datum = new RecalData(pos, qual, readGroup, "**");
|
RecalData datum = new RecalData(pos, qual, readGroup, "**");
|
||||||
for ( int dinucIndex = 0; dinucIndex < nDinucs; dinucIndex++ ) {
|
for ( int dinucIndex = 0; dinucIndex < nDinucs; dinucIndex++ ) {
|
||||||
RecalData datum2 = getRecalData(pos, qual, dinucIndex);
|
RecalData datum2 = getRecalData(pos, qual, RecalData.dinucIndex2bases(dinucIndex));
|
||||||
if ( datum2 != null )
|
if ( datum2 != null )
|
||||||
datum.inc(data[pos][qual][dinucIndex].N, data[pos][qual][dinucIndex].B);
|
datum.inc(data[pos][qual][dinucIndex].N, data[pos][qual][dinucIndex].B);
|
||||||
}
|
}
|
||||||
|
|
@ -128,7 +125,7 @@ public class RecalDataManager {
|
||||||
RecalData datum = new RecalData(-1, qual, readGroup, RecalData.dinucIndex2bases(dinucIndex));
|
RecalData datum = new RecalData(-1, qual, readGroup, RecalData.dinucIndex2bases(dinucIndex));
|
||||||
System.out.printf("Aggregating [%s]:%n", datum);
|
System.out.printf("Aggregating [%s]:%n", datum);
|
||||||
for ( int pos = 0; pos < data.length; pos++ ) {
|
for ( int pos = 0; pos < data.length; pos++ ) {
|
||||||
RecalData datum2 = getRecalData(pos, qual, dinucIndex);
|
RecalData datum2 = getRecalData(pos, qual, RecalData.dinucIndex2bases(dinucIndex));
|
||||||
if ( datum2 != null ) {
|
if ( datum2 != null ) {
|
||||||
System.out.printf(" + [%s]:%n", datum2);
|
System.out.printf(" + [%s]:%n", datum2);
|
||||||
datum.inc(data[pos][qual][dinucIndex].N, data[pos][qual][dinucIndex].B);
|
datum.inc(data[pos][qual][dinucIndex].N, data[pos][qual][dinucIndex].B);
|
||||||
|
|
|
||||||
|
|
@ -54,6 +54,7 @@ public class BaseUtils {
|
||||||
*/
|
*/
|
||||||
static public int simpleBaseToBaseIndex(char base) {
|
static public int simpleBaseToBaseIndex(char base) {
|
||||||
switch (base) {
|
switch (base) {
|
||||||
|
case '*': // the wildcard character counts as an A
|
||||||
case 'A':
|
case 'A':
|
||||||
case 'a': return 0;
|
case 'a': return 0;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,8 @@ package org.broadinstitute.sting.utils;
|
||||||
* @author Kiran Garimella
|
* @author Kiran Garimella
|
||||||
*/
|
*/
|
||||||
public class QualityUtils {
|
public class QualityUtils {
|
||||||
|
public final static byte MAX_QUAL_SCORE = 63;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Private constructor. No instantiating this class!
|
* Private constructor. No instantiating this class!
|
||||||
*/
|
*/
|
||||||
|
|
@ -70,7 +72,7 @@ public class QualityUtils {
|
||||||
* @return the capped quality score
|
* @return the capped quality score
|
||||||
*/
|
*/
|
||||||
static public byte boundQual(int qual) {
|
static public byte boundQual(int qual) {
|
||||||
return (byte) Math.min(qual, 63);
|
return (byte) Math.min(qual, MAX_QUAL_SCORE);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue