Updated version of the recalibration tool
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1060 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
aef519b427
commit
8ac40e8e2d
|
|
@ -19,8 +19,8 @@ import java.io.FileNotFoundException;
|
|||
|
||||
@WalkerName("CountCovariates")
|
||||
public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||
@Argument(fullName="maxReadLen", shortName="mrl", doc="max read length", required=false)
|
||||
public int maxReadLen = 101;
|
||||
@Argument(fullName="buggyMaxReadLen", doc="If we see a read longer than this, we assume there's a bug and abort", required=false)
|
||||
public int buggyMaxReadLen = 100000;
|
||||
|
||||
//@Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", doc="max quality score", required=false)
|
||||
//public int MAX_QUAL_SCORE = 63;
|
||||
|
|
@ -28,11 +28,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
@Argument(fullName="OUTPUT_FILEROOT", shortName="outroot", required=false, doc="Filename root for the outputted logistic regression training files")
|
||||
public String OUTPUT_FILEROOT = "output";
|
||||
|
||||
@Argument(fullName="CREATE_TRAINING_DATA", shortName="trainingdata", required=false, doc="Create training data files for logistic regression")
|
||||
public boolean CREATE_TRAINING_DATA = true;
|
||||
//@Argument(fullName="CREATE_TRAINING_DATA", shortName="trainingdata", required=false, doc="Create training data files for logistic regression")
|
||||
//public boolean CREATE_TRAINING_DATA = true;
|
||||
|
||||
@Argument(fullName="DOWNSAMPLE_FRACTION", shortName="sample", required=false, doc="Fraction of bases to randomly sample")
|
||||
public float DOWNSAMPLE_FRACTION=1.0f;
|
||||
//@Argument(fullName="DOWNSAMPLE_FRACTION", shortName="sample", required=false, doc="Fraction of bases to randomly sample")
|
||||
//public float DOWNSAMPLE_FRACTION=1.0f;
|
||||
|
||||
@Argument(fullName="MIN_MAPPING_QUALITY", shortName="minmap", required=false, doc="Only use reads with at least this quality score")
|
||||
public int MIN_MAPPING_QUALITY = 0;
|
||||
|
|
@ -47,8 +47,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
public List<String> platforms = Collections.singletonList("*");
|
||||
//public List<String> platforms = Collections.singletonList("ILLUMINA");
|
||||
|
||||
@Argument(fullName="rawData", shortName="raw", required=false, doc="If true, raw mismatch observations will be output to a file")
|
||||
public boolean outputRawData = true;
|
||||
//@Argument(fullName="rawData", shortName="raw", required=false, doc="If true, raw mismatch observations will be output to a file")
|
||||
//public boolean outputRawData = true;
|
||||
|
||||
@Argument(fullName="writeDetailedAnalysisFiles", required=false, doc="If true, we'll write detailed analysis files out at the end of covariate calcuilations")
|
||||
public boolean writeDetailedAnalysisFiles = false;
|
||||
|
||||
@Argument(fullName="collapsePos", shortName="collapsePos", required=false, doc="")
|
||||
public boolean collapsePos = false;
|
||||
|
|
@ -76,9 +79,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
if( !isSupportedReadGroup(readGroup) )
|
||||
continue;
|
||||
String rg = readGroup.getReadGroupId();
|
||||
RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, RecalData.NDINUCS, ! collapsePos, ! collapseDinuc );
|
||||
//RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, RecalData.NDINUCS, ! collapsePos, ! collapseDinuc );
|
||||
RecalDataManager manager = new RecalDataManager(rg, ! collapsePos, ! collapseDinuc );
|
||||
data.put(rg, manager);
|
||||
}
|
||||
out.printf("Created recalibration data collectors for %d read group(s)%n", data.size());
|
||||
}
|
||||
|
||||
private RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) {
|
||||
|
|
@ -100,7 +105,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
for (int i =0; i < reads.size(); i++ ) {
|
||||
SAMRecord read = reads.get(i);
|
||||
|
||||
if ( read.getReadLength() > maxReadLen ) {
|
||||
if ( read.getReadLength() > buggyMaxReadLen ) {
|
||||
throw new RuntimeException("Expectedly long read, please increase maxium read len with maxReadLen parameter: " + read.format());
|
||||
}
|
||||
|
||||
|
|
@ -140,7 +145,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
int qual = quals[offset];
|
||||
if ( qual > 0 && qual <= QualityUtils.MAX_QUAL_SCORE ) {
|
||||
if ( qual > 0 ) { // && qual <= QualityUtils.MAX_QUAL_SCORE ) {
|
||||
// previous base is the next base in terms of machine chemistry if this is a negative strand
|
||||
// if ( qual == 2 )
|
||||
//if ( read.getReadName().equals("30PTAAAXX090126:5:14:132:764#0") )
|
||||
|
|
@ -156,6 +161,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
/*
|
||||
PrintStream covars_out;
|
||||
try {
|
||||
covars_out = new PrintStream(OUTPUT_FILEROOT+".covars.out");
|
||||
|
|
@ -168,14 +174,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
} catch (FileNotFoundException e) {
|
||||
System.err.println("FileNotFoundException: " + e.getMessage());
|
||||
}
|
||||
|
||||
qualityEmpiricalObserved();
|
||||
qualityDiffVsCycle();
|
||||
qualityDiffVsDinucleotide();
|
||||
*/
|
||||
|
||||
printInfo(out);
|
||||
|
||||
if (CREATE_TRAINING_DATA) writeTrainingData();
|
||||
writeTrainingData();
|
||||
writeAnalysisData();
|
||||
}
|
||||
|
||||
private void printInfo(PrintStream out) {
|
||||
|
|
@ -190,8 +193,17 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
|
||||
|
||||
private void writeTrainingData() {
|
||||
out.printf("Writing raw recalibration data%n");
|
||||
writeRawTable();
|
||||
out.printf("...done%n");
|
||||
|
||||
out.printf("Writing logistic recalibration data%n");
|
||||
writeLogisticRecalibrationTable();
|
||||
out.printf("...done%n");
|
||||
}
|
||||
|
||||
private void writeLogisticRecalibrationTable() {
|
||||
PrintStream dinuc_out = null;
|
||||
PrintStream table_out = null;
|
||||
try {
|
||||
dinuc_out = new PrintStream( OUTPUT_FILEROOT+".covariate_counts.csv");
|
||||
dinuc_out.println("rg,dn,logitQ,pos,indicator,count");
|
||||
|
|
@ -207,28 +219,62 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( outputRawData ) {
|
||||
table_out = new PrintStream( OUTPUT_FILEROOT+".raw_data.csv");
|
||||
printInfo(table_out);
|
||||
table_out.println("rg,dn,Qrep,pos,NBases,MMismatches,Qemp");
|
||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
|
||||
if ( datum.N > 0 )
|
||||
table_out.format("%s%n", datum.toCSVString(collapsePos));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (FileNotFoundException e) {
|
||||
System.err.println("FileNotFoundException: " + e.getMessage());
|
||||
return;
|
||||
}
|
||||
finally {
|
||||
if (dinuc_out != null) dinuc_out.close();
|
||||
if (table_out != null) table_out.close();
|
||||
}
|
||||
}
|
||||
|
||||
private void writeRawTable() {
|
||||
PrintStream table_out = null;
|
||||
try {
|
||||
table_out = new PrintStream( OUTPUT_FILEROOT+".raw_data.csv");
|
||||
printInfo(table_out);
|
||||
table_out.println("rg,dn,Qrep,pos,NBases,MMismatches,Qemp");
|
||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
|
||||
if ( datum.N > 0 )
|
||||
table_out.format("%s%n", datum.toCSVString(collapsePos));
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException e ) {
|
||||
System.err.println("FileNotFoundException: " + e.getMessage());
|
||||
}
|
||||
finally {
|
||||
if (table_out != null) table_out.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Analysis data
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
private void writeAnalysisData() {
|
||||
if ( writeDetailedAnalysisFiles ) {
|
||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||
PrintStream DiffVsCycleFile = null, DiffVsDinucFile = null, EmpObsFile = null;
|
||||
try {
|
||||
DiffVsCycleFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".quality_difference_v_cycle.csv");
|
||||
DiffVsDinucFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".quality_difference_v_dinucleotide.csv");
|
||||
EmpObsFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".empirical_v_reported_quality.csv");
|
||||
|
||||
qualityEmpiricalObserved(EmpObsFile, readGroup.getReadGroupId());
|
||||
qualityDiffVsCycle(DiffVsCycleFile, readGroup.getReadGroupId());
|
||||
qualityDiffVsDinucleotide(DiffVsDinucFile, readGroup.getReadGroupId());
|
||||
} catch (FileNotFoundException e){
|
||||
throw new RuntimeException("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT, e);
|
||||
} finally {
|
||||
if ( DiffVsCycleFile != null) DiffVsCycleFile.close();
|
||||
if ( DiffVsDinucFile != null) DiffVsDinucFile.close();
|
||||
if ( EmpObsFile != null) EmpObsFile.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class MeanReportedQuality {
|
||||
|
|
@ -249,114 +295,83 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
public void qualityDiffVsCycle() {
|
||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||
PrintStream ByCycleFile = null;
|
||||
try {
|
||||
ByCycleFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".quality_difference_v_cycle.csv");
|
||||
} catch (FileNotFoundException e){
|
||||
System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT);
|
||||
System.exit(1);
|
||||
}
|
||||
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
|
||||
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
|
||||
ByCycleFile.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n");
|
||||
RecalData All = new RecalData(0,0,readGroup.getReadGroupId(),"");
|
||||
MeanReportedQuality AllReported = new MeanReportedQuality();
|
||||
for (int c=0; c < maxReadLen+1; c++) {
|
||||
ByCycle.add(new RecalData(c, -1,readGroup.getReadGroupId(),"-"));
|
||||
ByCycleReportedQ.add(new MeanReportedQuality());
|
||||
}
|
||||
|
||||
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
|
||||
ByCycle.get(datum.pos).inc(datum.N, datum.B);
|
||||
ByCycleReportedQ.get(datum.pos).inc(datum.qual, datum.N);
|
||||
All.inc(datum.N, datum.B);
|
||||
AllReported.inc(datum.qual, datum.N);
|
||||
}
|
||||
|
||||
for (int c=0; c < maxReadLen+1; c++) {
|
||||
double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
|
||||
double reportedQual = ByCycleReportedQ.get(c).result();
|
||||
ByCycleFile.printf("%d, %f, %f, %f, %d, %d%n", c, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
|
||||
}
|
||||
public void qualityDiffVsCycle(PrintStream file, final String readGroup) {
|
||||
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
|
||||
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
|
||||
file.printf("cycle,Qemp-obs,Qemp,Qobs,B,N%n");
|
||||
RecalData All = new RecalData(0,0,readGroup,"");
|
||||
MeanReportedQuality AllReported = new MeanReportedQuality();
|
||||
RecalDataManager manager = data.get(readGroup);
|
||||
int maxReadLen = manager.getMaxReadLen();
|
||||
for (int c=0; c < maxReadLen+1; c++) {
|
||||
ByCycle.add(new RecalData(c, -1,readGroup,"-"));
|
||||
ByCycleReportedQ.add(new MeanReportedQuality());
|
||||
}
|
||||
|
||||
for ( RecalData datum: getRecalData(readGroup) ) {
|
||||
ByCycle.get(datum.pos).inc(datum.N, datum.B);
|
||||
ByCycleReportedQ.get(datum.pos).inc(datum.qual, datum.N);
|
||||
All.inc(datum.N, datum.B);
|
||||
AllReported.inc(datum.qual, datum.N);
|
||||
}
|
||||
|
||||
for (int c=0; c < maxReadLen+1; c++) {
|
||||
double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
|
||||
double reportedQual = ByCycleReportedQ.get(c).result();
|
||||
file.printf("%d, %f, %f, %f, %d, %d%n", c, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
|
||||
}
|
||||
//System.out.printf("Cycle: N=%d, B=%d, Qemp=%.1f, ", All.N, All.B, -10 * Math.log10((double)All.B/All.N));
|
||||
//System.out.printf("Qrep=%.1f%n", AllReported.result());
|
||||
}
|
||||
|
||||
public void qualityDiffVsDinucleotide() {
|
||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||
PrintStream ByDinucFile = null;
|
||||
try {
|
||||
ByDinucFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".quality_difference_v_dinucleotide.csv");
|
||||
} catch (FileNotFoundException e){
|
||||
System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT);
|
||||
System.exit(1);
|
||||
}
|
||||
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
|
||||
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
|
||||
ByDinucFile.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
|
||||
RecalData All = new RecalData(0,0,readGroup.getReadGroupId(),"");
|
||||
MeanReportedQuality AllReported = new MeanReportedQuality();
|
||||
for (int c=0; c < RecalData.NDINUCS; c++) {
|
||||
ByCycle.add(new RecalData(-1, -1,readGroup.getReadGroupId(),RecalData.dinucIndex2bases(c)));
|
||||
ByCycleReportedQ.add(new MeanReportedQuality());
|
||||
}
|
||||
|
||||
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
|
||||
int dinucIndex = RecalData.dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false);
|
||||
ByCycle.get(dinucIndex).inc(datum.N, datum.B);
|
||||
ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N);
|
||||
All.inc(datum.N, datum.B);
|
||||
AllReported.inc(datum.qual, datum.N);
|
||||
}
|
||||
|
||||
for (int c=0; c < RecalData.NDINUCS; c++) {
|
||||
double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
|
||||
double reportedQual = ByCycleReportedQ.get(c).result();
|
||||
ByDinucFile.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
|
||||
}
|
||||
public void qualityDiffVsDinucleotide(PrintStream file, final String readGroup) {
|
||||
ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
|
||||
ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
|
||||
file.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
|
||||
RecalData All = new RecalData(0,0,readGroup,"");
|
||||
MeanReportedQuality AllReported = new MeanReportedQuality();
|
||||
for (int c=0; c < RecalData.NDINUCS; c++) {
|
||||
ByCycle.add(new RecalData(-1, -1,readGroup,RecalData.dinucIndex2bases(c)));
|
||||
ByCycleReportedQ.add(new MeanReportedQuality());
|
||||
}
|
||||
|
||||
for ( RecalData datum: getRecalData(readGroup) ) {
|
||||
int dinucIndex = RecalData.dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false);
|
||||
ByCycle.get(dinucIndex).inc(datum.N, datum.B);
|
||||
ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N);
|
||||
All.inc(datum.N, datum.B);
|
||||
AllReported.inc(datum.qual, datum.N);
|
||||
}
|
||||
|
||||
for (int c=0; c < RecalData.NDINUCS; c++) {
|
||||
double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
|
||||
double reportedQual = ByCycleReportedQ.get(c).result();
|
||||
file.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
|
||||
}
|
||||
//System.out.printf("Dinuc: N=%d, B=%d, Qemp=%.1f, ", All.N, All.B, -10 * Math.log10((double)All.B/All.N));
|
||||
//System.out.printf("Qrep=%.1f%n", AllReported.result());
|
||||
}
|
||||
|
||||
public void qualityEmpiricalObserved() {
|
||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||
PrintStream ByQualFile = null;
|
||||
try {
|
||||
ByQualFile = new PrintStream(OUTPUT_FILEROOT+".RG_"+readGroup.getReadGroupId()+".empirical_v_reported_quality.csv");
|
||||
} catch (FileNotFoundException e){
|
||||
System.out.println("Could not open output files based on OUTPUT_FILEROOT option: " + OUTPUT_FILEROOT);
|
||||
System.exit(1);
|
||||
}
|
||||
ArrayList<RecalData> ByQ = new ArrayList<RecalData>();
|
||||
ArrayList<MeanReportedQuality> ByQReportedQ = new ArrayList<MeanReportedQuality>();
|
||||
ByQualFile.printf("Qrep,Qemp,Qrep_avg,B,N%n");
|
||||
RecalData All = new RecalData(0,0,readGroup.getReadGroupId(),"");
|
||||
MeanReportedQuality AllReported = new MeanReportedQuality();
|
||||
for (int q=0; q<QualityUtils.MAX_QUAL_SCORE+1; q++) {
|
||||
ByQ.add(new RecalData(-1,q,readGroup.getReadGroupId(),"-"));
|
||||
ByQReportedQ.add(new MeanReportedQuality());
|
||||
}
|
||||
|
||||
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ){
|
||||
ByQ.get(datum.qual).inc(datum.N, datum.B);
|
||||
ByQReportedQ.get(datum.qual).inc(datum.qual, datum.N);
|
||||
All.inc(datum.N, datum.B);
|
||||
AllReported.inc(datum.qual, datum.N);
|
||||
//out.printf("%2d%6d%3d %2d %s%n", datum.qual, datum.N, datum.pos, datum.qual, datum.dinuc);
|
||||
}
|
||||
|
||||
for (int q=0; q<QualityUtils.MAX_QUAL_SCORE; q++) {
|
||||
double empiricalQual = -10 * Math.log10((double)ByQ.get(q).B / ByQ.get(q).N);
|
||||
ByQualFile.printf("%3d, %2.2f, %2.2f, %12d, %12d%n", q, empiricalQual, ByQReportedQ.get(q).result(), ByQ.get(q).B, ByQ.get(q).N);
|
||||
//out.printf("%3d,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, dinuc, qual, empiricalQual, qual-empiricalQual, N, B); n
|
||||
}
|
||||
public void qualityEmpiricalObserved(PrintStream file, final String readGroup) {
|
||||
ArrayList<RecalData> ByQ = new ArrayList<RecalData>();
|
||||
ArrayList<MeanReportedQuality> ByQReportedQ = new ArrayList<MeanReportedQuality>();
|
||||
file.printf("Qrep,Qemp,Qrep_avg,B,N%n");
|
||||
RecalData All = new RecalData(0,0,readGroup,"");
|
||||
MeanReportedQuality AllReported = new MeanReportedQuality();
|
||||
for (int q=0; q<QualityUtils.MAX_QUAL_SCORE+1; q++) {
|
||||
ByQ.add(new RecalData(-1,q,readGroup,"-"));
|
||||
ByQReportedQ.add(new MeanReportedQuality());
|
||||
}
|
||||
|
||||
for ( RecalData datum: getRecalData(readGroup) ){
|
||||
ByQ.get(datum.qual).inc(datum.N, datum.B);
|
||||
ByQReportedQ.get(datum.qual).inc(datum.qual, datum.N);
|
||||
All.inc(datum.N, datum.B);
|
||||
AllReported.inc(datum.qual, datum.N);
|
||||
//out.printf("%2d%6d%3d %2d %s%n", datum.qual, datum.N, datum.pos, datum.qual, datum.dinuc);
|
||||
}
|
||||
|
||||
for (int q=0; q<QualityUtils.MAX_QUAL_SCORE; q++) {
|
||||
double empiricalQual = -10 * Math.log10((double)ByQ.get(q).B / ByQ.get(q).N);
|
||||
file.printf("%3d, %2.2f, %2.2f, %12d, %12d%n", q, empiricalQual, ByQReportedQ.get(q).result(), ByQ.get(q).B, ByQ.get(q).N);
|
||||
//out.printf("%3d,%s,%3d,%5.1f,%5.1f,%6d,%6d", pos, dinuc, qual, empiricalQual, qual-empiricalQual, N, B); n
|
||||
}
|
||||
//System.out.printf("Emp-Obs: N=%d, B=%d, Qemp=%.1f, ", All.N, All.B, -10 * Math.log10((double)All.B/All.N));
|
||||
//System.out.printf("Qrep=%.1f%n", AllReported.result());
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
|
|
@ -367,12 +382,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
return 0;
|
||||
}
|
||||
|
||||
Random random_genrator;
|
||||
// Print out data for regression
|
||||
public CovariateCounterWalker() throws FileNotFoundException {
|
||||
random_genrator = new Random(123454321); // keep same random seed while debugging
|
||||
}
|
||||
|
||||
/**
|
||||
* Check to see whether this read group should be processed.
|
||||
* @param readGroup
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.ExpandingArrayList;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -13,20 +14,28 @@ import java.util.*;
|
|||
*/
|
||||
public class RecalDataManager {
|
||||
ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
|
||||
RecalData[][][] data = null;
|
||||
boolean trackPos, trackDinuc;
|
||||
String readGroup;
|
||||
int nDinucs, maxReadLen;
|
||||
|
||||
//RecalData[][][] data = null;
|
||||
ExpandingArrayList<ExpandingArrayList<ExpandingArrayList<RecalData>>> data = null;
|
||||
|
||||
public RecalDataManager(String readGroup,
|
||||
int maxReadLen, int maxQual, int nDinucs,
|
||||
//int maxReadLen, int maxQual, int nDinucs,
|
||||
boolean trackPos, boolean trackDinuc) {
|
||||
data = new RecalData[maxReadLen+1][maxQual+1][nDinucs];
|
||||
data = new ExpandingArrayList<ExpandingArrayList<ExpandingArrayList<RecalData>>>();
|
||||
//data = new RecalData[maxReadLen+1][maxQual+1][nDinucs];
|
||||
|
||||
this.readGroup = readGroup;
|
||||
this.trackPos = trackPos;
|
||||
this.trackDinuc = trackDinuc;
|
||||
this.maxReadLen = maxReadLen;
|
||||
this.nDinucs = nDinucs;
|
||||
//this.maxReadLen = maxReadLen;
|
||||
//this.nDinucs = nDinucs;
|
||||
}
|
||||
|
||||
public int getMaxReadLen() {
|
||||
return data.size();
|
||||
}
|
||||
|
||||
public int getPosIndex(int pos) {
|
||||
|
|
@ -58,13 +67,11 @@ public class RecalDataManager {
|
|||
if ( getRecalData(datum.pos, datum.qual, datum.dinuc) != null )
|
||||
throw new RuntimeException(String.format("Duplicate entry discovered: %s vs. %s", getRecalData(datum.pos, datum.qual, datum.dinuc), datum));
|
||||
|
||||
|
||||
int posIndex = getPosIndex(datum.pos);
|
||||
int internalDinucIndex = getDinucIndex(datum.dinuc);
|
||||
|
||||
if ( internalDinucIndex == -1 ) return;
|
||||
|
||||
data[posIndex][datum.qual][internalDinucIndex] = datum;
|
||||
set(posIndex, datum.qual, internalDinucIndex, datum);
|
||||
flattenData.add(datum);
|
||||
}
|
||||
|
||||
|
|
@ -78,18 +85,61 @@ public class RecalDataManager {
|
|||
|
||||
if ( internalDinucIndex == -1 ) return null;
|
||||
|
||||
RecalData datum = data[posIndex][qual][internalDinucIndex];
|
||||
RecalData datum = get(posIndex, qual, internalDinucIndex);
|
||||
if ( datum == null && expandP ) {
|
||||
//System.out.printf("Allocating %s %d %d %d%n", readGroup, pos, qual, dinuc_index);
|
||||
datum = new RecalData(posIndex, qual, readGroup, trackDinuc ? dinuc : "**");
|
||||
data[posIndex][qual][internalDinucIndex] = datum;
|
||||
set(posIndex, qual, internalDinucIndex, datum);
|
||||
flattenData.add(datum);
|
||||
}
|
||||
|
||||
return datum;
|
||||
}
|
||||
|
||||
public List<RecalData> select(int pos, int qual, int dinuc_index ) {
|
||||
/**
|
||||
* Returns the RecalData associated with pos, qual, dinuc, or null if not is bound.
|
||||
* Does not expand the system to corporate a new data point
|
||||
* @param posIndex
|
||||
* @param qual
|
||||
* @param dinucIndex
|
||||
* @return
|
||||
*/
|
||||
private RecalData get(int posIndex, int qual, int dinucIndex) {
|
||||
ExpandingArrayList<ExpandingArrayList<RecalData>> d2 = data.get(posIndex);
|
||||
if (d2 == null) return null;
|
||||
ExpandingArrayList<RecalData> d3 = d2.get(qual);
|
||||
return d3 == null ? null : d3.get(dinucIndex);
|
||||
}
|
||||
|
||||
//private RecalData get(int posIndex, int qual, int dinucIndex) {
|
||||
// return data[posIndex][qual][dinucIndex];
|
||||
//}
|
||||
|
||||
private void set(int posIndex, int qual, int dinucIndex, RecalData datum) {
|
||||
// grow data if necessary
|
||||
ExpandingArrayList<ExpandingArrayList<RecalData>> d2 = data.get(posIndex);
|
||||
if (d2 == null) {
|
||||
d2 = new ExpandingArrayList<ExpandingArrayList<RecalData>>();
|
||||
data.set(posIndex, d2);
|
||||
}
|
||||
|
||||
// Expand d2 if necessary
|
||||
ExpandingArrayList<RecalData> d3 = d2.get(qual);
|
||||
if ( d3 == null ) {
|
||||
d3 = new ExpandingArrayList<RecalData>();
|
||||
d2.set(qual, d3);
|
||||
}
|
||||
|
||||
// set d3 to datum
|
||||
d3.set(dinucIndex, datum);
|
||||
}
|
||||
|
||||
//private void set(int posIndex, int qual, int dinucIndex, RecalData datum) {
|
||||
// data[posIndex][qual][dinucIndex] = datum;
|
||||
//}
|
||||
|
||||
|
||||
/* public List<RecalData> select(int pos, int qual, int dinuc_index ) {
|
||||
List<RecalData> l = new LinkedList<RecalData>();
|
||||
for ( int i = 0; i < data.length; i++ ) {
|
||||
if ( i == pos || pos == -1 || ! trackPos ) {
|
||||
|
|
@ -147,7 +197,7 @@ public class RecalDataManager {
|
|||
|
||||
System.out.printf("getDataByDinuc => %d%n", l.size());
|
||||
return l;
|
||||
}
|
||||
}*/
|
||||
|
||||
public List<RecalData> getAll() {
|
||||
return flattenData;
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
HashMap<String, RecalMapping> cache = new HashMap<String, RecalMapping>();
|
||||
|
||||
//@Argument(shortName="serial", doc="", required=false)
|
||||
boolean serialRecalibration = false;
|
||||
//boolean serialRecalibration = false;
|
||||
|
||||
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static Pattern COLLAPSED_POS_PATTERN = Pattern.compile("^#\\s+collapsed_pos\\s+(\\w+)");
|
||||
|
|
@ -98,7 +98,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
// initialize the data structure
|
||||
HashMap<String, RecalDataManager> managers = new HashMap<String, RecalDataManager>();
|
||||
for ( String readGroup : readGroups ) {
|
||||
RecalDataManager manager = new RecalDataManager(readGroup, maxPos, maxQReported, dinucs.size(), ! collapsedPos, ! collapsedDinuc);
|
||||
RecalDataManager manager = new RecalDataManager(readGroup, ! collapsedPos, ! collapsedDinuc);
|
||||
//RecalDataManager manager = new RecalDataManager(readGroup, maxPos, maxQReported, dinucs.size(), ! collapsedPos, ! collapsedDinuc);
|
||||
managers.put(readGroup, manager);
|
||||
}
|
||||
|
||||
|
|
@ -111,10 +112,10 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
for ( String readGroup : readGroups ) {
|
||||
RecalDataManager manager = managers.get(readGroup);
|
||||
RecalMapping mapper = null;
|
||||
if ( serialRecalibration )
|
||||
mapper = new SerialRecalMapping(manager, dinucs, maxPos, maxQReported);
|
||||
else
|
||||
mapper = new CombinatorialRecalMapping(manager, dinucs, maxPos, maxQReported);
|
||||
//if ( serialRecalibration )
|
||||
// mapper = new SerialRecalMapping(manager, dinucs, maxPos, maxQReported);
|
||||
//else
|
||||
mapper = new CombinatorialRecalMapping(manager, dinucs, maxPos, maxQReported);
|
||||
cache.put(readGroup, mapper);
|
||||
}
|
||||
}
|
||||
|
|
@ -286,7 +287,7 @@ class CombinatorialRecalMapping implements RecalMapping {
|
|||
return result;
|
||||
}
|
||||
}*/
|
||||
|
||||
/*
|
||||
class SerialRecalMapping implements RecalMapping {
|
||||
// mapping from dinuc x Q => new Q
|
||||
HashMap<String, byte[]> mappingByDinuc;
|
||||
|
|
@ -338,4 +339,4 @@ class SerialRecalMapping implements RecalMapping {
|
|||
|
||||
return newQual;
|
||||
}
|
||||
}
|
||||
}*/
|
||||
|
|
|
|||
|
|
@ -0,0 +1,34 @@
|
|||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
|
||||
public class ExpandingArrayList<E> extends ArrayList<E> {
|
||||
public ExpandingArrayList() { super(); }
|
||||
public ExpandingArrayList(Collection<? extends E> c) { super(c); }
|
||||
public ExpandingArrayList(int initialCapacity) { super(initialCapacity); }
|
||||
|
||||
/**
|
||||
* Returns the element at the specified position in this list. If index > size,
|
||||
* returns null. Otherwise tries to access the array
|
||||
* @param index
|
||||
* @return
|
||||
* @throws IndexOutOfBoundsException in index < 0
|
||||
*/
|
||||
public E get(int index) throws IndexOutOfBoundsException {
|
||||
if ( index < size() )
|
||||
return super.get(index);
|
||||
else
|
||||
return null;
|
||||
}
|
||||
|
||||
public E set(int index, E element) {
|
||||
if ( index >= size() ) {
|
||||
// We need to add null items until we can safely set index to element
|
||||
for ( int i = size(); i <= index; i++ )
|
||||
add(null);
|
||||
}
|
||||
|
||||
return super.set(index, element);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,156 @@
|
|||
// our package
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
import org.junit.Before;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import java.util.HashSet;
|
||||
|
||||
/**
|
||||
* Basic unit test for RecalData
|
||||
*/
|
||||
public class RecalDataTest extends BaseTest {
|
||||
RecalData datum, starDatum;
|
||||
|
||||
/**
|
||||
* Tests that we got a string parameter in correctly
|
||||
*/
|
||||
@Before
|
||||
public void before() {
|
||||
datum = new RecalData(2, 3, "0", "AA");
|
||||
datum.B = datum.N = 1;
|
||||
starDatum = new RecalData(2, 3, "0", "**");
|
||||
starDatum.B = starDatum.N = 1;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBasic() {
|
||||
logger.warn("Executing testIsBetween");
|
||||
|
||||
Assert.assertTrue(datum.N == 1);
|
||||
Assert.assertTrue(datum.B == 1);
|
||||
Assert.assertTrue(datum.pos == 2);
|
||||
Assert.assertTrue(datum.qual == 3);
|
||||
Assert.assertTrue(datum.readGroup.equals("0"));
|
||||
Assert.assertTrue(datum.dinuc.equals("AA"));
|
||||
|
||||
Assert.assertTrue(starDatum.N == 1);
|
||||
Assert.assertTrue(starDatum.B == 1);
|
||||
Assert.assertTrue(starDatum.pos == 2);
|
||||
Assert.assertTrue(starDatum.qual == 3);
|
||||
Assert.assertTrue(starDatum.readGroup.equals("0"));
|
||||
Assert.assertTrue(starDatum.dinuc.equals("**"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInc() {
|
||||
logger.warn("Executing testInc");
|
||||
|
||||
datum.inc(1L, 0L);
|
||||
Assert.assertTrue(datum.N == 2);
|
||||
Assert.assertTrue(datum.B == 1);
|
||||
|
||||
datum.inc('A', 'A');
|
||||
Assert.assertTrue(datum.N == 3);
|
||||
Assert.assertTrue(datum.B == 1);
|
||||
|
||||
datum.inc('A', 'C');
|
||||
Assert.assertTrue(datum.N == 4);
|
||||
Assert.assertTrue(datum.B == 2);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testEmpQual() {
|
||||
logger.warn("Executing testEmpQual");
|
||||
|
||||
datum.B = 1;
|
||||
datum.N = 1;
|
||||
Assert.assertEquals(datum.empiricalQualDouble(), 0.0, 1e-5);
|
||||
Assert.assertEquals(datum.empiricalQualByte(), 0);
|
||||
|
||||
datum.B = 1;
|
||||
datum.N = 2;
|
||||
Assert.assertEquals(datum.empiricalQualDouble(), 3.0103, 1e-5);
|
||||
Assert.assertEquals(datum.empiricalQualByte(), 3);
|
||||
|
||||
datum.B = 1;
|
||||
datum.N = 3;
|
||||
Assert.assertEquals(datum.empiricalQualDouble(), 4.771213, 1e-5);
|
||||
Assert.assertEquals(datum.empiricalQualByte(), 5);
|
||||
|
||||
datum.B = 1;
|
||||
datum.B = 2;
|
||||
Assert.assertEquals(datum.empiricalQualDouble(), 1.760913, 1e-5);
|
||||
Assert.assertEquals(datum.empiricalQualByte(), 2);
|
||||
|
||||
datum.B = 1;
|
||||
datum.N = 10;
|
||||
Assert.assertEquals(datum.empiricalQualDouble(), 10.0, 1e-5);
|
||||
Assert.assertEquals(datum.empiricalQualByte(), 10);
|
||||
|
||||
datum.B = 1;
|
||||
datum.N = 100;
|
||||
Assert.assertEquals(datum.empiricalQualDouble(), 20.0, 1e-5);
|
||||
Assert.assertEquals(datum.empiricalQualByte(), 20);
|
||||
|
||||
datum.B = 1;
|
||||
datum.N = 1000;
|
||||
Assert.assertEquals(datum.empiricalQualDouble(), 30.0, 1e-5);
|
||||
Assert.assertEquals(datum.empiricalQualByte(), 30);
|
||||
|
||||
datum.B = 0;
|
||||
datum.N = 1000;
|
||||
Assert.assertEquals(datum.empiricalQualDouble(), QualityUtils.MAX_REASONABLE_Q_SCORE, 1e-5);
|
||||
Assert.assertEquals(datum.empiricalQualByte(), QualityUtils.MAX_REASONABLE_Q_SCORE);
|
||||
}
|
||||
|
||||
|
||||
public void testtoCSVString() {
|
||||
logger.warn("Executing testtoCSVString");
|
||||
|
||||
Assert.assertEquals(datum.toCSVString(false), "0,AA,3,2,1,1,0");
|
||||
Assert.assertEquals(datum.toCSVString(true), "0,AA,3,*,1,1,0");
|
||||
}
|
||||
|
||||
|
||||
public void testFromCSVString() {
|
||||
logger.warn("Executing testFromCSVString");
|
||||
|
||||
Assert.assertEquals(RecalData.fromCSVString("0,AA,3,2,1,1,0").toCSVString(false), datum.toCSVString(false));
|
||||
Assert.assertEquals(RecalData.fromCSVString("0,AA,3,*,1,1,0").toCSVString(false), datum.toCSVString(true));
|
||||
Assert.assertEquals(RecalData.fromCSVString("0,**,3,2,1,1,0").toCSVString(false), starDatum.toCSVString(false));
|
||||
Assert.assertEquals(RecalData.fromCSVString("0,**,3,*,1,1,0").toCSVString(false), starDatum.toCSVString(true));
|
||||
}
|
||||
|
||||
public void testDinucIndex() {
|
||||
logger.warn("Executing testDinucIndex");
|
||||
|
||||
HashSet<Integer> indices = new HashSet<Integer>();
|
||||
HashSet<Byte> unknownBytes = new HashSet<Byte>();
|
||||
byte bases[] = {'A', 'C', 'G', 'T', '*', 'N'};
|
||||
unknownBytes.add((byte)'*');
|
||||
unknownBytes.add((byte)'N');
|
||||
|
||||
for ( int i = 0; i < bases.length; i++ ) {
|
||||
for ( int j = 0; j < bases.length; j++ ) {
|
||||
byte[] bp = {bases[i], bases[j]};
|
||||
String s = new String(bp);
|
||||
int index = RecalData.dinucIndex(s);
|
||||
indices.add(index);
|
||||
Assert.assertEquals(index, RecalData.dinucIndex(bases[i], bases[j]));
|
||||
if ( index != -1 ) {
|
||||
Assert.assertEquals(RecalData.dinucIndex2bases(index), s);
|
||||
} else {
|
||||
Assert.assertTrue(unknownBytes.contains(bp[0]) || unknownBytes.contains(bp[1]) );
|
||||
}
|
||||
}
|
||||
}
|
||||
Assert.assertEquals(indices.size(), 17);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,153 @@
|
|||
// our package
|
||||
package org.broadinstitute.sting.utils;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
import org.junit.Before;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalData;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import java.util.HashSet;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Basic unit test for RecalData
|
||||
*/
|
||||
public class ExpandingArrayListTest extends BaseTest {
|
||||
ExpandingArrayList<Integer> empty, initCap10, hasOne, hasTen;
|
||||
|
||||
@Before
|
||||
public void before() {
|
||||
empty = new ExpandingArrayList<Integer>();
|
||||
|
||||
initCap10 = new ExpandingArrayList<Integer>(10);
|
||||
|
||||
hasOne = new ExpandingArrayList<Integer>();
|
||||
hasOne.add(1);
|
||||
|
||||
hasTen = new ExpandingArrayList<Integer>();
|
||||
hasTen.addAll(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBasicSizes() {
|
||||
logger.warn("Executing testBasicSizes");
|
||||
|
||||
Assert.assertEquals(empty.size(), 0);
|
||||
Assert.assertEquals(initCap10.size(), 0);
|
||||
Assert.assertEquals(hasOne.size(), 1);
|
||||
Assert.assertEquals(hasTen.size(), 10);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTenElements() {
|
||||
logger.warn("Executing testTenElements");
|
||||
|
||||
for ( int i = 0; i < 10; i++ ) {
|
||||
Assert.assertEquals((int)hasTen.get(i), i+1);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSettingTenElements() {
|
||||
logger.warn("Executing testSettingTenElements");
|
||||
|
||||
for ( int i = 0; i < 10; i++ ) {
|
||||
Assert.assertEquals((int)hasTen.set(i, 2*i), i+1);
|
||||
}
|
||||
|
||||
Assert.assertEquals(hasTen.size(), 10);
|
||||
for ( int i = 0; i < 10; i++ ) {
|
||||
Assert.assertEquals((int)hasTen.get(i), 2*i);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAdd() {
|
||||
logger.warn("Executing testAdd");
|
||||
|
||||
Assert.assertEquals(empty.size(), 0);
|
||||
empty.add(1);
|
||||
Assert.assertEquals(empty.size(), 1);
|
||||
Assert.assertEquals((int)empty.get(0), 1);
|
||||
empty.add(2);
|
||||
Assert.assertEquals(empty.size(), 2);
|
||||
Assert.assertEquals((int)empty.get(1), 2);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSet1() {
|
||||
logger.warn("Executing testSet1");
|
||||
|
||||
Assert.assertEquals(empty.size(), 0);
|
||||
empty.set(0, 1);
|
||||
Assert.assertEquals(empty.size(), 1);
|
||||
Assert.assertEquals((int)empty.get(0), 1);
|
||||
|
||||
empty.set(1, 2);
|
||||
Assert.assertEquals(empty.size(), 2);
|
||||
Assert.assertEquals((int)empty.get(1), 2);
|
||||
|
||||
// doesn't expand
|
||||
empty.set(0, 3);
|
||||
Assert.assertEquals(empty.size(), 2);
|
||||
Assert.assertEquals((int)empty.get(0), 3);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSetExpanding() {
|
||||
logger.warn("Executing testSetExpanding");
|
||||
|
||||
Assert.assertEquals(empty.size(), 0);
|
||||
empty.set(3, 1);
|
||||
Assert.assertEquals(empty.size(), 4);
|
||||
Assert.assertEquals(empty.get(0), null);
|
||||
Assert.assertEquals(empty.get(1), null);
|
||||
Assert.assertEquals(empty.get(2), null);
|
||||
Assert.assertEquals((int)empty.get(3), 1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSetExpandingReset() {
|
||||
logger.warn("Executing testSetExpandingReset");
|
||||
|
||||
Assert.assertEquals(empty.size(), 0);
|
||||
empty.set(3, 3);
|
||||
empty.set(2, 2);
|
||||
empty.set(1, 1);
|
||||
empty.set(0, 0);
|
||||
Assert.assertEquals(empty.size(), 4);
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
Assert.assertEquals((int)empty.get(i), i);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSetExpandingBig() {
|
||||
logger.warn("Executing testSetExpandingBig");
|
||||
|
||||
Assert.assertEquals(empty.size(), 0);
|
||||
empty.set(1000, 1000);
|
||||
Assert.assertEquals(empty.size(), 1001);
|
||||
for ( int i = 0; i < 1000; i++ )
|
||||
Assert.assertEquals(empty.get(i), null);
|
||||
Assert.assertEquals((int)empty.get(1000), 1000);
|
||||
}
|
||||
|
||||
@Test (expected=IndexOutOfBoundsException.class )
|
||||
public void testSetBadGetNegative() {
|
||||
logger.warn("Executing testSetBadGetNegative");
|
||||
empty.get(-1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSetBadGetPost() {
|
||||
logger.warn("Executing testSetBadGetPost");
|
||||
empty.set(1, 1);
|
||||
Assert.assertEquals(empty.get(0), null);
|
||||
Assert.assertEquals((int)empty.get(1), 1);
|
||||
Assert.assertEquals(empty.get(2), null);
|
||||
}
|
||||
}
|
||||
|
|
@ -16,12 +16,17 @@ def geli2dbsnpFile(geli):
|
|||
return os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
|
||||
|
||||
def SingleSampleGenotyperCmd(bam, geli, use2bp):
|
||||
naid = bam.split(".")[0]
|
||||
naid = os.path.split(bam)[1].split(".")[0]
|
||||
metrics = geli + '.metrics'
|
||||
gatkPath = '/humgen/gsa-scr1/kiran/repositories/Sting/trunk/dist/GenomeAnalysisTK.jar'
|
||||
hapmapChip = '/home/radon01/andrewk/hapmap_1kg/gffs/' + naid + '.gff'
|
||||
hapmapChipStr = ' '.join(['--hapmap_chip', hapmapChip])
|
||||
if not os.path.exists(hapmapChip):
|
||||
print '*** warning, no hapmap chip resuls for', naid
|
||||
hapmapChipStr = ''
|
||||
|
||||
targetList = '/home/radon01/depristo/work/1kg_pilot_evaluation/data/thousand_genomes_alpha_redesign.targets.interval_list'
|
||||
cmd = "java -ea -jar " + gatkPath + ' ' + ' '.join(['-T SingleSampleGenotyper', '-I', bam, '-L', targetList, '-R', ref, '-D', '/humgen/gsa-scr1/GATK_Data/dbsnp.rod.out', '--hapmap_chip', hapmapChip, '-calls', geli, '-met', metrics, '-geli -fb -l INFO'])
|
||||
cmd = "java -ea -jar " + gatkPath + ' ' + ' '.join(['-T SingleSampleGenotyper', '-I', bam, '-L', targetList, '-R', ref, '-D', '/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod', '-metout', metrics, '-varout', geli, '-geli -l INFO ']) + hapmapChipStr
|
||||
return cmd
|
||||
|
||||
def bams2geli(bams):
|
||||
|
|
@ -34,7 +39,7 @@ def bams2geli(bams):
|
|||
else:
|
||||
if not os.path.exists(geli):
|
||||
cmd = picard_utils.callGenotypesCmd( bam, geli, options = picard_utils.hybridSelectionExtraArgsForCalling())
|
||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry )
|
||||
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry )
|
||||
return geli, jobid
|
||||
calls = map(call1, bams)
|
||||
return map(lambda x: x[0], calls), map(lambda x: x[1], calls)
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ output_root = './'
|
|||
|
||||
# Location of the resource files distributed with the recalibration tool.
|
||||
# If editing, please end this variable with a trailing slash.
|
||||
resources='resources/'
|
||||
resources='recalibratorResources/'
|
||||
#resources='/broad/1KG/DCC_recal/ReadQualityRecalibrator/resources/'
|
||||
|
||||
import getopt,glob,os,sys
|
||||
|
|
@ -30,17 +30,17 @@ if not os.path.isabs(resources):
|
|||
resources = os.path.abspath(application_path) + '/' + resources
|
||||
|
||||
# Where does the reference live?
|
||||
reference_base = resources + 'human_b36_both'
|
||||
reference_base = resources + 'Homo_sapiens_assembly18'
|
||||
reference = reference_base + '.fasta'
|
||||
reference_dict = reference_base + '.dict'
|
||||
reference_fai = reference_base + '.fasta.fai'
|
||||
|
||||
# Where does DBSNP live?
|
||||
dbsnp = resources + 'dbsnp.1kg.rod.out'
|
||||
dbsnp = resources + 'dbsnp_129_hg18.rod'
|
||||
|
||||
# Where are the application files required to run the recalibration?
|
||||
gatk = resources + 'gatk/GenomeAnalysisTK.jar'
|
||||
#gatk = '/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar'
|
||||
#gatk = resources + 'gatk/GenomeAnalysisTK.jar'
|
||||
gatk = '/home/radon01/depristo/dev/GenomeAnalysisTK_stable/trunk/dist/GenomeAnalysisTK.jar'
|
||||
|
||||
logistic_regression_script = resources + 'logistic_regression.R'
|
||||
empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R'
|
||||
|
|
@ -146,7 +146,7 @@ def usage():
|
|||
|
||||
# Try to parse the given command-line arguments.
|
||||
try:
|
||||
opts, args = getopt.gnu_getopt(sys.argv[1:],'',['recalibrate','evaluate', 'reprocess', 'dry'])
|
||||
opts, args = getopt.gnu_getopt(sys.argv[1:],'',['recalibrate','evaluate', 'reprocess', 'dry', 'outputRoot='])
|
||||
except getopt.GetoptError, err:
|
||||
usage()
|
||||
|
||||
|
|
@ -168,6 +168,8 @@ for opt,arg in opts:
|
|||
reprocessFiles = True
|
||||
if opt == '--dry':
|
||||
dryRun = True
|
||||
if opt == '--outputRoot':
|
||||
output_root = arg
|
||||
|
||||
# Default to 'recalibrate' unless the user specified only evaluate.
|
||||
do_recalibration = not (evaluate_requested and not recalibrate_requested)
|
||||
|
|
|
|||
|
|
@ -27,20 +27,17 @@ def cmd(cmd_str_from_user, farm_queue=False, output_head=None, just_print_comman
|
|||
cmd_str += " \""+cmd_str_from_user + "\""
|
||||
|
||||
print ">>> Farming via "+cmd_str
|
||||
|
||||
#result = 'Job <2542666> is submitted to queue <broad>.'
|
||||
result = subprocess.Popen([cmd_str, ""], shell=True, stdout=subprocess.PIPE).communicate()[0]
|
||||
|
||||
p = re.compile('Job <(\d+)> is submitted to queue')
|
||||
jobid = p.match(result).group(1)
|
||||
|
||||
return jobid
|
||||
else:
|
||||
cmd_str = cmd_str_from_user
|
||||
print ">>> Executing "+cmd_str
|
||||
|
||||
if just_print_commands or (globals().has_key("justPrintCommands") and globals().justPrintCommands):
|
||||
return -1
|
||||
elif farm_queue:
|
||||
result = subprocess.Popen([cmd_str, ""], shell=True, stdout=subprocess.PIPE).communicate()[0]
|
||||
p = re.compile('Job <(\d+)> is submitted to queue')
|
||||
jobid = p.match(result).group(1)
|
||||
return jobid
|
||||
else:
|
||||
# Actually execute the command if we're not just in debugging output mode
|
||||
status = os.system(cmd_str)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,93 @@
|
|||
#
|
||||
# GATK configuration parser
|
||||
#
|
||||
import ConfigParser
|
||||
import os.path
|
||||
import sys
|
||||
|
||||
defaultRequiredOptions = {}
|
||||
def addRequiredOption(name, type):
|
||||
defaultRequiredOptions[name] = type
|
||||
|
||||
addRequiredOption('jar', 'input_file')
|
||||
addRequiredOption('reference', 'input_file')
|
||||
addRequiredOption('referenceIndex', 'input_file')
|
||||
addRequiredOption('referenceDict', 'input_file')
|
||||
addRequiredOption('java', 'file')
|
||||
addRequiredOption('jvm_args', str)
|
||||
addRequiredOption('args', str)
|
||||
addRequiredOption('tmp', 'output_file')
|
||||
|
||||
class gatkConfigParser(ConfigParser.SafeConfigParser):
|
||||
GATK = 'DEFAULT'
|
||||
|
||||
def __init__(self, configFiles):
|
||||
ConfigParser.SafeConfigParser.__init__(self)
|
||||
files = filter(None, configFiles)
|
||||
print 'Reading configuration file(s):', files
|
||||
print self.read(files)
|
||||
self.validateRequiredOptions()
|
||||
|
||||
def validateRequiredOptions(self):
|
||||
for key, value in defaultRequiredOptions.iteritems():
|
||||
self.validateOption(self.GATK, key, value)
|
||||
|
||||
def validateOption(self, section, name, type = str):
|
||||
v = self.getOption(section, name, type)
|
||||
print ' => Validated option', name, v
|
||||
|
||||
def getGATKOption(self, name, type = str):
|
||||
return self.getOption(self.GATK, name, type)
|
||||
|
||||
def getGATKModeOption(self, name, mode, type = str):
|
||||
return self.getOption(mode, name, type)
|
||||
|
||||
def getOption(self, section, name, typeF = None):
|
||||
if not self.has_option(section, name):
|
||||
raise "Option %s not found in section %s" % (name, section)
|
||||
else:
|
||||
val = self.get(section, name)
|
||||
if typeF == 'input_file' or typeF == 'output_file':
|
||||
path = os.path.abspath(os.path.expanduser(val))
|
||||
if typeF == 'input_file':
|
||||
if not os.path.exists(path):
|
||||
raise "Input file does not exist", path
|
||||
if not os.access(path, os.R_OK):
|
||||
raise "Input file cannot be read", path
|
||||
if typeF == 'output_file':
|
||||
if not os.access(path, os.W_OK):
|
||||
raise "Output file cannot be written", path
|
||||
return path
|
||||
elif type(typeF) == str:
|
||||
return str(val)
|
||||
elif typeF == None:
|
||||
return val
|
||||
else:
|
||||
return typeF(val)
|
||||
|
||||
def java(self): return self.getOption(self.GATK, 'java')
|
||||
def jvm_args(self): return self.getOption(self.GATK, 'jvm_args')
|
||||
def jar(self): return self.getOption(self.GATK, 'jar')
|
||||
def gatk_args(self): return self.getOption(self.GATK, 'args')
|
||||
def reference(self): return self.getOption(self.GATK, 'reference')
|
||||
|
||||
def gatkCmd(self, mode):
|
||||
cmd = ' '.join([self.java(), self.jvm_args(), '-jar', self.jar(), self.gatk_args(), '-R', self.reference()])
|
||||
cmd += ' ' + ' '.join(['-T', mode, self.getGATKModeOption('args', mode)])
|
||||
return cmd
|
||||
|
||||
import unittest
|
||||
class TestMergeBAMsUtils(unittest.TestCase):
|
||||
def setUp(self):
|
||||
configFile = os.path.join(os.path.split(sys.argv[0])[0] + "/../testdata/defaultGATKConfig.cfg")
|
||||
self.config = gatkConfigParser(configFile)
|
||||
|
||||
def testValidate(self):
|
||||
self.config.validateRequiredOptions()
|
||||
|
||||
def testCmd(self):
|
||||
s = "java -ea -Xmx2048m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -L 1:1-10,000,000 -R /home/radon01/depristo/work/humanref/Homo_sapiens_assembly18.fasta -T CountCovariates --CREATE_TRAINING_DATA --MIN_MAPPING_QUALITY 1"
|
||||
self.assertEquals(self.config.gatkCmd('CountCovariates'), s)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
[DEFAULT]
|
||||
jar: ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar
|
||||
referenceRoot: /home/radon01/depristo/work/humanref/Homo_sapiens_assembly18
|
||||
reference: %(referenceRoot)s.fasta
|
||||
referenceIndex: %(reference)s.fai
|
||||
referenceDict: %(referenceRoot)s.dict
|
||||
java: java
|
||||
jvm_args: -ea -Xmx2048m
|
||||
gatkData: /humgen/gsa-scr1/GATK_Data
|
||||
dbsnp: %(gatkData)s/dbsnp_129_hg18.rod
|
||||
tmp: /tmp/
|
||||
args: -l INFO
|
||||
#args: -l INFO -L chr1:1-1,000,000
|
||||
|
||||
[CountCovariates]
|
||||
args: --CREATE_TRAINING_DATA --MIN_MAPPING_QUALITY 1 -D %(dbsnp)s
|
||||
|
||||
[TableRecalibration]
|
||||
args: -compress 1
|
||||
|
||||
[R]
|
||||
Rscript: /broad/tools/apps/R-2.6.0/bin/Rscript
|
||||
PlotQEmpStated: ~andrewk/recalQual_resources/plot_q_emp_stated_hst.R
|
||||
Loading…
Reference in New Issue