Update training data creation in CovariateCounterWalker to output much smaller files by counting the number of occurences of each data point combination rather than outputting a line for each data point (i.e. each base). Also fixed bug in LogisticRecalibrationWalker where a null SAMHeader was being pulled from a function that is now marked deprecated.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@673 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4c12df372c
commit
1518f8f9bf
|
|
@ -3,15 +3,10 @@ package org.broadinstitute.sting.playground.gatk.walkers;
|
||||||
import org.broadinstitute.sting.gatk.LocusContext;
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
|
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.QualityUtils;
|
|
||||||
import org.broadinstitute.sting.utils.Pair;
|
|
||||||
import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
|
|
||||||
import org.broadinstitute.sting.utils.duplicates.DupUtils;
|
import org.broadinstitute.sting.utils.duplicates.DupUtils;
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.io.PrintStream;
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
@ -53,7 +48,7 @@ public class CombineDuplicatesWalker extends DuplicateWalker<SAMRecord, SAMFileW
|
||||||
}
|
}
|
||||||
|
|
||||||
public SAMFileWriter reduceInit() {
|
public SAMFileWriter reduceInit() {
|
||||||
if ( outputFilename != null ) { // ! outputFile.equals("") ) {
|
if ( outputFilename != null ) { // ! outputBamFile.equals("") ) {
|
||||||
SAMFileWriterFactory fact = new SAMFileWriterFactory();
|
SAMFileWriterFactory fact = new SAMFileWriterFactory();
|
||||||
SAMFileHeader header = this.getToolkit().getSamReader().getFileHeader();
|
SAMFileHeader header = this.getToolkit().getSamReader().getFileHeader();
|
||||||
return fact.makeBAMWriter(header, true, new File(outputFilename));
|
return fact.makeBAMWriter(header, true, new File(outputFilename));
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,7 @@ import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Random;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
|
|
||||||
|
|
@ -27,6 +28,15 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
@Argument(fullName="CREATE_TRAINING_DATA", shortName="trainingdata", required=false, doc="Create training data files for logistic regression")
|
@Argument(fullName="CREATE_TRAINING_DATA", shortName="trainingdata", required=false, doc="Create training data files for logistic regression")
|
||||||
public boolean CREATE_TRAINING_DATA = false;
|
public boolean CREATE_TRAINING_DATA = false;
|
||||||
|
|
||||||
|
@Argument(fullName="DOWNSAMPLE_FRACTION", shortName="sample", required=false, doc="Fraction of bases to randomly sample")
|
||||||
|
public float DOWNSAMPLE_FRACTION=1.0f;
|
||||||
|
|
||||||
|
@Argument(fullName="MIN_MAPPING_QUALITY", shortName="minmap", required=false, doc="Only use reads with at least this quality score")
|
||||||
|
public int MIN_MAPPING_QUALITY = 0;
|
||||||
|
|
||||||
|
@Argument(fullName="READ_GROUP", shortName="rg", required=false, doc="Only use reads with this read group (@RG)")
|
||||||
|
public String READ_GROUP = "none";
|
||||||
|
|
||||||
int NDINUCS = 16;
|
int NDINUCS = 16;
|
||||||
RecalData[][][] data = new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS];
|
RecalData[][][] data = new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS];
|
||||||
//RecalData[][][] data = new RecalData;
|
//RecalData[][][] data = new RecalData;
|
||||||
|
|
@ -98,23 +108,27 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
List<SAMRecord> reads = context.getReads();
|
List<SAMRecord> reads = context.getReads();
|
||||||
List<Integer> offsets = context.getOffsets();
|
List<Integer> offsets = context.getOffsets();
|
||||||
for (int i =0; i < reads.size(); i++ ) {
|
for (int i =0; i < reads.size(); i++ ) {
|
||||||
|
|
||||||
SAMRecord read = reads.get(i);
|
SAMRecord read = reads.get(i);
|
||||||
int offset = offsets.get(i);
|
|
||||||
int numBases = read.getReadLength();
|
|
||||||
if ( offset > 0 && offset < (numBases-1) ) { // skip first and last bases because they suck and they don't have a dinuc count
|
if ((READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
|
||||||
char base = (char)read.getReadBases()[offset];
|
(read.getMappingQuality() >= MIN_MAPPING_QUALITY) &&
|
||||||
int qual = (int)read.getBaseQualities()[offset];
|
(DOWNSAMPLE_FRACTION == 1.0 || random_genrator.nextFloat() < DOWNSAMPLE_FRACTION)) {
|
||||||
//out.printf("%d %d %d\n", offset, qual, bases2dinucIndex(prevBase,base));
|
//(random_genrator.nextFloat() <= DOWNSAMPLE_FRACTION)
|
||||||
// previous base is the next base in terms of machine chemistry if this is a negative strand
|
int offset = offsets.get(i);
|
||||||
char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)];
|
int numBases = read.getReadLength();
|
||||||
int dinuc_index = bases2dinucIndex(prevBase,base);
|
if ( offset > 0 && offset < (numBases-1) ) { // skip first and last bases because they suck and they don't have a dinuc count
|
||||||
if (qual > 0 && qual <= MAX_QUAL_SCORE) {
|
char base = (char)read.getReadBases()[offset];
|
||||||
// Convert offset into cycle position which means reversing the position of reads on the negative strand
|
int qual = (int)read.getBaseQualities()[offset];
|
||||||
int cycle = read.getReadNegativeStrandFlag() ? numBases - offset - 1 : offset;
|
//out.printf("%d %d %d\n", offset, qual, bases2dinucIndex(prevBase,base));
|
||||||
data[cycle][qual][dinuc_index].inc(base,ref);
|
// previous base is the next base in terms of machine chemistry if this is a negative strand
|
||||||
if (CREATE_TRAINING_DATA)
|
char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)];
|
||||||
dinuc_outs.get(dinuc_index).format("%d,%d,%d\n", qual, cycle, nuc2num[base]==nuc2num[ref] ? 0 : 1);
|
int dinuc_index = bases2dinucIndex(prevBase,base);
|
||||||
|
if (qual > 0 && qual <= MAX_QUAL_SCORE) {
|
||||||
|
// Convert offset into cycle position which means reversing the position of reads on the negative strand
|
||||||
|
int cycle = read.getReadNegativeStrandFlag() ? numBases - offset - 1 : offset;
|
||||||
|
data[cycle][qual][dinuc_index].inc(base,ref);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -134,12 +148,32 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
qualityDiffVsDinucleotide();
|
qualityDiffVsDinucleotide();
|
||||||
|
|
||||||
// Close dinuc filehandles
|
// Close dinuc filehandles
|
||||||
if (CREATE_TRAINING_DATA)
|
if (CREATE_TRAINING_DATA) writeTrainingData();
|
||||||
for ( PrintStream dinuc_stream : this.dinuc_outs)
|
|
||||||
dinuc_stream.close();
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void writeTrainingData() {
|
||||||
|
for ( int i=0; i<NDINUCS; i++) {
|
||||||
|
PrintStream dinuc_out = null;
|
||||||
|
try {
|
||||||
|
dinuc_out = new PrintStream( dinuc_root+"."+dinucIndex2bases(i)+".csv");
|
||||||
|
dinuc_out.println("logitQ,pos,indicator,count");
|
||||||
|
|
||||||
|
for ( RecalData datum: flattenData ) {
|
||||||
|
if ((datum.N - datum.B) > 0)
|
||||||
|
dinuc_out.format("%d,%d,%d,%d\n", datum.qual, datum.pos, 0, datum.N - datum.B);
|
||||||
|
if (datum.B > 0)
|
||||||
|
dinuc_out.format("%d,%d,%d,%d\n", datum.qual, datum.pos, 0, datum.B);
|
||||||
|
}
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
System.err.println("FileNotFoundException: " + e.getMessage());
|
||||||
|
} finally {
|
||||||
|
if (dinuc_out != null)
|
||||||
|
dinuc_out.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
class MeanReportedQuality {
|
class MeanReportedQuality {
|
||||||
double Qn = 0;
|
double Qn = 0;
|
||||||
long n = 0;
|
long n = 0;
|
||||||
|
|
@ -249,17 +283,12 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
num2nuc[2] = 'G';
|
num2nuc[2] = 'G';
|
||||||
num2nuc[3] = 'T';
|
num2nuc[3] = 'T';
|
||||||
}
|
}
|
||||||
|
Random random_genrator;
|
||||||
// Print out data for regression
|
// Print out data for regression
|
||||||
public CovariateCounterWalker() throws FileNotFoundException {
|
public CovariateCounterWalker() throws FileNotFoundException {
|
||||||
if (CREATE_TRAINING_DATA)
|
|
||||||
for ( int i=0; i<NDINUCS; i++) {
|
|
||||||
PrintStream next_dinuc = new PrintStream( dinuc_root+"."+dinucIndex2bases(i)+".csv");
|
|
||||||
next_dinuc.println("logitQ,pos,indicator");
|
|
||||||
dinuc_outs.add(next_dinuc);
|
|
||||||
}
|
|
||||||
ByQualFile = new PrintStream(OUTPUT_FILEROOT+".empirical_v_reported_quality.csv");
|
ByQualFile = new PrintStream(OUTPUT_FILEROOT+".empirical_v_reported_quality.csv");
|
||||||
ByCycleFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_cycle.csv");
|
ByCycleFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_cycle.csv");
|
||||||
ByDinucFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_dinucleotide.csv");
|
ByDinucFile = new PrintStream(OUTPUT_FILEROOT+".quality_difference_v_dinucleotide.csv");
|
||||||
|
random_genrator = new Random(123454321); // keep same random seed while debugging
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1,10 +1,6 @@
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers;
|
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||||
|
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
import org.broadinstitute.sting.gatk.LocusContext;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
|
@ -21,7 +17,7 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
|
||||||
public String logisticParamsFile;
|
public String logisticParamsFile;
|
||||||
|
|
||||||
@Argument(shortName="outputBAM", doc="output BAM file", required=false)
|
@Argument(shortName="outputBAM", doc="output BAM file", required=false)
|
||||||
public String outputFile = "";
|
public String outputBamFile = "";
|
||||||
|
|
||||||
HashMap<String, LogisticRegressor> regressors = new HashMap<String, LogisticRegressor>();
|
HashMap<String, LogisticRegressor> regressors = new HashMap<String, LogisticRegressor>();
|
||||||
private static Logger logger = Logger.getLogger(LogisticRecalibrationWalker.class);
|
private static Logger logger = Logger.getLogger(LogisticRecalibrationWalker.class);
|
||||||
|
|
@ -140,10 +136,10 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
|
||||||
}
|
}
|
||||||
|
|
||||||
public SAMFileWriter reduceInit() {
|
public SAMFileWriter reduceInit() {
|
||||||
if ( outputFile != null ) { // ! outputFile.equals("") ) {
|
if ( outputBamFile != null ) { // ! outputBamFile.equals("") ) {
|
||||||
SAMFileWriterFactory fact = new SAMFileWriterFactory();
|
SAMFileWriterFactory fact = new SAMFileWriterFactory();
|
||||||
SAMFileHeader header = this.getToolkit().getSamReader().getFileHeader();
|
SAMFileHeader header = this.getToolkit().getEngine().getSAMHeader();
|
||||||
return fact.makeBAMWriter(header, true, new File(outputFile));
|
return fact.makeBAMWriter(header, true, new File(outputBamFile));
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
return null;
|
return null;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue