diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CombineDuplicatesWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CombineDuplicatesWalker.java index b5038b10b..eb2f1d81a 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CombineDuplicatesWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CombineDuplicatesWalker.java @@ -3,15 +3,10 @@ package org.broadinstitute.sting.playground.gatk.walkers; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.gatk.walkers.DuplicateWalker; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.Pair; -import org.broadinstitute.sting.utils.duplicates.DuplicateComp; import org.broadinstitute.sting.utils.duplicates.DupUtils; import org.broadinstitute.sting.utils.cmdLine.Argument; import java.util.List; -import java.util.ArrayList; -import java.io.PrintStream; import java.io.File; import net.sf.samtools.SAMRecord; @@ -53,7 +48,7 @@ public class CombineDuplicatesWalker extends DuplicateWalker { @Argument(fullName="CREATE_TRAINING_DATA", shortName="trainingdata", required=false, doc="Create training data files for logistic regression") public boolean CREATE_TRAINING_DATA = false; + @Argument(fullName="DOWNSAMPLE_FRACTION", shortName="sample", required=false, doc="Fraction of bases to randomly sample") + public float DOWNSAMPLE_FRACTION=1.0f; + + @Argument(fullName="MIN_MAPPING_QUALITY", shortName="minmap", required=false, doc="Only use reads with at least this quality score") + public int MIN_MAPPING_QUALITY = 0; + + @Argument(fullName="READ_GROUP", shortName="rg", required=false, doc="Only use reads with this read group (@RG)") + public String READ_GROUP = "none"; + int NDINUCS = 16; RecalData[][][] data = new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS]; //RecalData[][][] data = new RecalData; @@ -98,23 +108,27 @@ public class CovariateCounterWalker extends LocusWalker { List reads = context.getReads(); List offsets = context.getOffsets(); for (int i =0; i < reads.size(); i++ ) { - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - int numBases = read.getReadLength(); - if ( offset > 0 && offset < (numBases-1) ) { // skip first and last bases because they suck and they don't have a dinuc count - char base = (char)read.getReadBases()[offset]; - int qual = (int)read.getBaseQualities()[offset]; - //out.printf("%d %d %d\n", offset, qual, bases2dinucIndex(prevBase,base)); - // previous base is the next base in terms of machine chemistry if this is a negative strand - char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)]; - int dinuc_index = bases2dinucIndex(prevBase,base); - if (qual > 0 && qual <= MAX_QUAL_SCORE) { - // Convert offset into cycle position which means reversing the position of reads on the negative strand - int cycle = read.getReadNegativeStrandFlag() ? numBases - offset - 1 : offset; - data[cycle][qual][dinuc_index].inc(base,ref); - if (CREATE_TRAINING_DATA) - dinuc_outs.get(dinuc_index).format("%d,%d,%d\n", qual, cycle, nuc2num[base]==nuc2num[ref] ? 0 : 1); + + + if ((READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) && + (read.getMappingQuality() >= MIN_MAPPING_QUALITY) && + (DOWNSAMPLE_FRACTION == 1.0 || random_genrator.nextFloat() < DOWNSAMPLE_FRACTION)) { + //(random_genrator.nextFloat() <= DOWNSAMPLE_FRACTION) + int offset = offsets.get(i); + int numBases = read.getReadLength(); + if ( offset > 0 && offset < (numBases-1) ) { // skip first and last bases because they suck and they don't have a dinuc count + char base = (char)read.getReadBases()[offset]; + int qual = (int)read.getBaseQualities()[offset]; + //out.printf("%d %d %d\n", offset, qual, bases2dinucIndex(prevBase,base)); + // previous base is the next base in terms of machine chemistry if this is a negative strand + char prevBase = (char)read.getReadBases()[offset + (read.getReadNegativeStrandFlag() ? 1 : -1)]; + int dinuc_index = bases2dinucIndex(prevBase,base); + if (qual > 0 && qual <= MAX_QUAL_SCORE) { + // Convert offset into cycle position which means reversing the position of reads on the negative strand + int cycle = read.getReadNegativeStrandFlag() ? numBases - offset - 1 : offset; + data[cycle][qual][dinuc_index].inc(base,ref); + } } } } @@ -134,12 +148,32 @@ public class CovariateCounterWalker extends LocusWalker { qualityDiffVsDinucleotide(); // Close dinuc filehandles - if (CREATE_TRAINING_DATA) - for ( PrintStream dinuc_stream : this.dinuc_outs) - dinuc_stream.close(); + if (CREATE_TRAINING_DATA) writeTrainingData(); } + void writeTrainingData() { + for ( int i=0; i 0) + dinuc_out.format("%d,%d,%d,%d\n", datum.qual, datum.pos, 0, datum.N - datum.B); + if (datum.B > 0) + dinuc_out.format("%d,%d,%d,%d\n", datum.qual, datum.pos, 0, datum.B); + } + } catch (FileNotFoundException e) { + System.err.println("FileNotFoundException: " + e.getMessage()); + } finally { + if (dinuc_out != null) + dinuc_out.close(); + } + } + } + class MeanReportedQuality { double Qn = 0; long n = 0; @@ -249,17 +283,12 @@ public class CovariateCounterWalker extends LocusWalker { num2nuc[2] = 'G'; num2nuc[3] = 'T'; } - + Random random_genrator; // Print out data for regression public CovariateCounterWalker() throws FileNotFoundException { - if (CREATE_TRAINING_DATA) - for ( int i=0; i regressors = new HashMap(); private static Logger logger = Logger.getLogger(LogisticRecalibrationWalker.class); @@ -140,10 +136,10 @@ public class LogisticRecalibrationWalker extends ReadWalker