More improvements to the duplicate quality combiner, making progress towards a clean system
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@788 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
04e51c8d1d
commit
30c63daf89
|
|
@ -176,12 +176,12 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
// Send each unique read to the map function
|
||||
for ( SAMRecord unique : uniqueReads ) {
|
||||
List<SAMRecord> l = Arrays.asList(unique);
|
||||
sum = mapOne(dupWalker, l, site, refBases, locus, sum);
|
||||
sum = mapOne(dupWalker, uniqueReads, l, site, refBases, locus, sum);
|
||||
}
|
||||
}
|
||||
|
||||
if ( duplicateReads.size() > 0 )
|
||||
sum = mapOne(dupWalker, duplicateReads, site, refBases, locus, sum);
|
||||
sum = mapOne(dupWalker, uniqueReads, duplicateReads, site, refBases, locus, sum);
|
||||
|
||||
printProgress("dups", site);
|
||||
|
||||
|
|
@ -231,14 +231,15 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
}
|
||||
|
||||
public <M, T> T mapOne(DuplicateWalker<M, T> dupWalker,
|
||||
List<SAMRecord> readSet,
|
||||
List<SAMRecord> uniqueReads,
|
||||
List<SAMRecord> duplicateReads,
|
||||
GenomeLoc site,
|
||||
byte[] refBases,
|
||||
LocusContext locus,
|
||||
T sum) {
|
||||
final boolean keepMeP = dupWalker.filter(site, refBases, locus, readSet);
|
||||
final boolean keepMeP = dupWalker.filter(site, refBases, locus, uniqueReads, duplicateReads);
|
||||
if (keepMeP) {
|
||||
M x = dupWalker.map(site, refBases, locus, readSet);
|
||||
M x = dupWalker.map(site, refBases, locus, uniqueReads, duplicateReads);
|
||||
sum = dupWalker.reduce(x, sum);
|
||||
}
|
||||
return sum;
|
||||
|
|
|
|||
|
|
@ -17,20 +17,23 @@ import net.sf.samtools.SAMRecord;
|
|||
@Requires({DataSource.READS,DataSource.REFERENCE})
|
||||
public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
|
||||
// Do we actually want to operate on the context?
|
||||
public boolean filter(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads) {
|
||||
public boolean filter(GenomeLoc loc, byte[] refBases, LocusContext context,
|
||||
List<SAMRecord> uniqueReads,
|
||||
List<SAMRecord> duplicateReads) {
|
||||
return true; // We are keeping all the reads
|
||||
}
|
||||
|
||||
/**
|
||||
* Called by the traversal engine to decide whether to send non-duplicates as lists of
|
||||
* singleton reads to the map function. By default it's false.
|
||||
*
|
||||
*
|
||||
* @return true if you want to see non duplicates during the traversal
|
||||
*/
|
||||
public boolean mapUniqueReadsTooP() { return false; }
|
||||
|
||||
// Map over the org.broadinstitute.sting.gatk.LocusContext
|
||||
public abstract MapType map(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads);
|
||||
public abstract MapType map(GenomeLoc loc, byte[] refBases, LocusContext context,
|
||||
List<SAMRecord> uniqueReads,
|
||||
List<SAMRecord> duplicateReads);
|
||||
|
||||
// Given result of map function
|
||||
public abstract ReduceType reduceInit();
|
||||
|
|
|
|||
|
|
@ -0,0 +1,133 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import net.sf.samtools.*;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequence;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Random;
|
||||
import java.util.HashMap;
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* ReadErrorRateWalker assesses the error rate per read position ('cycle') by comparing the
|
||||
* read to its home on the reference and noting the mismatch rate. It ignores reads with
|
||||
* indels in them, treats high and low-quality references bases the same, and does not count
|
||||
* ambiguous bases as mismatches. It's also thread-safe, so you can process a slew of reads
|
||||
* in short order.
|
||||
*
|
||||
* @author Kiran Garimella
|
||||
*/
|
||||
public class ReplaceQuals extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||
@Argument(shortName="inputQualsBAM",doc="BAM files containing qualities to be replaced",required=true)
|
||||
public String inputQualsBAM;
|
||||
|
||||
@Argument(shortName="outputBAM", required=false, doc="output BAM file for reads with replaced quals")
|
||||
public String outputFilename = null;
|
||||
|
||||
public int MAX_READS_TO_LOAD = -1;
|
||||
|
||||
private HashMap<String, Pair<SAMRecord, SAMRecord>> readNameToPairs;
|
||||
private int READ_PRINT_MOD = 100000;
|
||||
private final boolean DEBUG = false;
|
||||
|
||||
public void initialize() {
|
||||
readNameToPairs = new HashMap<String, Pair<SAMRecord, SAMRecord>>();
|
||||
|
||||
SAMFileReader samReader = new SAMFileReader(new File(inputQualsBAM));
|
||||
samReader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT);
|
||||
|
||||
int nReads = 0;
|
||||
logger.info("Starting to read inputQualsBAM = " + inputQualsBAM);
|
||||
for ( SAMRecord read : samReader ) {
|
||||
//System.out.printf("READ is %s%n", read.format());
|
||||
|
||||
final String name = read.getReadName();
|
||||
Pair<SAMRecord, SAMRecord> binding = readNameToPairs.containsKey(name) ? readNameToPairs.get(name) : new Pair<SAMRecord, SAMRecord>(null, null);
|
||||
if ( read.getFirstOfPairFlag() ) {
|
||||
binding.first = read;
|
||||
} else {
|
||||
binding.second = read;
|
||||
}
|
||||
readNameToPairs.put(name, binding);
|
||||
|
||||
if ( ++nReads % READ_PRINT_MOD == 0 ) {
|
||||
logger.info(String.format(" Read %d reads so far...", nReads));
|
||||
}
|
||||
|
||||
if ( nReads > MAX_READS_TO_LOAD && MAX_READS_TO_LOAD != -1 )
|
||||
break;
|
||||
}
|
||||
|
||||
logger.info("Done reading input BAM");
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public SAMRecord map(char[] ref, SAMRecord read) {
|
||||
final String name = read.getReadName();
|
||||
|
||||
if ( readNameToPairs.containsKey(name) ) {
|
||||
Pair<SAMRecord, SAMRecord> binding = readNameToPairs.get(name);
|
||||
SAMRecord qRead = read.getFirstOfPairFlag() ? binding.first : binding.second;
|
||||
if (qRead != null) {
|
||||
|
||||
if ( DEBUG ) {
|
||||
System.out.printf("Replacing read %s quals with %s%n", read.getReadName(), qRead.getReadName());
|
||||
System.out.printf("%s%n", read.getReadName());
|
||||
System.out.printf("%s%n", qRead.getReadName());
|
||||
System.out.printf("%s%n", read.getReadString());
|
||||
System.out.printf("%s%n", qRead.getReadString());
|
||||
System.out.printf("%s%n", read.getBaseQualityString());
|
||||
System.out.printf("%s%n", qRead.getBaseQualityString());
|
||||
|
||||
//if (! read.getReadString().equals(qRead.getReadString()))
|
||||
// throw new RuntimeException(String.format("BUG: equating %s and %s but bases are different", read.getReadName(), qRead.getReadName()));
|
||||
}
|
||||
|
||||
read.setBaseQualities(qRead.getBaseQualities());
|
||||
}
|
||||
}
|
||||
|
||||
return read;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------
|
||||
// Standard i/o reduce
|
||||
//
|
||||
public void onTraversalDone(SAMFileWriter output) {
|
||||
if ( output != null ) {
|
||||
output.close();
|
||||
}
|
||||
}
|
||||
|
||||
public SAMFileWriter reduceInit() {
|
||||
if ( outputFilename != null ) { // ! outputBamFile.equals("") ) {
|
||||
SAMFileWriterFactory fact = new SAMFileWriterFactory();
|
||||
SAMFileHeader header = this.getToolkit().getEngine().getSAMHeader();
|
||||
return fact.makeBAMWriter(header, true, new File(outputFilename));
|
||||
}
|
||||
else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
*/
|
||||
public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
|
||||
if ( output != null ) {
|
||||
output.addAlignment(read);
|
||||
} else {
|
||||
out.println(read.format());
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
}
|
||||
|
|
@ -48,9 +48,9 @@ public class CombineDuplicatesWalker extends DuplicateWalker<SAMRecord, SAMFileW
|
|||
}
|
||||
|
||||
public SAMFileWriter reduceInit() {
|
||||
if ( outputFilename != null ) { // ! outputBamFile.equals("") ) {
|
||||
if ( outputFilename != null ) {
|
||||
SAMFileWriterFactory fact = new SAMFileWriterFactory();
|
||||
SAMFileHeader header = this.getToolkit().getSamReader().getFileHeader();
|
||||
SAMFileHeader header = this.getToolkit().getEngine().getSAMHeader();
|
||||
return fact.makeBAMWriter(header, true, new File(outputFilename));
|
||||
}
|
||||
else {
|
||||
|
|
@ -83,8 +83,11 @@ public class CombineDuplicatesWalker extends DuplicateWalker<SAMRecord, SAMFileW
|
|||
* @param duplicateReads
|
||||
* @return
|
||||
*/
|
||||
public SAMRecord map(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads) {
|
||||
//logger.info(String.format("%s has %d duplicates%n", loc, duplicateReads.size()));
|
||||
public SAMRecord map(GenomeLoc loc, byte[] refBases, LocusContext context,
|
||||
List<SAMRecord> uniqueReads,
|
||||
List<SAMRecord> duplicateReads) {
|
||||
//logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size()));
|
||||
|
||||
SAMRecord combinedRead = null;
|
||||
|
||||
if ( duplicateReads.size() == 1 && ! duplicateReads.get(0).getDuplicateReadFlag() ) {
|
||||
|
|
@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
|
||||
import org.broadinstitute.sting.utils.duplicates.DupUtils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
|
@ -58,51 +59,63 @@ class QualityTracker {
|
|||
}
|
||||
}
|
||||
|
||||
public void inc(int b1Q, int b2Q, boolean mismatchP) {
|
||||
public void inc(int b1Qi, int b2Qi, boolean mismatchP, boolean orderDependent) {
|
||||
int b1Q = orderDependent ? b1Qi : Math.max(b1Qi, b2Qi);
|
||||
int b2Q = orderDependent ? b2Qi : Math.min(b1Qi, b2Qi);
|
||||
|
||||
if ( b1Q > MAX_QUAL_SCORE ) throw new RuntimeException("Unexpectedly large base quality " + b1Q);
|
||||
if ( b2Q > MAX_QUAL_SCORE ) throw new RuntimeException("Unexpectedly large base quality " + b2Q);
|
||||
|
||||
mismatchesByQ[b1Q][b2Q].inc(mismatchP);
|
||||
}
|
||||
|
||||
public void inc(DuplicateComp dc) {
|
||||
inc(dc.getQLarger(), dc.getQSmaller(), dc.isMismatchP());
|
||||
public void inc(DuplicateComp dc, boolean orderDependent) {
|
||||
inc(dc.getQLarger(), dc.getQSmaller(), dc.isMismatchP(), orderDependent);
|
||||
}
|
||||
|
||||
public int probMismatchQ1Q2(int q1, int q2) {
|
||||
double e1 = 1 - QualityUtils.qualToProb(q1);
|
||||
double e2 = 1 - QualityUtils.qualToProb(q2);
|
||||
double eMM = e1 * (1 - e2) + (1 - e1) * e2 - 1/3 * e1 * e2;
|
||||
return QualityUtils.probToQual(1 - eMM, 0.0);
|
||||
}
|
||||
|
||||
public void printToStream(PrintStream out, boolean filterUnobserved) {
|
||||
out.printf("Q1\tQ2\t%s%n", mismatchesByQ[0][0].headerString());
|
||||
out.printf("Q1\tQ2\tQmin\t%s%n", mismatchesByQ[0][0].headerString());
|
||||
for ( int i = 0; i < MAX_QUAL_SCORE; i++ ) {
|
||||
for ( int j = 0; j < MAX_QUAL_SCORE; j++ ) {
|
||||
MismatchCounter mc = mismatchesByQ[i][j];
|
||||
//System.out.printf("MC = %s%n", mc);
|
||||
if ( filterUnobserved && mc.nObs == 0 )
|
||||
continue;
|
||||
out.printf("%d\t%d\t%s\t%n", i, j, mc.toString());
|
||||
out.printf("%d\t%d\t%d\t%s\t%n", i, j, probMismatchQ1Q2(i,j), mc.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mdepristo
|
||||
* Date: Feb 22, 2009
|
||||
* Time: 2:52:28 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, QualityTracker> {
|
||||
@Argument(fullName="filterUnobservedQuals", shortName="filterUnobservedQuals", required=false, doc="Show only quality bins with at least one observation in the data")
|
||||
@Argument(fullName="filterUnobservedQuals", required=false, doc="Show only quality bins with at least one observation in the data")
|
||||
public boolean FILTER_UNOBSERVED_QUALS = false;
|
||||
|
||||
@Argument(fullName="maxPairwiseCompsPerDupSet", shortName="maxPairwiseCompsPerDupSet", required=false, doc="Maximumize number of pairwise comparisons to perform among duplicate read sets")
|
||||
@Argument(fullName="maxPairwiseCompsPerDupSet", required=false, doc="Maximumize number of pairwise comparisons to perform among duplicate read sets")
|
||||
public int MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET = 100;
|
||||
|
||||
@Argument(fullName="combinedQuals", shortName="combinedQuals", required=false, doc="Combine and assess pairwise base qualities")
|
||||
@Argument(fullName="combinedQuals", required=false, doc="Combine and assess pairwise base qualities")
|
||||
public boolean COMBINE_QUALS = false;
|
||||
|
||||
@Argument(fullName="combineAllDups", shortName="combineAllDups", required=false, doc="Combine and assess pairwise base qualities")
|
||||
@Argument(fullName="combineAllDups", required=false, doc="Combine and assess pairwise base qualities")
|
||||
public boolean COMBINE_ALL_DUPS = false;
|
||||
|
||||
@Argument(fullName="orderDependent", required=false, doc="")
|
||||
public boolean orderDependent = false;
|
||||
|
||||
@Argument(fullName="compareToUniqueReads", required=false, doc="If true, then we will compare only to unique (i.e., non-duplicated molecules) at the same duplicate site")
|
||||
public boolean compareToUniqueReads = false;
|
||||
|
||||
@Argument(fullName="comparePairToSingleton", required=false, doc="If true, then we will compare a combined dup to a random other read in the duplicate set, not a combined pair itself")
|
||||
public boolean comparePairToSingleton = false;
|
||||
|
||||
final boolean DEBUG = false;
|
||||
final private boolean ACTUALLY_DO_WORK = true;
|
||||
|
||||
|
|
@ -116,15 +129,17 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
|
|||
|
||||
public QualityTracker reduce(List<DuplicateComp> dupComps, QualityTracker tracker) {
|
||||
for ( DuplicateComp dc : dupComps ) {
|
||||
tracker.inc(dc);
|
||||
tracker.inc(dc, orderDependent);
|
||||
}
|
||||
|
||||
return tracker;
|
||||
}
|
||||
|
||||
// Print out data for regression
|
||||
public List<DuplicateComp> map(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads) {
|
||||
//out.printf("%s has %d duplicates%n", loc, duplicateReads.size());
|
||||
public List<DuplicateComp> map(GenomeLoc loc, byte[] refBases, LocusContext context,
|
||||
List<SAMRecord> uniqueReads,
|
||||
List<SAMRecord> duplicateReads) {
|
||||
//logger.info(String.format("%s has %d duplicates and %d non-duplicates", loc, duplicateReads.size(), uniqueReads.size()));
|
||||
List<DuplicateComp> pairwiseComps = new ArrayList<DuplicateComp>();
|
||||
|
||||
if ( ! ACTUALLY_DO_WORK )
|
||||
|
|
@ -135,15 +150,21 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
|
|||
if ( combinedReads != null ) {
|
||||
SAMRecord combined1 = combinedReads.first;
|
||||
SAMRecord combined2 = combinedReads.second;
|
||||
addPairwiseMatches( pairwiseComps, combined1, combined2 );
|
||||
|
||||
if ( comparePairToSingleton )
|
||||
pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, duplicateReads.get(2), uniqueReads );
|
||||
else
|
||||
pairwiseComps = addPairwiseMatches( pairwiseComps, combined1, combined2, uniqueReads );
|
||||
}
|
||||
} else {
|
||||
int nComparisons = 0;
|
||||
for ( SAMRecord read1 : duplicateReads ) {
|
||||
for ( SAMRecord read2 : duplicateReads ) {
|
||||
if ( DupUtils.usableDuplicate(read1, read2) ) {
|
||||
if ( read1.hashCode() < read2.hashCode() && DupUtils.usableDuplicate(read1, read2) ) {
|
||||
// the hashcode insures we don't do A vs. B and B vs. A
|
||||
//System.out.printf("Comparing %s against %s%n", read1, read2);
|
||||
nComparisons++;
|
||||
addPairwiseMatches( pairwiseComps, read1, read2 );
|
||||
pairwiseComps = addPairwiseMatches( pairwiseComps, read1, read2, uniqueReads );
|
||||
if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET )
|
||||
break;
|
||||
}
|
||||
|
|
@ -155,7 +176,32 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
|
|||
}
|
||||
|
||||
private List<DuplicateComp> addPairwiseMatches(List<DuplicateComp> comps,
|
||||
SAMRecord read1, SAMRecord read2 ) {
|
||||
SAMRecord read1, SAMRecord read2,
|
||||
List<SAMRecord> uniqueReads ) {
|
||||
if ( compareToUniqueReads ) {
|
||||
// we want to compare to a read in the unique read set
|
||||
if ( uniqueReads.size() > 0 ) { // there's actually something to compare to
|
||||
SAMRecord uniqueRead = uniqueReads.get(0); // might as well get the first one
|
||||
return pairwiseMatches(comps, read1, uniqueRead);
|
||||
} else {
|
||||
return comps;
|
||||
}
|
||||
} else {
|
||||
// default, just do read1 vs. read2
|
||||
return pairwiseMatches(comps, read1, read2);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the pairwise mismatches between reads read1 and read2 and adds the result to the comps list.
|
||||
* Doesn't contain any logic deciding what to compare, just does read1 and read2
|
||||
*
|
||||
* @param comps
|
||||
* @param read1
|
||||
* @param read2
|
||||
* @return
|
||||
*/
|
||||
private List<DuplicateComp> pairwiseMatches(List<DuplicateComp> comps, SAMRecord read1, SAMRecord read2 ) {
|
||||
byte[] read1Bases = read1.getReadBases();
|
||||
byte[] read1Quals = read1.getBaseQualities();
|
||||
byte[] read2Bases = read2.getReadBases();
|
||||
|
|
@ -164,7 +210,7 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
|
|||
for ( int i = 0; i < read1Bases.length; i++) {
|
||||
byte qual1 = read1Quals[i];
|
||||
byte qual2 = read2Quals[i];
|
||||
boolean mismatchP = read1Bases[i] != read2Bases[i];
|
||||
boolean mismatchP = ! BaseUtils.basesAreEqual(read1Bases[i], read2Bases[i]);
|
||||
DuplicateComp dc = new DuplicateComp(qual1, qual2, mismatchP);
|
||||
comps.add(dc);
|
||||
}
|
||||
Loading…
Reference in New Issue