System for traversing duplicate reads, along with a walker to compute quality scores among duplicates and a smarter method to combine quality scores across duplicates -- v1
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@624 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
71e8f47a6c
commit
2204be43eb
|
|
@ -15,6 +15,7 @@ import org.broadinstitute.sting.gatk.walkers.Walker;
|
|||
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -89,8 +90,9 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
return l;
|
||||
}
|
||||
|
||||
private List<SAMRecord> collectDuplicates(List<SAMRecord> reads) {
|
||||
ArrayList<SAMRecord> dups = new ArrayList<SAMRecord>();
|
||||
private Pair<List<SAMRecord>, List<SAMRecord>> splitDuplicates(List<SAMRecord> reads) {
|
||||
List<SAMRecord> uniques = new ArrayList<SAMRecord>();
|
||||
List<SAMRecord> dups = new ArrayList<SAMRecord>();
|
||||
|
||||
// find the first duplicate
|
||||
SAMRecord key = null;
|
||||
|
|
@ -104,7 +106,7 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
}
|
||||
|
||||
// At this point, there are two possibilities, we have found at least one dup or not
|
||||
//System.out.printf("Key is %s%n", key);
|
||||
// if it's a dup, add it to the dups list, otherwise add it to the uniques list
|
||||
if ( key != null ) {
|
||||
final GenomeLoc keyLoc = new GenomeLoc(key);
|
||||
final GenomeLoc keyMateLoc = new GenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart());
|
||||
|
|
@ -117,11 +119,15 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
// we are at the same position as the dup and have the same mat pos, it's a dup
|
||||
if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read));
|
||||
dups.add(read);
|
||||
} else {
|
||||
uniques.add(read);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uniques = reads;
|
||||
}
|
||||
|
||||
return dups;
|
||||
return new Pair<List<SAMRecord>, List<SAMRecord>>(uniques, dups);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -146,9 +152,13 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
for (SAMRecord read: iter) {
|
||||
// get the genome loc from the read
|
||||
GenomeLoc site = new GenomeLoc(read);
|
||||
logger.debug(String.format("*** TraverseDuplicates.traverse at %s", site));
|
||||
List<SAMRecord> reads = readsAtLoc(read, iter);
|
||||
List<SAMRecord> duplicateReads = collectDuplicates(reads);
|
||||
Pair<List<SAMRecord>, List<SAMRecord>> split = splitDuplicates(reads);
|
||||
List<SAMRecord> uniqueReads = split.getFirst();
|
||||
List<SAMRecord> duplicateReads = split.getSecond();
|
||||
|
||||
logger.debug(String.format("*** TraverseDuplicates.traverse at %s has %d unique and %d duplicate reads",
|
||||
site, uniqueReads.size(), duplicateReads.size()));
|
||||
|
||||
// Jump forward in the reference to this locus location
|
||||
LocusContext locus = new LocusContext(site, duplicateReads, Arrays.asList(0));
|
||||
|
|
@ -161,12 +171,17 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
|
||||
byte[] refBases = new byte[0];
|
||||
|
||||
final boolean keepMeP = dupWalker.filter(site, refBases, locus, duplicateReads);
|
||||
if (keepMeP) {
|
||||
M x = dupWalker.map(site, refBases, locus, duplicateReads);
|
||||
sum = dupWalker.reduce(x, sum);
|
||||
if ( dupWalker.mapUniqueReadsTooP() ) {
|
||||
// Send each unique read to the map function
|
||||
for ( SAMRecord unique : uniqueReads ) {
|
||||
List<SAMRecord> l = Arrays.asList(unique);
|
||||
sum = mapOne(dupWalker, l, site, refBases, locus, sum);
|
||||
}
|
||||
}
|
||||
|
||||
if ( duplicateReads.size() > 0 )
|
||||
sum = mapOne(dupWalker, duplicateReads, site, refBases, locus, sum);
|
||||
|
||||
printProgress("dups", site);
|
||||
|
||||
if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
|
||||
|
|
@ -213,7 +228,20 @@ public class TraverseDuplicates extends TraversalEngine {
|
|||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public <M, T> T mapOne(DuplicateWalker<M, T> dupWalker,
|
||||
List<SAMRecord> readSet,
|
||||
GenomeLoc site,
|
||||
byte[] refBases,
|
||||
LocusContext locus,
|
||||
T sum) {
|
||||
final boolean keepMeP = dupWalker.filter(site, refBases, locus, readSet);
|
||||
if (keepMeP) {
|
||||
M x = dupWalker.map(site, refBases, locus, readSet);
|
||||
sum = dupWalker.reduce(x, sum);
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
|
|||
|
|
@ -29,6 +29,15 @@ public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapTyp
|
|||
public boolean requiresReads() { return true; }
|
||||
public boolean cannotHandleReads() { return false; }
|
||||
|
||||
|
||||
/**
|
||||
* Called by the traversal engine to decide whether to send non-duplicates as lists of
|
||||
* singleton reads to the map function. By default it's false.
|
||||
*
|
||||
* @return true if you want to see non duplicates during the traversal
|
||||
*/
|
||||
public boolean mapUniqueReadsTooP() { return false; }
|
||||
|
||||
// Map over the org.broadinstitute.sting.gatk.LocusContext
|
||||
public abstract MapType map(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads);
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,108 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
|
||||
import org.broadinstitute.sting.utils.duplicates.DupUtils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.io.PrintStream;
|
||||
import java.io.File;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMFileWriterFactory;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mdepristo
|
||||
* Date: Feb 22, 2009
|
||||
* Time: 2:52:28 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class CombineDuplicatesWalker extends DuplicateWalker<SAMRecord, SAMFileWriter> {
|
||||
@Argument(fullName="outputBAM", shortName="outputBAM", required=false, defaultValue="", doc="BAM File to write combined duplicates to")
|
||||
public String outputFilename;
|
||||
|
||||
@Argument(fullName="includeUniqueReads", shortName="includeUniqueReads", required=false, defaultValue="true", doc="If true, also writes out non-duplicate reads in file")
|
||||
public boolean INCLUDE_UNIQUE_READS;
|
||||
|
||||
@Argument(fullName="maxQ", shortName="maxQ", required=false, defaultValue="50",
|
||||
doc="The maximum Q score allowed for combined reads, reflects the background error rate giving rise to perfect bases that don't correspond to the reference")
|
||||
public int MAX_QUALITY_SCORE;
|
||||
|
||||
final boolean DEBUG = false;
|
||||
|
||||
public boolean mapUniqueReadsTooP() {
|
||||
return INCLUDE_UNIQUE_READS;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------
|
||||
// Standard i/o reduce
|
||||
//
|
||||
public void onTraversalDone(SAMFileWriter output) {
|
||||
if ( output != null ) {
|
||||
output.close();
|
||||
}
|
||||
}
|
||||
|
||||
public SAMFileWriter reduceInit() {
|
||||
if ( outputFilename != null ) { // ! outputFile.equals("") ) {
|
||||
SAMFileWriterFactory fact = new SAMFileWriterFactory();
|
||||
SAMFileHeader header = this.getToolkit().getSamReader().getFileHeader();
|
||||
return fact.makeBAMWriter(header, true, new File(outputFilename));
|
||||
}
|
||||
else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
*/
|
||||
public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
|
||||
if ( output != null ) {
|
||||
output.addAlignment(read);
|
||||
} else {
|
||||
out.println(read.format());
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a combined read given the input list of non-unique reads. If there's just one read in the
|
||||
* set, it's considered unique and returned. If there's more than one, the N-way combine
|
||||
* duplicate function is invoked.
|
||||
*
|
||||
* @param loc
|
||||
* @param refBases
|
||||
* @param context
|
||||
* @param duplicateReads
|
||||
* @return
|
||||
*/
|
||||
public SAMRecord map(GenomeLoc loc, byte[] refBases, LocusContext context, List<SAMRecord> duplicateReads) {
|
||||
//logger.info(String.format("%s has %d duplicates%n", loc, duplicateReads.size()));
|
||||
SAMRecord combinedRead = null;
|
||||
|
||||
if ( duplicateReads.size() == 1 ) {
|
||||
// we are a unique read
|
||||
combinedRead = duplicateReads.get(0);
|
||||
} else {
|
||||
// actually call the combine function
|
||||
//for (SAMRecord read : duplicateReads ) {
|
||||
// System.out.printf("Read %s%n", read.format());
|
||||
//}
|
||||
combinedRead = DupUtils.combineDuplicates(duplicateReads, MAX_QUALITY_SCORE);
|
||||
}
|
||||
|
||||
return combinedRead;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,20 +1,17 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.walkers.DuplicateWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.duplicates.DuplicateComp;
|
||||
import org.broadinstitute.sting.utils.duplicates.DupUtils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.io.PrintStream;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
|
|
@ -68,7 +65,7 @@ class QualityTracker {
|
|||
}
|
||||
|
||||
public void inc(DuplicateComp dc) {
|
||||
inc(dc.qLarger, dc.qSmaller, dc.mismatchP);
|
||||
inc(dc.getQLarger(), dc.getQSmaller(), dc.isMismatchP());
|
||||
}
|
||||
|
||||
public void printToStream(PrintStream out, boolean filterUnobserved) {
|
||||
|
|
@ -85,21 +82,6 @@ class QualityTracker {
|
|||
}
|
||||
}
|
||||
|
||||
class DuplicateComp {
|
||||
int qLarger;
|
||||
int qSmaller;
|
||||
boolean mismatchP;
|
||||
|
||||
public DuplicateComp( int b1Q, int b2Q, boolean mismatchP ) {
|
||||
qLarger = Math.max(b1Q, b2Q);
|
||||
qSmaller = Math.min(b1Q, b2Q);
|
||||
this.mismatchP = mismatchP;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%d %d %b", qLarger, qSmaller, mismatchP);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -118,6 +100,9 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
|
|||
@Argument(fullName="combinedQuals", shortName="combinedQuals", required=false, doc="Combine and assess pairwise base qualities")
|
||||
public boolean COMBINE_QUALS = false;
|
||||
|
||||
@Argument(fullName="combineAllDups", shortName="combineAllDups", required=false, defaultValue="false", doc="Combine and assess pairwise base qualities")
|
||||
public boolean COMBINE_ALL_DUPS;
|
||||
|
||||
final boolean DEBUG = false;
|
||||
final private boolean ACTUALLY_DO_WORK = true;
|
||||
|
||||
|
|
@ -146,18 +131,17 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
|
|||
return pairwiseComps;
|
||||
|
||||
if ( COMBINE_QUALS ) {
|
||||
Pair<SAMRecord, SAMRecord> combinedReads = combinedReadPair( duplicateReads );
|
||||
Pair<SAMRecord, SAMRecord> combinedReads = DupUtils.combinedReadPair( duplicateReads );
|
||||
if ( combinedReads != null ) {
|
||||
SAMRecord combined1 = combinedReads.first;
|
||||
SAMRecord combined2 = combinedReads.second;
|
||||
addPairwiseMatches( pairwiseComps, combined1, combined2 );
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
int nComparisons = 0;
|
||||
for ( SAMRecord read1 : duplicateReads ) {
|
||||
for ( SAMRecord read2 : duplicateReads ) {
|
||||
if ( usableDuplicate(read1, read2) ) {
|
||||
if ( DupUtils.usableDuplicate(read1, read2) ) {
|
||||
nComparisons++;
|
||||
addPairwiseMatches( pairwiseComps, read1, read2 );
|
||||
if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET )
|
||||
|
|
@ -169,10 +153,6 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
|
|||
|
||||
return pairwiseComps;
|
||||
}
|
||||
|
||||
private boolean usableDuplicate( SAMRecord read1, SAMRecord read2 ) {
|
||||
return read1 != read2 && read1.getReadLength() == read2.getReadLength();
|
||||
}
|
||||
|
||||
private List<DuplicateComp> addPairwiseMatches(List<DuplicateComp> comps,
|
||||
SAMRecord read1, SAMRecord read2 ) {
|
||||
|
|
@ -191,94 +171,4 @@ public class DuplicateQualsWalker extends DuplicateWalker<List<DuplicateComp>, Q
|
|||
|
||||
return comps;
|
||||
}
|
||||
|
||||
private Pair<SAMRecord, SAMRecord> combinedReadPair( List<SAMRecord> duplicateReads ) {
|
||||
if ( duplicateReads.size() < 4 )
|
||||
return null;
|
||||
|
||||
SAMRecord c1 = combineDuplicates(duplicateReads.get(0),duplicateReads.get(1));
|
||||
SAMRecord c2 = combineDuplicates(duplicateReads.get(2),duplicateReads.get(3));
|
||||
return new Pair<SAMRecord, SAMRecord>(c1, c2);
|
||||
}
|
||||
|
||||
private SAMRecord sample3rdRead( List<SAMRecord> duplicateReads, SAMRecord read1, SAMRecord read2 ) {
|
||||
if ( duplicateReads.size() <= 2 ) {
|
||||
// no third unique read is available
|
||||
return null;
|
||||
} else {
|
||||
for ( SAMRecord read3 : duplicateReads ) {
|
||||
if ( usableDuplicate(read1, read3) && usableDuplicate(read2, read3) )
|
||||
return read3;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private SAMRecord tmpCopyRead(SAMRecord read) {
|
||||
SAMRecord copy = new SAMRecord(read.getHeader());
|
||||
copy.setReadName(read.getReadName());
|
||||
//copy.setReadString(final String value) {
|
||||
copy.setReadBases(read.getReadBases());
|
||||
copy.setBaseQualities(read.getBaseQualities());
|
||||
copy.setReferenceName(read.getReferenceName());
|
||||
copy.setReferenceIndex(read.getReferenceIndex());
|
||||
copy.setMateReferenceName(read.getMateReferenceName());
|
||||
copy.setMateReferenceIndex(read.getMateReferenceIndex());
|
||||
copy.setAlignmentStart(read.getAlignmentStart());
|
||||
//copy.setAlignmentEnd(read.getAlignmentEnd());
|
||||
copy.setMateAlignmentStart(read.getMateAlignmentStart());
|
||||
copy.setInferredInsertSize(read.getInferredInsertSize());
|
||||
copy.setMappingQuality(read.getMappingQuality());
|
||||
copy.setCigar(read.getCigar());
|
||||
copy.setFlags(copy.getFlags());
|
||||
|
||||
return copy;
|
||||
}
|
||||
|
||||
private SAMRecord combineDuplicates(SAMRecord read1, SAMRecord read2) {
|
||||
byte[] read1Bases = read1.getReadBases();
|
||||
byte[] read1Quals = read1.getBaseQualities();
|
||||
byte[] read2Bases = read2.getReadBases();
|
||||
byte[] read2Quals = read2.getBaseQualities();
|
||||
|
||||
byte[] bases = new byte[read1Bases.length];
|
||||
byte[] quals = new byte[read1Bases.length];
|
||||
|
||||
SAMRecord c = tmpCopyRead(read1);
|
||||
for ( int i = 0; i < read1Bases.length; i++) {
|
||||
byte base1 = read1Bases[i];
|
||||
byte base2 = read2Bases[i];
|
||||
byte qual1 = read1Quals[i];
|
||||
byte qual2 = read2Quals[i];
|
||||
final double p1 = QualityUtils.qualToProb(qual1);
|
||||
final double p2 = QualityUtils.qualToProb(qual2);
|
||||
|
||||
double pc;
|
||||
byte basec;
|
||||
|
||||
if ( base1 == base2 ) {
|
||||
// agreement
|
||||
basec = base1;
|
||||
pc = 1 - (1 - p1) * (1 - p2);
|
||||
} else {
|
||||
// disagreement
|
||||
basec = p1 > p2 ? base1 : base2;
|
||||
pc = p1 > p2 ? p1 : p2;
|
||||
//pc = 0;
|
||||
}
|
||||
|
||||
bases[i] = basec;
|
||||
quals[i] = QualityUtils.probToQual(pc, 0.0);
|
||||
|
||||
if ( DEBUG )
|
||||
logger.debug(String.format("Combining %s (Q%2d) with %s (Q%2d) -> %s (Q%2d)%s%n",
|
||||
(char)base1, qual1, (char)base2, qual2, (char)bases[i], quals[i],
|
||||
base1 == base2 ? "" : " [MISMATCH]"));
|
||||
}
|
||||
c.setReadBases(bases);
|
||||
c.setBaseQualities(quals);
|
||||
|
||||
return c;
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue