diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java index 0879b21d0..5e5b425d6 100644 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java @@ -11,9 +11,7 @@ import org.broadinstitute.sting.gatk.executive.MicroScheduler; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.traversals.*; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.LocusWindowWalker; -import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; @@ -269,9 +267,14 @@ public class GenomeAnalysisTK extends CommandLineProgram { } } else if ( my_walker instanceof LocusWindowWalker ) { this.engine = new TraverseByLocusWindows(INPUT_FILES, REF_FILE_ARG, rods); - } else { + } else if ( my_walker instanceof ReadWalker) { // we're a read walker this.engine = new TraverseByReads(INPUT_FILES, REF_FILE_ARG, rods); + } else if ( my_walker instanceof DuplicateWalker) { + // we're a duplicate walker + this.engine = new TraverseDuplicates(INPUT_FILES, REF_FILE_ARG, rods); + } else { + throw new RuntimeException("Unexpected walker type: " + my_walker); } // Prepare the sort ordering w.r.t. the sequence dictionary diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java new file mode 100755 index 000000000..b8f783c93 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java @@ -0,0 +1,289 @@ +package org.broadinstitute.sting.gatk.traversals; + +import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.dataSources.providers.LocusContextProvider; +import org.broadinstitute.sting.gatk.dataSources.shards.ReadShard; +import org.broadinstitute.sting.gatk.dataSources.shards.Shard; +import org.broadinstitute.sting.gatk.iterators.BoundedReadIterator; +import org.broadinstitute.sting.gatk.iterators.PushbackIterator; +import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.walkers.DuplicateWalker; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; + +import java.io.File; +import java.util.*; + +import edu.mit.broad.picard.filter.FilteringIterator; +import edu.mit.broad.picard.filter.SamRecordFilter; + +/** + * + * User: aaron + * Date: Apr 24, 2009 + * Time: 10:35:22 AM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author Mark DePristo + * @version 0.1 + * @date Apr 29, 2009 + *

+ * Class TraverseDuplicates + *

+ * This class handles traversing lists of duplicate reads in the new shardable style + */ +public class TraverseDuplicates extends TraversalEngine { + /** our log, which we want to capture anything from this class */ + protected static Logger logger = Logger.getLogger(TraverseDuplicates.class); + + private final boolean DEBUG = false; + + /** + * Creates a new, uninitialized TraversalEngine + * + * @param reads SAM/BAM file of reads + * @param ref Reference file in FASTA format, assumes a .dict file is also available + * @param rods Array of reference ordered data sets + */ + public TraverseDuplicates(List reads, File ref, List> rods) { + super(reads, ref, rods); + } + + private List readsAtLoc(final SAMRecord read, PushbackIterator iter) + { + GenomeLoc site = new GenomeLoc(read); + ArrayList l = new ArrayList(); + + l.add(read); + for (SAMRecord read2: iter) { + GenomeLoc site2 = new GenomeLoc(read2); + + // the next read starts too late + if ( site2.getStart() != site.getStart() ) { + //System.out.printf("site = %s, site2 = %s%n", site, site2); + iter.pushback(read2); + break; + } else { + //System.out.printf("Read is a duplicate: %s%n", read.format()); + l.add(read2); + } + } + + return l; + } + + private List collectDuplicates(List reads) { + ArrayList dups = new ArrayList(); + + // find the first duplicate + SAMRecord key = null; + for ( SAMRecord read : reads ) { + if ( read.getDuplicateReadFlag() ) { + // this is our key + key = read; + if (DEBUG) logger.debug(String.format("Key %s is a duplicate", read.getReadName())); + break; + } + } + + // At this point, there are two possibilities, we have found at least one dup or not + //System.out.printf("Key is %s%n", key); + if ( key != null ) { + final GenomeLoc keyLoc = new GenomeLoc(key); + final GenomeLoc keyMateLoc = new GenomeLoc(key.getMateReferenceIndex(), key.getMateAlignmentStart(), key.getMateAlignmentStart()); + + for ( SAMRecord read : reads ) { + final GenomeLoc readLoc = new GenomeLoc(read); + final GenomeLoc readMateLoc = new GenomeLoc(read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getMateAlignmentStart()); + if (DEBUG) logger.debug(String.format("Examining reads at %s vs. %s at %s / %s vs. %s / %s%n", key.getReadName(), read.getReadName(), keyLoc, keyMateLoc, readLoc, readMateLoc)); + if ( readLoc.compareTo(keyLoc) == 0 && readMateLoc.compareTo(keyMateLoc) == 0 ) { + // we are at the same position as the dup and have the same mat pos, it's a dup + if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s%n", read)); + dups.add(read); + } + } + } + + return dups; + } + + /** + * Traverse by reads, given the data and the walker + * @param walker the walker to execute over + * @param shard the shard of data to feed the walker + * @param locusProvider the factory for loci + * @param sum of type T, the return from the walker + * @param the generic type + * @param the return type of the reduce function + * @return + */ + public T actuallyTraverse(DuplicateWalker dupWalker, + Iterator readIter, + T sum) { + // while we still have more reads + // ok, here's the idea. We get all the reads that start at the same position in the genome + // We then split the list of reads into sublists of reads: + // -> those with the same mate pair position, for paired reads + // -> those flagged as unpaired and duplicated but having the same start and end and + PushbackIterator iter = new PushbackIterator(readIter); + for (SAMRecord read: iter) { + // get the genome loc from the read + GenomeLoc site = new GenomeLoc(read); + logger.debug(String.format("*** TraverseDuplicates.traverse at %s", site)); + List reads = readsAtLoc(read, iter); + List duplicateReads = collectDuplicates(reads); + + // Jump forward in the reference to this locus location + LocusContext locus = new LocusContext(site, duplicateReads, Arrays.asList(0)); + + // update the number of duplicate sets we've seen + TraversalStatistics.nRecords++; + + // we still have to fix the locus context provider to take care of this problem with > 1 length contexts + // LocusContext locus = locusProvider.getLocusContext(site); + + byte[] refBases = new byte[0]; + + final boolean keepMeP = dupWalker.filter(site, refBases, locus, duplicateReads); + if (keepMeP) { + M x = dupWalker.map(site, refBases, locus, duplicateReads); + sum = dupWalker.reduce(x, sum); + } + + printProgress("dups", site); + + if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) { + logger.warn(String.format(("Maximum number of duplicate sets encountered, terminating traversal " + TraversalStatistics.nRecords))); + break; + } + } + + return sum; + } + + /** + * Class to filter out un-handle-able reads from the stream. We currently are skipping + * unmapped reads, non-primary reads, unaligned reads, and duplicate reads. + */ + public static class duplicateStreamFilterFunc implements SamRecordFilter { + SAMRecord lastRead = null; + public boolean filterOut(SAMRecord rec) { + boolean result = false; + String why = ""; + if (rec.getReadUnmappedFlag()) { + TraversalStatistics.nUnmappedReads++; + result = true; + why = "Unmapped"; + } else if (rec.getNotPrimaryAlignmentFlag()) { + TraversalStatistics.nNotPrimary++; + result = true; + why = "Not Primary"; + } else if (rec.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) { + TraversalStatistics.nBadAlignments++; + result = true; + why = "No alignment start"; + } + else { + result = false; + } + + if (result) { + TraversalStatistics.nSkippedReads++; + //System.out.printf(" [filter] %s => %b %s", rec.getReadName(), result, why); + } else { + TraversalStatistics.nReads++; + } + return result; + } + } + + + // -------------------------------------------------------------------------------------------------------------- + // + // old style interface to the system + // + // -------------------------------------------------------------------------------------------------------------- + public T traverse(Walker walker, ArrayList locations) { + if ( walker instanceof DuplicateWalker) { + Walker x = walker; + DuplicateWalker dupWalker = (DuplicateWalker)x; + return (T)this.traverseByRead(dupWalker, locations); + } else { + throw new IllegalArgumentException("Walker isn't a duplicate walker!"); + } + } + + /** + * Should we deleted at the soonist possible opportunity + */ + public Object traverseByRead(DuplicateWalker walker, ArrayList locations) { + samReadIter = initializeReads(); + + // Initialize the walker + walker.initialize(); + + // Initialize the sum + FilteringIterator filterIter = new FilteringIterator(samReadIter, new duplicateStreamFilterFunc()); + T sum = actuallyTraverse(walker, filterIter, walker.reduceInit()); + + //printOnTraversalDone("reads", sum); + walker.onTraversalDone(sum); + return sum; + } + + // -------------------------------------------------------------------------------------------------------------- + // + // new style interface to the system + // + // -------------------------------------------------------------------------------------------------------------- + /** + * Traverse by reads, given the data and the walker + * @param walker the walker to execute over + * @param shard the shard of data to feed the walker + * @param locusProvider the factory for loci + * @param sum of type T, the return from the walker + * @param the generic type + * @param the return type of the reduce function + * @return + */ + public T traverse(Walker walker, + Shard shard, + LocusContextProvider locusProvider, + BoundedReadIterator readIter, + T sum) { + + logger.debug(String.format("TraverseDuplicates.traverse Genomic interval is %s", ((ReadShard)shard).getSize())); + + if (!(walker instanceof DuplicateWalker)) + throw new IllegalArgumentException("Walker isn't a duplicate walker!"); + + DuplicateWalker dupWalker = (DuplicateWalker) walker; + + // while we still have more reads + // ok, here's the idea. We get all the reads that start at the same position in the genome + // We then split the list of reads into sublists of reads: + // -> those with the same mate pair position, for paired reads + // -> those flagged as unpaired and duplicated but having the same start and end and + + FilteringIterator filterIter = new FilteringIterator(readIter, new duplicateStreamFilterFunc()); + PushbackIterator iter = new PushbackIterator(filterIter); + return actuallyTraverse(dupWalker, readIter, sum); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java new file mode 100755 index 000000000..1fc494e6e --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java @@ -0,0 +1,38 @@ +package org.broadinstitute.sting.gatk.walkers; + +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.List; + +import net.sf.samtools.SAMRecord; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 2:52:28 PM + * To change this template use File | Settings | File Templates. + */ +public abstract class DuplicateWalker extends Walker { + // Do we actually want to operate on the context? + public boolean filter(GenomeLoc loc, byte[] refBases, LocusContext context, List duplicateReads) { + return true; // We are keeping all the reads + } + + /** + * These two functions state whether we're don't make any sense without reads (requiresRead()) + * or whether we can't take any reads at all (cannotHandleRead()) + */ + public boolean requiresReads() { return true; } + public boolean cannotHandleReads() { return false; } + + // Map over the org.broadinstitute.sting.gatk.LocusContext + public abstract MapType map(GenomeLoc loc, byte[] refBases, LocusContext context, List duplicateReads); + + // Given result of map function + public abstract ReduceType reduceInit(); + public abstract ReduceType reduce(MapType value, ReduceType sum); +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/DuplicateQualsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/DuplicateQualsWalker.java new file mode 100755 index 000000000..1da9cab99 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/DuplicateQualsWalker.java @@ -0,0 +1,171 @@ +package org.broadinstitute.sting.playground.gatk.walkers; + +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.walkers.DuplicateWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import java.util.List; +import java.util.ArrayList; +import java.io.PrintStream; +import java.io.FileNotFoundException; + +import net.sf.samtools.SAMRecord; + +class MismatchCounter { + long nObs = 0; + long nMismatches = 0; + + public void inc(long incNObs, long incNMismatches) { + nObs += incNObs; + nMismatches += incNMismatches; + } + + public void inc(boolean mismatchP) { + inc(1, mismatchP ? 1 : 0); + } + + + public double mismatchRate() { + return (double)nMismatches / nObs; + } + + public byte empiricalQualScore() { + return QualityUtils.probToQual(1 - mismatchRate(), 0); + } + + public String headerString() { + return "mismatchRate\tempiricalQ\tnObs\tnMismatches"; + } + + public String toString() { + return String.format("%.10f\t%d\t%d\t%6d", mismatchRate(), empiricalQualScore(), nObs, nMismatches); + } +} + +class QualityTracker { + final private int MAX_QUAL_SCORE = 50; + MismatchCounter[][] mismatchesByQ = new MismatchCounter[MAX_QUAL_SCORE][MAX_QUAL_SCORE]; + + public QualityTracker() { + for ( int i = 0; i < MAX_QUAL_SCORE; i++ ) { + for ( int j = 0; j < MAX_QUAL_SCORE; j++ ) { + mismatchesByQ[i][j] = new MismatchCounter(); + } + } + } + + public void inc(int b1Q, int b2Q, boolean mismatchP) { + if ( b1Q > MAX_QUAL_SCORE ) throw new RuntimeException("Unexpectedly large base quality " + b1Q); + if ( b2Q > MAX_QUAL_SCORE ) throw new RuntimeException("Unexpectedly large base quality " + b2Q); + mismatchesByQ[b1Q][b2Q].inc(mismatchP); + } + + public void inc(DuplicateComp dc) { + inc(dc.qLarger, dc.qSmaller, dc.mismatchP); + } + + public void printToStream(PrintStream out, boolean filterUnobserved) { + out.printf("Q1\tQ2\t%s%n", mismatchesByQ[0][0].headerString()); + for ( int i = 0; i < MAX_QUAL_SCORE; i++ ) { + for ( int j = 0; j < MAX_QUAL_SCORE; j++ ) { + MismatchCounter mc = mismatchesByQ[i][j]; + //System.out.printf("MC = %s%n", mc); + if ( filterUnobserved && mc.nObs == 0 ) + continue; + out.printf("%d\t%d\t%s\t%n", i, j, mc.toString()); + } + } + } +} + +class DuplicateComp { + int qLarger; + int qSmaller; + boolean mismatchP; + + public DuplicateComp( int b1Q, int b2Q, boolean mismatchP ) { + qLarger = Math.max(b1Q, b2Q); + qSmaller = Math.min(b1Q, b2Q); + this.mismatchP = mismatchP; + } + + public String toString() { + return String.format("%d %d %b", qLarger, qSmaller, mismatchP); + } +} + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 2:52:28 PM + * To change this template use File | Settings | File Templates. + */ +public class DuplicateQualsWalker extends DuplicateWalker, QualityTracker> { + @Argument(fullName="filterUnobservedQuals", shortName="filterUnobservedQuals", required=false, defaultValue="false", doc="Show only quality bins with at least one observation in the data") + public boolean FILTER_UNOBSERVED_QUALS; + + @Argument(fullName="maxPairwiseCompsPerDupSet", shortName="maxPairwiseCompsPerDupSet", required=false, defaultValue="100", doc="Maximumize number of pairwise comparisons to perform among duplicate read sets") + public int MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET; + + final private boolean ACTUALLY_DO_WORK = true; + + public void onTraversalDone(QualityTracker result) { + result.printToStream(out, FILTER_UNOBSERVED_QUALS); + } + + public QualityTracker reduceInit() { + return new QualityTracker(); + } + + public QualityTracker reduce(List dupComps, QualityTracker tracker) { + for ( DuplicateComp dc : dupComps ) { + tracker.inc(dc); + } + + return tracker; + } + + // Print out data for regression + public List map(GenomeLoc loc, byte[] refBases, LocusContext context, List duplicateReads) { + //out.printf("%s has %d duplicates%n", loc, duplicateReads.size()); + List comps = new ArrayList(); + + if ( ! ACTUALLY_DO_WORK ) + return comps; + + int nComparisons = 0; + for ( SAMRecord read1 : duplicateReads ) { + byte[] read1Bases = read1.getReadBases(); + byte[] read1Quals = read1.getBaseQualities(); + + for ( SAMRecord read2 : duplicateReads ) { + if ( read1 != read2 && read1.getReadLength() == read2.getReadLength()) { + byte[] read2Bases = read2.getReadBases(); + byte[] read2Quals = read2.getBaseQualities(); + nComparisons++; + + for ( int i = 0; i < read1Bases.length; i++) { + byte read1Q = read1Quals[i]; + byte read2Q = read2Quals[i]; + boolean mismatchP = read1Bases[i] != read2Bases[i]; + DuplicateComp dc = new DuplicateComp(read1Q, read2Q, mismatchP); + + //logger.debug(String.format("dc: %s", dc)); + comps.add(dc); + } + + if ( nComparisons > MAX_PAIRSIZE_COMPS_PER_DUPLICATE_SET ) + break; + } + } + } + + return comps; + } +} \ No newline at end of file