ReduceReads is now scattered by contig

It's no longer safe to scatter/gather by interval because now we don't hard-clip to the intervals anymore.
This commit is contained in:
Mauricio Carneiro 2012-12-06 17:06:32 -05:00
parent bdda63d973
commit 8a115edbaf
3 changed files with 1 additions and 333 deletions

View File

@ -85,7 +85,7 @@ import java.util.*;
*/
@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} )
@PartitionBy(PartitionType.INTERVAL)
@PartitionBy(PartitionType.CONTIG)
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class})
public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceReadsStash> {

View File

@ -1,149 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.qc;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.io.PrintStream;
import java.util.*;
/**
* Emits intervals present in either the original or reduced bam but not the other.
*
* <h2>Input</h2>
* <p>
* The original and reduced BAM files.
* </p>
*
* <h2>Output</h2>
* <p>
* A list of intervals present in one bam but not the other.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -I:original original.bam \
* -I:reduced reduced.bam \
* -R ref.fasta \
* -T AssessReducedCoverage \
* -o output.intervals
* </pre>
*
* @author ebanks
*/
@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} )
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class})
@Hidden
public class AssessReducedCoverage extends LocusWalker<GenomeLoc, GenomeLoc> implements TreeReducible<GenomeLoc> {
private static final String original = "original";
private static final String reduced = "reduced";
@Output
protected PrintStream out;
@Override
public boolean includeReadsWithDeletionAtLoci() { return true; }
@Argument(fullName = "output_reduced_only_coverage", shortName = "output_reduced_only_coverage", doc = "Output an interval if the reduced bam has coverage where the original does not", required = false)
public boolean OUTPUT_REDUCED_ONLY_INTERVALS = false;
public void initialize() {}
public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
return null;
Set<String> tags = getAllTags(context.getBasePileup());
return (tags.contains(original) && !tags.contains(reduced)) ||
(OUTPUT_REDUCED_ONLY_INTERVALS && tags.contains(reduced) && !tags.contains(original)) ? ref.getLocus() : null;
}
private Set<String> getAllTags(final ReadBackedPileup pileup) {
final Set<String> tags = new HashSet<String>(10);
for ( final PileupElement p : pileup ) {
if ( (int)p.getQual() > 2 && p.getMappingQual() > 0 && !p.isDeletion() )
tags.addAll(getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags());
}
return tags;
}
public void onTraversalDone(GenomeLoc sum) {
if ( sum != null )
out.println(sum);
}
public GenomeLoc reduceInit() {
return null;
}
public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) {
if ( lhs == null )
return rhs;
if ( rhs == null )
return lhs;
// if contiguous, just merge them
if ( lhs.contiguousP(rhs) )
return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop());
// otherwise, print the lhs and start over with the rhs
out.println(lhs);
return rhs;
}
public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) {
if ( value == null )
return sum;
if ( sum == null )
return value;
// if contiguous, just merge them
if ( sum.contiguousP(value) )
return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop());
// otherwise, print the sum and start over with the value
out.println(sum);
return value;
}
}

View File

@ -1,183 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.qc;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.io.PrintStream;
import java.util.List;
/**
* Emits intervals in which the differences between the original and reduced bam quals are bigger epsilon (unless the quals of
* the reduced bam are above sufficient threshold)
*
* <h2>Input</h2>
* <p>
* The original and reduced BAM files.
* </p>
*
* <h2>Output</h2>
* <p>
* A list of intervals in which the differences between the original and reduced bam quals are bigger epsilon.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -I:original original.bam \
* -I:reduced reduced.bam \
* -R ref.fasta \
* -T AssessReducedQuals \
* -o output.intervals
* </pre>
*
* @author ami
*/
public class AssessReducedQuals extends LocusWalker<GenomeLoc, GenomeLoc> implements TreeReducible<GenomeLoc> {
private static final String reduced = "reduced";
private static final int originalQualsIndex = 0;
private static final int reducedQualsIndex = 1;
@Argument(fullName = "sufficientQualSum", shortName = "sufficientQualSum", doc = "When a reduced bam qual sum is above this threshold, it passes even without comparing to the non-reduced bam ", required = false)
public int sufficientQualSum = 600;
@Argument(fullName = "qual_epsilon", shortName = "epsilon", doc = "when |Quals_reduced_bam - Quals_original_bam| > epsilon*Quals_original_bam we output this interval", required = false)
public int qual_epsilon = 0;
@Argument(fullName = "debugLevel", shortName = "debug", doc = "debug level: NO_DEBUG, PRINT_LOCI,PRINT_PILEUPS", required = false)
public DebugLevel debugLevel = DebugLevel.NO_DEBUG;
@Output
protected PrintStream out;
public void initialize() {
if (debugLevel != DebugLevel.NO_DEBUG)
out.println(" Debug mode" +
"Debug:\tsufficientQualSum: "+sufficientQualSum+ "\n " +
"Debug:\tqual_epsilon: "+qual_epsilon);
}
@Override
public boolean includeReadsWithDeletionAtLoci() { return true; }
@Override
public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
return null;
boolean reportLocus;
final int[] quals = getPileupQuals(context.getBasePileup());
if (debugLevel != DebugLevel.NO_DEBUG)
out.println("Debug:\tLocus Quals\t"+ref.getLocus()+"\toriginal\t"+quals[originalQualsIndex]+"\treduced\t"+quals[reducedQualsIndex]);
final int epsilon = MathUtils.fastRound(quals[originalQualsIndex]*qual_epsilon);
final int calcOriginalQuals = Math.min(quals[originalQualsIndex],sufficientQualSum);
final int calcReducedQuals = Math.min(quals[reducedQualsIndex],sufficientQualSum);
final int OriginalReducedQualDiff = calcOriginalQuals - calcReducedQuals;
reportLocus = OriginalReducedQualDiff > epsilon || OriginalReducedQualDiff < -1*epsilon;
if(debugLevel != DebugLevel.NO_DEBUG && reportLocus)
out.println("Debug:\tEmited Locus\t"+ref.getLocus()+"\toriginal\t"+quals[originalQualsIndex]+"\treduced\t"+quals[reducedQualsIndex]+"\tepsilon\t"+epsilon+"\tdiff\t"+OriginalReducedQualDiff);
return reportLocus ? ref.getLocus() : null;
}
private int[] getPileupQuals(final ReadBackedPileup readPileup) {
final int[] quals = new int[2];
String[] printPileup = {"Debug 2:\toriginal pileup:\t"+readPileup.getLocation()+"\nDebug 2:----------------------------------\n",
"Debug 2:\t reduced pileup:\t"+readPileup.getLocation()+"\nDebug 2:----------------------------------\n"};
for( PileupElement p : readPileup ){
final List<String> tags = getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags();
if ( isGoodRead(p) ){
final int tempQual = (int)(p.getQual()) * p.getRepresentativeCount();
final int tagIndex = getTagIndex(tags);
quals[tagIndex] += tempQual;
if(debugLevel == DebugLevel.PRINT_PILEUPS)
printPileup[tagIndex] += "\tDebug 2: ("+p+")\tMQ="+p.getMappingQual()+":QU="+p.getQual()+":RC="+p.getRepresentativeCount()+":OS="+p.getOffset()+"\n";
}
}
if(debugLevel == DebugLevel.PRINT_PILEUPS){
out.println(printPileup[originalQualsIndex]);
out.println(printPileup[reducedQualsIndex]);
}
return quals;
}
private boolean isGoodRead(PileupElement p){
// TODO -- You want to check whether the read itself is a reduced read and only
// TODO -- for them you want to ignore that min mapping quality cutoff (but you *do* still want the min base cutoff).
return !p.isDeletion() && ((p.getRead().isReducedRead()) || (!p.getRead().isReducedRead() && (int)p.getQual() >= 20 && p.getMappingQual() >= 20));
}
private int getTagIndex(List<String> tags){
return tags.contains(reduced) ? 1 : 0;
}
@Override
public void onTraversalDone(GenomeLoc sum) {
if ( sum != null )
out.println(sum);
}
@Override
public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) {
if ( lhs == null )
return rhs;
if ( rhs == null )
return lhs;
// if contiguous, just merge them
if ( lhs.contiguousP(rhs) )
return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop());
// otherwise, print the lhs and start over with the rhs
out.println(lhs);
return rhs;
}
@Override
public GenomeLoc reduceInit() {
return null;
}
@Override
public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) {
if ( value == null )
return sum;
if ( sum == null )
return value;
// if contiguous, just merge them
if ( sum.contiguousP(value) )
return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop());
// otherwise, print the sum and start over with the value
out.println(sum);
return value;
}
public enum DebugLevel {
NO_DEBUG,
/**
* Print locus level information (such as locus quals) and loci with unmatch quals
*/
PRINT_LOCI,
/**
* Print the pileup infomarion of the reduced bam files and the original bam files
*/
PRINT_PILEUPS
}
}