Optimizations:
1. push the ReadBackedPileup filtering up into the ReadFilters for read-based filters 2. stop querying the cigar for its length (just do it once) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2381 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
36875fca89
commit
bb92e31118
|
|
@ -0,0 +1,46 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.filters;
|
||||||
|
|
||||||
|
import net.sf.picard.filter.SamRecordFilter;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Filter out reads with low mapping qualities.
|
||||||
|
*
|
||||||
|
* @author ebanks
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class BadMateReadFilter implements SamRecordFilter {
|
||||||
|
|
||||||
|
@Argument(fullName = "use_reads_with_bad_mates", shortName = "bad_mates", doc = "Use reads whose mates are mapped excessively far away for calling", required = false)
|
||||||
|
public boolean USE_BADLY_MATED_READS = false;
|
||||||
|
|
||||||
|
public boolean filterOut(SAMRecord rec) {
|
||||||
|
return (!USE_BADLY_MATED_READS && rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && rec.getMateReferenceIndex() != rec.getReferenceIndex());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,46 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.filters;
|
||||||
|
|
||||||
|
import net.sf.picard.filter.SamRecordFilter;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Filter out reads with low mapping qualities.
|
||||||
|
*
|
||||||
|
* @author ebanks
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class MappingQualityReadFilter implements SamRecordFilter {
|
||||||
|
|
||||||
|
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false)
|
||||||
|
public int MIN_MAPPING_QUALTY_SCORE = 30;
|
||||||
|
|
||||||
|
public boolean filterOut(SAMRecord rec) {
|
||||||
|
return (rec.getMappingQuality() < MIN_MAPPING_QUALTY_SCORE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -115,7 +115,7 @@ public class DepthOfCoverageWalker extends LocusWalker<DepthOfCoverageWalker.DoC
|
||||||
|
|
||||||
// fill in and print all of the per-locus coverage data, then return it to reduce
|
// fill in and print all of the per-locus coverage data, then return it to reduce
|
||||||
|
|
||||||
ReadBackedPileup pileup = context.getPileup().getBaseAndMappingFilteredPileup(minBaseQ, -1);
|
ReadBackedPileup pileup = context.getPileup().getBaseFilteredPileup(minBaseQ);
|
||||||
|
|
||||||
DoCInfo info = new DoCInfo();
|
DoCInfo info = new DoCInfo();
|
||||||
info.totalCoverage = pileup.size();
|
info.totalCoverage = pileup.size();
|
||||||
|
|
|
||||||
|
|
@ -91,12 +91,6 @@ public class UnifiedArgumentCollection {
|
||||||
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
|
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
|
||||||
public int MIN_BASE_QUALTY_SCORE = 20;
|
public int MIN_BASE_QUALTY_SCORE = 20;
|
||||||
|
|
||||||
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling (MQ0 reads are ignored regardless)", required = false)
|
|
||||||
public int MIN_MAPPING_QUALTY_SCORE = 30;
|
|
||||||
|
|
||||||
@Argument(fullName = "max_mismatches_in_40bp_window", shortName = "mm40", doc = "Maximum number of mismatches within a 40 bp window (20bp on either side) around the target position for a read to be used for calling", required = false)
|
@Argument(fullName = "max_mismatches_in_40bp_window", shortName = "mm40", doc = "Maximum number of mismatches within a 40 bp window (20bp on either side) around the target position for a read to be used for calling", required = false)
|
||||||
public int MAX_MISMATCHES = 3;
|
public int MAX_MISMATCHES = 3;
|
||||||
|
|
||||||
@Argument(fullName = "use_reads_with_bad_mates", shortName = "bad_mates", doc = "Use reads whose mates are mapped excessively far away for calling", required = false)
|
|
||||||
public boolean USE_BADLY_MATED_READS = false;
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -26,7 +26,7 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
import org.broadinstitute.sting.gatk.filters.*;
|
||||||
import org.broadinstitute.sting.gatk.contexts.*;
|
import org.broadinstitute.sting.gatk.contexts.*;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.*;
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
|
|
@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.genotype.*;
|
||||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||||
|
|
||||||
import net.sf.samtools.SAMReadGroupRecord;
|
import net.sf.samtools.SAMReadGroupRecord;
|
||||||
|
import net.sf.picard.filter.SamRecordFilter;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
@ -48,7 +49,7 @@ import java.util.*;
|
||||||
* multi-sample, and pooled data. The user can choose from several different incorporated calculation models.
|
* multi-sample, and pooled data. The user can choose from several different incorporated calculation models.
|
||||||
*/
|
*/
|
||||||
@Reference(window=@Window(start=-20,stop=20))
|
@Reference(window=@Window(start=-20,stop=20))
|
||||||
@ReadFilters({ZeroMappingQualityReadFilter.class})
|
@ReadFilters({ZeroMappingQualityReadFilter.class,MappingQualityReadFilter.class,BadMateReadFilter.class})
|
||||||
public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genotype>>, Integer> {
|
public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genotype>>, Integer> {
|
||||||
|
|
||||||
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||||
|
|
@ -183,7 +184,10 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
|
||||||
headerInfo.addAll(VCFGenotypeRecord.getSupportedHeaderStrings());
|
headerInfo.addAll(VCFGenotypeRecord.getSupportedHeaderStrings());
|
||||||
|
|
||||||
// all of the arguments from the argument collection
|
// all of the arguments from the argument collection
|
||||||
Map<String,String> commandLineArgs = CommandLineUtils.getApproximateCommandLineArguments(Collections.<Object>singleton(UAC));
|
Set<Object> x = new HashSet<Object>();
|
||||||
|
x.add(UAC);
|
||||||
|
x.addAll(getToolkit().getFilters());
|
||||||
|
Map<String,String> commandLineArgs = CommandLineUtils.getApproximateCommandLineArguments(x);
|
||||||
for ( Map.Entry<String, String> commandLineArg : commandLineArgs.entrySet() )
|
for ( Map.Entry<String, String> commandLineArg : commandLineArgs.entrySet() )
|
||||||
headerInfo.add(new VCFHeaderLine(String.format("UG_%s", commandLineArg.getKey()), commandLineArg.getValue()));
|
headerInfo.add(new VCFHeaderLine(String.format("UG_%s", commandLineArg.getKey()), commandLineArg.getValue()));
|
||||||
|
|
||||||
|
|
@ -203,10 +207,10 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
// filter the context based on min base and mapping qualities
|
// filter the context based on min base and mapping qualities
|
||||||
ReadBackedPileup pileup = rawContext.getPileup().getBaseAndMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE, UAC.MIN_MAPPING_QUALTY_SCORE);
|
ReadBackedPileup pileup = rawContext.getPileup().getBaseFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE);
|
||||||
|
|
||||||
// filter the context based on mismatches and reads with bad mates
|
// filter the context based on mismatches
|
||||||
pileup = filterPileup(pileup, refContext, UAC.MAX_MISMATCHES, UAC.USE_BADLY_MATED_READS);
|
pileup = filterPileup(pileup, refContext, UAC.MAX_MISMATCHES);
|
||||||
|
|
||||||
// an optimization to speed things up when there is no coverage or when overly covered
|
// an optimization to speed things up when there is no coverage or when overly covered
|
||||||
if ( pileup.size() == 0 ||
|
if ( pileup.size() == 0 ||
|
||||||
|
|
@ -241,11 +245,10 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
|
||||||
}
|
}
|
||||||
|
|
||||||
// filter based on maximum mismatches and bad mates
|
// filter based on maximum mismatches and bad mates
|
||||||
private static ReadBackedPileup filterPileup(ReadBackedPileup pileup, ReferenceContext refContext, int maxMismatches, boolean useBadMates) {
|
private static ReadBackedPileup filterPileup(ReadBackedPileup pileup, ReferenceContext refContext, int maxMismatches) {
|
||||||
ArrayList<PileupElement> filteredPileup = new ArrayList<PileupElement>();
|
ArrayList<PileupElement> filteredPileup = new ArrayList<PileupElement>();
|
||||||
for ( PileupElement p : pileup ) {
|
for ( PileupElement p : pileup ) {
|
||||||
if ( (useBadMates || !p.getRead().getReadPairedFlag() || p.getRead().getMateUnmappedFlag() || p.getRead().getMateReferenceIndex() == p.getRead().getReferenceIndex()) &&
|
if ( AlignmentUtils.mismatchesInRefWindow(p, refContext, true) <= maxMismatches )
|
||||||
AlignmentUtils.mismatchesInRefWindow(p, refContext, true) <= maxMismatches )
|
|
||||||
filteredPileup.add(p);
|
filteredPileup.add(p);
|
||||||
}
|
}
|
||||||
return new ReadBackedPileup(pileup.getLocation(), filteredPileup);
|
return new ReadBackedPileup(pileup.getLocation(), filteredPileup);
|
||||||
|
|
|
||||||
|
|
@ -180,26 +180,28 @@ public class AlignmentUtils {
|
||||||
|
|
||||||
int mismatches = 0;
|
int mismatches = 0;
|
||||||
|
|
||||||
GenomeLoc window = ref.getWindow();
|
int windowStart = (int)ref.getWindow().getStart();
|
||||||
|
int windowStop = (int)ref.getWindow().getStop();
|
||||||
char[] refBases = ref.getBases();
|
char[] refBases = ref.getBases();
|
||||||
byte[] readBases = p.getRead().getReadBases();
|
byte[] readBases = p.getRead().getReadBases();
|
||||||
Cigar c = p.getRead().getCigar();
|
Cigar c = p.getRead().getCigar();
|
||||||
|
|
||||||
int readIndex = 0;
|
int readIndex = 0;
|
||||||
int currentPos = p.getRead().getAlignmentStart();
|
int currentPos = p.getRead().getAlignmentStart();
|
||||||
int refIndex = Math.max(0, currentPos - (int)window.getStart());
|
int refIndex = Math.max(0, currentPos - windowStart);
|
||||||
|
|
||||||
for (int i = 0 ; i < c.numCigarElements() ; i++) {
|
for (int i = 0 ; i < c.numCigarElements() ; i++) {
|
||||||
CigarElement ce = c.getCigarElement(i);
|
CigarElement ce = c.getCigarElement(i);
|
||||||
|
int cigarElementLength = ce.getLength();
|
||||||
switch ( ce.getOperator() ) {
|
switch ( ce.getOperator() ) {
|
||||||
case M:
|
case M:
|
||||||
for (int j = 0; j < ce.getLength(); j++, readIndex++, currentPos++) {
|
for (int j = 0; j < cigarElementLength; j++, readIndex++, currentPos++) {
|
||||||
// are we past the ref window?
|
// are we past the ref window?
|
||||||
if ( currentPos > window.getStop() )
|
if ( currentPos > windowStop )
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// are we before the ref window?
|
// are we before the ref window?
|
||||||
if ( currentPos < window.getStart() )
|
if ( currentPos < windowStart )
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
char refChr = refBases[refIndex++];
|
char refChr = refBases[refIndex++];
|
||||||
|
|
@ -214,15 +216,15 @@ public class AlignmentUtils {
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case I:
|
case I:
|
||||||
readIndex += ce.getLength();
|
readIndex += cigarElementLength;
|
||||||
break;
|
break;
|
||||||
case D:
|
case D:
|
||||||
currentPos += ce.getLength();
|
currentPos += cigarElementLength;
|
||||||
if ( currentPos > window.getStart() )
|
if ( currentPos > windowStart )
|
||||||
refIndex += Math.min(ce.getLength(), currentPos - window.getStart());
|
refIndex += Math.min(cigarElementLength, currentPos - windowStart);
|
||||||
break;
|
break;
|
||||||
case S:
|
case S:
|
||||||
readIndex += ce.getLength();
|
readIndex += cigarElementLength;
|
||||||
break;
|
break;
|
||||||
default: throw new StingException("Only M,I,D,S cigar elements are currently supported; there was " + ce.getOperator());
|
default: throw new StingException("Only M,I,D,S cigar elements are currently supported; there was " + ce.getOperator());
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -176,6 +176,11 @@ public class ReadBackedPileup implements Iterable<PileupElement> {
|
||||||
|
|
||||||
return new ReadBackedPileup(loc, filteredPileup);
|
return new ReadBackedPileup(loc, filteredPileup);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public ReadBackedPileup getBaseFilteredPileup( int minBaseQ ) {
|
||||||
|
return getBaseAndMappingFilteredPileup(minBaseQ, -1);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a pileup randomly downsampled to the desiredCoverage.
|
* Returns a pileup randomly downsampled to the desiredCoverage.
|
||||||
*
|
*
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue