RealignerTargetCreator is officially live

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2697 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-01-27 03:41:52 +00:00
parent c1f154c31f
commit 476d6f3076
6 changed files with 13 additions and 465 deletions

View File

@ -1,137 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.indels;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.filters.Platform454Filter;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.*;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.AlignmentBlock;
import net.sf.samtools.CigarElement;
import java.util.List;
/**
* Emits intervals consisting of indels from the aligned reads.
*/
@WalkerName("IndelIntervals")
@Requires({DataSource.READS, DataSource.REFERENCE})
@ReadFilters({Platform454Filter.class, ZeroMappingQualityReadFilter.class})
public class IndelIntervalWalker extends ReadWalker<IndelIntervalWalker.Interval, IndelIntervalWalker.Interval> {
@Argument(fullName="allow454Reads", shortName="454", doc="process 454 reads", required=false)
boolean allow454 = false;
@Argument(fullName="minIndelsPerInterval", shortName="minIndels", doc="min indels per interval", required=false)
int minIntervalIndelCount = 1;
public void initialize() {}
public boolean filter(char[] ref, SAMRecord read) {
return ( !read.getReadUnmappedFlag() && // mapped
read.getMappingQuality() != 0 && // positive mapping quality
read.getCigarLength() > 1 && // indel
(allow454 || !Utils.is454Read(read)) );
}
public Interval map(char[] ref, SAMRecord read) {
// List<AlignmentBlock> blocks = read.getAlignmentBlocks();
// long indelLeftEdge = read.getAlignmentStart() + blocks.get(0).getLength() - 1;
// long indelRightEdge = read.getAlignmentEnd() - blocks.get(blocks.size()-1).getLength() + 1;
long indelLeftEdge = -1;
long indelRightEdge = -1;
int lengthOnRef = 0; // length of the event on the reference (this preset value is right for insertion)
int pos = read.getAlignmentStart();
for ( final CigarElement ce : read.getCigar().getCigarElements() ) {
switch( ce.getOperator() ) {
case S :
case H : break; // ignore clipping completely: alignment start is set to the first NON-clipped base
case P : // skip padding and matching bases, advance position on the ref:
case M : pos += ce.getLength(); break;
case D :
lengthOnRef = ce.getLength(); // deletion occupies non-zero number of bases on the ref
case I :
// for insertion we do not have to set lengthOnRef as it was already initialized to 0
if ( indelLeftEdge == -1 ) { // it's the first indel in the current read
indelLeftEdge = pos - 1; // set to the last (matching) base before the indel
}
pos += lengthOnRef;
indelRightEdge = pos; // whether it is the first indel in the read or not, we set right edge immediately after it
break;
default :
throw new StingException("Unrecognized cigar element '"+ce.getOperator()+"' in read "+read.getReadName());
}
}
// at this point indelLeftEdge == -1 or [indelLeftEdge,indelRightEdge] is the interval encompassing ALL indels in the current read:
if ( indelLeftEdge == -1 ) return null;
// we've seen some bam files coming from other centers that have reads (and indels!!) hanging over the contig ends;
// we do not want to deal with this stuff (what is it, anyway??)
if ( indelRightEdge > GenomeLocParser.getContigInfo(read.getReferenceName()).getSequenceLength() ) {
System.out.println("WARNING: read " + read.getReadName()+" contains indel(s) spanning past the contig end (read ignored).");
return null;
}
// if ( indelLeftEdge == 49563377 ) System.out.println("read: " +read.getReadName());
GenomeLoc indelLoc = GenomeLocParser.createGenomeLoc(read.getReferenceIndex(), indelLeftEdge, indelRightEdge);
GenomeLoc refLoc = GenomeLocParser.createGenomeLoc(read);
// if ( indelLeftEdge == 10313124 || indelLeftEdge == 10313170 ) System.out.println("read: " +read.getReadName() + " ; " + refLoc + " ; " + indelLoc);
return new Interval(refLoc, indelLoc);
}
public Interval reduceInit() {
return null;
}
// Note that the reduce step assumes that reductions occur in order (i.e. no hierarchical reductions),
// although this can easily be changed if necessary.
public Interval reduce(Interval value, Interval sum) {
// if there is no interval to the left, then this is the first one
if ( sum == null )
return value;
if ( value == null ) return sum; // nothing to do, wait for the next
//System.out.println(sum + " ==> " + value);
// if the intervals don't overlap, print out the leftmost one and start a new one
if ( !sum.readLoc.overlapsP(value.readLoc) ) {
if ( sum.indelCount >= minIntervalIndelCount )
out.println(sum);
return value;
}
// otherwise, merge them
return sum.merge(value);
}
public void onTraversalDone(Interval result) {
if ( result != null && result.indelCount >= minIntervalIndelCount )
out.println(result);
}
public class Interval {
public GenomeLoc readLoc = null;
public GenomeLoc indelLoc = null;
public int indelCount;
Interval(GenomeLoc readLoc, GenomeLoc indelLoc) {
this.indelLoc = indelLoc;
this.readLoc = readLoc;
indelCount = 1;
}
public Interval merge(Interval i) {
long indelLeftEdge = Math.min(this.indelLoc.getStart(), i.indelLoc.getStart());
long indelRightEdge = Math.max(this.indelLoc.getStop(), i.indelLoc.getStop());
GenomeLoc mergedIndelLoc = GenomeLocParser.createGenomeLoc(this.indelLoc.getContigIndex(), indelLeftEdge, indelRightEdge);
Interval mergedInterval = new Interval(this.readLoc.merge(i.readLoc), mergedIndelLoc);
mergedInterval.indelCount = this.indelCount + i.indelCount;
return mergedInterval;
}
public String toString() {
return indelLoc.toString();
}
}
}

View File

@ -1,114 +0,0 @@
/*
* Copyright (c) 2009 The Broad Institute
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.filters.Platform454Filter;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.*;
/**
* Merges sets of intervals based on reads which overlap them; no two intervals in the final list
* will have a read that even partially spans them both. A maximum interval size is enforced though.
*/
@WalkerName("IntervalMerger")
@Requires({DataSource.READS, DataSource.REFERENCE})
@ReadFilters({Platform454Filter.class, ZeroMappingQualityReadFilter.class})
public class IntervalMergerWalker extends ReadWalker<Integer,Integer> {
@Argument(fullName="intervalsToMerge", shortName="intervals", doc="Intervals to merge", required=true)
List<String> intervalsSource = null;
@Argument(fullName="allow454Reads", shortName="454", doc="process 454 reads", required=false)
boolean allow454 = false;
@Argument(fullName="maxIntervalSize", shortName="maxInterval", doc="max interval size", required=false)
int maxIntervalSize = 500;
private LinkedList<GenomeLoc> intervals;
private GenomeLoc currentInterval = null;
private boolean currentIntervalIsUsed = false;
@Override
public void initialize() {
intervals = new LinkedList<GenomeLoc>(GenomeLocParser.parseIntervals(intervalsSource, GenomeAnalysisEngine.instance.getArguments().intervalMerging));
currentInterval = (intervals.size() > 0 ? intervals.removeFirst() : null);
}
@Override
public Integer map(char[] ref, SAMRecord read) {
if ( currentInterval == null ||
(!allow454 && Utils.is454Read(read)) )
return 0;
GenomeLoc loc = GenomeLocParser.createGenomeLoc(read);
// ignore all intervals which we've passed
while ( loc.isPast(currentInterval) ) {
if ( currentIntervalIsUsed && (maxIntervalSize <= 0 || currentInterval.getStop() - currentInterval.getStart() <= maxIntervalSize) ) {
out.println(currentInterval);
currentIntervalIsUsed = false;
}
if ( intervals.size() > 0 ) {
currentInterval = intervals.removeFirst();
} else {
currentInterval = null;
return 0;
}
}
// if we're not yet in the current interval, we're done
if ( !loc.overlapsP(currentInterval))
return 0;
// at this point, we're in the current interval.
// now we can merge any other intervals which we overlap
currentIntervalIsUsed = true;
while ( intervals.size() > 0 && loc.overlapsP(intervals.getFirst()) )
currentInterval = GenomeLocParser.setStop(currentInterval, intervals.removeFirst().getStop());
return 1;
}
@Override
public Integer reduceInit() { return 0; }
@Override
public Integer reduce( Integer value, Integer accum ) {
return accum + value;
}
@Override
public void onTraversalDone( Integer value ) {
if ( currentInterval != null && currentIntervalIsUsed &&
(maxIntervalSize <= 0 || currentInterval.getStop() - currentInterval.getStart() <= maxIntervalSize) )
out.println(currentInterval);
}
}

View File

@ -1,120 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.Platform454Filter;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.*;
/**
* Emits intervals consisting of nearby loci with high mismatch rates.
*/
@WalkerName("MismatchIntervals")
@ReadFilters({Platform454Filter.class, ZeroMappingQualityReadFilter.class})
public class MismatchIntervalWalker extends LocusWalker<Pair<GenomeLoc, Boolean>, Pair<LinkedList<Boolean>, GenomeLoc>> {
@Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy", required=false)
int windowSize = 10;
@Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of mismatching base qualities threshold", required=false)
double mismatchThreshold = 0.15;
@Argument(fullName="allow454Reads", shortName="454", doc="process 454 reads", required=false)
boolean allow454 = false;
private final int minReadsAtInterval = 4;
public void initialize() {
if ( windowSize < 1)
throw new RuntimeException("Window Size must be a positive integer");
}
public Pair<GenomeLoc, Boolean> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
char upperRef = Character.toUpperCase(ref.getBase());
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
int goodReads = 0, mismatchQualities = 0, totalQualities = 0;
for (int i = 0; i < reads.size(); i++) {
SAMRecord read = reads.get(i);
if ( read.getMappingQuality() == 0 ||
read.getAlignmentBlocks().size() > 1 ||
(!allow454 && Utils.is454Read(read)) )
continue;
goodReads++;
int offset = offsets.get(i);
int quality = (int)read.getBaseQualityString().charAt(offset) - 33;
totalQualities += quality;
char base = Character.toUpperCase((char)read.getReadBases()[offset]);
if ( base != upperRef )
mismatchQualities += quality;
}
boolean flag = false;
if ( goodReads >= minReadsAtInterval && (double)mismatchQualities / (double)totalQualities > mismatchThreshold )
flag = true;
return new Pair<GenomeLoc, Boolean>(context.getLocation(), flag);
}
public void onTraversalDone(Pair<LinkedList<Boolean>, GenomeLoc> sum) {
if (sum.second != null)
out.println(sum.second);
}
public Pair<LinkedList<Boolean>, GenomeLoc> reduceInit() {
return new Pair<LinkedList<Boolean>, GenomeLoc>(new LinkedList<Boolean>(), null);
}
public Pair<LinkedList<Boolean>, GenomeLoc> reduce(Pair<GenomeLoc, Boolean> value, Pair<LinkedList<Boolean>, GenomeLoc> sum) {
// if we hit a new contig, clear the list
if ( sum.second != null && sum.second.getContigIndex() != value.first.getContigIndex() ) {
out.println(sum.second);
sum.first.clear();
sum.second = null;
}
sum.first.addLast(value.second);
if ( sum.first.size() <= windowSize )
return sum;
sum.first.remove();
if ( !value.second )
return sum;
int mismatches = 0;
int firstMismatch = -1;
for (int i = 0; i < windowSize; i++) {
if ( sum.first.get(i) ) {
mismatches++;
if ( firstMismatch == -1 )
firstMismatch = i;
}
}
if ( mismatches > 1 ) {
// if there is no interval to the left, then this is the first one
if ( sum.second == null ) {
sum.second = value.first;
sum.second = GenomeLocParser.setStart(sum.second, sum.second.getStart() - windowSize + firstMismatch + 1);
}
// if the intervals don't overlap, print out the leftmost one and start a new one
else if ( value.first.getStop() - sum.second.getStop() > windowSize ) {
out.println(sum.second);
sum.second = value.first;
sum.second = GenomeLocParser.setStart(sum.second,sum.second.getStart() - windowSize + firstMismatch + 1);
}
// otherwise, merge them
else {
sum.second = GenomeLocParser.setStop(sum.second, value.first.getStop());
}
}
return sum;
}
}

View File

@ -15,30 +15,24 @@ import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.*;
/**
* Emits intervals for the Local Indel Realigner to target for cleaning. Ignores 454 reads.
* Emits intervals for the Local Indel Realigner to target for cleaning. Ignores 454 and MQ0 reads.
*/
@ReadFilters({Platform454Filter.class, ZeroMappingQualityReadFilter.class})
public class RealignerTargetCreator extends LocusWalker<RealignerTargetCreator.Event, RealignerTargetCreator.Event> {
// mismatch/entropy arguments
// mismatch/entropy/SNP arguments
@Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy or SNP clusters", required=false)
protected int windowSize = 10;
@Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of base qualities needing to mismatch for a position to have high entropy; to disable set to <= 0 or > 1", required=false)
protected double mismatchThreshold = 0.15;
// observed indels arguments
@Argument(fullName="minIndelsPerInterval", shortName="minIndels", doc="min indels per interval", required=false)
int minIntervalIndelCount = 1;
@Argument(fullName="minReadsAtLocus", shortName="minReads", doc="minimum reads at a locus to enable using the entropy calculation", required=false)
protected int minReadsAtLocus = 4;
// interval merging arguments
@Argument(fullName="maxIntervalSize", shortName="maxInterval", doc="max interval size", required=false)
int maxIntervalSize = 500;
private final int minReadsAtInterval = 4;
@Argument(fullName="maxIntervalSize", shortName="maxInterval", doc="maximum interval size", required=false)
protected int maxIntervalSize = 500;
@Override
public boolean generateExtendedEvents() { return true; }
@ -115,7 +109,7 @@ public class RealignerTargetCreator extends LocusWalker<RealignerTargetCreator.E
// make sure we're supposed to look for high entropy
if ( mismatchThreshold > 0.0 &&
mismatchThreshold <= 1.0 &&
pileup.size() >= minReadsAtInterval &&
pileup.size() >= minReadsAtLocus &&
(double)mismatchQualities / (double)totalQualities >= mismatchThreshold )
hasPointEvent = true;
}

View File

@ -1,68 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.indels;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.VariationRod;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.Variation;
/**
* Given a ROD track SNP calls called "snps", emits intervals consisting of clustered SNPs.
*/
@WalkerName("SNPClusters")
@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="snps",type= VariationRod.class)})
public class SNPClusterWalker extends RefWalker<GenomeLoc, GenomeLoc> {
@Argument(fullName="windowSize", shortName="window", doc="window size for calculating clusters", required=false)
int windowSize = 10;
public void initialize() {
if ( windowSize < 1)
throw new RuntimeException("Window Size must be a positive integer");
}
public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
Variation snp = (Variation)tracker.lookup("snps", null);
return (snp != null && snp.isBiallelic() && snp.isSNP()) ? context.getLocation() : null;
}
public void onTraversalDone(GenomeLoc sum) {
if ( sum != null && sum.getStart() != sum.getStop() )
out.println(sum);
}
public GenomeLoc reduceInit() {
return null;
}
public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) {
// ignore non-SNP variants
if ( value == null )
return sum;
// if we have no previous SNPs start with the new location
if ( sum == null )
return value;
// if we hit a new contig, emit and start with the new location
if ( sum.getContigIndex() != value.getContigIndex() ) {
if ( sum.getStart() != sum.getStop() )
out.println(sum);
return value;
}
// if the last SNP location was within a window, merge them
if ( value.getStart() - sum.getStop() <= windowSize ) {
sum = GenomeLocParser.setStop(sum,value.getStart());
return sum;
}
// otherwise, emit and start with the new location
if ( sum.getStart() != sum.getStop() )
out.println(sum);
return value;
}
}

View File

@ -10,22 +10,15 @@ public class IntervalsIntegrationTest extends WalkerTest {
public void testIntervals() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
"-T IndelIntervals -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -o %s",
"-T RealignerTargetCreator -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s",
1,
Arrays.asList("76f97b91921f427ab639b6b8228ac4dc"));
executeTest("testIndelIntervals", spec1);
Arrays.asList("d21e83a8b0d3f63acd9ca3b0b636e515"));
executeTest("test standard", spec1);
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
"-T MismatchIntervals -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -o %s",
"-T RealignerTargetCreator -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_b36.rod -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s",
1,
Arrays.asList("31e8b5d4c42f2c63c08b8f6b8e10ac99"));
executeTest("testMismatchIntervals", spec2);
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
"-T IntervalMerger -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-11,000,000 -intervals " + validationDataLocation + "indelIntervals.test -intervals " + validationDataLocation + "mismatchIntervals.test -o %s",
1,
Arrays.asList("bf1f23667ef0065bbcb9754f50c2d664"));
executeTest("testMergeIntervals", spec3);
Arrays.asList("bfccfa50f62d10ee2fe8cfa68fb70002"));
executeTest("test dbsnp", spec2);
}
}