From b2a18a9d61f3633431d8f4e40302783f232ff9f2 Mon Sep 17 00:00:00 2001 From: ebanks Date: Tue, 18 Aug 2009 03:04:12 +0000 Subject: [PATCH] - first pass at a basic indel filter (for now, based on size and homopolymer runs) - fix simple indel rod printout git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1431 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/refdata/SimpleIndelROD.java | 3 +- .../walkers/indels/IndelFilterWalker.java | 82 +++++++++++++++++++ 2 files changed, 83 insertions(+), 2 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IndelFilterWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/SimpleIndelROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/SimpleIndelROD.java index acfd505d9..401bb5eff 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/SimpleIndelROD.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/SimpleIndelROD.java @@ -66,8 +66,7 @@ public class SimpleIndelROD extends TabularROD implements Genotype, AllelicVaria public String toString() { StringBuffer sb = new StringBuffer(); sb.append(getLocation().getContig() + "\t" + getLocation().getStart() + "\t"); - String indel = getFWDAlleles().get(0); - sb.append((indel.length()-1) + "\t" + (isInsertion() ? "I" : "D") + "\t" + indel.substring(1)); + sb.append(length() + "\t" + (isInsertion() ? "I" : "D") + "\t" + getFWDAlleles().get(0)); return sb.toString(); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IndelFilterWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IndelFilterWalker.java new file mode 100755 index 000000000..6ec1bbdaa --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IndelFilterWalker.java @@ -0,0 +1,82 @@ +package org.broadinstitute.sting.playground.gatk.walkers.indels; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +/** + * filter an indel callset based on given criteria + */ +@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="indels",type=AllelicVariant.class)}) +@Reference(window=@Window(start=-20,stop=20)) +public class IndelFilterWalker extends RefWalker { + @Argument(fullName="homopolymerRunMax", shortName="homopolMax", doc="filter indels within homopolymer runs greater than the given length (max 20)", required=false) + Integer HOMOPOLYMER_MAX = 20; + @Argument(fullName="homopolymerRunMin", shortName="homopolMin", doc="filter indels within homopolymer runs less than the given length", required=false) + Integer HOMOPOLYMER_MIN = 0; + @Argument(fullName="sizeMax", shortName="sizeMax", doc="filter indels greater than a certain size", required=false) + Integer SIZE_MAX = 100; + @Argument(fullName="sizeMin", shortName="sizeMin", doc="filter indels less than a certain size", required=false) + Integer SIZE_MIN = 0; + + public void initialize() { + if ( HOMOPOLYMER_MAX > 20 ) + HOMOPOLYMER_MAX = 20; + } + + public Integer reduceInit() { return 0; } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + AllelicVariant indel = (AllelicVariant)tracker.lookup("indels", null); + + if ( indel == null || !indel.isIndel() ) + return 0; + + if ( indel.length() < SIZE_MIN || indel.length() > SIZE_MAX ) + return 0; + + int homopol = homopolymerRunSize(ref, indel); + if ( homopol < HOMOPOLYMER_MIN || homopol > HOMOPOLYMER_MAX ) + return 0; + + out.println(indel); + return 1; + } + + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public void onTraversalDone(Integer result) { + out.printf("output %d indels.\n", result); + } + + private int homopolymerRunSize(ReferenceContext ref, AllelicVariant indel) { + char[] bases = ref.getBases(); + GenomeLoc window = ref.getWindow(); + GenomeLoc locus = ref.getLocus(); + + int refBasePos = (int)(locus.getStart() - window.getStart()); + char indelBase = indel.isDeletion() ? bases[refBasePos+1] : indel.getAltBasesFWD().charAt(0); + int leftRun = 0; + for ( int i = refBasePos; i >= 0; i--) { + if ( bases[i] != indelBase ) + break; + leftRun++; + } + + indelBase = indel.isDeletion() ? bases[refBasePos+indel.length()] : indel.getAltBasesFWD().charAt(indel.getAltBasesFWD().length()-1); + int rightRun = 0; + for ( int i = refBasePos + (indel.isDeletion() ? 1+indel.length() : 1); i < bases.length; i++) { + if ( bases[i] != indelBase ) + break; + rightRun++; + } + + System.out.println(String.valueOf(bases) + ": " + leftRun + " / " + rightRun); + return Math.max(leftRun, rightRun); + } +} \ No newline at end of file