From 0cafd3d6429ac63014adfe62738caa142e7f270e Mon Sep 17 00:00:00 2001 From: aaron Date: Tue, 22 Jun 2010 22:42:38 +0000 Subject: [PATCH] clip VCF alleles for indels: only a single left base, and as many right bases as align before converting to variant context. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3614 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/refdata/features/vcf4/VCF4Codec.java | 29 +++++------------- .../refdata/features/vcf4/VCF4UnitTest.java | 30 +++++++++++++++++-- 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java index b04f7c8fe..53c885cec 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4Codec.java @@ -409,44 +409,29 @@ public class VCF4Codec implements FeatureCodec { static Pair> clipAlleles(String contig, long position, String ref, List unclippedAlleles) { List newAlleleList = new ArrayList(); - // find the smallest allele - int minimumAllele = (unclippedAlleles.size() > 0) ? unclippedAlleles.get(0).length() : Integer.MAX_VALUE; - for (Allele smallest : unclippedAlleles) - if (smallest.length() < minimumAllele) - minimumAllele = smallest.length(); - - // find the preceeding string common to all alleles and the reference - int forwardClipped = 0; boolean clipping = true; - while (clipping) { - for (Allele a : unclippedAlleles) - if (forwardClipped > ref.length() - 1) - clipping = false; - else if (a.length() <= forwardClipped || (a.getBases()[forwardClipped] != ref.getBytes()[forwardClipped])) { + for (Allele a : unclippedAlleles) + if (a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0])) { clipping = false; } - if (clipping) forwardClipped++; - } + int forwardClipping = (clipping) ? 1 : 0; int reverseClipped = 0; clipping = true; while (clipping) { for (Allele a : unclippedAlleles) - if (a.length() - reverseClipped < 0 || a.length() - forwardClipped == 0) + if (a.length() - reverseClipped <= forwardClipping || a.length() - forwardClipping == 0) clipping = false; else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1]) clipping = false; if (clipping) reverseClipped++; } - // check to see if we're about to clip all the bases from the reference, if so back off the front clip a base - //if (forwardClipped + reverseClipped >= ref.length() || forwardClipped + reverseClipped >= minimumAllele) - // forwardClipped--; - + for (Allele a : unclippedAlleles) - newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipped,a.getBases().length-reverseClipped),a.isReference())); - return new Pair>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipped,(position+ref.length()-reverseClipped-1)), + newAlleleList.add(Allele.create(Arrays.copyOfRange(a.getBases(),forwardClipping,a.getBases().length-reverseClipped),a.isReference())); + return new Pair>(GenomeLocParser.createGenomeLoc(contig,position+forwardClipping,(position+ref.length()-reverseClipped-1)), newAlleleList); } diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java index 032d6fe19..4da25db74 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/features/vcf4/VCF4UnitTest.java @@ -199,8 +199,7 @@ public class VCF4UnitTest extends BaseTest { public void checkLargeVCF() { TestSetup testSetup = new TestSetup().invoke(largeVCF); AsciiLineReader reader = testSetup.getReader(); - VCF4Codec codec = testSetup.getCodec(); - + // now parse the lines String line = null; try { @@ -299,6 +298,30 @@ public class VCF4UnitTest extends BaseTest { Assert.assertTrue(locAndList.second.get(2).toString().equals("-")); } + /** + * test out the clipping of alleles (removing extra context provided by VCF implementation). + */ + @Test + public void testClippingManyPotentialFrontClippedBases() { + String ref = "GGGGTT"; + ArrayList alleles = new ArrayList(); + alleles.add(Allele.create(ref,true)); + alleles.add(Allele.create("GGGGAATT",false)); + alleles.add(Allele.create("GGGT",false)); + + Pair> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles); + Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",2,5))); + + // we know the ordering + //System.err.println(locAndList.second.get(0).toString()); + //System.err.println(locAndList.second.get(1).toString()); + //System.err.println(locAndList.second.get(2).toString()); + Assert.assertTrue(locAndList.second.get(0).toString().equals("GGGT*")); + Assert.assertTrue(locAndList.second.get(0).isReference()); + Assert.assertTrue(locAndList.second.get(1).toString().equals("GGGAAT")); + Assert.assertTrue(locAndList.second.get(2).toString().equals("GG")); + } + /** * test out the clipping of alleles (removing extra context provided by VCF implementation). */ @@ -322,6 +345,7 @@ public class VCF4UnitTest extends BaseTest { /** * test out the clipping of alleles (removing extra context provided by VCF implementation). + * TODO - this is kind of a tricky test... we don't know which way clipped the reads, but the position should be accurate */ @Test public void testClippingOfAllelesLongRefRepeatClippable() { @@ -332,7 +356,7 @@ public class VCF4UnitTest extends BaseTest { alleles.add(Allele.create("GGG",false)); Pair> locAndList = VCF4Codec.clipAlleles("1",1,ref,alleles); - Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",3,5))); + Assert.assertTrue(locAndList.first.equals(GenomeLocParser.createGenomeLoc("1",2,4))); // we know the ordering Assert.assertTrue(locAndList.second.get(0).toString().equals("GGG*"));